diff --git a/mace/core/registry/op_delegator_registry.cc b/mace/core/registry/op_delegator_registry.cc
index 7aed0cb08254c98cdde3b8faf2d7811e46a80d63..f853527ff23af6398c02909f5472d16df8c0e2db 100644
--- a/mace/core/registry/op_delegator_registry.cc
+++ b/mace/core/registry/op_delegator_registry.cc
@@ -60,6 +60,7 @@ MaceStatus OpDelegatorRegistry::Register(const DelegatorInfo &key,
 DelegatorCreator OpDelegatorRegistry::GetCreator(
     const DelegatorInfo &key) const {
   if (registry_.count(key) > 0) {
+    VLOG(3) << "find delegator creator: " << key.ToString();
     return registry_.at(key);
   }
 
diff --git a/mace/ops/BUILD.bazel b/mace/ops/BUILD.bazel
index a3b8ec79e3ea8537f0b8ebc8f002bc0ff2249a23..32226af45b1d7798e9a8abc90f6c6381ad0e6b03 100644
--- a/mace/ops/BUILD.bazel
+++ b/mace/ops/BUILD.bazel
@@ -105,6 +105,7 @@ cc_library(
     name = "arm_neon_kernels",
     srcs = glob(
         [
+            "arm/base/*.cc",
             "arm/fp32/*.cc",
             "arm/fp16/gemv.h",
         ],
@@ -121,6 +122,7 @@ cc_library(
     )),
     hdrs = glob(
         [
+            "arm/base/*.h",
             "arm/fp32/*.h",
         ],
     ) + if_quantize_enabled(glob(
diff --git a/mace/ops/CMakeLists.txt b/mace/ops/CMakeLists.txt
index 7de9661d61d05cd6e4ac9d551cbccbb38904f7d4..61b3b15390caa9413201020066febfc888506035 100644
--- a/mace/ops/CMakeLists.txt
+++ b/mace/ops/CMakeLists.txt
@@ -5,6 +5,9 @@ file(GLOB OPS_REF_Q8_KERNELS_SRCS
   ref/q8/*.cc
 )
 
+file(GLOB OPS_ARM_NEON_BASE_KERNELS_SRCS
+  arm/base/*.cc
+)
 file(GLOB OPS_ARM_NEON_FP32_KERNELS_SRCS
   arm/fp32/*.cc
 )
@@ -32,7 +35,7 @@ if(MACE_ENABLE_QUANTIZE)
 endif(MACE_ENABLE_QUANTIZE)
 
 if(MACE_ENABLE_NEON)
-  set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_FP32_KERNELS_SRCS})
+  set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_BASE_KERNELS_SRCS} ${OPS_ARM_NEON_FP32_KERNELS_SRCS})
   if(MACE_ENABLE_QUANTIZE)
     set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_Q8_KERNELS_SRCS})
   endif(MACE_ENABLE_QUANTIZE)
diff --git a/mace/ops/arm/base/activation.cc b/mace/ops/arm/base/activation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6531616ae0ab8b2b749e886a3e2f4431ceb50856
--- /dev/null
+++ b/mace/ops/arm/base/activation.cc
@@ -0,0 +1,91 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/activation.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+MaceStatus Activation<T>::Compute(const OpContext *context,
+                                  const Tensor *input, Tensor *output) {
+  Tensor::MappingGuard input_guard(input);
+  if (input != output) {
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+    Tensor::MappingGuard output_guard(output);
+    DoActivation(context, input, output);
+  } else {
+    DoActivation(context, input, output);
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+template<typename T>
+void Activation<T>::DoActivation(const OpContext *context,
+                                 const Tensor *input,
+                                 Tensor *output) {
+  const T *input_data = input->data<T>();
+  T *output_data = output->mutable_data<T>();
+  const index_t size = input->size();
+
+  utils::ThreadPool &thread_pool =
+      context->device()->cpu_runtime()->thread_pool();
+
+  switch (type_) {
+    case RELU: {
+      ActivateRelu(&thread_pool, input_data, size, output_data);
+      break;
+    }
+
+    case RELUX: {
+      ActivateRelux(&thread_pool, input_data, size, output_data);
+      break;
+    }
+
+    case LEAKYRELU: {
+      ActivateLeakyRelu(&thread_pool, input_data, size, output_data);
+      break;
+    }
+
+    case TANH: {
+      ActivateTanh(&thread_pool, input_data, size, output_data);
+      break;
+    }
+
+    case SIGMOID: {
+      ActivateSigmoid(&thread_pool, input_data, size, output_data);
+      break;
+    }
+
+    case NOOP: {
+      break;
+    }
+
+    default: {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+}
+
+void RegisterActivationDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Activation<float>, delegator::ActivationParam,
+      MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::NEON));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/base/activation.h b/mace/ops/arm/base/activation.h
new file mode 100644
index 0000000000000000000000000000000000000000..aac917c9b37642ae8b452331cf86d5b0e51407f4
--- /dev/null
+++ b/mace/ops/arm/base/activation.h
@@ -0,0 +1,54 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_ACTIVATION_H_
+#define MACE_OPS_ARM_BASE_ACTIVATION_H_
+
+#include "mace/ops/delegator/activation.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class Activation : public delegator::Activation {
+ public:
+  explicit Activation(const delegator::ActivationParam &param)
+      : delegator::Activation(param) {}
+  ~Activation() = default;
+
+  MaceStatus Compute(const OpContext *context,
+                     const Tensor *input, Tensor *output) override;
+
+ private:
+  void DoActivation(const OpContext *context,
+                    const Tensor *input, Tensor *output);
+
+  void ActivateRelu(utils::ThreadPool *thread_pool, const T *input_data,
+                    const index_t input_size, T *output_data);
+  void ActivateRelux(utils::ThreadPool *thread_pool, const T *input_data,
+                     const index_t input_size, T *output_data);
+  void ActivateLeakyRelu(utils::ThreadPool *thread_pool, const T *input_data,
+                         const index_t input_size, T *output_data);
+  void ActivateTanh(utils::ThreadPool *thread_pool, const T *input_data,
+                    const index_t input_size, T *output_data);
+  void ActivateSigmoid(utils::ThreadPool *thread_pool, const T *input_data,
+                       const index_t input_size, T *output_data);
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_ACTIVATION_H_
diff --git a/mace/ops/arm/base/bias_add.cc b/mace/ops/arm/base/bias_add.cc
new file mode 100644
index 0000000000000000000000000000000000000000..42357a48e8ce04f5199c39e0c428abcd1562f6e6
--- /dev/null
+++ b/mace/ops/arm/base/bias_add.cc
@@ -0,0 +1,79 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/bias_add.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+MaceStatus BiasAdd<T>::Compute(const OpContext *context, const Tensor *input,
+                               const Tensor *bias, Tensor *output) {
+  if (input != output) {
+    if (bias == nullptr) {
+      output->Copy(*input);
+    } else {
+      MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+      Tensor::MappingGuard input_guard(input);
+      Tensor::MappingGuard bias_guard(bias);
+      Tensor::MappingGuard output_guard(output);
+      AddBias(context, input, bias, output);
+    }
+  } else {
+    if (bias != nullptr) {
+      Tensor::MappingGuard input_guard(input);
+      Tensor::MappingGuard bias_guard(bias);
+      AddBias(context, input, bias, output);
+    }
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+template<typename T>
+void BiasAdd<T>::AddBias(const OpContext *context, const Tensor *input,
+                         const Tensor *bias, mace::Tensor *output) {
+  auto input_data = input->data<T>();
+  auto bias_data = bias->data<T>();
+  auto output_data = output->mutable_data<T>();
+
+  const index_t batch = input->dim(0);
+  const index_t channels = input->dim(1);
+
+  const index_t height = input->dim(2);
+  const index_t width = input->dim(3);
+  const index_t image_size = height * width;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  if (bias->dim_size() == 1) {
+    Add1DimBias(&thread_pool, input_data, bias_data,
+                output_data, batch, channels, image_size);
+  } else {
+    Add2DimsBias(&thread_pool, input_data, bias_data,
+                     output_data, batch, channels, image_size);
+  }
+}
+
+void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, BiasAdd<float>, DelegatorParam,
+      MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::NEON));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/base/bias_add.h b/mace/ops/arm/base/bias_add.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0e2e1c09ef19a0d77a817bf55f2992282973b31
--- /dev/null
+++ b/mace/ops/arm/base/bias_add.h
@@ -0,0 +1,52 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_BIAS_ADD_H_
+#define MACE_OPS_ARM_BASE_BIAS_ADD_H_
+
+#include "mace/ops/delegator/bias_add.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class BiasAdd : public delegator::BiasAdd {
+ public:
+  explicit BiasAdd(const DelegatorParam &param) : delegator::BiasAdd(param) {}
+  ~BiasAdd() = default;
+
+  MaceStatus Compute(const OpContext *context, const Tensor *input,
+                     const Tensor *bias, Tensor *output) override;
+
+ private:
+  void AddBias(const OpContext *context, const Tensor *input,
+               const Tensor *bias, Tensor *output);
+
+  void Add1DimBias(utils::ThreadPool *thread_pool, const T *input_data,
+                   const T *bias_data, T *output_data,
+                   const index_t batch, const index_t channels,
+                   const index_t image_size);
+
+  void Add2DimsBias(utils::ThreadPool *thread_pool, const T *input_data,
+                    const T *bias_data, T *output_data,
+                    const index_t batch, const index_t channels,
+                    const index_t image_size);
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_BIAS_ADD_H_
diff --git a/mace/ops/arm/fp32/conv_2d.cc b/mace/ops/arm/base/conv_2d.cc
similarity index 67%
rename from mace/ops/arm/fp32/conv_2d.cc
rename to mace/ops/arm/base/conv_2d.cc
index 357b47754b0b9bf814302be042f56651883594a5..c5a69ac9704e1de3ea25537ca30c74eac629ddf3 100644
--- a/mace/ops/arm/fp32/conv_2d.cc
+++ b/mace/ops/arm/base/conv_2d.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,18 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d.h"
 
+#include <algorithm>
 #include <memory>
 #include <utility>
-#include <algorithm>
 
 #include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
 void Conv2dBase::CalOutputShapeAndInputPadSize(
     const std::vector<index_t> &input_shape,
@@ -164,10 +163,10 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
   auto scratch_buffer = context->device()->scratch_buffer();
   const index_t padded_in_size =
       MACE_EXTRA_BUFFER_PAD_SIZE + (is_in_padded ? PadAlignSize(
-          sizeof(float) * batch * in_channels * padded_in_height
+          type_size_ * batch * in_channels * padded_in_height
               * padded_in_width) : 0);
   const index_t padded_out_size = is_out_padded ? PadAlignSize(
-      sizeof(float) * batch * out_channels * padded_out_height
+      type_size_ * batch * out_channels * padded_out_height
           * padded_out_width) : 0;
 
   scratch_buffer->Rewind();
@@ -176,7 +175,7 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
     std::unique_ptr<Tensor>
         padded_in =
         make_unique<Tensor>(scratch_buffer->Scratch(padded_in_size),
-                            DataType::DT_FLOAT);
+                            input->dtype());
     padded_in->Resize({batch, in_channels, padded_in_height, padded_in_width});
     PadInput(*input, in_pad_size[0], in_pad_size[2], padded_in.get());
     *padded_input = std::move(padded_in);
@@ -185,7 +184,7 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
     std::unique_ptr<Tensor>
         padded_out =
         make_unique<Tensor>(scratch_buffer->Scratch(padded_out_size),
-                            DataType::DT_FLOAT);
+                            output->dtype());
     padded_out->Resize({batch, out_channels, padded_out_height,
                         padded_out_width});
     *padded_output = std::move(padded_out);
@@ -206,8 +205,8 @@ void Conv2dBase::PadInput(const Tensor &src,
   const index_t padded_width = dst->dim(3);
   const int pad_bottom = static_cast<int>(padded_height - height - pad_top);
   const int pad_right = static_cast<int>(padded_width - width - pad_left);
-  auto in_data = src.data<float>();
-  auto padded_in_data = dst->mutable_data<float>();
+  auto in_data = src.data<uint8_t>();
+  auto padded_in_data = dst->mutable_data<uint8_t>();
 
   const index_t img_size = height * width;
   const index_t padded_img_size = padded_height * padded_width;
@@ -215,25 +214,26 @@ void Conv2dBase::PadInput(const Tensor &src,
   for (index_t b = 0; b < batch; ++b) {
     for (index_t c = 0; c < channels; ++c) {
       const index_t bc = b * channels + c;
-      const float *in_base = in_data + bc * img_size;
-      float *padded_in_base = padded_in_data + bc * padded_img_size;
+      const uint8_t *in_base = in_data + bc * img_size * type_size_;
+      uint8_t *padded_in_base =
+          padded_in_data + bc * padded_img_size * type_size_;
 
-      memset(padded_in_base, 0, sizeof(float) * pad_top * padded_width);
-      padded_in_base += pad_top * padded_width;
+      memset(padded_in_base, 0, type_size_ * pad_top * padded_width);
+      padded_in_base += pad_top * padded_width * type_size_;
       for (index_t h = 0; h < height; ++h) {
         memset(padded_in_base,
                0,
-               sizeof(float) * pad_left);
-        memcpy(padded_in_base + pad_left,
+               type_size_ * pad_left);
+        memcpy(padded_in_base + pad_left * type_size_,
                in_base,
-               sizeof(float) * width);
-        memset(padded_in_base + pad_left + width,
+               type_size_ * width);
+        memset(padded_in_base + (pad_left + width) * type_size_,
                0,
-               sizeof(float) * pad_right);
-        in_base += width;
-        padded_in_base += padded_width;
+               type_size_ * pad_right);
+        in_base += width * type_size_;
+        padded_in_base += padded_width * type_size_;
       }
-      memset(padded_in_base, 0, sizeof(float) * pad_bottom * padded_width);
+      memset(padded_in_base, 0, type_size_ * pad_bottom * padded_width);
     }
   }
 }
@@ -247,8 +247,8 @@ void Conv2dBase::UnPadOutput(const Tensor &src, Tensor *dst) {
   const index_t padded_height = src.dim(2);
   const index_t padded_width = src.dim(3);
 
-  auto padded_out_data = src.data<float>();
-  auto out_data = dst->mutable_data<float>();
+  auto padded_out_data = src.data<uint8_t>();
+  auto out_data = dst->mutable_data<uint8_t>();
 
   const index_t img_size = height * width;
   const index_t padded_img_size = padded_height * padded_width;
@@ -256,21 +256,93 @@ void Conv2dBase::UnPadOutput(const Tensor &src, Tensor *dst) {
   for (index_t b = 0; b < batch; ++b) {
     for (index_t c = 0; c < channels; ++c) {
       const index_t bc = (b * channels + c);
-      float *out_base = out_data + bc * img_size;
-      const float *padded_out_base = padded_out_data + bc * padded_img_size;
+      uint8_t *out_base = out_data + bc * img_size * type_size_;
+      const uint8_t *padded_out_base =
+          padded_out_data + bc * padded_img_size * type_size_;
 
       for (index_t h = 0; h < height; ++h) {
-        memcpy(out_base,
-               padded_out_base,
-               sizeof(float) * width);
-        out_base += width;
-        padded_out_base += padded_width;
+        memcpy(out_base, padded_out_base, type_size_ * width);
+        out_base += width * type_size_;
+        padded_out_base += padded_width * type_size_;
       }  // h
     }  // c
   }  // b
 }
 
-}  // namespace fp32
+ConvComputeParam Conv2dBase::PreWorkAndGetConv2DParam(
+    const OpContext *context, const Tensor *in_tensor, Tensor *out_tensor) {
+  auto &in_shape = in_tensor->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  return ConvComputeParam(batch, in_channels, in_height, in_width,
+                          out_channels, out_height, out_width,
+                          in_image_size, out_image_size,
+                          in_batch_size, out_batch_size, &thread_pool);
+}
+
+DepthwiseConvComputeParam Conv2dBase::PreWorkAndGetDepthwiseConv2DParam(
+    const OpContext *context, const Tensor *input,
+    const Tensor *filter, Tensor *output) {
+  std::vector<index_t> out_shape(4);
+  std::vector<int> paddings(2);
+  auto &in_shape = input->shape();
+  auto &filter_shape = filter->shape();
+  CalOutputShapeAndInputPadSize(in_shape, filter_shape, &out_shape, &paddings);
+  out_shape[1] *= filter_shape[1];
+  MACE_CHECK(output->Resize(out_shape) == MaceStatus::MACE_SUCCESS,
+             "Resize failed.");
+  output->Clear();
+
+  const int pad_top = paddings[0] / 2;
+  const int pad_left = paddings[1] / 2;
+
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t multiplier = out_channels / in_channels;
+
+  std::vector<index_t> out_bounds;
+  CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds);
+  const index_t valid_h_start = out_bounds[0];
+  const index_t valid_h_stop = out_bounds[1];
+  const index_t valid_w_start = out_bounds[2];
+  const index_t valid_w_stop = out_bounds[3];
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  return DepthwiseConvComputeParam(
+      batch, in_channels, in_height, in_width, out_channels, out_height,
+      out_width, in_image_size, out_image_size, in_batch_size, out_batch_size,
+      &thread_pool, pad_top, pad_left, multiplier, valid_h_start, valid_h_stop,
+      valid_w_start, valid_w_stop);
+}
+
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/base/conv_2d.h b/mace/ops/arm/base/conv_2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1cd0947031952aecdd9799653eff7a1e4989679
--- /dev/null
+++ b/mace/ops/arm/base/conv_2d.h
@@ -0,0 +1,159 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_H_
+
+#include <memory>
+#include <vector>
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/ops/arm/base/gemm.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/conv_2d.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+struct ConvComputeParam {
+  const index_t batch;
+  const index_t in_channels;
+  const index_t in_height;
+  const index_t in_width;
+  const index_t out_channels;
+  const index_t out_height;
+  const index_t out_width;
+
+  const index_t in_image_size;
+  const index_t out_image_size;
+  const index_t in_batch_size;
+  const index_t out_batch_size;
+
+  utils::ThreadPool &thread_pool;
+
+  ConvComputeParam(const index_t b,
+                   const index_t in_c,
+                   const index_t in_h,
+                   const index_t in_w,
+                   const index_t out_c,
+                   const index_t out_h,
+                   const index_t out_w,
+                   const index_t in_size,
+                   const index_t out_size,
+                   const index_t in_b_size,
+                   const index_t out_b_size,
+                   utils::ThreadPool *thrd_pool)
+      : batch(b), in_channels(in_c), in_height(in_h), in_width(in_w),
+        out_channels(out_c), out_height(out_h), out_width(out_w),
+        in_image_size(in_size), out_image_size(out_size),
+        in_batch_size(in_b_size), out_batch_size(out_b_size),
+        thread_pool(*thrd_pool) {}
+};
+
+struct DepthwiseConvComputeParam : public ConvComputeParam {
+  const int pad_top;
+  const int pad_left;
+  const index_t multiplier;
+  const index_t valid_h_start;
+  const index_t valid_h_stop;
+  const index_t valid_w_start;
+  const index_t valid_w_stop;
+  DepthwiseConvComputeParam(const index_t b,
+                            const index_t in_c,
+                            const index_t in_h,
+                            const index_t in_w,
+                            const index_t out_c,
+                            const index_t out_h,
+                            const index_t out_w,
+                            const index_t in_size,
+                            const index_t out_size,
+                            const index_t in_b_size,
+                            const index_t out_b_size,
+                            utils::ThreadPool *thrd_pool,
+                            const int pad_top_data,
+                            const int pad_left_data,
+                            const index_t multiplier_data,
+                            const index_t valid_height_start,
+                            const index_t valid_height_stop,
+                            const index_t valid_width_start,
+                            const index_t valid_width_stop)
+      : ConvComputeParam(b, in_c, in_h, in_w, out_c, out_h, out_w,
+                         in_size, out_size, in_b_size, out_b_size, thrd_pool),
+        pad_top(pad_top_data), pad_left(pad_left_data),
+        multiplier(multiplier_data),
+        valid_h_start(valid_height_start), valid_h_stop(valid_height_stop),
+        valid_w_start(valid_width_start), valid_w_stop(valid_width_stop) {}
+};
+
+class Conv2dBase : public delegator::Conv2d {
+ public:
+  explicit Conv2dBase(const delegator::Conv2dParam &param, int type_size)
+      : delegator::Conv2d(param), type_size_(type_size) {}
+
+  virtual ~Conv2dBase() = default;
+
+ protected:
+  void CalOutputShapeAndInputPadSize(const std::vector<index_t> &input_shape,
+                                     const std::vector<index_t> &filter_shape,
+                                     std::vector<index_t> *output_shape,
+                                     std::vector<int> *in_pad_size);
+
+  void CalOutputBoundaryWithoutUsingInputPad(const std::vector<index_t>
+                                             &output_shape,
+                                             const std::vector<int>
+                                             in_pad_size,
+                                             std::vector<index_t>
+                                             *out_bound);
+
+  void CalOutputShapeAndPadSize(const Tensor *input,
+                                const Tensor *filter,
+                                const int out_tile_height,
+                                const int out_tile_width,
+                                std::vector<index_t> *output_shape,
+                                std::vector<int> *in_pad_size,
+                                std::vector<int> *out_pad_size);
+
+  MaceStatus ResizeOutAndPadInOut(const OpContext *context,
+                                  const Tensor *input,
+                                  const Tensor *filter,
+                                  Tensor *output,
+                                  const int out_tile_height,
+                                  const int out_tile_width,
+                                  std::unique_ptr<const Tensor> *padded_input,
+                                  std::unique_ptr<Tensor> *padded_output);
+
+  void PadInput(const Tensor &src,
+                const int pad_top,
+                const int pad_left,
+                Tensor *dst);
+  void UnPadOutput(const Tensor &src, Tensor *dst);
+
+  ConvComputeParam PreWorkAndGetConv2DParam(
+      const OpContext *context, const Tensor *in_tensor, Tensor *out_tensor);
+  DepthwiseConvComputeParam PreWorkAndGetDepthwiseConv2DParam(
+      const OpContext *context, const Tensor *input,
+      const Tensor *filter, Tensor *output);
+
+ private:
+  int type_size_;
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_H_
diff --git a/mace/ops/arm/fp32/conv_2d_1x1.cc b/mace/ops/arm/base/conv_2d_1x1.cc
similarity index 67%
rename from mace/ops/arm/fp32/conv_2d_1x1.cc
rename to mace/ops/arm/base/conv_2d_1x1.cc
index fb3c8a26a738eaedadf7afea7ce1cb60c5c362f5..7fa96e8cf72cbe80c94b4f9f218800345842387c 100644
--- a/mace/ops/arm/fp32/conv_2d_1x1.cc
+++ b/mace/ops/arm/base/conv_2d_1x1.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,36 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/conv_2d.h"
-#include "mace/ops/arm/fp32/gemm.h"
-#include "mace/ops/delegator/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d_1x1.h"
+
+#include <vector>
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-class Conv2dK1x1 : public Conv2dBase {
- public:
-  explicit Conv2dK1x1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param),
-        gemm_(delegator::GemmParam()) {}
-  virtual ~Conv2dK1x1() {}
-
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
-
- private:
-  Gemm gemm_;
-};
 
-MaceStatus Conv2dK1x1::Compute(const OpContext *context,
-                               const Tensor *input,
-                               const Tensor *filter,
-                               Tensor *output) {
+template<typename T>
+MaceStatus Conv2dK1x1<T>::Compute(const OpContext *context,
+                                  const Tensor *input,
+                                  const Tensor *filter,
+                                  Tensor *output) {
   index_t batch = input->dim(0);
   index_t in_height = input->dim(2);
   index_t in_width = input->dim(3);
@@ -50,13 +33,8 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
   std::vector<index_t> output_shape;
   std::vector<int> in_pad_size;
   std::vector<int> out_pad_size;
-  CalOutputShapeAndPadSize(input,
-                           filter,
-                           1,
-                           1,
-                           &output_shape,
-                           &in_pad_size,
-                           &out_pad_size);
+  CalOutputShapeAndPadSize(input, filter, 1, 1,
+                           &output_shape, &in_pad_size, &out_pad_size);
   MACE_RETURN_IF_ERROR(output->Resize(output_shape));
 
   const index_t out_channels = output_shape[1];
@@ -70,16 +48,16 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
       in_height != padded_in_height || in_width != padded_in_width;
   auto scratch_buffer = context->device()->scratch_buffer();
   const index_t padded_in_size = is_in_padded ? PadAlignSize(
-      sizeof(float) * batch * in_channels * padded_in_height
+      sizeof(T) * batch * in_channels * padded_in_height
           * padded_in_width) : 0;
   const index_t pack_filter_size =
-      PadAlignSize(sizeof(float) * out_channels * in_channels);
+      PadAlignSize(sizeof(T) * out_channels * in_channels);
   const index_t pack_input_size =
       PadAlignSize(
-          sizeof(float) * in_channels * padded_in_height * padded_in_width);
+          sizeof(T) * in_channels * padded_in_height * padded_in_width);
   const index_t pack_output_size =
       PadAlignSize(
-          sizeof(float) * out_channels * padded_in_height * padded_in_width);
+          sizeof(T) * out_channels * padded_in_height * padded_in_width);
 
   const index_t gemm_pack_size =
       pack_filter_size + pack_input_size + pack_output_size;
@@ -115,12 +93,11 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
 
 void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry) {
   MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK1x1, delegator::Conv2dParam,
+      registry, Conv2dK1x1<float>, delegator::Conv2dParam,
       MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
                             float, ImplType::NEON, K1x1));
 }
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/base/conv_2d_1x1.h b/mace/ops/arm/base/conv_2d_1x1.h
new file mode 100644
index 0000000000000000000000000000000000000000..197e98e9464e36fb2a24acb0103ede98605e4a59
--- /dev/null
+++ b/mace/ops/arm/base/conv_2d_1x1.h
@@ -0,0 +1,47 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_1X1_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_1X1_H_
+
+#include "mace/ops/arm/base/conv_2d.h"
+#include "mace/ops/arm/base/gemm.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class Conv2dK1x1 : public Conv2dBase {
+ public:
+  explicit Conv2dK1x1(const delegator::Conv2dParam &param)
+      : Conv2dBase(param, sizeof(T)),
+        gemm_(delegator::GemmParam()) {}
+  virtual ~Conv2dK1x1() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output) override;
+
+ private:
+  Gemm<T> gemm_;
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_1X1_H_
diff --git a/mace/ops/arm/base/conv_2d_1xn.cc b/mace/ops/arm/base/conv_2d_1xn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..417bec8a53376378654c34c2932e18424d811efb
--- /dev/null
+++ b/mace/ops/arm/base/conv_2d_1xn.cc
@@ -0,0 +1,45 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/conv_2d_1xn.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK1x7S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K1x7S1));
+
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK7x1S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K7x1S1));
+
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK1x15S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K1x15S1));
+
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK15x1S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K15x1S1));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_2d_1xn.h b/mace/ops/arm/base/conv_2d_1xn.h
similarity index 50%
rename from mace/ops/arm/fp32/conv_2d_1xn.h
rename to mace/ops/arm/base/conv_2d_1xn.h
index c0a6da637e3ecffd74da458c71730a8646e365c3..ef18e0f7aacfc5f97dab15b70b39b71e8e4870cc 100644
--- a/mace/ops/arm/fp32/conv_2d_1xn.h
+++ b/mace/ops/arm/base/conv_2d_1xn.h
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,76 +12,66 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
-#define MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_1XN_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_1XN_H_
 
 #include <vector>
 
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d_mxn.h"
 #include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
-class Conv2dK1x7S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK1x7S1 : public Conv2dKMxN<T> {
  public:
   explicit Conv2dK1x7S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
   virtual ~Conv2dK1x7S1() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };
 
-class Conv2dK7x1S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK7x1S1 : public Conv2dKMxN<T> {
  public:
   explicit Conv2dK7x1S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 4, 1) {}
   virtual ~Conv2dK7x1S1() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };
 
-class Conv2dK1x15S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK1x15S1 : public Conv2dKMxN<T> {
  public:
   explicit Conv2dK1x15S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
   virtual ~Conv2dK1x15S1() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };
 
-class Conv2dK15x1S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK15x1S1 : public Conv2dKMxN<T> {
  public:
   explicit Conv2dK15x1S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 4, 1) {}
   virtual ~Conv2dK15x1S1() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_1XN_H_
diff --git a/mace/ops/arm/base/conv_2d_3x3.cc b/mace/ops/arm/base/conv_2d_3x3.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f2c02b3a9a9ebf6db87100f19f970b9c59774666
--- /dev/null
+++ b/mace/ops/arm/base/conv_2d_3x3.cc
@@ -0,0 +1,34 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/conv_2d_3x3.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK3x3S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK3x3S2<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S2));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_2d_3x3.h b/mace/ops/arm/base/conv_2d_3x3.h
similarity index 58%
rename from mace/ops/arm/fp32/conv_2d_3x3.h
rename to mace/ops/arm/base/conv_2d_3x3.h
index e64d061e3e6103f78901c144d9866d047e8dfc96..9aaf66f0b7dc84e95d204b90c9aab2ee658eedec 100644
--- a/mace/ops/arm/fp32/conv_2d_3x3.h
+++ b/mace/ops/arm/base/conv_2d_3x3.h
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,50 +12,44 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
-#define MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_3X3_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_3X3_H_
 
 #include <vector>
 
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d_mxn.h"
 #include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
-class Conv2dK3x3S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK3x3S1 : public Conv2dKMxN<T> {
  public:
   explicit Conv2dK3x3S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 2, 4) {}
   virtual ~Conv2dK3x3S1() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };
 
-class Conv2dK3x3S2 : public Conv2dBase {
+template<typename T>
+class Conv2dK3x3S2 : public Conv2dKMxN<T> {
  public:
   explicit Conv2dK3x3S2(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
   virtual ~Conv2dK3x3S2() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_3X3_H_
diff --git a/mace/ops/arm/base/conv_2d_5x5.cc b/mace/ops/arm/base/conv_2d_5x5.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5db15881552dd2e044353ee118080afd9fc6b54f
--- /dev/null
+++ b/mace/ops/arm/base/conv_2d_5x5.cc
@@ -0,0 +1,30 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/conv_2d_5x5.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK5x5S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K5x5S1));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/base/conv_2d_5x5.h b/mace/ops/arm/base/conv_2d_5x5.h
new file mode 100644
index 0000000000000000000000000000000000000000..1528927e39f77a64c673da8c14a6fd1724fa98ac
--- /dev/null
+++ b/mace/ops/arm/base/conv_2d_5x5.h
@@ -0,0 +1,44 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_5X5_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_5X5_H_
+
+#include <vector>
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/ops/arm/base/conv_2d_mxn.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class Conv2dK5x5S1 : public Conv2dKMxN<T> {
+ public:
+  explicit Conv2dK5x5S1(const delegator::Conv2dParam &param)
+      : Conv2dKMxN<T>(param, 1, 4) {}
+  virtual ~Conv2dK5x5S1() {}
+
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_5X5_H_
diff --git a/mace/ops/arm/base/conv_2d_7x7.cc b/mace/ops/arm/base/conv_2d_7x7.cc
new file mode 100644
index 0000000000000000000000000000000000000000..611f7acc910af4cac516e461ccd54af546ebd6da
--- /dev/null
+++ b/mace/ops/arm/base/conv_2d_7x7.cc
@@ -0,0 +1,38 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/conv_2d_7x7.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK7x7S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K7x7S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK7x7S2<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K7x7S2));
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK7x7S3<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K7x7S3));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_2d_7x7.h b/mace/ops/arm/base/conv_2d_7x7.h
similarity index 53%
rename from mace/ops/arm/fp32/conv_2d_7x7.h
rename to mace/ops/arm/base/conv_2d_7x7.h
index 0d0467fc5b38a354bab744503dafbe28b5f180f3..f9b8374287000e7ab391f522b4b447daf65faba3 100644
--- a/mace/ops/arm/fp32/conv_2d_7x7.h
+++ b/mace/ops/arm/base/conv_2d_7x7.h
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,63 +12,55 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
-#define MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_7X7_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_7X7_H_
 
 #include <vector>
 
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d_mxn.h"
 #include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
-class Conv2dK7x7S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK7x7S1 : public Conv2dKMxN<T> {
  public:
   explicit Conv2dK7x7S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
   virtual ~Conv2dK7x7S1() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };
 
-class Conv2dK7x7S2 : public Conv2dBase {
+template<typename T>
+class Conv2dK7x7S2 : public Conv2dKMxN<T> {
  public:
   explicit Conv2dK7x7S2(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
   virtual ~Conv2dK7x7S2() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };
 
-class Conv2dK7x7S3 : public Conv2dBase {
+template<typename T>
+class Conv2dK7x7S3 : public Conv2dKMxN<T> {
  public:
   explicit Conv2dK7x7S3(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
   virtual ~Conv2dK7x7S3() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
 };
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_7X7_H_
diff --git a/mace/ops/arm/base/conv_2d_general.cc b/mace/ops/arm/base/conv_2d_general.cc
new file mode 100644
index 0000000000000000000000000000000000000000..04121b8c003c47d4596c96586ccb3071aeeae171
--- /dev/null
+++ b/mace/ops/arm/base/conv_2d_general.cc
@@ -0,0 +1,68 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/conv_2d_general.h"
+
+#include <memory>
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+MaceStatus Conv2dGeneral<T>::Compute(const OpContext *context,
+                                     const Tensor *input,
+                                     const Tensor *filter,
+                                     Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context, input, filter, output, 1, 4,
+                       &padded_input, &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+
+  const T *filter_data = filter->data<T>();
+  const T *input_data = in_tensor->data<T>();
+  T *output_data = out_tensor->mutable_data<T>();
+
+  const ConvComputeParam p =
+      PreWorkAndGetConv2DParam(context, in_tensor, out_tensor);
+  auto &filter_shape = filter->shape();
+
+  DoCompute(p, filter_data, input_data, output_data, filter_shape);
+
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
+}
+
+void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dGeneral<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::NEON));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/base/conv_2d_general.h b/mace/ops/arm/base/conv_2d_general.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0944d9b1056cb3a6762009c768fb643ef064f2b
--- /dev/null
+++ b/mace/ops/arm/base/conv_2d_general.h
@@ -0,0 +1,50 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_GENERAL_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_GENERAL_H_
+
+#include <vector>
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/ops/arm/base/conv_2d.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class Conv2dGeneral : public Conv2dBase {
+ public:
+  explicit Conv2dGeneral(const delegator::Conv2dParam &param)
+      : Conv2dBase(param, sizeof(T)) {}
+  virtual ~Conv2dGeneral() {}
+
+  MaceStatus Compute(const OpContext *context, const Tensor *input,
+                     const Tensor *filter, Tensor *output) override;
+
+ protected:
+  MaceStatus DoCompute(
+      const ConvComputeParam &p, const T *filter_data,
+      const T *input_data, T *output_data,
+      const std::vector<index_t> &filter_shape);
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_GENERAL_H_
diff --git a/mace/ops/arm/base/conv_2d_mxn.h b/mace/ops/arm/base/conv_2d_mxn.h
new file mode 100644
index 0000000000000000000000000000000000000000..0941cfa71f1f3513612b5b45d8448f23e4b19d51
--- /dev/null
+++ b/mace/ops/arm/base/conv_2d_mxn.h
@@ -0,0 +1,85 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_MXN_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_MXN_H_
+
+#include <memory>
+#include <vector>
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/ops/arm/base/conv_2d.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class Conv2dKMxN : public Conv2dBase {
+ public:
+  explicit Conv2dKMxN(const delegator::Conv2dParam &param,
+                      const int tile_h, const int tile_w)
+      : Conv2dBase(param, sizeof(T)),
+        out_tile_h_(tile_h), out_tile_w_(tile_w) {}
+
+  virtual ~Conv2dKMxN() {}
+
+  MaceStatus Compute(const OpContext *context, const Tensor *input,
+                     const Tensor *filter, Tensor *output) override {
+    std::unique_ptr<const Tensor> padded_input;
+    std::unique_ptr<Tensor> padded_output;
+    ResizeOutAndPadInOut(context, input, filter, output, out_tile_h_,
+                         out_tile_w_, &padded_input, &padded_output);
+    const Tensor *in_tensor = input;
+    if (padded_input != nullptr) {
+      in_tensor = padded_input.get();
+    }
+    Tensor *out_tensor = output;
+    if (padded_output != nullptr) {
+      out_tensor = padded_output.get();
+    }
+    out_tensor->Clear();
+
+    Tensor::MappingGuard in_guard(input);
+    Tensor::MappingGuard filter_guard(filter);
+    Tensor::MappingGuard out_guard(output);
+
+    const T *filter_data = filter->data<T>();
+    const T *input_data = in_tensor->data<T>();
+    T *output_data = out_tensor->mutable_data<T>();
+
+    const ConvComputeParam p =
+        PreWorkAndGetConv2DParam(context, in_tensor, out_tensor);
+
+    DoCompute(p, filter_data, input_data, output_data);
+
+    UnPadOutput(*out_tensor, output);
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+  virtual MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                               const T *input_data, T *output_data) = 0;
+
+ private:
+  const int out_tile_h_;
+  const int out_tile_w_;
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_MXN_H_
diff --git a/mace/ops/arm/fp32/deconv_2d.cc b/mace/ops/arm/base/deconv_2d.cc
similarity index 53%
rename from mace/ops/arm/fp32/deconv_2d.cc
rename to mace/ops/arm/base/deconv_2d.cc
index 41a01a6ca3c653e3412c6c1f27403c0d4c04bd11..1fc14db618cef8468a2e6b6c16c582bae2891afb 100644
--- a/mace/ops/arm/fp32/deconv_2d.cc
+++ b/mace/ops/arm/base/deconv_2d.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d.h"
 
-#include <utility>
 #include <functional>
-#include "mace/utils/memory.h"
+#include <utility>
+
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
 MaceStatus Deconv2dBase::ResizeOutAndPadOut(
     const OpContext *context,
@@ -67,7 +67,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut(
         std::accumulate(padded_out_shape.begin(),
                         padded_out_shape.end(),
                         1,
-                        std::multiplies<index_t>()) * sizeof(float);
+                        std::multiplies<index_t>()) * type_size_;
     ScratchBuffer *scratch = context->device()->scratch_buffer();
     scratch->Rewind();
     index_t scratch_size = PadAlignSize(padded_out_size);
@@ -75,7 +75,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut(
 
     std::unique_ptr<Tensor>
         padded_out
-        (make_unique<Tensor>(scratch->Scratch(scratch_size), DT_FLOAT));
+        (make_unique<Tensor>(scratch->Scratch(scratch_size), output->dtype()));
     padded_out->Reshape(padded_out_shape);
     *padded_output = std::move(padded_out);
   }
@@ -97,24 +97,97 @@ void Deconv2dBase::UnPadOutput(const Tensor &src,
   const index_t padded_height = src.dim(2);
   const index_t padded_width = src.dim(3);
 
-  auto padded_out_data = src.data<float>();
-  auto out_data = dst->mutable_data<float>();
+  auto padded_out_data = src.data<uint8_t>();
+  auto out_data = dst->mutable_data<uint8_t>();
 
   for (index_t i = 0; i < batch; ++i) {
     for (index_t j = 0; j < channels; ++j) {
       for (index_t k = 0; k < height; ++k) {
-        const float *input_base =
+        const uint8_t *input_base =
             padded_out_data + ((i * channels + j) * padded_height
-                + (k + pad_h)) * padded_width;
-        float *output_base =
-            out_data + ((i * channels + j) * height + k) * width;
-        memcpy(output_base, input_base + pad_w, width * sizeof(float));
+                + (k + pad_h)) * padded_width * type_size_;
+        uint8_t *output_base =
+            out_data + ((i * channels + j) * height + k) * width * type_size_;
+        memcpy(output_base,
+               input_base + pad_w * type_size_,
+               width * type_size_);
       }
     }
   }
 }
 
-}  // namespace fp32
+DeconvComputeParam Deconv2dBase::PreWorkAndGetDeconvParam(
+    const OpContext *context, const Tensor *input, Tensor *out_tensor) {
+
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  return DeconvComputeParam(batch, inch, h, w, outch, outh, outw,
+                            out_img_size, &thread_pool);
+}
+
+DepthwiseDeconvComputeParam Deconv2dBase::PreWorkAndGetDepthwiseDeconvParam(
+    const OpContext *context, const Tensor *input, Tensor *out_tensor) {
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t channels = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+  const index_t in_img_size = h * w;
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  return DepthwiseDeconvComputeParam(batch, channels, h, w, in_img_size,
+                                     outh, outw, out_img_size, &thread_pool);
+}
+
+GroupDeconvComputeParam Deconv2dBase::PreWorkAndGetGroupDeconvParam(
+    const OpContext *context, const Tensor *input, Tensor *out_tensor) {
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+
+  const index_t batch = in_shape[0];
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+
+  const index_t in_img_size = h * w;
+  const index_t out_img_size = outh * outw;
+
+  const index_t inch_g = inch / group_;
+  const index_t outch_g = outch / group_;
+
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+
+  return GroupDeconvComputeParam(batch, inch, h, w, outch, outh, outw,
+                                 in_img_size, out_img_size, inch_g,
+                                 outch_g, &thread_pool);
+}
+
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/base/deconv_2d.h b/mace/ops/arm/base/deconv_2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..19d971177adf7eaa75a52b4745925dfd19901098
--- /dev/null
+++ b/mace/ops/arm/base/deconv_2d.h
@@ -0,0 +1,172 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_H_
+
+#include <memory>
+#include <vector>
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/ops/arm/base/gemm.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/deconv_2d.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+struct DeconvComputeParam {
+  const index_t batch;
+  const index_t in_channels;
+  const index_t in_height;
+  const index_t in_width;
+  const index_t out_channels;
+  const index_t out_height;
+  const index_t out_width;
+  const index_t out_img_size;
+
+  utils::ThreadPool &thread_pool;
+
+  DeconvComputeParam(const index_t b,
+                     const index_t in_c,
+                     const index_t in_h,
+                     const index_t in_w,
+                     const index_t out_c,
+                     const index_t out_h,
+                     const index_t out_w,
+                     const index_t out_size,
+                     utils::ThreadPool *thrd_pool)
+      : batch(b), in_channels(in_c), in_height(in_h), in_width(in_w),
+        out_channels(out_c), out_height(out_h), out_width(out_w),
+        out_img_size(out_size), thread_pool(*thrd_pool) {}
+};
+
+struct DepthwiseDeconvComputeParam {
+  const index_t batch;
+  const index_t in_channels;
+  const index_t in_height;
+  const index_t in_width;
+  const index_t in_img_size;
+  const index_t out_height;
+  const index_t out_width;
+  const index_t out_img_size;
+  utils::ThreadPool &thread_pool;
+
+  DepthwiseDeconvComputeParam(const index_t b,
+                              const index_t in_c,
+                              const index_t in_h,
+                              const index_t in_w,
+                              const index_t in_size,
+                              const index_t out_h,
+                              const index_t out_w,
+                              const index_t out_size,
+                              utils::ThreadPool *thrd_pool)
+      : batch(b),
+        in_channels(in_c),
+        in_height(in_h),
+        in_width(in_w),
+        in_img_size(in_size),
+        out_height(out_h),
+        out_width(out_w),
+        out_img_size(out_size),
+        thread_pool(*thrd_pool) {}
+};
+
+struct GroupDeconvComputeParam {
+  const index_t batch;
+  const index_t in_channels;
+  const index_t in_height;
+  const index_t in_width;
+
+  const index_t out_channels;
+  const index_t out_height;
+  const index_t out_width;
+
+  const index_t in_img_size;
+  const index_t out_img_size;
+
+  const index_t inch_g;
+  const index_t outch_g;
+  utils::ThreadPool &thread_pool;
+
+  GroupDeconvComputeParam(const index_t in_b,
+                          const index_t in_ch,
+                          const index_t in_h,
+                          const index_t in_w,
+                          const index_t out_ch,
+                          const index_t out_h,
+                          const index_t out_w,
+                          const index_t in_size,
+                          const index_t out_size,
+                          const index_t in_ch_g,
+                          const index_t out_ch_g,
+                          utils::ThreadPool *thrd_pool)
+      : batch(in_b),
+        in_channels(in_ch),
+        in_height(in_h),
+        in_width(in_w),
+        out_channels(out_ch),
+        out_height(out_h),
+        out_width(out_w),
+        in_img_size(in_size),
+        out_img_size(out_size),
+        inch_g(in_ch_g),
+        outch_g(out_ch_g),
+        thread_pool(*thrd_pool) {}
+};
+
+class Deconv2dBase : public delegator::Deconv2d {
+ public:
+  explicit Deconv2dBase(const delegator::Deconv2dParam &param, int type_size)
+      : delegator::Deconv2d(param),
+        group_(param.group_), type_size_(type_size) {}
+
+  virtual ~Deconv2dBase() = default;
+
+ protected:
+  MaceStatus ResizeOutAndPadOut(const OpContext *context,
+                                const Tensor *input,
+                                const Tensor *filter,
+                                const Tensor *output_shape,
+                                Tensor *output,
+                                std::vector<int> *out_pad_size,
+                                std::unique_ptr<Tensor> *padded_output);
+
+  void UnPadOutput(const Tensor &src,
+                   const std::vector<int> &out_pad_size,
+                   Tensor *dst);
+
+  DeconvComputeParam PreWorkAndGetDeconvParam(
+      const OpContext *context, const Tensor *input, Tensor *out_tensor);
+  DepthwiseDeconvComputeParam PreWorkAndGetDepthwiseDeconvParam(
+      const OpContext *context, const Tensor *input, Tensor *out_tensor);
+  GroupDeconvComputeParam PreWorkAndGetGroupDeconvParam(
+      const OpContext *context, const Tensor *input, Tensor *out_tensor);
+
+ protected:
+  index_t group_;
+
+ private:
+  int type_size_;
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_H_
diff --git a/mace/ops/arm/base/deconv_2d_2x2.cc b/mace/ops/arm/base/deconv_2d_2x2.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d8181034b38835401c9c2047bc3d661311a9b90b
--- /dev/null
+++ b/mace/ops/arm/base/deconv_2d_2x2.cc
@@ -0,0 +1,34 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/deconv_2d_2x2.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterDeconv2dK2x2Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK2x2S1<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K2x2S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK2x2S2<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K2x2S2));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/deconv_2d_2x2.h b/mace/ops/arm/base/deconv_2d_2x2.h
similarity index 58%
rename from mace/ops/arm/fp32/deconv_2d_2x2.h
rename to mace/ops/arm/base/deconv_2d_2x2.h
index 6fd533444a2e1a1e910c2d527987112940ddb4cc..6d1a416a114f3771f5ed1d120b3be1542b7aa1c2 100644
--- a/mace/ops/arm/fp32/deconv_2d_2x2.h
+++ b/mace/ops/arm/base/deconv_2d_2x2.h
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_
-#define MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_2X2_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_2X2_H_
 
 #include <vector>
 #include <memory>
@@ -21,46 +21,38 @@
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d_mxn.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
-class Deconv2dK2x2S1 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK2x2S1 : public Deconv2dKMxN<T> {
  public:
   explicit Deconv2dK2x2S1(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
   virtual ~Deconv2dK2x2S1() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };
 
-class Deconv2dK2x2S2 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK2x2S2 : public Deconv2dKMxN<T> {
  public:
   explicit Deconv2dK2x2S2(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
   virtual ~Deconv2dK2x2S2() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_2X2_H_
diff --git a/mace/ops/arm/base/deconv_2d_3x3.cc b/mace/ops/arm/base/deconv_2d_3x3.cc
new file mode 100644
index 0000000000000000000000000000000000000000..05d936c29b793d326d8d30e845a3df94c3a5dec4
--- /dev/null
+++ b/mace/ops/arm/base/deconv_2d_3x3.cc
@@ -0,0 +1,34 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/deconv_2d_3x3.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK3x3S1<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK3x3S2<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S2));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/deconv_2d_3x3.h b/mace/ops/arm/base/deconv_2d_3x3.h
similarity index 58%
rename from mace/ops/arm/fp32/deconv_2d_3x3.h
rename to mace/ops/arm/base/deconv_2d_3x3.h
index 65cc23e6f365d9809d983c94bc12855760046a17..00b33b429a28af55f802dedc85d67b65c614d82e 100644
--- a/mace/ops/arm/fp32/deconv_2d_3x3.h
+++ b/mace/ops/arm/base/deconv_2d_3x3.h
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_
-#define MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_3X3_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_3X3_H_
 
 #include <vector>
 #include <memory>
@@ -21,46 +21,38 @@
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d_mxn.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
-class Deconv2dK3x3S1 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK3x3S1 : public Deconv2dKMxN<T> {
  public:
   explicit Deconv2dK3x3S1(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
   virtual ~Deconv2dK3x3S1() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };
 
-class Deconv2dK3x3S2 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK3x3S2 : public Deconv2dKMxN<T> {
  public:
   explicit Deconv2dK3x3S2(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
   virtual ~Deconv2dK3x3S2() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_3X3_H_
diff --git a/mace/ops/arm/base/deconv_2d_4x4.cc b/mace/ops/arm/base/deconv_2d_4x4.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9cc42b3ddfd12843a405292fc65d283384e38cda
--- /dev/null
+++ b/mace/ops/arm/base/deconv_2d_4x4.cc
@@ -0,0 +1,34 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/deconv_2d_4x4.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK4x4S1<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK4x4S2<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S2));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/deconv_2d_4x4.h b/mace/ops/arm/base/deconv_2d_4x4.h
similarity index 58%
rename from mace/ops/arm/fp32/deconv_2d_4x4.h
rename to mace/ops/arm/base/deconv_2d_4x4.h
index bf86a62ab4575ef20072dc6f1fd648f2bd65da14..692ff73865a5ac6e7ef5651634874190ed85c964 100644
--- a/mace/ops/arm/fp32/deconv_2d_4x4.h
+++ b/mace/ops/arm/base/deconv_2d_4x4.h
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,55 +12,47 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_
-#define MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_4X4_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_4X4_H_
 
-#include <vector>
 #include <memory>
+#include <vector>
 
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d_mxn.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
-class Deconv2dK4x4S1 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK4x4S1 : public Deconv2dKMxN<T> {
  public:
   explicit Deconv2dK4x4S1(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
   virtual ~Deconv2dK4x4S1() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };
 
-class Deconv2dK4x4S2 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK4x4S2 : public Deconv2dKMxN<T> {
  public:
   explicit Deconv2dK4x4S2(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
   virtual ~Deconv2dK4x4S2() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_4X4_H_
diff --git a/mace/ops/arm/fp32/deconv_2d_general.cc b/mace/ops/arm/base/deconv_2d_general.cc
similarity index 76%
rename from mace/ops/arm/fp32/deconv_2d_general.cc
rename to mace/ops/arm/base/deconv_2d_general.cc
index d090ba23104869712fa2af1e9fc9e6dc203f0276..ec95f186aef7d29f9789d8a53a69e773fca8cb6d 100644
--- a/mace/ops/arm/fp32/deconv_2d_general.cc
+++ b/mace/ops/arm/base/deconv_2d_general.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,34 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d_general.h"
 
-// TODO(liutuo): optimize it
+#include <memory>
+#include <vector>
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-class Deconv2dGeneral : public Deconv2dBase {
- public:
-  explicit Deconv2dGeneral(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
-  virtual ~Deconv2dGeneral() {}
-
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
-};
-
-MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
-                                    const Tensor *input,
-                                    const Tensor *filter,
-                                    const Tensor *output_shape,
-                                    Tensor *output) {
+
+template<typename T>
+MaceStatus Deconv2dGeneral<T>::Compute(const OpContext *context,
+                                       const Tensor *input,
+                                       const Tensor *filter,
+                                       const Tensor *output_shape,
+                                       Tensor *output) {
   std::unique_ptr<Tensor> padded_out;
   std::vector<int> out_pad_size;
   ResizeOutAndPadOut(context,
@@ -60,9 +47,9 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
   Tensor::MappingGuard filter_mapper(filter);
   Tensor::MappingGuard output_mapper(output);
 
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
+  auto input_data = input->data<T>();
+  auto filter_data = filter->data<T>();
+  auto padded_out_data = out_tensor->mutable_data<T>();
 
   auto &in_shape = input->shape();
   auto &out_shape = out_tensor->shape();
@@ -95,7 +82,7 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
                             index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t oc = start1; oc < end1; oc += step1) {
-        float *out_base =
+        T *out_base =
             padded_out_data + (b * out_channels + oc) * out_img_size;
         for (index_t i = 0; i < in_height; ++i) {
           for (index_t j = 0; j < in_width; ++j) {
@@ -104,7 +91,7 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
             for (int ic = 0; ic < in_channels; ++ic) {
               const index_t input_idx =
                   (b * in_channels + ic) * in_img_size + i * in_width + j;
-              const float val = input_data[input_idx];
+              const T val = input_data[input_idx];
               const index_t kernel_offset =
                   (oc * in_channels + ic) * kernel_size;
               for (int k = 0; k < kernel_size; ++k) {
@@ -126,11 +113,10 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
 
 void RegisterDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
   MACE_REGISTER_DELEGATOR(
-      registry, Deconv2dGeneral, delegator::Deconv2dParam,
+      registry, Deconv2dGeneral<float>, delegator::Deconv2dParam,
       MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, float, ImplType::NEON));
 }
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/base/deconv_2d_general.h b/mace/ops/arm/base/deconv_2d_general.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe1786dd96a62447ae8cfe6c4dfa99123c6432fc
--- /dev/null
+++ b/mace/ops/arm/base/deconv_2d_general.h
@@ -0,0 +1,46 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_GENERAL_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_GENERAL_H_
+
+#include "mace/ops/arm/base/deconv_2d.h"
+
+// TODO(liutuo): optimize it
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class Deconv2dGeneral : public Deconv2dBase {
+ public:
+  explicit Deconv2dGeneral(const delegator::Deconv2dParam &param)
+      : Deconv2dBase(param, sizeof(T)) {}
+  virtual ~Deconv2dGeneral() {}
+
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_GENERAL_H_
+
diff --git a/mace/ops/arm/base/deconv_2d_mxn.h b/mace/ops/arm/base/deconv_2d_mxn.h
new file mode 100644
index 0000000000000000000000000000000000000000..89775dae9d5d159258ef572f56bee96fc12202bc
--- /dev/null
+++ b/mace/ops/arm/base/deconv_2d_mxn.h
@@ -0,0 +1,77 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_MXN_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_MXN_H_
+
+#include <memory>
+#include <vector>
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/ops/arm/base/deconv_2d.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class Deconv2dKMxN : public Deconv2dBase {
+ public:
+  explicit Deconv2dKMxN(const delegator::Deconv2dParam &param)
+      : Deconv2dBase(param, sizeof(T)) {}
+  virtual ~Deconv2dKMxN() {}
+
+  MaceStatus Compute(const OpContext *context,
+                     const Tensor *input,
+                     const Tensor *filter,
+                     const Tensor *output_shape,
+                     Tensor *output) {
+    std::unique_ptr<Tensor> padded_out;
+    std::vector<int> out_pad_size;
+    ResizeOutAndPadOut(context, input, filter, output_shape,
+                       output, &out_pad_size, &padded_out);
+
+    Tensor *out_tensor = output;
+    if (padded_out != nullptr) {
+      out_tensor = padded_out.get();
+    }
+    out_tensor->Clear();
+
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard filter_mapper(filter);
+    Tensor::MappingGuard output_mapper(output);
+
+    const T *input_data = input->data<T>();
+    const T *filter_data = filter->data<T>();
+    T *padded_out_data = out_tensor->mutable_data<T>();
+
+    const DeconvComputeParam p =
+        PreWorkAndGetDeconvParam(context, input, out_tensor);
+    DoCompute(p, filter_data, input_data, padded_out_data);
+    UnPadOutput(*out_tensor, out_pad_size, output);
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+  virtual MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
+                               const T *input_data, T *padded_out_data) = 0;
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_MXN_H_
diff --git a/mace/ops/arm/base/depthwise_conv_2d_3x3.cc b/mace/ops/arm/base/depthwise_conv_2d_3x3.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f94239596acca1027ddfdb685099e3d43d0326f5
--- /dev/null
+++ b/mace/ops/arm/base/depthwise_conv_2d_3x3.cc
@@ -0,0 +1,34 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/depthwise_conv_2d_3x3.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseConv2dK3x3S1<float>, delegator::DepthwiseConv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseConv2dK3x3S2<float>, delegator::DepthwiseConv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S2));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h b/mace/ops/arm/base/depthwise_conv_2d_3x3.h
similarity index 58%
rename from mace/ops/arm/fp32/depthwise_conv_2d_3x3.h
rename to mace/ops/arm/base/depthwise_conv_2d_3x3.h
index 49412b808dde686c26fff1b80137ab86c78d65f9..c9edf26e9318de5d4a0baa5eb88a11f97c840e04 100644
--- a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h
+++ b/mace/ops/arm/base/depthwise_conv_2d_3x3.h
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,51 +12,47 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_
-#define MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_3X3_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_3X3_H_
 
 #include <vector>
 
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/depthwise_conv_2d_mxn.h"
 #include "mace/ops/delegator/depthwise_conv_2d.h"
 #include "mace/public/mace.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
-class DepthwiseConv2dK3x3S1 : public Conv2dBase {
+template<typename T>
+class DepthwiseConv2dK3x3S1 : public DepthwiseConv2dKMxN<T> {
  public:
   explicit DepthwiseConv2dK3x3S1(const delegator::DepthwiseConv2dParam &param)
-      : Conv2dBase(param) {}
+      : DepthwiseConv2dKMxN<T>(param) {}
   virtual ~DepthwiseConv2dK3x3S1() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(
+      const DepthwiseConvComputeParam &p, const T *filter,
+      const T *input_data, T *output_data) override;
 };
 
-class DepthwiseConv2dK3x3S2 : public Conv2dBase {
+template<typename T>
+class DepthwiseConv2dK3x3S2 : public DepthwiseConv2dKMxN<T> {
  public:
   explicit DepthwiseConv2dK3x3S2(const delegator::DepthwiseConv2dParam &param)
-      : Conv2dBase(param) {}
+      : DepthwiseConv2dKMxN<T>(param) {}
   virtual ~DepthwiseConv2dK3x3S2() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
+  MaceStatus DoCompute(
+      const DepthwiseConvComputeParam &p, const T *filter,
+      const T *input_data, T *output_data) override;
 };
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_3X3_H_
diff --git a/mace/ops/arm/base/depthwise_conv_2d_mxn.h b/mace/ops/arm/base/depthwise_conv_2d_mxn.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f59802be83bce02e872c2e8836f2609f84eb9e5
--- /dev/null
+++ b/mace/ops/arm/base/depthwise_conv_2d_mxn.h
@@ -0,0 +1,64 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_MXN_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_MXN_H_
+
+#include <vector>
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/ops/arm/base/conv_2d.h"
+#include "mace/ops/delegator/depthwise_conv_2d.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class DepthwiseConv2dKMxN : public Conv2dBase {
+ public:
+  explicit DepthwiseConv2dKMxN(const delegator::DepthwiseConv2dParam &param)
+      : Conv2dBase(param, sizeof(T)) {}
+  virtual ~DepthwiseConv2dKMxN() {}
+
+  MaceStatus Compute(const OpContext *context, const Tensor *input,
+                     const Tensor *filter, Tensor *output) {
+    DepthwiseConvComputeParam p =
+        PreWorkAndGetDepthwiseConv2DParam(context, input, filter, output);
+
+    Tensor::MappingGuard in_guard(input);
+    Tensor::MappingGuard filter_guard(filter);
+    Tensor::MappingGuard out_guard(output);
+    const T *filter_data = filter->data<T>();
+    const T *input_data = input->data<T>();
+    T *output_data = output->mutable_data<T>();
+
+    DoCompute(p, filter_data, input_data, output_data);
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ protected:
+  virtual MaceStatus DoCompute(
+      const DepthwiseConvComputeParam &p, const T *filter,
+      const T *input_data, T *output_data) = 0;
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_MXN_H_
diff --git a/mace/ops/arm/base/depthwise_deconv_2d_3x3.cc b/mace/ops/arm/base/depthwise_deconv_2d_3x3.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c9a70467dbaafb39b8d76716c3ffa4ca201ea61b
--- /dev/null
+++ b/mace/ops/arm/base/depthwise_deconv_2d_3x3.cc
@@ -0,0 +1,47 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/depthwise_deconv_2d_3x3.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterDepthwiseDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseDeconv2dK3x3S1<float>,
+      delegator::DepthwiseDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseDeconv2dK3x3S2<float>,
+      delegator::DepthwiseDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S2));
+}
+
+void RegisterGroupDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, GroupDeconv2dK3x3S1<float>, delegator::GroupDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, GroupDeconv2dK3x3S2<float>, delegator::GroupDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S2));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h b/mace/ops/arm/base/depthwise_deconv_2d_3x3.h
similarity index 51%
rename from mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h
rename to mace/ops/arm/base/depthwise_deconv_2d_3x3.h
index eeb21d6c3c5d50502b268e61f3b0726066a963cb..afe9356eb33887ea850d35657605fa8bf2689ed4 100644
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h
+++ b/mace/ops/arm/base/depthwise_deconv_2d_3x3.h
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_
-#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_3X3_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_3X3_H_
 
 #include <vector>
 #include <memory>
@@ -21,7 +21,7 @@
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/depthwise_deconv_2d_mxn.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/delegator/depthwise_deconv_2d.h"
 #include "mace/public/mace.h"
@@ -29,70 +29,56 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
-class DepthwiseDeconv2dK3x3S1 : public Deconv2dBase {
+template<typename T>
+class DepthwiseDeconv2dK3x3S1 : public DepthwiseDeconv2dKMxN<T> {
  public:
   explicit DepthwiseDeconv2dK3x3S1(
       const delegator::DepthwiseDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : DepthwiseDeconv2dKMxN<T>(param) {}
   virtual ~DepthwiseDeconv2dK3x3S1() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };
 
-class DepthwiseDeconv2dK3x3S2 : public Deconv2dBase {
+template<typename T>
+class DepthwiseDeconv2dK3x3S2 : public DepthwiseDeconv2dKMxN<T> {
  public:
   explicit DepthwiseDeconv2dK3x3S2(
       const delegator::DepthwiseDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : DepthwiseDeconv2dKMxN<T>(param) {}
   virtual ~DepthwiseDeconv2dK3x3S2() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };
 
-class GroupDeconv2dK3x3S1 : public Deconv2dBase {
+template<typename T>
+class GroupDeconv2dK3x3S1 : public GroupDeconv2dKMxN<T> {
  public:
   explicit GroupDeconv2dK3x3S1(
       const delegator::GroupDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : GroupDeconv2dKMxN<T>(param) {}
   virtual ~GroupDeconv2dK3x3S1() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };
 
-class GroupDeconv2dK3x3S2 : public Deconv2dBase {
+template<typename T>
+class GroupDeconv2dK3x3S2 : public GroupDeconv2dKMxN<T> {
  public:
   explicit GroupDeconv2dK3x3S2(const delegator::GroupDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : GroupDeconv2dKMxN<T>(param) {}
   virtual ~GroupDeconv2dK3x3S2() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_3X3_H_
diff --git a/mace/ops/arm/base/depthwise_deconv_2d_4x4.cc b/mace/ops/arm/base/depthwise_deconv_2d_4x4.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c6d0605a4c82e34f4681b9f07b610dd1cd477e1b
--- /dev/null
+++ b/mace/ops/arm/base/depthwise_deconv_2d_4x4.cc
@@ -0,0 +1,48 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/depthwise_deconv_2d_4x4.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterDepthwiseDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseDeconv2dK4x4S1<float>,
+      delegator::DepthwiseDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseDeconv2dK4x4S2<float>,
+      delegator::DepthwiseDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S2));
+}
+
+void RegisterGroupDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, GroupDeconv2dK4x4S1<float>, delegator::GroupDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, GroupDeconv2dK4x4S2<float>, delegator::GroupDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S2));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h b/mace/ops/arm/base/depthwise_deconv_2d_4x4.h
similarity index 51%
rename from mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h
rename to mace/ops/arm/base/depthwise_deconv_2d_4x4.h
index 31d5bd99ed5cfe287026f99ac89d3721c7fed8bb..c543b94af75910734595e92856649dc836228556 100644
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h
+++ b/mace/ops/arm/base/depthwise_deconv_2d_4x4.h
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_
-#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_4X4_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_4X4_H_
 
 #include <vector>
 #include <memory>
@@ -21,7 +21,7 @@
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/depthwise_deconv_2d_mxn.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/delegator/depthwise_deconv_2d.h"
 #include "mace/public/mace.h"
@@ -29,69 +29,55 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
-class DepthwiseDeconv2dK4x4S1 : public Deconv2dBase {
+template<typename T>
+class DepthwiseDeconv2dK4x4S1 : public DepthwiseDeconv2dKMxN<T> {
  public:
   explicit DepthwiseDeconv2dK4x4S1(
       const delegator::DepthwiseDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : DepthwiseDeconv2dKMxN<T>(param) {}
   virtual ~DepthwiseDeconv2dK4x4S1() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };
 
-class DepthwiseDeconv2dK4x4S2 : public Deconv2dBase {
+template<typename T>
+class DepthwiseDeconv2dK4x4S2 : public DepthwiseDeconv2dKMxN<T> {
  public:
   explicit DepthwiseDeconv2dK4x4S2(
       const delegator::DepthwiseDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : DepthwiseDeconv2dKMxN<T>(param) {}
   virtual ~DepthwiseDeconv2dK4x4S2() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };
 
-class GroupDeconv2dK4x4S1 : public Deconv2dBase {
+template<typename T>
+class GroupDeconv2dK4x4S1 : public GroupDeconv2dKMxN<T> {
  public:
   explicit GroupDeconv2dK4x4S1(const delegator::GroupDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : GroupDeconv2dKMxN<T>(param) {}
   virtual ~GroupDeconv2dK4x4S1() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };
 
-class GroupDeconv2dK4x4S2 : public Deconv2dBase {
+template<typename T>
+class GroupDeconv2dK4x4S2 : public GroupDeconv2dKMxN<T> {
  public:
   explicit GroupDeconv2dK4x4S2(const delegator::GroupDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : GroupDeconv2dKMxN<T>(param) {}
   virtual ~GroupDeconv2dK4x4S2() {}
 
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
+  MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter,
+                       const T *input_data, T *padded_out_data) override;
 };
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_4X4_H_
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc b/mace/ops/arm/base/depthwise_deconv_2d_general.cc
similarity index 84%
rename from mace/ops/arm/fp32/depthwise_deconv_2d_general.cc
rename to mace/ops/arm/base/depthwise_deconv_2d_general.cc
index 33d9cb01a377757358757576564d8131eb3c3e48..222706b56e8f5abb1a67ee820b4aae1d50bbd787 100644
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc
+++ b/mace/ops/arm/base/depthwise_deconv_2d_general.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/depthwise_deconv_2d_general.h"
+#include "mace/ops/arm/base/depthwise_deconv_2d_general.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
-MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context,
-                                             const Tensor *input,
-                                             const Tensor *filter,
-                                             const Tensor *output_shape,
-                                             Tensor *output) {
+template<typename T>
+MaceStatus DepthwiseDeconv2dGeneral<T>::Compute(const OpContext *context,
+                                                const Tensor *input,
+                                                const Tensor *filter,
+                                                const Tensor *output_shape,
+                                                Tensor *output) {
   std::unique_ptr<Tensor> padded_out;
   std::vector<int> out_pad_size;
   group_ = input->dim(1);
@@ -46,9 +46,9 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context,
   Tensor::MappingGuard filter_mapper(filter);
   Tensor::MappingGuard output_mapper(output);
 
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
+  const T *input_data = input->data<T>();
+  const T *filter_data = filter->data<T>();
+  T *padded_out_data = out_tensor->mutable_data<T>();
 
   auto &in_shape = input->shape();
   auto &out_shape = out_tensor->shape();
@@ -79,7 +79,7 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context,
                             index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t c = start1; c < end1; c += step1) {
-        float *out_base =
+        T *out_base =
             padded_out_data + (b * channels + c) * out_img_size;
         for (index_t i = 0; i < in_height; ++i) {
           for (index_t j = 0; j < in_width; ++j) {
@@ -105,11 +105,12 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context,
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context,
-                                         const Tensor *input,
-                                         const Tensor *filter,
-                                         const Tensor *output_shape,
-                                         Tensor *output) {
+template<typename T>
+MaceStatus GroupDeconv2dGeneral<T>::Compute(const OpContext *context,
+                                            const Tensor *input,
+                                            const Tensor *filter,
+                                            const Tensor *output_shape,
+                                            Tensor *output) {
   std::unique_ptr<Tensor> padded_out;
   std::vector<int> out_pad_size;
   ResizeOutAndPadOut(context,
@@ -131,9 +132,9 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context,
   Tensor::MappingGuard filter_mapper(filter);
   Tensor::MappingGuard output_mapper(output);
 
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
+  const T *input_data = input->data<T>();
+  const T *filter_data = filter->data<T>();
+  T *padded_out_data = out_tensor->mutable_data<T>();
 
   auto &in_shape = input->shape();
   auto &out_shape = out_tensor->shape();
@@ -209,19 +210,19 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context,
 
 void RegisterDepthwiseDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
   MACE_REGISTER_DELEGATOR(
-      registry, DepthwiseDeconv2dGeneral, delegator::DepthwiseDeconv2dParam,
+      registry, DepthwiseDeconv2dGeneral<float>,
+      delegator::DepthwiseDeconv2dParam,
       MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU,
                          float, ImplType::NEON));
 }
 
 void RegisterGroupDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
   MACE_REGISTER_DELEGATOR(
-      registry, GroupDeconv2dGeneral, delegator::GroupDeconv2dParam,
+      registry, GroupDeconv2dGeneral<float>, delegator::GroupDeconv2dParam,
       MACE_DELEGATOR_KEY(GroupDeconv2d, DeviceType::CPU,
                          float, ImplType::NEON));
 }
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_general.h b/mace/ops/arm/base/depthwise_deconv_2d_general.h
similarity index 80%
rename from mace/ops/arm/fp32/depthwise_deconv_2d_general.h
rename to mace/ops/arm/base/depthwise_deconv_2d_general.h
index 924924498301592de6dd1c9af6473eb61d289407..3fa6d4543a0b4432e4c88a5aa1c5df5644e89505 100644
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_general.h
+++ b/mace/ops/arm/base/depthwise_deconv_2d_general.h
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_
-#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_GENERAL_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_GENERAL_H_
 
 #include <vector>
 #include <memory>
@@ -21,7 +21,7 @@
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/delegator/depthwise_deconv_2d.h"
 #include "mace/public/mace.h"
@@ -29,13 +29,13 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
+template<typename T>
 class DepthwiseDeconv2dGeneral : public Deconv2dBase {
  public:
   explicit DepthwiseDeconv2dGeneral(
       const delegator::DepthwiseDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dBase(param, sizeof(T)) {}
   virtual ~DepthwiseDeconv2dGeneral() {}
 
   MaceStatus Compute(
@@ -46,10 +46,11 @@ class DepthwiseDeconv2dGeneral : public Deconv2dBase {
       Tensor *output) override;
 };
 
+template<typename T>
 class GroupDeconv2dGeneral : public Deconv2dBase {
  public:
   explicit GroupDeconv2dGeneral(const delegator::GroupDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dBase(param, sizeof(T)) {}
   virtual ~GroupDeconv2dGeneral() {}
 
   MaceStatus Compute(
@@ -60,9 +61,8 @@ class GroupDeconv2dGeneral : public Deconv2dBase {
       Tensor *output) override;
 };
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_GENERAL_H_
diff --git a/mace/ops/arm/base/depthwise_deconv_2d_mxn.h b/mace/ops/arm/base/depthwise_deconv_2d_mxn.h
new file mode 100644
index 0000000000000000000000000000000000000000..416551c88c9845737846706806d8cd5b5b176533
--- /dev/null
+++ b/mace/ops/arm/base/depthwise_deconv_2d_mxn.h
@@ -0,0 +1,136 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_MXN_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_MXN_H_
+
+#include <vector>
+#include <memory>
+
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/ops/arm/base/deconv_2d.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/depthwise_deconv_2d.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+template<typename T>
+class DepthwiseDeconv2dKMxN : public Deconv2dBase {
+ public:
+  explicit DepthwiseDeconv2dKMxN(
+      const delegator::DepthwiseDeconv2dParam &param)
+      : Deconv2dBase(param, sizeof(T)) {}
+  virtual ~DepthwiseDeconv2dKMxN() {}
+
+  MaceStatus Compute(
+      const OpContext *context, const Tensor *input, const Tensor *filter,
+      const Tensor *output_shape, Tensor *output) override {
+    std::unique_ptr<Tensor> padded_out;
+    std::vector<int> out_pad_size;
+    group_ = input->dim(1);
+    ResizeOutAndPadOut(context,
+                       input,
+                       filter,
+                       output_shape,
+                       output,
+                       &out_pad_size,
+                       &padded_out);
+
+    Tensor *out_tensor = output;
+    if (padded_out != nullptr) {
+      out_tensor = padded_out.get();
+    }
+
+    out_tensor->Clear();
+
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard filter_mapper(filter);
+    Tensor::MappingGuard output_mapper(output);
+
+    const T *input_data = input->data<float>();
+    const T *filter_data = filter->data<float>();
+    T *padded_out_data = out_tensor->mutable_data<float>();
+
+    DepthwiseDeconvComputeParam p =
+        PreWorkAndGetDepthwiseDeconvParam(context, input, out_tensor);
+    DoCompute(p, filter_data, input_data, padded_out_data);
+    UnPadOutput(*out_tensor, out_pad_size, output);
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+  virtual MaceStatus DoCompute(
+      const DepthwiseDeconvComputeParam &p, const T *filter,
+      const T *input_data, T *padded_out_data) = 0;
+};
+
+template<typename T>
+class GroupDeconv2dKMxN : public Deconv2dBase {
+ public:
+  explicit GroupDeconv2dKMxN(
+      const delegator::DepthwiseDeconv2dParam &param)
+      : Deconv2dBase(param, sizeof(T)) {}
+  virtual ~GroupDeconv2dKMxN() {}
+
+  MaceStatus Compute(
+      const OpContext *context, const Tensor *input, const Tensor *filter,
+      const Tensor *output_shape, Tensor *output) override {
+    std::unique_ptr<Tensor> padded_out;
+    std::vector<int> out_pad_size;
+    ResizeOutAndPadOut(context,
+                       input,
+                       filter,
+                       output_shape,
+                       output,
+                       &out_pad_size,
+                       &padded_out);
+
+    Tensor *out_tensor = output;
+    if (padded_out != nullptr) {
+      out_tensor = padded_out.get();
+    }
+
+    out_tensor->Clear();
+
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard filter_mapper(filter);
+    Tensor::MappingGuard output_mapper(output);
+
+    auto input_data = input->data<float>();
+    auto filter_data = filter->data<float>();
+    auto padded_out_data = out_tensor->mutable_data<float>();
+
+    GroupDeconvComputeParam p =
+        PreWorkAndGetGroupDeconvParam(context, input, out_tensor);
+    DoCompute(p, filter_data, input_data, padded_out_data);
+    UnPadOutput(*out_tensor, out_pad_size, output);
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+  virtual MaceStatus DoCompute(
+      const GroupDeconvComputeParam &p, const T *filter,
+      const T *input_data, T *padded_out_data) = 0;
+};
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_MXN_H_
diff --git a/mace/ops/arm/base/gemm.cc b/mace/ops/arm/base/gemm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..437f767e6956644473ce865f910841a1df9ccb9f
--- /dev/null
+++ b/mace/ops/arm/base/gemm.cc
@@ -0,0 +1,29 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/arm/base/gemm.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterGemmDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Gemm<float>, delegator::GemmParam,
+      MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::NEON));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/gemm.h b/mace/ops/arm/base/gemm.h
similarity index 65%
rename from mace/ops/arm/fp32/gemm.h
rename to mace/ops/arm/base/gemm.h
index 4910ae358347bf94eef076e63934f9365aa1ef79..b2320a71d95842c96fd562413f116516bd0c0c87 100644
--- a/mace/ops/arm/fp32/gemm.h
+++ b/mace/ops/arm/base/gemm.h
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_ARM_FP32_GEMM_H_
-#define MACE_OPS_ARM_FP32_GEMM_H_
+#ifndef MACE_OPS_ARM_BASE_GEMM_H_
+#define MACE_OPS_ARM_BASE_GEMM_H_
 
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
@@ -28,8 +28,10 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
+enum { kNoCache, kCacheLhs, kCacheRhs };
+
+template<typename T>
 class Gemm : public delegator::Gemm {
  public:
   explicit Gemm(const delegator::GemmParam &param)
@@ -68,26 +70,49 @@ class Gemm : public delegator::Gemm {
       const bool transpose_out,
       const bool lhs_batched,
       const bool rhs_batched,
-      Tensor *output) override;
+      Tensor *output) override {
+    index_t rows = transpose_lhs ? lhs_cols : lhs_rows;
+    index_t depth = transpose_lhs ? lhs_rows : lhs_cols;
+    index_t cols = transpose_rhs ? rhs_rows : rhs_cols;
+    index_t depth2 = transpose_rhs ? rhs_cols : rhs_rows;
+    MACE_CHECK(depth == depth2,
+               "Matrices that multiply have inconsistent depth dim: ",
+               depth,
+               " vs. ",
+               depth2);
+
+    return Compute(context,
+                   lhs,
+                   rhs,
+                   batch,
+                   rows,
+                   cols,
+                   depth,
+                   transpose_lhs ? ColMajor : RowMajor,
+                   transpose_rhs ? ColMajor : RowMajor,
+                   transpose_out ? ColMajor : RowMajor,
+                   lhs_batched,
+                   rhs_batched,
+                   output);
+  }
 
- private:
-  void ComputeBlock(const float *packed_lhs_data,
-                    const float *packed_rhs_data,
+ protected:
+  void ComputeBlock(const T *packed_lhs_data,
+                    const T *packed_rhs_data,
                     const index_t depth_padded,
-                    float *packed_output_data);
-
-  void PackLhs(const MatrixMap<const float> &lhs,
-               float *packed_lhs);
+                    T *packed_output_data);
 
-  void PackRhs(const MatrixMap<const float> &rhs,
-               float *packed_rhs);
+  void PackLhs(const MatrixMap<const T> &lhs,
+               T *packed_lhs);
 
-  void UnpackOutput(const float *packed_output,
-                    MatrixMap<float> *output);
+  void PackRhs(const MatrixMap<const T> &rhs,
+               T *packed_rhs);
 
+  void UnpackOutput(const T *packed_output,
+                    MatrixMap<T> *output);
   template<int RowBlockSize, int ColBlockSize>
-  void Unpack(const float *packed_output,
-              MatrixMap<float> *output) {
+  void Unpack(const T *packed_output,
+              MatrixMap<T> *output) {
     const index_t rows = output->rows();
     const index_t cols = output->cols();
     for (index_t r = 0; r < rows; ++r) {
@@ -98,9 +123,9 @@ class Gemm : public delegator::Gemm {
   }
 
   template<int WidthBlockSize, int DepthBlockSize>
-  void Pack(const MatrixMap<const float> &matrix,
+  void Pack(const MatrixMap<const T> &matrix,
             MatrixMajor dst_major,
-            float *packed_matrix) {
+            T *packed_matrix) {
     const index_t rows = matrix.rows();
     const index_t cols = matrix.cols();
     index_t depth = cols;
@@ -109,7 +134,7 @@ class Gemm : public delegator::Gemm {
       depth = rows;
     }
     const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4));
-    memset(packed_matrix, 0, sizeof(float) * WidthBlockSize * depth_padded);
+    memset(packed_matrix, 0, sizeof(T) * WidthBlockSize * depth_padded);
     if (dst_major == ColMajor) {
       for (index_t c = 0; c < cols; ++c) {
         for (index_t r = 0; r < rows; ++r) {
@@ -125,31 +150,14 @@ class Gemm : public delegator::Gemm {
     }
   }
 
+ private:
   Buffer pack_cache_;
-
   bool should_cache_pack_;
   int cached_;
 };
 
-template<>
-void Gemm::Pack<4, 4>(const MatrixMap<const float> &matrix,
-                      MatrixMajor dst_major,
-                      float *packed_matrix);
-
-template<>
-void Gemm::Pack<8, 4>(const MatrixMap<const float> &matrix,
-                      MatrixMajor dst_major,
-                      float *packed_matrix);
-
-template<>
-void Gemm::Unpack<4, 8>(const float *packed_output, MatrixMap<float> *output);
-
-template<>
-void Gemm::Unpack<8, 8>(const float *packed_output, MatrixMap<float> *output);
-
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_ARM_FP32_GEMM_H_
+#endif  // MACE_OPS_ARM_BASE_GEMM_H_
diff --git a/mace/ops/arm/base/gemv.cc b/mace/ops/arm/base/gemv.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb62314a6774906cd884175d8e32afe1f75f4438
--- /dev/null
+++ b/mace/ops/arm/base/gemv.cc
@@ -0,0 +1,30 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "mace/ops/arm/base/gemv.h"
+
+namespace mace {
+namespace ops {
+namespace arm {
+
+void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Gemv<float>, DelegatorParam,
+      MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::NEON));
+}
+
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
diff --git a/mace/ops/arm/fp32/gemv.h b/mace/ops/arm/base/gemv.h
similarity index 86%
rename from mace/ops/arm/fp32/gemv.h
rename to mace/ops/arm/base/gemv.h
index 9933cf42b817e20945517588a87dfca2232e7411..b3cbf19ec4e980903114ebb254290f3ab044cad0 100644
--- a/mace/ops/arm/fp32/gemv.h
+++ b/mace/ops/arm/base/gemv.h
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_ARM_FP32_GEMV_H_
-#define MACE_OPS_ARM_FP32_GEMV_H_
+#ifndef MACE_OPS_ARM_BASE_GEMV_H_
+#define MACE_OPS_ARM_BASE_GEMV_H_
 
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
@@ -23,8 +23,8 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
+template<typename T>
 class Gemv : public delegator::Gemv {
  public:
   explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
@@ -43,9 +43,8 @@ class Gemv : public delegator::Gemv {
       Tensor *output) override;
 };
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_ARM_FP32_GEMV_H_
+#endif  // MACE_OPS_ARM_BASE_GEMV_H_
diff --git a/mace/ops/arm/fp32/activation.cc b/mace/ops/arm/fp32/activation.cc
index 5d8d6984bd04fe7ae1ea9626e409388475505fbb..add68ad01e3b0ea93fcce29ba05768ee3d696ae7 100644
--- a/mace/ops/arm/fp32/activation.cc
+++ b/mace/ops/arm/fp32/activation.cc
@@ -12,186 +12,139 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/delegator/activation.h"
-
 #include <arm_neon.h>
 #include <algorithm>
 
+#include "mace/ops/arm/base/activation.h"
+
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-class Activation : public delegator::Activation {
- public:
-  explicit Activation(const delegator::ActivationParam &param)
-      : delegator::Activation(param) {}
-  ~Activation() = default;
-
-  MaceStatus Compute(const OpContext *context,
-                     const Tensor *input, Tensor *output) override;
-
- private:
-  void DoActivation(const OpContext *context,
-                    const Tensor *input, Tensor *output);
-};
-
-MaceStatus Activation::Compute(const OpContext *context,
-                               const Tensor *input, Tensor *output) {
-  Tensor::MappingGuard input_guard(input);
-  if (input != output) {
-    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-    Tensor::MappingGuard output_guard(output);
-    DoActivation(context, input, output);
-  } else {
-    DoActivation(context, input, output);
+
+template<>
+void Activation<float>::ActivateRelu(utils::ThreadPool *thread_pool,
+                                     const float *input_data,
+                                     const index_t input_size,
+                                     float *output_data) {
+  const float32x4_t vzero = vdupq_n_f32(0.f);
+  const index_t block_count = input_size / 4;
+
+  thread_pool->Compute1D(
+      [=](index_t start, index_t end, index_t step) {
+        auto input_ptr = input_data + start * 4;
+        auto output_ptr = output_data + start * 4;
+
+        for (index_t i = start; i < end; i += step) {
+          float32x4_t v = vld1q_f32(input_ptr);
+          v = vmaxq_f32(v, vzero);
+          vst1q_f32(output_ptr, v);
+
+          input_ptr += 4;
+          output_ptr += 4;
+        }
+      },
+      0, block_count, 1);
+
+  // remain
+  for (index_t i = block_count * 4; i < input_size; ++i) {
+    output_data[i] = std::max(0.f, input_data[i]);
   }
+}
 
-  return MaceStatus::MACE_SUCCESS;
+template<>
+void Activation<float>::ActivateRelux(utils::ThreadPool *thread_pool,
+                                      const float *input_data,
+                                      const index_t input_size,
+                                      float *output_data) {
+  const float32x4_t vzero = vdupq_n_f32(0.f);
+  const float32x4_t vlimit = vdupq_n_f32(limit_);
+  const index_t block_count = input_size / 4;
+
+  thread_pool->Compute1D(
+      [=](index_t start, index_t end, index_t step) {
+        auto input_ptr = input_data + start * 4;
+        auto output_ptr = output_data + start * 4;
+
+        for (index_t i = start; i < end; i += step) {
+          float32x4_t v = vld1q_f32(input_ptr);
+          v = vmaxq_f32(v, vzero);
+          v = vminq_f32(v, vlimit);
+          vst1q_f32(output_ptr, v);
+
+          input_ptr += 4;
+          output_ptr += 4;
+        }
+      },
+      0, block_count, 1);
+
+  // remain
+  for (index_t i = block_count * 4; i < input_size; ++i) {
+    output_data[i] = std::max(0.f, std::min(limit_, input_data[i]));
+  }
 }
 
-void Activation::DoActivation(const OpContext *context,
-                              const Tensor *input,
-                              Tensor *output) {
-  auto input_data = input->data<float>();
-  auto output_data = output->mutable_data<float>();
-  const index_t size = input->size();
-
-  utils::ThreadPool &thread_pool =
-      context->device()->cpu_runtime()->thread_pool();
-
-  switch (type_) {
-    case RELU: {
-      const float32x4_t vzero = vdupq_n_f32(0.f);
-      const index_t block_count = size / 4;
-
-      thread_pool.Compute1D(
-          [=](index_t start, index_t end, index_t step) {
-            auto input_ptr = input_data + start * 4;
-            auto output_ptr = output_data + start * 4;
-
-            for (index_t i = start; i < end; i += step) {
-              float32x4_t v = vld1q_f32(input_ptr);
-              v = vmaxq_f32(v, vzero);
-              vst1q_f32(output_ptr, v);
-
-              input_ptr += 4;
-              output_ptr += 4;
-            }
-          },
-          0, block_count, 1);
-
-      // remain
-      for (index_t i = block_count * 4; i < size; ++i) {
-        output_data[i] = std::max(0.f, input_data[i]);
-      }
-
-      break;
-    }
-
-    case RELUX: {
-      const float32x4_t vzero = vdupq_n_f32(0.f);
-      const float32x4_t vlimit = vdupq_n_f32(limit_);
-      const index_t block_count = size / 4;
-
-      thread_pool.Compute1D(
-          [=](index_t start, index_t end, index_t step) {
-            auto input_ptr = input_data + start * 4;
-            auto output_ptr = output_data + start * 4;
-
-            for (index_t i = start; i < end; i += step) {
-              float32x4_t v = vld1q_f32(input_ptr);
-              v = vmaxq_f32(v, vzero);
-              v = vminq_f32(v, vlimit);
-              vst1q_f32(output_ptr, v);
-
-              input_ptr += 4;
-              output_ptr += 4;
-            }
-          },
-          0, block_count, 1);
-
-      // remain
-      for (index_t i = block_count * 4; i < size; ++i) {
-        output_data[i] = std::max(0.f, std::min(limit_, input_data[i]));
-      }
-
-      break;
-    }
-
-    case LEAKYRELU: {
-      const float32x4_t vzero = vdupq_n_f32(0.f);
-      const float32x4_t valpha = vdupq_n_f32(leakyrelu_coefficient_);
-      const index_t block_count = size / 4;
-
-      thread_pool.Compute1D(
-          [=](index_t start, index_t end, index_t step) {
-            auto input_ptr = input_data + start * 4;
-            auto output_ptr = output_data + start * 4;
-
-            for (index_t i = start; i < end; i += step) {
-              float32x4_t v = vld1q_f32(input_ptr);
-              float32x4_t u = vminq_f32(v, vzero);
-              v = vmaxq_f32(v, vzero);
-              v = vmlaq_f32(v, valpha, u);
-              vst1q_f32(output_ptr, v);
-
-              input_ptr += 4;
-              output_ptr += 4;
-            }
-          },
-          0, block_count, 1);
-
-      // remain
-      for (index_t i = block_count * 4; i < size; ++i) {
-        output_data[i] = std::max(input_data[i], 0.f) +
-            std::min(input_data[i], 0.f) * leakyrelu_coefficient_;
-      }
-
-      break;
-    }
-
-    case TANH: {
-      thread_pool.Compute1D(
-          [=](index_t start, index_t end, index_t step) {
-            for (index_t i = start; i < end; i += step) {
-              output_data[i] = std::tanh(input_data[i]);
-            }
-          },
-          0, size, 1);
-
-      break;
-    }
-
-    case SIGMOID: {
-      thread_pool.Compute1D(
-          [=](index_t start, index_t end, index_t step) {
-            for (index_t i = start; i < end; i += step) {
-              output_data[i] = 1 / (1 + std::exp(-(input_data[i])));
-            }
-          },
-          0, size, 1);
-
-      break;
-    }
-
-    case NOOP: {
-      break;
-    }
-
-    default: {
-      MACE_NOT_IMPLEMENTED;
-    }
+template<>
+void Activation<float>::ActivateLeakyRelu(utils::ThreadPool *thread_pool,
+                                          const float *input_data,
+                                          const index_t input_size,
+                                          float *output_data) {
+  const float32x4_t vzero = vdupq_n_f32(0.f);
+  const float32x4_t valpha = vdupq_n_f32(leakyrelu_coefficient_);
+  const index_t block_count = input_size / 4;
+
+  thread_pool->Compute1D(
+      [=](index_t start, index_t end, index_t step) {
+        auto input_ptr = input_data + start * 4;
+        auto output_ptr = output_data + start * 4;
+
+        for (index_t i = start; i < end; i += step) {
+          float32x4_t v = vld1q_f32(input_ptr);
+          float32x4_t u = vminq_f32(v, vzero);
+          v = vmaxq_f32(v, vzero);
+          v = vmlaq_f32(v, valpha, u);
+          vst1q_f32(output_ptr, v);
+
+          input_ptr += 4;
+          output_ptr += 4;
+        }
+      },
+      0, block_count, 1);
+
+  // remain
+  for (index_t i = block_count * 4; i < input_size; ++i) {
+    output_data[i] = std::max(input_data[i], 0.f) +
+        std::min(input_data[i], 0.f) * leakyrelu_coefficient_;
   }
 }
 
-void RegisterActivationDelegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Activation, delegator::ActivationParam,
-      MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::NEON));
+template<>
+void Activation<float>::ActivateTanh(utils::ThreadPool *thread_pool,
+                                     const float *input_data,
+                                     const index_t input_size,
+                                     float *output_data) {
+  thread_pool->Compute1D(
+      [=](index_t start, index_t end, index_t step) {
+        for (index_t i = start; i < end; i += step) {
+          output_data[i] = std::tanh(input_data[i]);
+        }
+      },
+      0, input_size, 1);
+}
+
+template<>
+void Activation<float>::ActivateSigmoid(utils::ThreadPool *thread_pool,
+                                        const float *input_data,
+                                        const index_t input_size,
+                                        float *output_data) {
+  thread_pool->Compute1D(
+      [=](index_t start, index_t end, index_t step) {
+        for (index_t i = start; i < end; i += step) {
+          output_data[i] = 1 / (1 + std::exp(-(input_data[i])));
+        }
+      },
+      0, input_size, 1);
 }
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/bias_add.cc b/mace/ops/arm/fp32/bias_add.cc
index 7edafec327692d736cc66ec22e82808031819e05..042d306d8475ca850ee61cdc0d14185038543ecb 100644
--- a/mace/ops/arm/fp32/bias_add.cc
+++ b/mace/ops/arm/fp32/bias_add.cc
@@ -13,129 +13,81 @@
 // limitations under the License.
 
 #include <arm_neon.h>
-#include "mace/ops/delegator/bias_add.h"
+
+#include "mace/ops/arm/base/bias_add.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-class BiasAdd : public delegator::BiasAdd {
- public:
-  explicit BiasAdd(const DelegatorParam &param) : delegator::BiasAdd(param) {}
-  ~BiasAdd() = default;
-
-  MaceStatus Compute(const OpContext *context, const Tensor *input,
-                     const Tensor *bias, Tensor *output) override;
 
- private:
-  void AddBias(const OpContext *context, const Tensor *input,
-               const Tensor *bias, Tensor *output);
-};
-
-MaceStatus BiasAdd::Compute(const OpContext *context,
-                            const Tensor *input,
-                            const Tensor *bias,
-                            Tensor *output) {
-  Tensor::MappingGuard input_guard(input);
-  Tensor::MappingGuard bias_guard(bias);
-  if (input != output) {
-    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-    if (bias == nullptr) {
-      output->Copy(*input);
-    } else {
-      Tensor::MappingGuard output_guard(output);
-      AddBias(context, input, bias, output);
-    }
-  } else {
-    if (bias != nullptr) {
-      AddBias(context, input, bias, output);
+template<>
+void BiasAdd<float>::Add1DimBias(
+    utils::ThreadPool *thread_pool, const float *input_data,
+    const float *bias_data, float *output_data, const index_t batch,
+    const index_t channels, const index_t image_size) {
+  const index_t block_count = image_size / 4;
+  const index_t remain = image_size % 4;
+  thread_pool->Compute2D([=](index_t start0, index_t end0, index_t step0,
+                             index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      const index_t b_offset = b * channels;
+      for (index_t c = start1; c < end1; c += step1) {
+        const index_t offset = (b_offset + c) * image_size;
+        auto input_ptr = input_data + offset;
+        auto output_ptr = output_data + offset;
+        const float bias = bias_data[c];
+        float32x4_t vbias = vdupq_n_f32(bias);
+
+        for (index_t i = 0; i < block_count; ++i) {
+          float32x4_t v = vld1q_f32(input_ptr);
+          v = vaddq_f32(v, vbias);
+          vst1q_f32(output_ptr, v);
+
+          input_ptr += 4;
+          output_ptr += 4;
+        }
+        for (index_t i = 0; i < remain; ++i) {
+          (*output_ptr++) = (*input_ptr++) + bias;
+        }
+      }
     }
-  }
-
-  return MaceStatus::MACE_SUCCESS;
+  }, 0, batch, 1, 0, channels, 1);
 }
 
-void BiasAdd::AddBias(const OpContext *context,
-                      const Tensor *input,
-                      const Tensor *bias,
-                      mace::Tensor *output) {
-  auto input_data = input->data<float>();
-  auto bias_data = bias->data<float>();
-  auto output_data = output->mutable_data<float>();
-
-  const index_t batch = input->dim(0);
-  const index_t channels = input->dim(1);
-  const index_t height = output->dim(2);
-  const index_t width = output->dim(3);
-  const index_t image_size = height * width;
+template<>
+void BiasAdd<float>::Add2DimsBias(
+    utils::ThreadPool *thread_pool, const float *input_data,
+    const float *bias_data, float *output_data, const index_t batch,
+    const index_t channels, const index_t image_size) {
   const index_t block_count = image_size / 4;
   const index_t remain = image_size % 4;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  if (bias->dim_size() == 1) {
-    thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                              index_t start1, index_t end1, index_t step1) {
-      for (index_t b = start0; b < end0; b += step0) {
-        const index_t b_offset = b * channels;
-        for (index_t c = start1; c < end1; c += step1) {
-          const index_t offset = (b_offset + c) * image_size;
-          auto input_ptr = input_data + offset;
-          auto output_ptr = output_data + offset;
-          const float bias = bias_data[c];
-          float32x4_t vbias = vdupq_n_f32(bias);
-
-          for (index_t i = 0; i < block_count; ++i) {
-            float32x4_t v = vld1q_f32(input_ptr);
-            v = vaddq_f32(v, vbias);
-            vst1q_f32(output_ptr, v);
-
-            input_ptr += 4;
-            output_ptr += 4;
-          }
-          for (index_t i = 0; i < remain; ++i) {
-            (*output_ptr++) = (*input_ptr++) + bias;
-          }
+  thread_pool->Compute2D([=](index_t start0, index_t end0, index_t step0,
+                             index_t start1, index_t end1, index_t step1) {
+    for (index_t b = start0; b < end0; b += step0) {
+      const index_t b_offset = b * channels;
+      for (index_t c = start1; c < end1; c += step1) {
+        const index_t offset = (b_offset + c) * image_size;
+        auto input_ptr = input_data + offset;
+        auto output_ptr = output_data + offset;
+        const float bias = bias_data[b * channels + c];
+        float32x4_t vbias = vdupq_n_f32(bias);
+
+        for (index_t i = 0; i < block_count; ++i) {
+          float32x4_t v = vld1q_f32(input_ptr);
+          v = vaddq_f32(v, vbias);
+          vst1q_f32(output_ptr, v);
+
+          input_ptr += 4;
+          output_ptr += 4;
         }
-      }
-    }, 0, batch, 1, 0, channels, 1);
-  } else {
-    thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                              index_t start1, index_t end1, index_t step1) {
-      for (index_t b = start0; b < end0; b += step0) {
-        const index_t b_offset = b * channels;
-        for (index_t c = start1; c < end1; c += step1) {
-          const index_t offset = (b_offset + c) * image_size;
-          auto input_ptr = input_data + offset;
-          auto output_ptr = output_data + offset;
-          const float bias = bias_data[b * channels + c];
-          float32x4_t vbias = vdupq_n_f32(bias);
-
-          for (index_t i = 0; i < block_count; ++i) {
-            float32x4_t v = vld1q_f32(input_ptr);
-            v = vaddq_f32(v, vbias);
-            vst1q_f32(output_ptr, v);
-
-            input_ptr += 4;
-            output_ptr += 4;
-          }
-          for (index_t i = 0; i < remain; ++i) {
-            (*output_ptr++) = (*input_ptr++) + bias;
-          }
+        for (index_t i = 0; i < remain; ++i) {
+          (*output_ptr++) = (*input_ptr++) + bias;
         }
       }
-    }, 0, batch, 1, 0, channels, 1);
-  }
-}
-
-void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, BiasAdd, DelegatorParam,
-      MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::NEON));
+    }
+  }, 0, batch, 1, 0, channels, 1);
 }
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/common_neon.h b/mace/ops/arm/fp32/common_neon.h
index d4e61add21872e991c1947307f733ac404136738..502ffc393c0601259ee60bd4a7e0b8bcae4e73b2 100644
--- a/mace/ops/arm/fp32/common_neon.h
+++ b/mace/ops/arm/fp32/common_neon.h
@@ -21,7 +21,6 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
 inline float32x4_t neon_vfma_lane_0(float32x4_t a,
                           float32x4_t b,
@@ -63,7 +62,6 @@ inline float32x4_t neon_vfma_lane_3(float32x4_t a,
 #endif
 }
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_2d.h b/mace/ops/arm/fp32/conv_2d.h
deleted file mode 100644
index a143f5f84c2092c614d60576e27e26ec69d7e3a3..0000000000000000000000000000000000000000
--- a/mace/ops/arm/fp32/conv_2d.h
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ARM_FP32_CONV_2D_H_
-#define MACE_OPS_ARM_FP32_CONV_2D_H_
-
-#include <vector>
-#include <memory>
-
-#include "mace/core/ops/op_context.h"
-#include "mace/core/tensor.h"
-#include "mace/ops/delegator/conv_2d.h"
-#include "mace/ops/arm/fp32/gemm.h"
-#include "mace/ops/common/conv_pool_2d_util.h"
-#include "mace/public/mace.h"
-
-namespace mace {
-namespace ops {
-namespace arm {
-namespace fp32 {
-
-class Conv2dBase : public delegator::Conv2d {
- public:
-  explicit Conv2dBase(const delegator::Conv2dParam &param)
-      : delegator::Conv2d(param) {}
-
-  virtual ~Conv2dBase() = default;
-
- protected:
-  void CalOutputShapeAndInputPadSize(const std::vector<index_t> &input_shape,
-                                     const std::vector<index_t> &filter_shape,
-                                     std::vector<index_t> *output_shape,
-                                     std::vector<int> *in_pad_size);
-
-  void CalOutputBoundaryWithoutUsingInputPad(const std::vector<index_t>
-                                             &output_shape,
-                                             const std::vector<int>
-                                             in_pad_size,
-                                             std::vector<index_t>
-                                             *out_bound);
-
-  void CalOutputShapeAndPadSize(const Tensor *input,
-                                const Tensor *filter,
-                                const int out_tile_height,
-                                const int out_tile_width,
-                                std::vector<index_t> *output_shape,
-                                std::vector<int> *in_pad_size,
-                                std::vector<int> *out_pad_size);
-
-  MaceStatus ResizeOutAndPadInOut(const OpContext *context,
-                                  const Tensor *input,
-                                  const Tensor *filter,
-                                  Tensor *output,
-                                  const int out_tile_height,
-                                  const int out_tile_width,
-                                  std::unique_ptr<const Tensor> *padded_input,
-                                  std::unique_ptr<Tensor> *padded_output);
-
-  void PadInput(const Tensor &src,
-                const int pad_top,
-                const int pad_left,
-                Tensor *dst);
-  void UnPadOutput(const Tensor &src, Tensor *dst);
-};
-
-}  // namespace fp32
-}  // namespace arm
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARM_FP32_CONV_2D_H_
diff --git a/mace/ops/arm/fp32/conv_2d_1xn.cc b/mace/ops/arm/fp32/conv_2d_1xn.cc
index 0b5d335a69753c705a49180c5e005f6bbff125b2..527ac0980caf189765322c470227d3d32c189e9d 100644
--- a/mace/ops/arm/fp32/conv_2d_1xn.cc
+++ b/mace/ops/arm/fp32/conv_2d_1xn.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,93 +12,44 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/conv_2d_1xn.h"
-
 #include <arm_neon.h>
 #include <memory>
 
+#include "mace/ops/arm/base/conv_2d_1xn.h"
 #include "mace/ops/delegator/conv_2d.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
-                                 const Tensor *input,
-                                 const Tensor *filter,
-                                 Tensor *output) {
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
-
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       1,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+
+template<>
+MaceStatus Conv2dK1x7S1<float>::DoCompute(
+    const ConvComputeParam &p, const float *filter_data,
+    const float *input_data, float *output_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t m = start1; m < end1; m += step1) {
-        if (m + 3 < out_channels) {
+        if (m + 3 < p.out_channels) {
           float *out_ptr0_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
           float *out_ptr1_base =
-              output_data + b * out_batch_size + (m + 1) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
           float *out_ptr2_base =
-              output_data + b * out_batch_size + (m + 2) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
           float *out_ptr3_base =
-              output_data + b * out_batch_size + (m + 3) * out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+              output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
+          for (index_t c = 0; c < p.in_channels; ++c) {
             const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
-            const float
-                *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7;
-            const float
-                *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7;
-            const float
-                *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7;
-            const float
-                *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
+            const float *filter_ptr0 =
+                filter_data + m * p.in_channels * 7 + c * 7;
+            const float *filter_ptr1 =
+                filter_data + (m + 1) * p.in_channels * 7 + c * 7;
+            const float *filter_ptr2 =
+                filter_data + (m + 2) * p.in_channels * 7 + c * 7;
+            const float *filter_ptr3 =
+                filter_data + (m + 3) * p.in_channels * 7 + c * 7;
             /* load filter (4 outch x 1 height x 4 width) */
             float32x4_t vf00, vf01;
             float32x4_t vf10, vf11;
@@ -113,12 +64,12 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
             vf30 = vld1q_f32(filter_ptr3);
             vf31 = vld1q_f32(filter_ptr3 + 3);
 
-            for (index_t h = 0; h < out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+            for (index_t h = 0; h < p.out_height; ++h) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                 // output (4 outch x 1 height x 4 width): vo_outch_height
                 float32x4_t vo0, vo1, vo2, vo3;
                 // load output
-                index_t out_offset = h * out_width + w;
+                index_t out_offset = h * p.out_width + w;
                 vo0 = vld1q_f32(out_ptr0_base + out_offset);
                 vo1 = vld1q_f32(out_ptr1_base + out_offset);
                 vo2 = vld1q_f32(out_ptr2_base + out_offset);
@@ -127,7 +78,7 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
                 // input (3 slide)
                 float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
                 // input offset
-                index_t in_offset = h * in_width + w;
+                index_t in_offset = h * p.in_width + w;
                 // load input
                 vi0 = vld1q_f32(in_ptr_base + in_offset);
                 vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
@@ -214,31 +165,31 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
             }    // h
           }  // c
         } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
+          for (index_t mm = m; mm < p.out_channels; ++mm) {
             float *out_ptr0_base =
-                output_data + b * out_batch_size + mm * out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
+                output_data + b * p.out_batch_size + mm * p.out_image_size;
+            for (index_t c = 0; c < p.in_channels; ++c) {
               const float *in_ptr_base =
-                  input_data + b * in_batch_size + c * in_image_size;
-              const float
-                  *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7;
+                  input_data + b * p.in_batch_size + c * p.in_image_size;
+              const float *filter_ptr0 =
+                  filter_data + mm * p.in_channels * 7 + c * 7;
               /* load filter (1 outch x 1 height x 4 width) */
               float32x4_t vf00, vf01;
               vf00 = vld1q_f32(filter_ptr0);
               vf01 = vld1q_f32(filter_ptr0 + 3);
 
-              for (index_t h = 0; h < out_height; ++h) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
+              for (index_t h = 0; h < p.out_height; ++h) {
+                for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                   // output (1 outch x 1 height x 4 width): vo_outch_height
                   float32x4_t vo0;
                   // load output
-                  index_t out_offset = h * out_width + w;
+                  index_t out_offset = h * p.out_width + w;
                   vo0 = vld1q_f32(out_ptr0_base + out_offset);
 
                   // input (3 slide)
                   float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
                   // input offset
-                  index_t in_offset = h * in_width + w;
+                  index_t in_offset = h * p.in_width + w;
                   // load input
                   vi0 = vld1q_f32(in_ptr_base + in_offset);
                   vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
@@ -275,87 +226,39 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
         }  // if
       }    // m
     }      // b
-  }, 0, batch, 1, 0, out_channels, 4);
+  }, 0, p.batch, 1, 0, p.out_channels, 4);
 
-  UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
-                                 const Tensor *input,
-                                 const Tensor *filter,
-                                 Tensor *output) {
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
-
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       4,
-                       1,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+template<>
+MaceStatus Conv2dK7x1S1<float>::DoCompute(
+    const ConvComputeParam &p, const float *filter_data,
+    const float *input_data, float *output_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t m = start1; m < end1; m += step1) {
-        if (m + 3 < out_channels) {
+        if (m + 3 < p.out_channels) {
           float *out_ptr0_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
           float *out_ptr1_base =
-              output_data + b * out_batch_size + (m + 1) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
           float *out_ptr2_base =
-              output_data + b * out_batch_size + (m + 2) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
           float *out_ptr3_base =
-              output_data + b * out_batch_size + (m + 3) * out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+              output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
+          for (index_t c = 0; c < p.in_channels; ++c) {
             const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
-            const float
-                *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7;
-            const float
-                *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7;
-            const float
-                *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7;
-            const float
-                *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
+            const float *filter_ptr0 =
+                filter_data + m * p.in_channels * 7 + c * 7;
+            const float *filter_ptr1 =
+                filter_data + (m + 1) * p.in_channels * 7 + c * 7;
+            const float *filter_ptr2 =
+                filter_data + (m + 2) * p.in_channels * 7 + c * 7;
+            const float *filter_ptr3 =
+                filter_data + (m + 3) * p.in_channels * 7 + c * 7;
             /* load filter (4 outch x 4 height x 1 width) */
             float32x4_t vf00, vf01;
             float32x4_t vf10, vf11;
@@ -370,41 +273,41 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
             vf30 = vld1q_f32(filter_ptr3);
             vf31 = vld1q_f32(filter_ptr3 + 3);
 
-            for (index_t h = 0; h + 3 < out_height; h += 4) {
-              for (index_t w = 0; w < out_width; ++w) {
+            for (index_t h = 0; h + 3 < p.out_height; h += 4) {
+              for (index_t w = 0; w < p.out_width; ++w) {
                 // load output
-                index_t out_offset = h * out_width + w;
+                index_t out_offset = h * p.out_width + w;
                 // output (4 outch x 4 height x 1 width): vo_outch_height
                 float32x4_t vo0 = {out_ptr0_base[out_offset],
-                                   out_ptr0_base[out_offset + out_width],
-                                   out_ptr0_base[out_offset + 2 * out_width],
-                                   out_ptr0_base[out_offset + 3 * out_width]};
+                                   out_ptr0_base[out_offset + p.out_width],
+                                   out_ptr0_base[out_offset + 2 * p.out_width],
+                                   out_ptr0_base[out_offset + 3 * p.out_width]};
                 float32x4_t vo1 = {out_ptr1_base[out_offset],
-                                   out_ptr1_base[out_offset + out_width],
-                                   out_ptr1_base[out_offset + 2 * out_width],
-                                   out_ptr1_base[out_offset + 3 * out_width]};
+                                   out_ptr1_base[out_offset + p.out_width],
+                                   out_ptr1_base[out_offset + 2 * p.out_width],
+                                   out_ptr1_base[out_offset + 3 * p.out_width]};
                 float32x4_t vo2 = {out_ptr2_base[out_offset],
-                                   out_ptr2_base[out_offset + out_width],
-                                   out_ptr2_base[out_offset + 2 * out_width],
-                                   out_ptr2_base[out_offset + 3 * out_width]};
+                                   out_ptr2_base[out_offset + p.out_width],
+                                   out_ptr2_base[out_offset + 2 * p.out_width],
+                                   out_ptr2_base[out_offset + 3 * p.out_width]};
                 float32x4_t vo3 = {out_ptr3_base[out_offset],
-                                   out_ptr3_base[out_offset + out_width],
-                                   out_ptr3_base[out_offset + 2 * out_width],
-                                   out_ptr3_base[out_offset + 3 * out_width]};
+                                   out_ptr3_base[out_offset + p.out_width],
+                                   out_ptr3_base[out_offset + 2 * p.out_width],
+                                   out_ptr3_base[out_offset + 3 * p.out_width]};
 
                 // input offset
-                index_t in_offset = h * in_width + w;
+                index_t in_offset = h * p.in_width + w;
                 // input (3 slide)
                 float32x4_t vi0 = {in_ptr_base[in_offset],
-                                   in_ptr_base[in_offset + in_width],
-                                   in_ptr_base[in_offset + 2 * in_width],
-                                   in_ptr_base[in_offset + 3 * in_width]};
-                float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
-                                   in_ptr_base[in_offset + 5 * in_width],
-                                   in_ptr_base[in_offset + 6 * in_width],
-                                   in_ptr_base[in_offset + 7 * in_width]};
-                float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
-                                   in_ptr_base[in_offset + 9 * in_width]};
+                                   in_ptr_base[in_offset + p.in_width],
+                                   in_ptr_base[in_offset + 2 * p.in_width],
+                                   in_ptr_base[in_offset + 3 * p.in_width]};
+                float32x4_t vi4 = {in_ptr_base[in_offset + 4 * p.in_width],
+                                   in_ptr_base[in_offset + 5 * p.in_width],
+                                   in_ptr_base[in_offset + 6 * p.in_width],
+                                   in_ptr_base[in_offset + 7 * p.in_width]};
+                float32x4_t vi8 = {in_ptr_base[in_offset + 8 * p.in_width],
+                                   in_ptr_base[in_offset + 9 * p.in_width]};
                 float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
                 float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
                 float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
@@ -480,63 +383,65 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
 #endif
 
                 out_ptr0_base[out_offset] = vo0[0];
-                out_ptr0_base[out_offset + out_width] = vo0[1];
-                out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
-                out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
+                out_ptr0_base[out_offset + p.out_width] = vo0[1];
+                out_ptr0_base[out_offset + 2 * p.out_width] = vo0[2];
+                out_ptr0_base[out_offset + 3 * p.out_width] = vo0[3];
                 out_ptr1_base[out_offset] = vo1[0];
-                out_ptr1_base[out_offset + out_width] = vo1[1];
-                out_ptr1_base[out_offset + 2 * out_width] = vo1[2];
-                out_ptr1_base[out_offset + 3 * out_width] = vo1[3];
+                out_ptr1_base[out_offset + p.out_width] = vo1[1];
+                out_ptr1_base[out_offset + 2 * p.out_width] = vo1[2];
+                out_ptr1_base[out_offset + 3 * p.out_width] = vo1[3];
                 out_ptr2_base[out_offset] = vo2[0];
-                out_ptr2_base[out_offset + out_width] = vo2[1];
-                out_ptr2_base[out_offset + 2 * out_width] = vo2[2];
-                out_ptr2_base[out_offset + 3 * out_width] = vo2[3];
+                out_ptr2_base[out_offset + p.out_width] = vo2[1];
+                out_ptr2_base[out_offset + 2 * p.out_width] = vo2[2];
+                out_ptr2_base[out_offset + 3 * p.out_width] = vo2[3];
                 out_ptr3_base[out_offset] = vo3[0];
-                out_ptr3_base[out_offset + out_width] = vo3[1];
-                out_ptr3_base[out_offset + 2 * out_width] = vo3[2];
-                out_ptr3_base[out_offset + 3 * out_width] = vo3[3];
+                out_ptr3_base[out_offset + p.out_width] = vo3[1];
+                out_ptr3_base[out_offset + 2 * p.out_width] = vo3[2];
+                out_ptr3_base[out_offset + 3 * p.out_width] = vo3[3];
               }  // w
             }    // h
           }  // c
         } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
+          for (index_t mm = m; mm < p.out_channels; ++mm) {
             float *out_ptr0_base =
-                output_data + b * out_batch_size + mm * out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
+                output_data + b * p.out_batch_size + mm * p.out_image_size;
+            for (index_t c = 0; c < p.in_channels; ++c) {
               const float *in_ptr_base =
-                  input_data + b * in_batch_size + c * in_image_size;
-              const float
-                  *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7;
+                  input_data + b * p.in_batch_size + c * p.in_image_size;
+              const float *filter_ptr0 =
+                  filter_data + mm * p.in_channels * 7 + c * 7;
               /* load filter (1 outch x 4 height x 1 width) */
               float32x4_t vf00, vf01;
               vf00 = vld1q_f32(filter_ptr0);
               vf01 = vld1q_f32(filter_ptr0 + 3);
 
-              for (index_t h = 0; h + 3 < out_height; h += 4) {
-                for (index_t w = 0; w < out_width; ++w) {
+              for (index_t h = 0; h + 3 < p.out_height; h += 4) {
+                for (index_t w = 0; w < p.out_width; ++w) {
                   // load output
-                  index_t out_offset = h * out_width + w;
+                  index_t out_offset = h * p.out_width + w;
                   // output (1 outch x 4 height x 1 width): vo_outch_height
                   float32x4_t vo0 = {out_ptr0_base[out_offset],
-                                     out_ptr0_base[out_offset + out_width],
-                                     out_ptr0_base[out_offset + 2 * out_width],
-                                     out_ptr0_base[out_offset + 3 * out_width]};
+                                     out_ptr0_base[out_offset + p.out_width],
+                                     out_ptr0_base[out_offset
+                                         + 2 * p.out_width],
+                                     out_ptr0_base[out_offset
+                                         + 3 * p.out_width]};
 
                   // input offset
-                  index_t in_offset = h * in_width + w;
+                  index_t in_offset = h * p.in_width + w;
                   // input (3 slide)
                   float32x4_t vi0 = {in_ptr_base[in_offset],
-                                     in_ptr_base[in_offset + in_width],
-                                     in_ptr_base[in_offset + 2 * in_width],
-                                     in_ptr_base[in_offset + 3 * in_width]};
-                  float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
-                                     in_ptr_base[in_offset + 5 * in_width],
-                                     in_ptr_base[in_offset + 6 * in_width],
-                                     in_ptr_base[in_offset + 7 * in_width]};
-                  float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
-                                     in_ptr_base[in_offset + 9 * in_width],
-                                     in_ptr_base[in_offset + 10 * in_width],
-                                     in_ptr_base[in_offset + 11 * in_width]};
+                                     in_ptr_base[in_offset + p.in_width],
+                                     in_ptr_base[in_offset + 2 * p.in_width],
+                                     in_ptr_base[in_offset + 3 * p.in_width]};
+                  float32x4_t vi4 = {in_ptr_base[in_offset + 4 * p.in_width],
+                                     in_ptr_base[in_offset + 5 * p.in_width],
+                                     in_ptr_base[in_offset + 6 * p.in_width],
+                                     in_ptr_base[in_offset + 7 * p.in_width]};
+                  float32x4_t vi8 = {in_ptr_base[in_offset + 8 * p.in_width],
+                                     in_ptr_base[in_offset + 9 * p.in_width],
+                                     in_ptr_base[in_offset + 10 * p.in_width],
+                                     in_ptr_base[in_offset + 11 * p.in_width]};
                   float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
                   float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
                   float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
@@ -562,9 +467,9 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
 #endif
 
                   out_ptr0_base[out_offset] = vo0[0];
-                  out_ptr0_base[out_offset + out_width] = vo0[1];
-                  out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
-                  out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
+                  out_ptr0_base[out_offset + p.out_width] = vo0[1];
+                  out_ptr0_base[out_offset + 2 * p.out_width] = vo0[2];
+                  out_ptr0_base[out_offset + 3 * p.out_width] = vo0[3];
                 }  // w
               }    // h
             }  // c
@@ -572,78 +477,30 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
         }  // if
       }    // m
     }      // b
-  }, 0, batch, 1, 0, out_channels, 4);
+  }, 0, p.batch, 1, 0, p.out_channels, 4);
 
-  UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus Conv2dK1x15S1::Compute(const OpContext *context,
-                                  const Tensor *input,
-                                  const Tensor *filter,
-                                  Tensor *output) {
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
-
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       1,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input.get() != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output.get() != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-
+template<>
+MaceStatus Conv2dK1x15S1<float>::DoCompute(
+    const ConvComputeParam &p, const float *filter_data,
+    const float *input_data, float *output_data) {
   const index_t tile_height =
-      out_channels < 4 ? RoundUpDiv4(out_height) : out_height;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+      p.out_channels < 4 ? RoundUpDiv4(p.out_height) : p.out_height;
 
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t m = start1; m < end1; m += step1) {
-        for (index_t h = 0; h < out_height; h += tile_height) {
+        for (index_t h = 0; h < p.out_height; h += tile_height) {
           float *out_ptr_base =
-              output_data + b * out_batch_size + m * out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+              output_data + b * p.out_batch_size + m * p.out_image_size;
+          for (index_t c = 0; c < p.in_channels; ++c) {
             const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
-            const float
-                *filter_ptr = filter_data + m * in_channels * 15 + c * 15;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
+            const float *filter_ptr =
+                filter_data + m * p.in_channels * 15 + c * 15;
             /* load filter (1 outch x 4 height x 1 width) */
             float32x4_t vf0, vf1, vf2, vf3;
             vf0 = vld1q_f32(filter_ptr);
@@ -651,20 +508,20 @@ MaceStatus Conv2dK1x15S1::Compute(const OpContext *context,
             vf2 = vld1q_f32(filter_ptr + 8);
             vf3 = vld1q_f32(filter_ptr + 11);
 
-            for (index_t ht = 0; ht < tile_height && h + ht < out_height;
+            for (index_t ht = 0; ht < tile_height && h + ht < p.out_height;
                  ++ht) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                 // output (1 outch x 1 height x 4 width): vo_outch_height
                 float32x4_t vo;
                 // load output
-                index_t out_offset = (h + ht) * out_width + w;
+                index_t out_offset = (h + ht) * p.out_width + w;
                 vo = vld1q_f32(out_ptr_base + out_offset);
 
                 // input (3 slide)
                 float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9,
                     vi10, vi11, vi12, vi13, vi14, vi16;
                 // input offset
-                index_t in_offset = (h + ht) * in_width + w;
+                index_t in_offset = (h + ht) * p.in_width + w;
                 // load input
                 vi0 = vld1q_f32(in_ptr_base + in_offset);
                 vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
@@ -706,78 +563,30 @@ MaceStatus Conv2dK1x15S1::Compute(const OpContext *context,
         }    // h
       }      // m
     }        // b
-  }, 0, batch, 1, 0, out_channels, 1);
+  }, 0, p.batch, 1, 0, p.out_channels, 1);
 
-  UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus Conv2dK15x1S1::Compute(const OpContext *context,
-                                  const Tensor *input,
-                                  const Tensor *filter,
-                                  Tensor *output) {
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
-
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       4,
-                       1,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input.get() != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output.get() != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
+template<>
+MaceStatus Conv2dK15x1S1<float>::DoCompute(
+    const ConvComputeParam &p, const float *filter_data,
+    const float *input_data, float *output_data) {
 
   const index_t tile_width =
-      out_channels < 4 ? RoundUpDiv4(out_width) : out_width;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+      p.out_channels < 4 ? RoundUpDiv4(p.out_width) : p.out_width;
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t m = start1; m < end1; m += step1) {
-        for (index_t w = 0; w < out_width; w += tile_width) {
+        for (index_t w = 0; w < p.out_width; w += tile_width) {
           float *out_ptr_base =
-              output_data + b * out_batch_size + m * out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+              output_data + b * p.out_batch_size + m * p.out_image_size;
+          for (index_t c = 0; c < p.in_channels; ++c) {
             const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
-            const float
-                *filter_ptr = filter_data + m * in_channels * 15 + c * 15;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
+            const float *filter_ptr =
+                filter_data + m * p.in_channels * 15 + c * 15;
             /* load filter (1 outch x 4 height x 1 width) */
             float32x4_t vf0, vf1, vf2, vf3;
             vf0 = vld1q_f32(filter_ptr);
@@ -785,38 +594,38 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context,
             vf2 = vld1q_f32(filter_ptr + 8);
             vf3 = vld1q_f32(filter_ptr + 11);
 
-            for (index_t h = 0; h + 3 < out_height; h += 4) {
-              for (index_t wt = 0; wt < tile_width && w + wt < out_width;
+            for (index_t h = 0; h + 3 < p.out_height; h += 4) {
+              for (index_t wt = 0; wt < tile_width && w + wt < p.out_width;
                    ++wt) {
                 // load output
-                index_t out_offset = h * out_width + w + wt;
+                index_t out_offset = h * p.out_width + w + wt;
                 // output (1 outch x 4 height x 1 width): vo_outch_height
                 float32x4_t vo = {out_ptr_base[out_offset],
-                                  out_ptr_base[out_offset + out_width],
-                                  out_ptr_base[out_offset + 2 * out_width],
-                                  out_ptr_base[out_offset + 3 * out_width]};
+                                  out_ptr_base[out_offset + p.out_width],
+                                  out_ptr_base[out_offset + 2 * p.out_width],
+                                  out_ptr_base[out_offset + 3 * p.out_width]};
 
                 // input offset
-                index_t in_offset = h * in_width + w + wt;
+                index_t in_offset = h * p.in_width + w + wt;
                 // input (3 slide)
                 float32x4_t vi0 = {in_ptr_base[in_offset],
-                                   in_ptr_base[in_offset + in_width],
-                                   in_ptr_base[in_offset + 2 * in_width],
-                                   in_ptr_base[in_offset + 3 * in_width]};
-                float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
-                                   in_ptr_base[in_offset + 5 * in_width],
-                                   in_ptr_base[in_offset + 6 * in_width],
-                                   in_ptr_base[in_offset + 7 * in_width]};
-                float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
-                                   in_ptr_base[in_offset + 9 * in_width],
-                                   in_ptr_base[in_offset + 10 * in_width],
-                                   in_ptr_base[in_offset + 11 * in_width]};
-                float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width],
-                                    in_ptr_base[in_offset + 13 * in_width],
-                                    in_ptr_base[in_offset + 14 * in_width],
-                                    in_ptr_base[in_offset + 15 * in_width]};
-                float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width],
-                                    in_ptr_base[in_offset + 17 * in_width]};
+                                   in_ptr_base[in_offset + p.in_width],
+                                   in_ptr_base[in_offset + 2 * p.in_width],
+                                   in_ptr_base[in_offset + 3 * p.in_width]};
+                float32x4_t vi4 = {in_ptr_base[in_offset + 4 * p.in_width],
+                                   in_ptr_base[in_offset + 5 * p.in_width],
+                                   in_ptr_base[in_offset + 6 * p.in_width],
+                                   in_ptr_base[in_offset + 7 * p.in_width]};
+                float32x4_t vi8 = {in_ptr_base[in_offset + 8 * p.in_width],
+                                   in_ptr_base[in_offset + 9 * p.in_width],
+                                   in_ptr_base[in_offset + 10 * p.in_width],
+                                   in_ptr_base[in_offset + 11 * p.in_width]};
+                float32x4_t vi12 = {in_ptr_base[in_offset + 12 * p.in_width],
+                                    in_ptr_base[in_offset + 13 * p.in_width],
+                                    in_ptr_base[in_offset + 14 * p.in_width],
+                                    in_ptr_base[in_offset + 15 * p.in_width]};
+                float32x4_t vi16 = {in_ptr_base[in_offset + 16 * p.in_width],
+                                    in_ptr_base[in_offset + 17 * p.in_width]};
                 float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
                 float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
                 float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
@@ -846,44 +655,20 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context,
                 vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
 
                 out_ptr_base[out_offset] = vo[0];
-                out_ptr_base[out_offset + out_width] = vo[1];
-                out_ptr_base[out_offset + 2 * out_width] = vo[2];
-                out_ptr_base[out_offset + 3 * out_width] = vo[3];
+                out_ptr_base[out_offset + p.out_width] = vo[1];
+                out_ptr_base[out_offset + 2 * p.out_width] = vo[2];
+                out_ptr_base[out_offset + 3 * p.out_width] = vo[3];
               }  // wt
             }    // h
           }  // c
         }    // w
       }      // m
     }        // b
-  }, 0, batch, 1, 0, out_channels, 1);
+  }, 0, p.batch, 1, 0, p.out_channels, 1);
 
-  UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
 }
 
-void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK1x7S1, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K1x7S1));
-
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK7x1S1, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K7x1S1));
-
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK1x15S1, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K1x15S1));
-
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK15x1S1, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K15x1S1));
-}
-
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_2d_3x3.cc b/mace/ops/arm/fp32/conv_2d_3x3.cc
index 84635c7cac26d7c76bd82cd181716c2f5b987ecd..d058e0780b0cd621f8fb348c268717e2445257b0 100644
--- a/mace/ops/arm/fp32/conv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/conv_2d_3x3.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,95 +12,47 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/conv_2d_3x3.h"
-
 #include <arm_neon.h>
 #include <memory>
 
+#include "mace/ops/arm/base/conv_2d_3x3.h"
 #include "mace/ops/delegator/conv_2d.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
-                                 const Tensor *input,
-                                 const Tensor *filter,
-                                 Tensor *output) {
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       2,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+
+template<>
+MaceStatus Conv2dK3x3S1<float>::DoCompute(
+    const ConvComputeParam &p, const float *filter_data,
+    const float *input_data, float *output_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t m = start1; m < end1; m += step1) {
-        if (m + 1 < out_channels) {
+        if (m + 1 < p.out_channels) {
           float *out_ptr0_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
           float *out_ptr1_base =
-              output_data + b * out_batch_size + (m + 1) * out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
-            const float
-                *in_ptr0 = input_data + b * in_batch_size + c * in_image_size;
+              output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
+          for (index_t c = 0; c < p.in_channels; ++c) {
+            const float *in_ptr0 =
+                input_data + b * p.in_batch_size + c * p.in_image_size;
             const float
-                *filter_ptr0 = filter_data + m * in_channels * 9 + c * 9;
+                *filter_ptr0 = filter_data + m * p.in_channels * 9 + c * 9;
 
             float *out_ptr1 = out_ptr1_base;
             const float *in_ptr1 =
-                input_data + b * in_batch_size + c * in_image_size
-                    + 1 * in_width;
+                input_data + b * p.in_batch_size + c * p.in_image_size
+                    + 1 * p.in_width;
             const float *in_ptr2 =
-                input_data + b * in_batch_size + c * in_image_size
-                    + 2 * in_width;
+                input_data + b * p.in_batch_size + c * p.in_image_size
+                    + 2 * p.in_width;
             const float *in_ptr3 =
-                input_data + b * in_batch_size + c * in_image_size
-                    + 3 * in_width;
-            const float
-                *filter_ptr1 = filter_data + (m + 1) * in_channels * 9 + c * 9;
+                input_data + b * p.in_batch_size + c * p.in_image_size
+                    + 3 * p.in_width;
+            const float *filter_ptr1 =
+                filter_data + (m + 1) * p.in_channels * 9 + c * 9;
 
 #if defined(__aarch64__)
             float *out_ptr0 = out_ptr0_base;
@@ -116,8 +68,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
             vf11 = vld1q_f32(filter_ptr1 + 3);
             vf12 = vld1q_f32(filter_ptr1 + 6);
 
-            for (index_t h = 0; h + 1 < out_height; h += 2) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+            for (index_t h = 0; h + 1 < p.out_height; h += 2) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                 // input (4 height x 3 slide): vi_height_slide
                 float32x4_t vi00, vi01, vi02;  // reg count: 14
                 float32x4_t vi10, vi11, vi12;
@@ -150,9 +102,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
 
                 // load ouptut
                 vo00 = vld1q_f32(out_ptr0);
-                vo01 = vld1q_f32(out_ptr0 + out_width);
+                vo01 = vld1q_f32(out_ptr0 + p.out_width);
                 vo10 = vld1q_f32(out_ptr1);
-                vo11 = vld1q_f32(out_ptr1 + out_width);
+                vo11 = vld1q_f32(out_ptr1 + p.out_width);
 
                 // outch 0, height 0
                 vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0);  // reg count: 18
@@ -199,9 +151,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                 vo11 = vfmaq_laneq_f32(vo11, vi32, vf12, 2);
 
                 vst1q_f32(out_ptr0, vo00);
-                vst1q_f32(out_ptr0 + out_width, vo01);
+                vst1q_f32(out_ptr0 + p.out_width, vo01);
                 vst1q_f32(out_ptr1, vo10);
-                vst1q_f32(out_ptr1 + out_width, vo11);
+                vst1q_f32(out_ptr1 + p.out_width, vo11);
 
                 in_ptr0 += 4;
                 in_ptr1 += 4;
@@ -212,13 +164,13 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                 out_ptr1 += 4;
               }  // w
 
-              in_ptr0 += 2 + in_width;
-              in_ptr1 += 2 + in_width;
-              in_ptr2 += 2 + in_width;
-              in_ptr3 += 2 + in_width;
+              in_ptr0 += 2 + p.in_width;
+              in_ptr1 += 2 + p.in_width;
+              in_ptr2 += 2 + p.in_width;
+              in_ptr3 += 2 + p.in_width;
 
-              out_ptr0 += out_width;
-              out_ptr1 += out_width;
+              out_ptr0 += p.out_width;
+              out_ptr1 += p.out_width;
             }                      // h
 #else  // arm v7
             float *out_ptr0 = out_ptr0_base;
@@ -238,8 +190,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
             vf167 = vld1_f32(filter_ptr1 + 6);
             vf189 = vld1_f32(filter_ptr1 + 8);
 
-            for (index_t h = 0; h + 1 < out_height; h += 2) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+            for (index_t h = 0; h + 1 < p.out_height; h += 2) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                 // input (4 height x 3 slide): vi_height_slide
                 float32x4_t vi00, vi01, vi02;  // reg count: 14
                 float32x4_t vi10, vi11, vi12;
@@ -272,9 +224,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
 
                 // load ouptut
                 vo00 = vld1q_f32(out_ptr0);
-                vo01 = vld1q_f32(out_ptr0 + out_width);
+                vo01 = vld1q_f32(out_ptr0 + p.out_width);
                 vo10 = vld1q_f32(out_ptr1);
-                vo11 = vld1q_f32(out_ptr1 + out_width);
+                vo11 = vld1q_f32(out_ptr1 + p.out_width);
 
                 // outch 0, height 0
                 vo00 = vmlaq_lane_f32(vo00, vi00, vf001, 0);
@@ -321,9 +273,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                 vo11 = vmlaq_lane_f32(vo11, vi32, vf189, 0);
 
                 vst1q_f32(out_ptr0, vo00);
-                vst1q_f32(out_ptr0 + out_width, vo01);
+                vst1q_f32(out_ptr0 + p.out_width, vo01);
                 vst1q_f32(out_ptr1, vo10);
-                vst1q_f32(out_ptr1 + out_width, vo11);
+                vst1q_f32(out_ptr1 + p.out_width, vo11);
 
                 in_ptr0 += 4;
                 in_ptr1 += 4;
@@ -334,34 +286,34 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                 out_ptr1 += 4;
               }  // w
 
-              in_ptr0 += 2 + in_width;
-              in_ptr1 += 2 + in_width;
-              in_ptr2 += 2 + in_width;
-              in_ptr3 += 2 + in_width;
+              in_ptr0 += 2 + p.in_width;
+              in_ptr1 += 2 + p.in_width;
+              in_ptr2 += 2 + p.in_width;
+              in_ptr3 += 2 + p.in_width;
 
-              out_ptr0 += out_width;
-              out_ptr1 += out_width;
+              out_ptr0 += p.out_width;
+              out_ptr1 += p.out_width;
             }  // h
 #endif
           }  // c
         } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
+          for (index_t mm = m; mm < p.out_channels; ++mm) {
             float *out_ptr0_base =
-                output_data + b * out_batch_size + mm * out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
+                output_data + b * p.out_batch_size + mm * p.out_image_size;
+            for (index_t c = 0; c < p.in_channels; ++c) {
               const float *in_ptr0 =
-                  input_data + b * in_batch_size + c * in_image_size;
+                  input_data + b * p.in_batch_size + c * p.in_image_size;
               const float *in_ptr1 =
-                  input_data + b * in_batch_size + c * in_image_size
-                      + 1 * in_width;
+                  input_data + b * p.in_batch_size + c * p.in_image_size
+                      + 1 * p.in_width;
               const float *in_ptr2 =
-                  input_data + b * in_batch_size + c * in_image_size
-                      + 2 * in_width;
+                  input_data + b * p.in_batch_size + c * p.in_image_size
+                      + 2 * p.in_width;
               const float *in_ptr3 =
-                  input_data + b * in_batch_size + c * in_image_size
-                      + 3 * in_width;
+                  input_data + b * p.in_batch_size + c * p.in_image_size
+                      + 3 * p.in_width;
               const float
-                  *filter_ptr0 = filter_data + mm * in_channels * 9 + c * 9;
+                  *filter_ptr0 = filter_data + mm * p.in_channels * 9 + c * 9;
 
 #if defined(__aarch64__)
               float *out_ptr0 = out_ptr0_base;
@@ -372,8 +324,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
               vf01 = vld1q_f32(filter_ptr0 + 3);
               vf02 = vld1q_f32(filter_ptr0 + 5);
 
-              for (index_t h = 0; h + 1 < out_height; h += 2) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
+              for (index_t h = 0; h + 1 < p.out_height; h += 2) {
+                for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                   // input (4 height x 3 slide): vi_height_slide
                   float32x4_t vi00, vi01, vi02, vi0n;
                   float32x4_t vi10, vi11, vi12, vi1n;
@@ -404,7 +356,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
 
                   // load ouptut
                   vo00 = vld1q_f32(out_ptr0);
-                  vo01 = vld1q_f32(out_ptr0 + out_width);
+                  vo01 = vld1q_f32(out_ptr0 + p.out_width);
 
                   // outch 0, height 0
                   vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0);
@@ -429,7 +381,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                   vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 3);
 
                   vst1q_f32(out_ptr0, vo00);
-                  vst1q_f32(out_ptr0 + out_width, vo01);
+                  vst1q_f32(out_ptr0 + p.out_width, vo01);
 
                   in_ptr0 += 4;
                   in_ptr1 += 4;
@@ -439,12 +391,12 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                   out_ptr0 += 4;
                 }  // w
 
-                in_ptr0 += 2 + in_width;
-                in_ptr1 += 2 + in_width;
-                in_ptr2 += 2 + in_width;
-                in_ptr3 += 2 + in_width;
+                in_ptr0 += 2 + p.in_width;
+                in_ptr1 += 2 + p.in_width;
+                in_ptr2 += 2 + p.in_width;
+                in_ptr3 += 2 + p.in_width;
 
-                out_ptr0 += out_width;
+                out_ptr0 += p.out_width;
               }                    // h
 #else  // arm v7
               float *out_ptr0 = out_ptr0_base;
@@ -457,8 +409,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
               vf67 = vld1_f32(filter_ptr0 + 6);
               vf78 = vld1_f32(filter_ptr0 + 7);
 
-              for (index_t h = 0; h + 1 < out_height; h += 2) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
+              for (index_t h = 0; h + 1 < p.out_height; h += 2) {
+                for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                   // input (4 height x 3 slide): vi_height_slide
                   float32x4_t vi00, vi01, vi02, vi0n;
                   float32x4_t vi10, vi11, vi12, vi1n;
@@ -489,7 +441,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
 
                   // load ouptut
                   vo00 = vld1q_f32(out_ptr0);
-                  vo01 = vld1q_f32(out_ptr0 + out_width);
+                  vo01 = vld1q_f32(out_ptr0 + p.out_width);
 
                   // outch 0, height 0
                   vo00 = vmlaq_lane_f32(vo00, vi00, vf01, 0);
@@ -514,7 +466,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                   vo01 = vmlaq_lane_f32(vo01, vi32, vf78, 1);
 
                   vst1q_f32(out_ptr0, vo00);
-                  vst1q_f32(out_ptr0 + out_width, vo01);
+                  vst1q_f32(out_ptr0 + p.out_width, vo01);
 
                   in_ptr0 += 4;
                   in_ptr1 += 4;
@@ -524,12 +476,12 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                   out_ptr0 += 4;
                 }  // w
 
-                in_ptr0 += 2 + in_width;
-                in_ptr1 += 2 + in_width;
-                in_ptr2 += 2 + in_width;
-                in_ptr3 += 2 + in_width;
+                in_ptr0 += 2 + p.in_width;
+                in_ptr1 += 2 + p.in_width;
+                in_ptr2 += 2 + p.in_width;
+                in_ptr3 += 2 + p.in_width;
 
-                out_ptr0 += out_width;
+                out_ptr0 += p.out_width;
               }  // h
 #endif
             }  // c
@@ -537,73 +489,25 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
         }      // if
       }        // m
     }          // b
-  }, 0, batch, 1, 0, out_channels, 2);
+  }, 0, p.batch, 1, 0, p.out_channels, 2);
 
-  UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
-                                 const Tensor *input,
-                                 const Tensor *filter,
-                                 Tensor *output) {
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
-
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       1,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+template<>
+MaceStatus Conv2dK3x3S2<float>::DoCompute(
+    const ConvComputeParam &p, const float *filter_data,
+    const float *input_data, float *output_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t m = start1; m < end1; m += step1) {
-        for (index_t c = 0; c < in_channels; ++c) {
+        for (index_t c = 0; c < p.in_channels; ++c) {
           const float
-              *in_base = input_data + b * in_batch_size + c * in_image_size;
-          const float *filter_ptr = filter_data + m * in_channels * 9 + c * 9;
-          float
-              *out_base = output_data + b * out_batch_size + m * out_image_size;
+              *in_base = input_data + b * p.in_batch_size + c * p.in_image_size;
+          const float *filter_ptr = filter_data + m * p.in_channels * 9 + c * 9;
+          float *out_base =
+              output_data + b * p.out_batch_size + m * p.out_image_size;
 
 #if defined(__aarch64__)
           // load filter (1 outch x 3 height x 3 width): vf_outch_height
@@ -612,8 +516,8 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
           vf01 = vld1q_f32(filter_ptr + 3);
           vf02 = vld1q_f32(filter_ptr + 5);
 
-          for (index_t h = 0; h < out_height; ++h) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
+          for (index_t h = 0; h < p.out_height; ++h) {
+            for (index_t w = 0; w + 3 < p.out_width; w += 4) {
               float32x4x2_t vi0, vi1, vi2;
               float32x4_t vi0n, vi1n, vi2n;
 
@@ -628,17 +532,17 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
               // load input
               index_t in_h = h * 2;
               index_t in_w = w * 2;
-              index_t in_offset = in_h * in_width + in_w;
+              index_t in_offset = in_h * p.in_width + in_w;
               vi0 = vld2q_f32(in_base + in_offset);  // [0.2.4.6, 1.3.5.7]
-              vi1 = vld2q_f32(in_base + in_offset + in_width);
-              vi2 = vld2q_f32(in_base + in_offset + 2 * in_width);
+              vi1 = vld2q_f32(in_base + in_offset + p.in_width);
+              vi2 = vld2q_f32(in_base + in_offset + 2 * p.in_width);
 
               vi0n = vld1q_f32(in_base + in_offset + 8);  // [8.9.10.11]
-              vi1n = vld1q_f32(in_base + in_offset + in_width + 8);
-              vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8);
+              vi1n = vld1q_f32(in_base + in_offset + p.in_width + 8);
+              vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 8);
 
               // load ouptut
-              index_t out_offset = h * out_width + w;
+              index_t out_offset = h * p.out_width + w;
               vo = vld1q_f32(out_base + out_offset);
 
               vi00 = vi0.val[0];                // [0.2.4.6]
@@ -674,8 +578,8 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
           vf67 = vld1_f32(filter_ptr + 6);
           vf78 = vld1_f32(filter_ptr + 7);
 
-          for (index_t h = 0; h < out_height; ++h) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
+          for (index_t h = 0; h < p.out_height; ++h) {
+            for (index_t w = 0; w + 3 < p.out_width; w += 4) {
               float32x4x2_t vi0, vi1, vi2;
               float32x4_t vi0n, vi1n, vi2n;
 
@@ -690,17 +594,17 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
               // load input
               index_t in_h = h * 2;
               index_t in_w = w * 2;
-              index_t in_offset = in_h * in_width + in_w;
+              index_t in_offset = in_h * p.in_width + in_w;
               vi0 = vld2q_f32(in_base + in_offset);  // [0.2.4.6, 1.3.5.7]
-              vi1 = vld2q_f32(in_base + in_offset + in_width);
-              vi2 = vld2q_f32(in_base + in_offset + 2 * in_width);
+              vi1 = vld2q_f32(in_base + in_offset + p.in_width);
+              vi2 = vld2q_f32(in_base + in_offset + 2 * p.in_width);
 
               vi0n = vld1q_f32(in_base + in_offset + 8);  // [8.9.10.11]
-              vi1n = vld1q_f32(in_base + in_offset + in_width + 8);
-              vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8);
+              vi1n = vld1q_f32(in_base + in_offset + p.in_width + 8);
+              vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 8);
 
               // load ouptut
-              index_t out_offset = h * out_width + w;
+              index_t out_offset = h * p.out_width + w;
               vo = vld1q_f32(out_base + out_offset);
 
               vi00 = vi0.val[0];                // [0.2.4.6]
@@ -731,24 +635,11 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
         }  // c
       }    // m
     }      // b
-  }, 0, batch, 1, 0, out_channels, 1);
+  }, 0, p.batch, 1, 0, p.out_channels, 1);
 
-  UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
 }
 
-void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK3x3S1, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK3x3S2, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S2));
-}
-
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
index 1ec5205735e9564e5c7516768c77491a394c391d..051d558797730c6e42389db03275c99e2e03c655 100644
--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
@@ -18,8 +18,8 @@
 
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/delegator/conv_2d.h"
-#include "mace/utils/memory.h"
 #include "mace/utils/math.h"
+#include "mace/utils/memory.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
index ec4db81bb2d552615430b81e330ef0ff862c563f..513cc99d4a9eb6538aecf299f5a7e6aaf8b8a309 100644
--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
@@ -20,8 +20,8 @@
 
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
-#include "mace/ops/arm/fp32/gemm.h"
+#include "mace/ops/arm/base/conv_2d.h"
+#include "mace/ops/arm/base/gemm.h"
 #include "mace/public/mace.h"
 
 namespace mace {
@@ -32,7 +32,7 @@ namespace fp32 {
 class Conv2dK3x3Winograd : public Conv2dBase {
  public:
   explicit Conv2dK3x3Winograd(const delegator::Conv2dParam &param)
-      : Conv2dBase(param),
+      : Conv2dBase(param, sizeof(float)),
         gemm_(delegator::GemmParam()),
         transformed_filter_(nullptr),
         out_tile_size_(0) {}
@@ -94,7 +94,7 @@ class Conv2dK3x3Winograd : public Conv2dBase {
                           index_t tile_count,
                           float *output);
 
-  Gemm gemm_;
+  Gemm<float> gemm_;
   std::unique_ptr<Tensor> transformed_filter_;
   index_t out_tile_size_;
 };
diff --git a/mace/ops/arm/fp32/conv_2d_5x5.cc b/mace/ops/arm/fp32/conv_2d_5x5.cc
index 2bfb762520f49cf0a5b5cb82dea11bc2f55fc6a0..4751b0d15439fcd724d2562f96cd33bc07d9d600 100644
--- a/mace/ops/arm/fp32/conv_2d_5x5.cc
+++ b/mace/ops/arm/fp32/conv_2d_5x5.cc
@@ -15,26 +15,12 @@
 #include <arm_neon.h>
 #include <memory>
 
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d_5x5.h"
 #include "mace/ops/delegator/conv_2d.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-class Conv2dK5x5S1 : public Conv2dBase {
- public:
-  explicit Conv2dK5x5S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
-  virtual ~Conv2dK5x5S1() {}
-
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
-};
 
 #define MACE_Conv2dNeonK5x5SnLoadCalc4                    \
   /* load filter (4 outch x 1 height x 4 width) */        \
@@ -91,89 +77,43 @@ class Conv2dK5x5S1 : public Conv2dBase {
   vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
   vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1);
 
-MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
-                                 const Tensor *input,
-                                 const Tensor *filter,
-                                 Tensor *output) {
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       1,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+template<>
+MaceStatus Conv2dK5x5S1<float>::DoCompute(
+    const ConvComputeParam &p, const float *filter_data,
+    const float *input_data, float *output_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t m = start1; m < end1; m += step1) {
-        if (m + 3 < out_channels) {
+        if (m + 3 < p.out_channels) {
           float *out_ptr0_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
           float *out_ptr1_base =
-              output_data + b * out_batch_size + (m + 1) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
           float *out_ptr2_base =
-              output_data + b * out_batch_size + (m + 2) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
           float *out_ptr3_base =
-              output_data + b * out_batch_size + (m + 3) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
 
-          for (index_t c = 0; c < in_channels; ++c) {
+          for (index_t c = 0; c < p.in_channels; ++c) {
             const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
             const float
-                *filter_ptr0 = filter_data + m * in_channels * 25 + c * 25;
+                *filter_ptr0 = filter_data + m * p.in_channels * 25 + c * 25;
             const float *filter_ptr1 =
-                filter_data + (m + 1) * in_channels * 25 + c * 25;
+                filter_data + (m + 1) * p.in_channels * 25 + c * 25;
             const float *filter_ptr2 =
-                filter_data + (m + 2) * in_channels * 25 + c * 25;
+                filter_data + (m + 2) * p.in_channels * 25 + c * 25;
             const float *filter_ptr3 =
-                filter_data + (m + 3) * in_channels * 25 + c * 25;
-            for (index_t h = 0; h < out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+                filter_data + (m + 3) * p.in_channels * 25 + c * 25;
+            for (index_t h = 0; h < p.out_height; ++h) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                 // input offset
-                index_t in_offset = h * in_width + w;
+                index_t in_offset = h * p.in_width + w;
                 // output (4 outch x 1 height x 4 width): vo_outch_height
                 float32x4_t vo0, vo1, vo2, vo3;
                 // load output
-                index_t out_offset = h * out_width + w;
+                index_t out_offset = h * p.out_width + w;
                 vo0 = vld1q_f32(out_ptr0_base + out_offset);
                 vo1 = vld1q_f32(out_ptr1_base + out_offset);
                 vo2 = vld1q_f32(out_ptr2_base + out_offset);
@@ -190,7 +130,7 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
 
                   MACE_Conv2dNeonK5x5SnLoadCalc4;
 
-                  in_offset += in_width;
+                  in_offset += p.in_width;
                   filter_ptr0 += 5;
                   filter_ptr1 += 5;
                   filter_ptr2 += 5;
@@ -210,22 +150,22 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
             }    // h
           }  // c
         } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
+          for (index_t mm = m; mm < p.out_channels; ++mm) {
             float *out_ptr0_base =
-                output_data + b * out_batch_size + mm * out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
+                output_data + b * p.out_batch_size + mm * p.out_image_size;
+            for (index_t c = 0; c < p.in_channels; ++c) {
               const float *in_ptr_base =
-                  input_data + b * in_batch_size + c * in_image_size;
+                  input_data + b * p.in_batch_size + c * p.in_image_size;
               const float
-                  *filter_ptr0 = filter_data + mm * in_channels * 25 + c * 25;
-              for (index_t h = 0; h < out_height; ++h) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                  *filter_ptr0 = filter_data + mm * p.in_channels * 25 + c * 25;
+              for (index_t h = 0; h < p.out_height; ++h) {
+                for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                   // input offset
-                  index_t in_offset = h * in_width + w;
+                  index_t in_offset = h * p.in_width + w;
                   // output (1 outch x 1 height x 4 width): vo_outch_height
                   float32x4_t vo0;
                   // load output
-                  index_t out_offset = h * out_width + w;
+                  index_t out_offset = h * p.out_width + w;
                   vo0 = vld1q_f32(out_ptr0_base + out_offset);
                   for (index_t r = 0; r < 5; ++r) {
                     // input (3 slide)
@@ -239,7 +179,7 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
 
                     MACE_Conv2dNeonK5x5SnLoadCalc1;
 
-                    in_offset += in_width;
+                    in_offset += p.in_width;
                     filter_ptr0 += 5;
                   }  // r
 
@@ -252,20 +192,11 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
         }      // if
       }        // m
     }          // b
-  }, 0, batch, 1, 0, out_channels, 4);
+  }, 0, p.batch, 1, 0, p.out_channels, 4);
 
-  UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
 }
 
-void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK5x5S1, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K5x5S1));
-}
-
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_2d_7x7.cc b/mace/ops/arm/fp32/conv_2d_7x7.cc
index d1f69967a21dd7393dafb196fd02b0c9e0322e4b..1ebb052113388d00bbb1e2191c91580ce3a3e299 100644
--- a/mace/ops/arm/fp32/conv_2d_7x7.cc
+++ b/mace/ops/arm/fp32/conv_2d_7x7.cc
@@ -12,17 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/conv_2d_7x7.h"
-
 #include <arm_neon.h>
 #include <memory>
 
+#include "mace/ops/arm/base/conv_2d_7x7.h"
 #include "mace/ops/delegator/conv_2d.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
 #define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4        \
   /* load filter (4 outch x 1 height x 4 width) */ \
@@ -156,88 +154,43 @@ namespace fp32 {
   vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \
   vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
 
-MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
-                                 const Tensor *input,
-                                 const Tensor *filter,
-                                 Tensor *output) {
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       1,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+template<>
+MaceStatus Conv2dK7x7S1<float>::DoCompute(
+    const ConvComputeParam &p, const float *filter_data,
+    const float *input_data, float *output_data) {
+
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t m = start1; m < end1; m += step1) {
-        if (m + 3 < out_channels) {
+        if (m + 3 < p.out_channels) {
           float *out_ptr0_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
           float *out_ptr1_base =
-              output_data + b * out_batch_size + (m + 1) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
           float *out_ptr2_base =
-              output_data + b * out_batch_size + (m + 2) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
           float *out_ptr3_base =
-              output_data + b * out_batch_size + (m + 3) * out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+              output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
+          for (index_t c = 0; c < p.in_channels; ++c) {
             const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
             const float
-                *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
+                *filter_ptr0 = filter_data + m * p.in_channels * 49 + c * 49;
             const float *filter_ptr1 =
-                filter_data + (m + 1) * in_channels * 49 + c * 49;
+                filter_data + (m + 1) * p.in_channels * 49 + c * 49;
             const float *filter_ptr2 =
-                filter_data + (m + 2) * in_channels * 49 + c * 49;
+                filter_data + (m + 2) * p.in_channels * 49 + c * 49;
             const float *filter_ptr3 =
-                filter_data + (m + 3) * in_channels * 49 + c * 49;
-            for (index_t h = 0; h < out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+                filter_data + (m + 3) * p.in_channels * 49 + c * 49;
+            for (index_t h = 0; h < p.out_height; ++h) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                 // input offset
-                index_t in_offset = h * in_width + w;
+                index_t in_offset = h * p.in_width + w;
                 // output (4 outch x 1 height x 4 width): vo_outch_height
                 float32x4_t vo0, vo1, vo2, vo3;
                 // load output
-                index_t out_offset = h * out_width + w;
+                index_t out_offset = h * p.out_width + w;
                 vo0 = vld1q_f32(out_ptr0_base + out_offset);
                 vo1 = vld1q_f32(out_ptr1_base + out_offset);
                 vo2 = vld1q_f32(out_ptr2_base + out_offset);
@@ -262,7 +215,7 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
                   MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
 #endif
 
-                  in_offset += in_width;
+                  in_offset += p.in_width;
                   filter_ptr0 += 7;
                   filter_ptr1 += 7;
                   filter_ptr2 += 7;
@@ -282,22 +235,22 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
             }    // h
           }  // c
         } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
+          for (index_t mm = m; mm < p.out_channels; ++mm) {
             float *out_ptr0_base =
-                output_data + b * out_batch_size + mm * out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
+                output_data + b * p.out_batch_size + mm * p.out_image_size;
+            for (index_t c = 0; c < p.in_channels; ++c) {
               const float *in_ptr_base =
-                  input_data + b * in_batch_size + c * in_image_size;
+                  input_data + b * p.in_batch_size + c * p.in_image_size;
               const float
-                  *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
-              for (index_t h = 0; h < out_height; ++h) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                  *filter_ptr0 = filter_data + mm * p.in_channels * 49 + c * 49;
+              for (index_t h = 0; h < p.out_height; ++h) {
+                for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                   // input offset
-                  index_t in_offset = h * in_width + w;
+                  index_t in_offset = h * p.in_width + w;
                   // output (1 outch x 1 height x 4 width): vo_outch_height
                   float32x4_t vo0;
                   // load output
-                  index_t out_offset = h * out_width + w;
+                  index_t out_offset = h * p.out_width + w;
                   vo0 = vld1q_f32(out_ptr0_base + out_offset);
                   for (index_t r = 0; r < 7; ++r) {
                     // input (3 slide)
@@ -319,7 +272,7 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
                     MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
 #endif
 
-                    in_offset += in_width;
+                    in_offset += p.in_width;
                     filter_ptr0 += 7;
                   }  // r
 
@@ -332,96 +285,49 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
         }      // if
       }        // m
     }          // b
-  }, 0, batch, 1, 0, out_channels, 4);
+  }, 0, p.batch, 1, 0, p.out_channels, 4);
 
-  UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
-                                 const Tensor *input,
-                                 const Tensor *filter,
-                                 Tensor *output) {
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       1,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+template<>
+MaceStatus Conv2dK7x7S2<float>::DoCompute(
+    const ConvComputeParam &p, const float *filter_data,
+    const float *input_data, float *output_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t m = start1; m < end1; m += step1) {
-        if (m + 3 < out_channels) {
+        if (m + 3 < p.out_channels) {
           float *out_ptr0_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
           float *out_ptr1_base =
-              output_data + b * out_batch_size + (m + 1) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
           float *out_ptr2_base =
-              output_data + b * out_batch_size + (m + 2) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
           float *out_ptr3_base =
-              output_data + b * out_batch_size + (m + 3) * out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+              output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
+          for (index_t c = 0; c < p.in_channels; ++c) {
             const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
             const float
-                *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
+                *filter_ptr0 = filter_data + m * p.in_channels * 49 + c * 49;
             const float *filter_ptr1 =
-                filter_data + (m + 1) * in_channels * 49 + c * 49;
+                filter_data + (m + 1) * p.in_channels * 49 + c * 49;
             const float *filter_ptr2 =
-                filter_data + (m + 2) * in_channels * 49 + c * 49;
+                filter_data + (m + 2) * p.in_channels * 49 + c * 49;
             const float *filter_ptr3 =
-                filter_data + (m + 3) * in_channels * 49 + c * 49;
-            for (index_t h = 0; h < out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+                filter_data + (m + 3) * p.in_channels * 49 + c * 49;
+            for (index_t h = 0; h < p.out_height; ++h) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                 // input offset
                 index_t in_h = h * 2;
                 index_t in_w = w * 2;
-                index_t in_offset = in_h * in_width + in_w;
+                index_t in_offset = in_h * p.in_width + in_w;
                 // output (4 outch x 1 height x 4 width): vo_outch_height
                 float32x4_t vo0, vo1, vo2, vo3;
                 // load output
-                index_t out_offset = h * out_width + w;
+                index_t out_offset = h * p.out_width + w;
                 vo0 = vld1q_f32(out_ptr0_base + out_offset);
                 vo1 = vld1q_f32(out_ptr1_base + out_offset);
                 vo2 = vld1q_f32(out_ptr2_base + out_offset);
@@ -449,7 +355,7 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
                   MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
 #endif
 
-                  in_offset += in_width;
+                  in_offset += p.in_width;
                   filter_ptr0 += 7;
                   filter_ptr1 += 7;
                   filter_ptr2 += 7;
@@ -469,24 +375,24 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
             }    // h
           }  // c
         } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
+          for (index_t mm = m; mm < p.out_channels; ++mm) {
             float *out_ptr0_base =
-                output_data + b * out_batch_size + mm * out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
+                output_data + b * p.out_batch_size + mm * p.out_image_size;
+            for (index_t c = 0; c < p.in_channels; ++c) {
               const float *in_ptr_base =
-                  input_data + b * in_batch_size + c * in_image_size;
+                  input_data + b * p.in_batch_size + c * p.in_image_size;
               const float
-                  *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
-              for (index_t h = 0; h < out_height; ++h) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                  *filter_ptr0 = filter_data + mm * p.in_channels * 49 + c * 49;
+              for (index_t h = 0; h < p.out_height; ++h) {
+                for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                   // input offset
                   index_t in_h = h * 2;
                   index_t in_w = w * 2;
-                  index_t in_offset = in_h * in_width + in_w;
+                  index_t in_offset = in_h * p.in_width + in_w;
                   // output (1 outch x 1 height x 4 width): vo_outch_height
                   float32x4_t vo0;
                   // load ouput
-                  index_t out_offset = h * out_width + w;
+                  index_t out_offset = h * p.out_width + w;
                   vo0 = vld1q_f32(out_ptr0_base + out_offset);
                   for (index_t r = 0; r < 7; ++r) {
                     // input (3 slide)
@@ -511,7 +417,7 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
                     MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
 #endif
 
-                    in_offset += in_width;
+                    in_offset += p.in_width;
                     filter_ptr0 += 7;
                   }  // r
 
@@ -524,96 +430,49 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
         }      // if
       }        // m
     }          // b
-  }, 0, batch, 1, 0, out_channels, 4);
+  }, 0, p.batch, 1, 0, p.out_channels, 4);
 
-  UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
-                                 const Tensor *input,
-                                 const Tensor *filter,
-                                 Tensor *output) {
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       1,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+template<>
+MaceStatus Conv2dK7x7S3<float>::DoCompute(
+    const ConvComputeParam &p, const float *filter_data,
+    const float *input_data, float *output_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t m = start1; m < end1; m += step1) {
-        if (m + 3 < out_channels) {
+        if (m + 3 < p.out_channels) {
           float *out_ptr0_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
           float *out_ptr1_base =
-              output_data + b * out_batch_size + (m + 1) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
           float *out_ptr2_base =
-              output_data + b * out_batch_size + (m + 2) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
           float *out_ptr3_base =
-              output_data + b * out_batch_size + (m + 3) * out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+              output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
+          for (index_t c = 0; c < p.in_channels; ++c) {
             const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
             const float
-                *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
+                *filter_ptr0 = filter_data + m * p.in_channels * 49 + c * 49;
             const float *filter_ptr1 =
-                filter_data + (m + 1) * in_channels * 49 + c * 49;
+                filter_data + (m + 1) * p.in_channels * 49 + c * 49;
             const float *filter_ptr2 =
-                filter_data + (m + 2) * in_channels * 49 + c * 49;
+                filter_data + (m + 2) * p.in_channels * 49 + c * 49;
             const float *filter_ptr3 =
-                filter_data + (m + 3) * in_channels * 49 + c * 49;
-            for (index_t h = 0; h < out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+                filter_data + (m + 3) * p.in_channels * 49 + c * 49;
+            for (index_t h = 0; h < p.out_height; ++h) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                 // input offset
                 index_t in_h = h * 3;
                 index_t in_w = w * 3;
-                index_t in_offset = in_h * in_width + in_w;
+                index_t in_offset = in_h * p.in_width + in_w;
                 // output (4 outch x 1 height x 4 width): vo_outch_height
                 float32x4_t vo0, vo1, vo2, vo3;
                 // load output
-                index_t out_offset = h * out_width + w;
+                index_t out_offset = h * p.out_width + w;
                 vo0 = vld1q_f32(out_ptr0_base + out_offset);
                 vo1 = vld1q_f32(out_ptr1_base + out_offset);
                 vo2 = vld1q_f32(out_ptr2_base + out_offset);
@@ -641,7 +500,7 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
                   MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
 #endif
 
-                  in_offset += in_width;
+                  in_offset += p.in_width;
                   filter_ptr0 += 7;
                   filter_ptr1 += 7;
                   filter_ptr2 += 7;
@@ -661,24 +520,24 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
             }    // h
           }  // c
         } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
+          for (index_t mm = m; mm < p.out_channels; ++mm) {
             float *out_ptr0_base =
-                output_data + b * out_batch_size + mm * out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
+                output_data + b * p.out_batch_size + mm * p.out_image_size;
+            for (index_t c = 0; c < p.in_channels; ++c) {
               const float *in_ptr_base =
-                  input_data + b * in_batch_size + c * in_image_size;
+                  input_data + b * p.in_batch_size + c * p.in_image_size;
               const float
-                  *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
-              for (index_t h = 0; h < out_height; ++h) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                  *filter_ptr0 = filter_data + mm * p.in_channels * 49 + c * 49;
+              for (index_t h = 0; h < p.out_height; ++h) {
+                for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                   // input offset
                   index_t in_h = h * 3;
                   index_t in_w = w * 3;
-                  index_t in_offset = in_h * in_width + in_w;
+                  index_t in_offset = in_h * p.in_width + in_w;
                   // output (1 outch x 1 height x 4 width): vo_outch_height
                   float32x4_t vo0;
                   // load output
-                  index_t out_offset = h * out_width + w;
+                  index_t out_offset = h * p.out_width + w;
                   vo0 = vld1q_f32(out_ptr0_base + out_offset);
                   for (index_t r = 0; r < 7; ++r) {
                     // input (3 slide)
@@ -703,7 +562,7 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
                     MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
 #endif
 
-                    in_offset += in_width;
+                    in_offset += p.in_width;
                     filter_ptr0 += 7;
                   }  // r
 
@@ -716,28 +575,11 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
         }      // if
       }        // m
     }          // b
-  }, 0, batch, 1, 0, out_channels, 4);
+  }, 0, p.batch, 1, 0, p.out_channels, 4);
 
-  UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
 }
 
-void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK7x7S1, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K7x7S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK7x7S2, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K7x7S2));
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK7x7S3, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K7x7S3));
-}
-
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/conv_general.cc b/mace/ops/arm/fp32/conv_2d_general.cc
similarity index 61%
rename from mace/ops/arm/fp32/conv_general.cc
rename to mace/ops/arm/fp32/conv_2d_general.cc
index d58a1725e507e27af12bcb0b0d64821c36769829..6f6a1ff5693a60c3cdf6a754a531fbac14d4be01 100644
--- a/mace/ops/arm/fp32/conv_general.cc
+++ b/mace/ops/arm/fp32/conv_2d_general.cc
@@ -12,118 +12,59 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/conv_2d.h"
-
 #include <memory>
 
+#include "mace/ops/arm/base/conv_2d_general.h"
 #include "mace/ops/delegator/conv_2d.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-class Conv2dGeneral : public Conv2dBase {
- public:
-  explicit Conv2dGeneral(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
-  virtual ~Conv2dGeneral() {}
-
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
-};
-
-MaceStatus Conv2dGeneral::Compute(const OpContext *context,
-                                  const Tensor *input,
-                                  const Tensor *filter,
-                                  Tensor *output) {
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
-
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       1,
-                       4,
-                       &padded_input,
-                       &padded_output);
-
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
 
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-  auto &filter_shape = filter->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
+template<>
+MaceStatus Conv2dGeneral<float>::DoCompute(
+    const ConvComputeParam &p, const float *filter_data,
+    const float *input_data, float *output_data,
+    const std::vector<index_t> &filter_shape) {
   const index_t filter_height = filter_shape[2];
   const index_t filter_width = filter_shape[3];
-
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
   const index_t filter_size = filter_height * filter_width;
 
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t m = start1; m < end1; m += step1) {
         const int stride_h = strides_[0];
         const int stride_w = strides_[1];
         const int dilation_h = dilations_[0];
         const int dilation_w = dilations_[1];
-        if (m + 3 < out_channels) {
+        if (m + 3 < p.out_channels) {
           float *out_ptr0_base =
-              output_data + b * out_batch_size + m * out_image_size;
-          float *out_ptr1_base = out_ptr0_base + out_image_size;
-          float *out_ptr2_base = out_ptr1_base + out_image_size;
-          float *out_ptr3_base = out_ptr2_base + out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+              output_data + b * p.out_batch_size + m * p.out_image_size;
+          float *out_ptr1_base = out_ptr0_base + p.out_image_size;
+          float *out_ptr2_base = out_ptr1_base + p.out_image_size;
+          float *out_ptr3_base = out_ptr2_base + p.out_image_size;
+          for (index_t c = 0; c < p.in_channels; ++c) {
             const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
             const float *filter_ptr0 =
-                filter_data + m * in_channels * filter_size + c * filter_size;
-            const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
-            const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
-            const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
-            for (index_t h = 0; h < out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+                filter_data + m * p.in_channels * filter_size + c * filter_size;
+            const float *filter_ptr1 =
+                filter_ptr0 + p.in_channels * filter_size;
+            const float *filter_ptr2 =
+                filter_ptr1 + p.in_channels * filter_size;
+            const float *filter_ptr3 =
+                filter_ptr2 + p.in_channels * filter_size;
+            for (index_t h = 0; h < p.out_height; ++h) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                 // input offset
                 index_t ih = h * stride_h;
                 index_t iw = w * stride_w;
-                index_t in_offset = ih * in_width + iw;
+                index_t in_offset = ih * p.in_width + iw;
                 // output (4 outch x 1 height x 4 width): vo_outch_height
                 float vo0[4], vo1[4], vo2[4], vo3[4];
                 // load output
-                index_t out_offset = h * out_width + w;
+                index_t out_offset = h * p.out_width + w;
                 for (index_t ow = 0; ow < 4; ++ow) {
                   vo0[ow] = out_ptr0_base[out_offset + ow];
                   vo1[ow] = out_ptr1_base[out_offset + ow];
@@ -171,7 +112,7 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
                         + kw * dilation_w] * filter_ptr3[kw];
                   }  // kw
 
-                  in_offset += dilation_h * in_width;
+                  in_offset += dilation_h * p.in_width;
                   filter_ptr0 += filter_width;
                   filter_ptr1 += filter_width;
                   filter_ptr2 += filter_width;
@@ -193,26 +134,26 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
             }  // h
           }  // c
         } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
+          for (index_t mm = m; mm < p.out_channels; ++mm) {
             float *out_ptr0_base =
-                output_data + b * out_batch_size + mm * out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
+                output_data + b * p.out_batch_size + mm * p.out_image_size;
+            for (index_t c = 0; c < p.in_channels; ++c) {
               const float *in_ptr_base =
-                  input_data + b * in_batch_size + c * in_image_size;
+                  input_data + b * p.in_batch_size + c * p.in_image_size;
               const float *filter_ptr0 =
-                  filter_data + mm * in_channels * filter_size
+                  filter_data + mm * p.in_channels * filter_size
                       + c * filter_size;
 
-              for (index_t h = 0; h < out_height; ++h) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
+              for (index_t h = 0; h < p.out_height; ++h) {
+                for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                   // input offset
                   index_t ih = h * stride_h;
                   index_t iw = w * stride_w;
-                  index_t in_offset = ih * in_width + iw;
+                  index_t in_offset = ih * p.in_width + iw;
                   // output (1 outch x 1 height x 4 width): vo_outch_height
                   float vo0[4];
                   // load output
-                  index_t out_offset = h * out_width + w;
+                  index_t out_offset = h * p.out_width + w;
                   for (index_t ow = 0; ow < 4; ++ow) {
                     vo0[ow] = out_ptr0_base[out_offset + ow];
                   }
@@ -231,7 +172,7 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
                           + kw * dilation_w] * filter_ptr0[kw];
                     }  // kw
 
-                    in_offset += dilation_h * in_width;
+                    in_offset += dilation_h * p.in_width;
                     filter_ptr0 += filter_width;
                   }  // kh
 
@@ -246,19 +187,11 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
         }  // if
       }  // m
     }  // b
-  }, 0, batch, 1, 0, out_channels, 4);
+  }, 0, p.batch, 1, 0, p.out_channels, 4);
 
-  UnPadOutput(*out_tensor, output);
   return MaceStatus::MACE_SUCCESS;
 }
 
-void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dGeneral, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::NEON));
-}
-
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/deconv_2d.h b/mace/ops/arm/fp32/deconv_2d.h
deleted file mode 100644
index 128d5858beee4a8530ed3f775536fb3d1652c44b..0000000000000000000000000000000000000000
--- a/mace/ops/arm/fp32/deconv_2d.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2019 The MACE Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ARM_FP32_DECONV_2D_H_
-#define MACE_OPS_ARM_FP32_DECONV_2D_H_
-
-#include <vector>
-#include <memory>
-
-#include "mace/core/ops/op_context.h"
-#include "mace/core/tensor.h"
-#include "mace/core/types.h"
-#include "mace/ops/arm/fp32/gemm.h"
-#include "mace/ops/common/conv_pool_2d_util.h"
-#include "mace/ops/delegator/deconv_2d.h"
-#include "mace/public/mace.h"
-
-namespace mace {
-namespace ops {
-namespace arm {
-namespace fp32 {
-
-class Deconv2dBase : public delegator::Deconv2d {
- public:
-  explicit Deconv2dBase(const delegator::Deconv2dParam &param)
-      : delegator::Deconv2d(param),
-        group_(param.group_) {}
-
-  virtual ~Deconv2dBase() = default;
-
- protected:
-  MaceStatus ResizeOutAndPadOut(const OpContext *context,
-                                const Tensor *input,
-                                const Tensor *filter,
-                                const Tensor *output_shape,
-                                Tensor *output,
-                                std::vector<int> *out_pad_size,
-                                std::unique_ptr<Tensor> *padded_output);
-
-  void UnPadOutput(const Tensor &src,
-                   const std::vector<int> &out_pad_size,
-                   Tensor *dst);
-  index_t group_;
-};
-
-}  // namespace fp32
-}  // namespace arm
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARM_FP32_DECONV_2D_H_
diff --git a/mace/ops/arm/fp32/deconv_2d_2x2.cc b/mace/ops/arm/fp32/deconv_2d_2x2.cc
index 57784e638f0da27575020b50a63e3080674c5c6f..2a6ca40d624e27a1d0cd531745685a45de1c5264 100644
--- a/mace/ops/arm/fp32/deconv_2d_2x2.cc
+++ b/mace/ops/arm/fp32/deconv_2d_2x2.cc
@@ -12,74 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/deconv_2d_2x2.h"
-
 #include <arm_neon.h>
+
+#include "mace/ops/arm/base/deconv_2d_2x2.h"
 #include "mace/ops/arm/fp32/common_neon.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
-                                   const Tensor *input,
-                                   const Tensor *filter,
-                                   const Tensor *output_shape,
-                                   Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-
-  out_tensor->Clear();
-
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-
-  const index_t out_img_size = outh * outw;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+
+template<>
+MaceStatus Deconv2dK2x2S1<float>::DoCompute(
+    const DeconvComputeParam &p, const float *filter_data,
+    const float *input_data, float *padded_out_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t oc = start1; oc < end1; oc += step1) {
-        if (oc + 1 < outch) {
-          float *out_base0 = padded_out_data + (b * outch + oc) * out_img_size;
-          float *out_base1 = out_base0 + out_img_size;
-          for (index_t ic = 0; ic < inch; ++ic) {
-            const float *input_base = input_data + (b * inch + ic) * h * w;
-            const float *kernel_base0 = filter_data + (oc * inch + ic) * 4;
-            const float *kernel_base1 = kernel_base0 + inch * 4;
+        if (oc + 1 < p.out_channels) {
+          float *out_base0 =
+              padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
+          float *out_base1 = out_base0 + p.out_img_size;
+          for (index_t ic = 0; ic < p.in_channels; ++ic) {
+            const float *input_base = input_data +
+                (b * p.in_channels + ic) * p.in_height * p.in_width;
+            const float *kernel_base0 =
+                filter_data + (oc * p.in_channels + ic) * 4;
+            const float *kernel_base1 = kernel_base0 + p.in_channels * 4;
             const float *in = input_base;
             // output channel 0
             const float *k0 = kernel_base0;
@@ -89,18 +48,18 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
             float32x4_t k0_vec = vld1q_f32(k0);
             float32x4_t k1_vec = vld1q_f32(k1);
 
-            for (index_t i = 0; i < h; ++i) {
-              float *out_row_base0 = out_base0 + i * outw;
+            for (index_t i = 0; i < p.in_height; ++i) {
+              float *out_row_base0 = out_base0 + i * p.out_width;
               float *out_row0_0 = out_row_base0;
-              float *out_row0_1 = out_row_base0 + outw;
+              float *out_row0_1 = out_row_base0 + p.out_width;
 
-              float *out_row_base1 = out_base1 + i * outw;
+              float *out_row_base1 = out_base1 + i * p.out_width;
               float *out_row1_0 = out_row_base1;
-              float *out_row1_1 = out_row_base1 + outw;
+              float *out_row1_1 = out_row_base1 + p.out_width;
 
               index_t j = 0;
 
-              for (; j + 3 < w; j += 4) {
+              for (; j + 3 < p.in_width; j += 4) {
                 float32x4_t in_vec = vld1q_f32(in);
 
                 float32x4_t out00, out01, out02, out03;
@@ -145,7 +104,7 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
                 out_row1_1 += 4;
               }
 
-              for (; j < w; ++j) {
+              for (; j < p.in_width; ++j) {
                 float val = in[0];
                 for (int k = 0; k < 2; ++k) {
                   out_row0_0[k] += val * k0[k];
@@ -162,23 +121,26 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
             }
           }
         } else {
-          float *out_base0 = padded_out_data + (b * outch + oc) * outh * outw;
-          for (index_t ic = 0; ic < inch; ++ic) {
-            const float *input_base = input_data + (b * inch + ic) * h * w;
-            const float *kernel_base0 = filter_data + (oc * inch + ic) * 4;
+          float *out_base0 = padded_out_data +
+              (b * p.out_channels + oc) * p.out_height * p.out_width;
+          for (index_t ic = 0; ic < p.in_channels; ++ic) {
+            const float *input_base = input_data +
+                (b * p.in_channels + ic) * p.in_height * p.in_width;
+            const float *kernel_base0 =
+                filter_data + (oc * p.in_channels + ic) * 4;
             const float *in = input_base;
             const float *k0 = kernel_base0;
 
             // load filter
             float32x4_t k0_vec = vld1q_f32(k0);
 
-            for (index_t i = 0; i < h; ++i) {
-              float *out_row_base0 = out_base0 + i * outw;
+            for (index_t i = 0; i < p.in_height; ++i) {
+              float *out_row_base0 = out_base0 + i * p.out_width;
               float *out_row0_0 = out_row_base0;
-              float *out_row0_1 = out_row_base0 + outw;
+              float *out_row0_1 = out_row_base0 + p.out_width;
               index_t j = 0;
 
-              for (; j + 3 < w; j += 4) {
+              for (; j + 3 < p.in_width; j += 4) {
                 float32x4_t in_vec = vld1q_f32(in);
                 float32x4_t out00, out01, out02, out03;
 
@@ -203,7 +165,7 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
                 out_row0_1 += 4;
               }
 
-              for (; j < w; ++j) {
+              for (; j < p.in_width; ++j) {
                 float val = in[0];
                 for (int k = 0; k < 2; ++k) {
                   out_row0_0[k] += val * k0[k];
@@ -218,79 +180,39 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
         }
       }
     }
-  }, 0, batch, 1, 0, outch, 2);
-
-  UnPadOutput(*out_tensor, out_pad_size, output);
+  }, 0, p.batch, 1, 0, p.out_channels, 2);
 
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context,
-                                   const Tensor *input,
-                                   const Tensor *filter,
-                                   const Tensor *output_shape,
-                                   Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-
-  out_tensor->Clear();
-
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+template<>
+MaceStatus Deconv2dK2x2S2<float>::DoCompute(
+    const DeconvComputeParam &p, const float *filter_data,
+    const float *input_data, float *padded_out_data) {
+
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t oc = start1; oc < end1; oc += step1) {
-        float *out_base = padded_out_data + (b * outch + oc) * out_img_size;
-        for (index_t ic = 0; ic < inch; ++ic) {
-          const float *input_base = input_data + (b * inch + ic) * h * w;
-          const float *kernel_base = filter_data + (oc * inch + ic) * 4;
+        float *out_base =
+            padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
+        for (index_t ic = 0; ic < p.in_channels; ++ic) {
+          const float *input_base = input_data +
+              (b * p.in_channels + ic) * p.in_height * p.in_width;
+          const float *kernel_base =
+              filter_data + (oc * p.in_channels + ic) * 4;
           const float *in = input_base;
           const float *k0 = kernel_base;
           float32x4_t k0_vec = vld1q_f32(k0);
 
-          for (index_t i = 0; i < h; ++i) {
-            float *out_row_base = out_base + i * 2 * outw;
+          for (index_t i = 0; i < p.in_height; ++i) {
+            float *out_row_base = out_base + i * 2 * p.out_width;
             float *out_row_0 = out_row_base;
-            float *out_row_1 = out_row_0 + outw;
+            float *out_row_1 = out_row_0 + p.out_width;
 
             index_t j = 0;
 
-            for (; j + 3 < w; j += 4) {
+            for (; j + 3 < p.in_width; j += 4) {
               float32x4_t in_vec = vld1q_f32(in);
 
               // out row 0
@@ -314,7 +236,7 @@ MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context,
               out_row_1 += 8;
             }
 
-            for (; j < w; ++j) {
+            for (; j < p.in_width; ++j) {
               float val = in[0];
               for (int k = 0; k < 2; ++k) {
                 out_row_0[k] += val * k0[k];
@@ -328,25 +250,11 @@ MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context,
         }
       }
     }
-  }, 0, batch, 1, 0, outch, 1);
-
-  UnPadOutput(*out_tensor, out_pad_size, output);
+  }, 0, p.batch, 1, 0, p.out_channels, 1);
 
   return MaceStatus::MACE_SUCCESS;
 }
 
-void RegisterDeconv2dK2x2Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Deconv2dK2x2S1, delegator::Deconv2dParam,
-      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K2x2S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, Deconv2dK2x2S2, delegator::Deconv2dParam,
-      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K2x2S2));
-}
-
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/deconv_2d_3x3.cc b/mace/ops/arm/fp32/deconv_2d_3x3.cc
index d0b49e0d296d89ca2dc12757dd8feda69ef25a67..4c00f07d28634254b6deef1479070054d07074c3 100644
--- a/mace/ops/arm/fp32/deconv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/deconv_2d_3x3.cc
@@ -12,73 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/deconv_2d_3x3.h"
-
 #include <arm_neon.h>
+
+#include "mace/ops/arm/base/deconv_2d_3x3.h"
 #include "mace/ops/arm/fp32/common_neon.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
-                                   const Tensor *input,
-                                   const Tensor *filter,
-                                   const Tensor *output_shape,
-                                   Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-
-  out_tensor->Clear();
-
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = out_shape[0];
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-
-  const index_t out_img_size = outh * outw;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+
+template<>
+MaceStatus Deconv2dK3x3S1<float>::DoCompute(
+    const DeconvComputeParam &p, const float *filter_data,
+    const float *input_data, float *padded_out_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t oc = start1; oc < end1; oc += step1) {
-        if (oc + 1 < outch) {
-          float *out_base0 = padded_out_data + (b * outch + oc) * out_img_size;
-          float *out_base1 = out_base0 + out_img_size;
-          for (index_t ic = 0; ic < inch; ++ic) {
-            const float *input_base = input_data + (b * inch + ic) * h * w;
-            const float *kernel_base0 = filter_data + (oc * inch + ic) * 9;
-            const float *kernel_base1 = kernel_base0 + inch * 9;
+        if (oc + 1 < p.out_channels) {
+          float *out_base0 =
+              padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
+          float *out_base1 = out_base0 + p.out_img_size;
+          for (index_t ic = 0; ic < p.in_channels; ++ic) {
+            const float *input_base = input_data +
+                (b * p.in_channels + ic) * p.in_height * p.in_width;
+            const float *kernel_base0 =
+                filter_data + (oc * p.in_channels + ic) * 9;
+            const float *kernel_base1 = kernel_base0 + p.in_channels * 9;
             const float *in = input_base;
 
             // output channel 0
@@ -102,20 +62,20 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
             k11_vec = vld1q_f32(k1_1);
             k12_vec = vld1q_f32(k1_2);
 
-            for (index_t i = 0; i < h; ++i) {
-              float *out_row_base0 = out_base0 + i * outw;
+            for (index_t i = 0; i < p.in_height; ++i) {
+              float *out_row_base0 = out_base0 + i * p.out_width;
               float *out_row0_0 = out_row_base0;
-              float *out_row0_1 = out_row_base0 + outw;
-              float *out_row0_2 = out_row_base0 + 2 * outw;
+              float *out_row0_1 = out_row_base0 + p.out_width;
+              float *out_row0_2 = out_row_base0 + 2 * p.out_width;
 
-              float *out_row_base1 = out_base1 + i * outw;
+              float *out_row_base1 = out_base1 + i * p.out_width;
               float *out_row1_0 = out_row_base1;
-              float *out_row1_1 = out_row_base1 + outw;
-              float *out_row1_2 = out_row_base1 + 2 * outw;
+              float *out_row1_1 = out_row_base1 + p.out_width;
+              float *out_row1_2 = out_row_base1 + 2 * p.out_width;
 
               index_t j = 0;
 
-              for (; j + 3 < w; j += 4) {
+              for (; j + 3 < p.in_width; j += 4) {
                 float32x4_t in_vec = vld1q_f32(in);
 
                 float32x4_t out00, out01, out02;
@@ -203,7 +163,7 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
                 out_row1_2 += 4;
               }
 
-              for (; j < w; ++j) {
+              for (; j < p.in_width; ++j) {
                 float val = in[0];
                 for (int k = 0; k < 3; ++k) {
                   out_row0_0[k] += val * k0_0[k];
@@ -224,10 +184,13 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
             }
           }
         } else {
-          float *out_base0 = padded_out_data + (b * outch + oc) * outh * outw;
-          for (index_t ic = 0; ic < inch; ++ic) {
-            const float *input_base = input_data + (b * inch + ic) * h * w;
-            const float *kernel_base0 = filter_data + (oc * inch + ic) * 9;
+          float *out_base0 = padded_out_data +
+              (b * p.out_channels + oc) * p.out_height * p.out_width;
+          for (index_t ic = 0; ic < p.in_channels; ++ic) {
+            const float *input_base = input_data +
+                (b * p.in_channels + ic) * p.in_height * p.in_width;
+            const float *kernel_base0 =
+                filter_data + (oc * p.in_channels + ic) * 9;
             const float *in = input_base;
             const float *k0_0 = kernel_base0;
             const float *k0_1 = kernel_base0 + 3;
@@ -238,14 +201,14 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
             float32x4_t k01_vec = vld1q_f32(k0_1);
             float32x4_t k02_vec = vld1q_f32(k0_2);
 
-            for (index_t i = 0; i < h; ++i) {
-              float *out_row_base0 = out_base0 + i * outw;
+            for (index_t i = 0; i < p.in_height; ++i) {
+              float *out_row_base0 = out_base0 + i * p.out_width;
               float *out_row0_0 = out_row_base0;
-              float *out_row0_1 = out_row_base0 + outw;
-              float *out_row0_2 = out_row_base0 + 2 * outw;
+              float *out_row0_1 = out_row_base0 + p.out_width;
+              float *out_row0_2 = out_row_base0 + 2 * p.out_width;
               index_t j = 0;
 
-              for (; j + 3 < w; j += 4) {
+              for (; j + 3 < p.in_width; j += 4) {
                 float32x4_t in_vec = vld1q_f32(in);
 
                 float32x4_t out00, out01, out02;
@@ -294,7 +257,7 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
                 out_row0_2 += 4;
               }
 
-              for (; j < w; ++j) {
+              for (; j < p.in_width; ++j) {
                 float val = in[0];
                 for (int k = 0; k < 3; ++k) {
                   out_row0_0[k] += val * k0_0[k];
@@ -311,67 +274,26 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
         }
       }
     }
-  }, 0, batch, 1, 0, outch, 2);
-
-  UnPadOutput(*out_tensor, out_pad_size, output);
+  }, 0, p.batch, 1, 0, p.out_channels, 2);
 
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context,
-                                   const Tensor *input,
-                                   const Tensor *filter,
-                                   const Tensor *output_shape,
-                                   Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-
-  out_tensor->Clear();
-
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+template<>
+MaceStatus Deconv2dK3x3S2<float>::DoCompute(
+    const DeconvComputeParam &p, const float *filter_data,
+    const float *input_data, float *padded_out_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t oc = start1; oc < end1; oc += step1) {
-        float *out_base = padded_out_data + (b * outch + oc) * out_img_size;
-        for (index_t ic = 0; ic < inch; ++ic) {
-          const float *input_base = input_data + (b * inch + ic) * h * w;
-          const float *kernel_base = filter_data + (oc * inch + ic) * 9;
+        float *out_base =
+            padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
+        for (index_t ic = 0; ic < p.in_channels; ++ic) {
+          const float *input_base =
+              input_data + (b * p.in_channels + ic) * p.in_height * p.in_width;
+          const float *kernel_base =
+              filter_data + (oc * p.in_channels + ic) * 9;
           const float *in = input_base;
 
           const float *k0 = kernel_base;
@@ -382,15 +304,15 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context,
           float32x4_t k1_vec = vld1q_f32(k1);
           float32x4_t k2_vec = vld1q_f32(k2);
 
-          for (index_t i = 0; i < h; ++i) {
-            float *out_row_base = out_base + i * 2 * outw;
+          for (index_t i = 0; i < p.in_height; ++i) {
+            float *out_row_base = out_base + i * 2 * p.out_width;
             float *out_row_0 = out_row_base;
-            float *out_row_1 = out_row_0 + outw;
-            float *out_row_2 = out_row_1 + outw;
+            float *out_row_1 = out_row_0 + p.out_width;
+            float *out_row_2 = out_row_1 + p.out_width;
 
             index_t j = 0;
 
-            for (index_t n = 0; n + 9 < outw; n += 8) {
+            for (index_t n = 0; n + 9 < p.out_width; n += 8) {
               float32x4_t in_vec = vld1q_f32(in);
 
               // out row 0
@@ -439,7 +361,7 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context,
               j += 4;
             }
 
-            for (; j < w; ++j) {
+            for (; j < p.in_width; ++j) {
               float val = in[0];
 
               for (int k = 0; k < 3; ++k) {
@@ -457,25 +379,11 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context,
         }
       }
     }
-  }, 0, batch, 1, 0, outch, 1);
-
-  UnPadOutput(*out_tensor, out_pad_size, output);
+  }, 0, p.batch, 1, 0, p.out_channels, 1);
 
   return MaceStatus::MACE_SUCCESS;
 }
 
-void RegisterDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Deconv2dK3x3S1, delegator::Deconv2dParam,
-      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, Deconv2dK3x3S2, delegator::Deconv2dParam,
-      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S2));
-}
-
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/deconv_2d_4x4.cc b/mace/ops/arm/fp32/deconv_2d_4x4.cc
index 4a84e0394bf07764103c7c2c6c23f8cc79a31d5b..2dbe4d3e9f226c71a5ba0c3c26f6ef0f0e40210b 100644
--- a/mace/ops/arm/fp32/deconv_2d_4x4.cc
+++ b/mace/ops/arm/fp32/deconv_2d_4x4.cc
@@ -12,78 +12,39 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/deconv_2d_4x4.h"
-
 #include <arm_neon.h>
+
+#include "mace/ops/arm/base/deconv_2d_4x4.h"
 #include "mace/ops/arm/fp32/common_neon.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
-                                   const Tensor *input,
-                                   const Tensor *filter,
-                                   const Tensor *output_shape,
-                                   Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-  out_tensor->Clear();
-
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-
-  const index_t out_img_size = outh * outw;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+
+template<>
+MaceStatus Deconv2dK4x4S1<float>::DoCompute(
+    const DeconvComputeParam &p, const float *filter_data,
+    const float *input_data, float *padded_out_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t oc = start1; oc < end1; oc += step1) {
-        if (oc + 1 < outch) {
-          float *out_base = padded_out_data + (b * outch + oc) * out_img_size;
-          float *out_base1 = out_base + out_img_size;
-          for (index_t q = 0; q < inch; q++) {
-            const float *input_base = input_data + (b * inch + q) * h * w;
+        if (oc + 1 < p.out_channels) {
+          float *out_base =
+              padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
+          float *out_base1 = out_base + p.out_img_size;
+          for (index_t q = 0; q < p.in_channels; q++) {
+            const float *input_base = input_data +
+                (b * p.in_channels + q) * p.in_height * p.in_width;
             const float *in = input_base;
-            const float *kernel_base = filter_data + (oc * inch + q) * 16;
+            const float *kernel_base =
+                filter_data + (oc * p.in_channels + q) * 16;
             const float *k0 = kernel_base;
             const float *k1 = kernel_base + 4;
             const float *k2 = kernel_base + 8;
             const float *k3 = kernel_base + 12;
 
-            const float *kernel_base1 = kernel_base + inch * 16;
+            const float *kernel_base1 = kernel_base + p.in_channels * 16;
             const float *k10 = kernel_base1;
             const float *k11 = kernel_base1 + 4;
             const float *k12 = kernel_base1 + 8;
@@ -99,24 +60,24 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
             float32x4_t k12_vec = vld1q_f32(k12);
             float32x4_t k13_vec = vld1q_f32(k13);
 
-            for (index_t i = 0; i < h; i++) {
-              float *out_row = out_base + i * outw;
+            for (index_t i = 0; i < p.in_height; i++) {
+              float *out_row = out_base + i * p.out_width;
 
               float *out_row_0 = out_row;
-              float *out_row_1 = out_row_0 + outw;
-              float *out_row_2 = out_row_1 + outw;
-              float *out_row_3 = out_row_2 + outw;
+              float *out_row_1 = out_row_0 + p.out_width;
+              float *out_row_2 = out_row_1 + p.out_width;
+              float *out_row_3 = out_row_2 + p.out_width;
 
-              float *out_row1 = out_base1 + i * outw;
+              float *out_row1 = out_base1 + i * p.out_width;
 
               float *out_row1_0 = out_row1;
-              float *out_row1_1 = out_row1_0 + outw;
-              float *out_row1_2 = out_row1_1 + outw;
-              float *out_row1_3 = out_row1_2 + outw;
+              float *out_row1_1 = out_row1_0 + p.out_width;
+              float *out_row1_2 = out_row1_1 + p.out_width;
+              float *out_row1_3 = out_row1_2 + p.out_width;
 
               index_t j = 0;
 
-              for (; j + 3 < w; j += 4) {
+              for (; j + 3 < p.in_width; j += 4) {
                 float32x4_t in_vec = vld1q_f32(in);
                 float32x4_t out00, out01, out02, out03;
                 float32x4_t out10, out11, out12, out13;
@@ -260,7 +221,7 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
                 out_row1_3 += 4;
               }
 
-              for (; j < w; j++) {
+              for (; j < p.in_width; j++) {
                 float val = in[0];
                 for (int k = 0; k < 4; ++k) {
                   out_row_0[k] += val * k0[k];
@@ -285,10 +246,13 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
             }
           }
         } else {
-          float *out_base = padded_out_data + (b * outch + oc) * out_img_size;
-          for (index_t q = 0; q < inch; q++) {
-            const float *input_base = input_data + (b * inch + q) * h * w;
-            const float *kernel_base = filter_data + (oc * inch + q) * 16;
+          float *out_base =
+              padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
+          for (index_t q = 0; q < p.in_channels; q++) {
+            const float *input_base = input_data +
+                (b * p.in_channels + q) * p.in_height * p.in_width;
+            const float *kernel_base =
+                filter_data + (oc * p.in_channels + q) * 16;
             const float *in = input_base;
             const float *k0 = kernel_base;
             const float *k1 = kernel_base + 4;
@@ -300,15 +264,15 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
             float32x4_t k2_vec = vld1q_f32(k2);
             float32x4_t k3_vec = vld1q_f32(k3);
 
-            for (index_t i = 0; i < h; i++) {
-              float *out_row = out_base + i * outw;
+            for (index_t i = 0; i < p.in_height; i++) {
+              float *out_row = out_base + i * p.out_width;
               float *out_row_0 = out_row;
-              float *out_row_1 = out_row_0 + outw;
-              float *out_row_2 = out_row_1 + outw;
-              float *out_row_3 = out_row_2 + outw;
+              float *out_row_1 = out_row_0 + p.out_width;
+              float *out_row_2 = out_row_1 + p.out_width;
+              float *out_row_3 = out_row_2 + p.out_width;
               int j = 0;
 
-              for (; j + 3 < w; j += 4) {
+              for (; j + 3 < p.in_width; j += 4) {
                 float32x4_t in_vec = vld1q_f32(in);
 
                 float32x4_t out00 = vld1q_f32(out_row_0);
@@ -382,7 +346,7 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
                 out_row_3 += 4;
               }
 
-              for (; j < w; j++) {
+              for (; j < p.in_width; j++) {
                 float val = in[0];
                 for (int k = 0; k < 4; ++k) {
                   out_row_0[k] += val * k0[k];
@@ -401,65 +365,25 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
         }
       }
     }
-  }, 0, batch, 1, 0, outch, 2);
-
-  UnPadOutput(*out_tensor, out_pad_size, output);
+  }, 0, p.batch, 1, 0, p.out_channels, 2);
 
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
-                                   const Tensor *input,
-                                   const Tensor *filter,
-                                   const Tensor *output_shape,
-                                   Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-  out_tensor->Clear();
-
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+template<>
+MaceStatus Deconv2dK4x4S2<float>::DoCompute(
+    const DeconvComputeParam &p, const float *filter_data,
+    const float *input_data, float *padded_out_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
-      for (index_t p = start1; p < end1; p += step1) {
-        float *out_base = padded_out_data + (b * outch + p) * out_img_size;
-        for (index_t q = 0; q < inch; q++) {
-          const float *input_base = input_data + (b * inch + q) * h * w;
-          const float *kernel_base = filter_data + (p * inch + q) * 16;
+      for (index_t k = start1; k < end1; k += step1) {
+        float *out_base =
+            padded_out_data + (b * p.out_channels + k) * p.out_img_size;
+        for (index_t q = 0; q < p.in_channels; q++) {
+          const float *input_base = input_data +
+              (b * p.in_channels + q) * p.in_height * p.in_width;
+          const float *kernel_base = filter_data + (k * p.in_channels + q) * 16;
           const float *in = input_base;
 
           const float *k0 = kernel_base;
@@ -472,17 +396,17 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
           float32x4_t k2_vec = vld1q_f32(k2);
           float32x4_t k3_vec = vld1q_f32(k3);
 
-          for (index_t i = 0; i < h; i++) {
-            float *out_row = out_base + 2 * i * outw;
+          for (index_t i = 0; i < p.in_height; i++) {
+            float *out_row = out_base + 2 * i * p.out_width;
 
             float *out_row_0 = out_row;
-            float *out_row_1 = out_row_0 + outw;
-            float *out_row_2 = out_row_1 + outw;
-            float *out_row_3 = out_row_2 + outw;
+            float *out_row_1 = out_row_0 + p.out_width;
+            float *out_row_2 = out_row_1 + p.out_width;
+            float *out_row_3 = out_row_2 + p.out_width;
 
             index_t j = 0;
 
-            for (index_t n = 0; n + 9 < outw; n += 8) {
+            for (index_t n = 0; n + 9 < p.out_width; n += 8) {
               float32x4_t in_vec = vld1q_f32(in);
 
               // row 0
@@ -549,7 +473,7 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
               j += 4;
             }
 
-            for (; j < w; j++) {
+            for (; j < p.in_width; j++) {
               float val = in[0];
               for (int k = 0; k < 4; ++k) {
                 out_row_0[k] += val * k0[k];
@@ -567,25 +491,11 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
         }
       }
     }
-  }, 0, batch, 1, 0, outch, 1);
-
-  UnPadOutput(*out_tensor, out_pad_size, output);
+  }, 0, p.batch, 1, 0, p.out_channels, 1);
 
   return MaceStatus::MACE_SUCCESS;
 }
 
-void RegisterDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Deconv2dK4x4S1, delegator::Deconv2dParam,
-      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K4x4S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, Deconv2dK4x4S2, delegator::Deconv2dParam,
-      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K4x4S2));
-}
-
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc
index cc0ab45a02425f5917eb9edc44d4d20122b57296..fa850e562b6a9ebf8def69b4ce8a193e1d929602 100644
--- a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/depthwise_conv_2d_3x3.h"
-
 #include <arm_neon.h>
 
+#include "mace/ops/arm/base/depthwise_conv_2d_3x3.h"
+
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 
 namespace {
 void DepthwiseConv2dPixel(const float *in_base,
@@ -48,79 +47,36 @@ void DepthwiseConv2dPixel(const float *in_base,
 }
 }  // namespace
 
-MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
-                                          const mace::Tensor *input,
-                                          const mace::Tensor *filter,
-                                          mace::Tensor *output) {
-  MACE_UNUSED(context);
-  std::vector<index_t> out_shape(4);
-  std::vector<int> paddings(2);
-  auto &in_shape = input->shape();
-  auto &filter_shape = filter->shape();
-  CalOutputShapeAndInputPadSize(in_shape, filter_shape, &out_shape, &paddings);
-  out_shape[1] *= filter_shape[1];
-  MACE_RETURN_IF_ERROR(output->Resize(out_shape));
-  output->Clear();
-
-  const int pad_top = paddings[0] / 2;
-  const int pad_left = paddings[1] / 2;
-
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-  const index_t multiplier = out_channels / in_channels;
-
-  std::vector<index_t> out_bounds;
-  CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds);
-  const index_t valid_h_start = out_bounds[0];
-  const index_t valid_h_stop = out_bounds[1];
-  const index_t valid_w_start = out_bounds[2];
-  const index_t valid_w_stop = out_bounds[3];
-
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = input->data<float>();
-  auto output_data = output->mutable_data<float>();
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+template<>
+MaceStatus DepthwiseConv2dK3x3S1<float>::DoCompute(
+    const DepthwiseConvComputeParam &p, const float *filter_data,
+    const float *input_data, float *output_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t m = start1; m < end1; m += step1) {
-        const index_t c = m / multiplier;
-        const index_t multi_index = m % multiplier;
+        const index_t c = m / p.multiplier;
+        const index_t multi_index = m % p.multiplier;
         const float
-            *in_base = input_data + b * in_batch_size + c * in_image_size;
+            *in_base = input_data + b * p.in_batch_size + c * p.in_image_size;
         const float
-            *filter_ptr = filter_data + multi_index * in_channels * 9 + c * 9;
-        float *out_base = output_data + b * out_batch_size + m * out_image_size;
+            *filter_ptr = filter_data + multi_index * p.in_channels * 9 + c * 9;
+        float *out_base =
+            output_data + b * p.out_batch_size + m * p.out_image_size;
         index_t h, w;
 
         // top
-        for (h = 0; h < valid_h_start; ++h) {
-          for (w = 0; w < out_width; ++w) {
+        for (h = 0; h < p.valid_h_start; ++h) {
+          for (w = 0; w < p.out_width; ++w) {
             DepthwiseConv2dPixel(in_base,
                                  filter_ptr,
                                  h,
                                  w,
-                                 h - pad_top,
-                                 w - pad_left,
-                                 out_width,
-                                 in_height,
-                                 in_width,
+                                 h - p.pad_top,
+                                 w - p.pad_left,
+                                 p.out_width,
+                                 p.in_height,
+                                 p.in_width,
                                  3,
                                  3,
                                  out_base);
@@ -133,18 +89,18 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
         vf01 = vld1q_f32(filter_ptr + 3);
         vf02 = vld1q_f32(filter_ptr + 5);
 
-        for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) {
+        for (h = p.valid_h_start; h + 1 < p.valid_h_stop; h += 2) {
           // left
-          for (w = 0; w < valid_w_start; ++w) {
+          for (w = 0; w < p.valid_w_start; ++w) {
             DepthwiseConv2dPixel(in_base,
                                  filter_ptr,
                                  h,
                                  w,
-                                 h - pad_top,
-                                 w - pad_left,
-                                 out_width,
-                                 in_height,
-                                 in_width,
+                                 h - p.pad_top,
+                                 w - p.pad_left,
+                                 p.out_width,
+                                 p.in_height,
+                                 p.in_width,
                                  3,
                                  3,
                                  out_base);
@@ -152,17 +108,17 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
                                  filter_ptr,
                                  h + 1,
                                  w,
-                                 h + 1 - pad_top,
-                                 w - pad_left,
-                                 out_width,
-                                 in_height,
-                                 in_width,
+                                 h + 1 - p.pad_top,
+                                 w - p.pad_left,
+                                 p.out_width,
+                                 p.in_height,
+                                 p.in_width,
                                  3,
                                  3,
                                  out_base);
           }
 
-          for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
+          for (w = p.valid_w_start; w + 3 < p.valid_w_stop; w += 4) {
             // input (4 height x 3 slide): vi_height_slide
             float32x4_t vi00, vi01, vi02, vi0n;
             float32x4_t vi10, vi11, vi12, vi1n;
@@ -173,17 +129,17 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
             float32x4_t vo00, vo01;
 
             // load input
-            index_t in_h = h - pad_top;
-            index_t in_w = w - pad_left;
-            index_t in_offset = in_h * in_width + in_w;
+            index_t in_h = h - p.pad_top;
+            index_t in_w = w - p.pad_left;
+            index_t in_offset = in_h * p.in_width + in_w;
             vi00 = vld1q_f32(in_base + in_offset);
             vi0n = vld1q_f32(in_base + in_offset + 4);
-            vi10 = vld1q_f32(in_base + in_offset + in_width);
-            vi1n = vld1q_f32(in_base + in_offset + in_width + 4);
-            vi20 = vld1q_f32(in_base + in_offset + 2 * in_width);
-            vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 4);
-            vi30 = vld1q_f32(in_base + in_offset + 3 * in_width);
-            vi3n = vld1q_f32(in_base + in_offset + 3 * in_width + 4);
+            vi10 = vld1q_f32(in_base + in_offset + p.in_width);
+            vi1n = vld1q_f32(in_base + in_offset + p.in_width + 4);
+            vi20 = vld1q_f32(in_base + in_offset + 2 * p.in_width);
+            vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 4);
+            vi30 = vld1q_f32(in_base + in_offset + 3 * p.in_width);
+            vi3n = vld1q_f32(in_base + in_offset + 3 * p.in_width + 4);
 
             vi01 = vextq_f32(vi00, vi0n, 1);
             vi02 = vextq_f32(vi00, vi0n, 2);
@@ -195,9 +151,9 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
             vi32 = vextq_f32(vi30, vi3n, 2);
 
             // load ouptut
-            index_t out_offset = h * out_width + w;
+            index_t out_offset = h * p.out_width + w;
             vo00 = vld1q_f32(out_base + out_offset);
-            vo01 = vld1q_f32(out_base + out_offset + out_width);
+            vo01 = vld1q_f32(out_base + out_offset + p.out_width);
 
 #if defined(__aarch64__)
             // outch 0, height 0
@@ -245,20 +201,20 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
             vo01 = vmlaq_lane_f32(vo01, vi32, vget_high_f32(vf02), 1);
 #endif
             vst1q_f32(out_base + out_offset, vo00);
-            vst1q_f32(out_base + out_offset + out_width, vo01);
+            vst1q_f32(out_base + out_offset + p.out_width, vo01);
           }  // w
 
           // right
-          for (; w < out_width; ++w) {
+          for (; w < p.out_width; ++w) {
             DepthwiseConv2dPixel(in_base,
                                  filter_ptr,
                                  h,
                                  w,
-                                 h - pad_top,
-                                 w - pad_left,
-                                 out_width,
-                                 in_height,
-                                 in_width,
+                                 h - p.pad_top,
+                                 w - p.pad_left,
+                                 p.out_width,
+                                 p.in_height,
+                                 p.in_width,
                                  3,
                                  3,
                                  out_base);
@@ -266,11 +222,11 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
                                  filter_ptr,
                                  h + 1,
                                  w,
-                                 h + 1 - pad_top,
-                                 w - pad_left,
-                                 out_width,
-                                 in_height,
-                                 in_width,
+                                 h + 1 - p.pad_top,
+                                 w - p.pad_left,
+                                 p.out_width,
+                                 p.in_height,
+                                 p.in_width,
                                  3,
                                  3,
                                  out_base);
@@ -279,17 +235,17 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
 
 
         // bottom
-        for (; h < out_height; ++h) {
-          for (w = 0; w < out_width; ++w) {
+        for (; h < p.out_height; ++h) {
+          for (w = 0; w < p.out_width; ++w) {
             DepthwiseConv2dPixel(in_base,
                                  filter_ptr,
                                  h,
                                  w,
-                                 h - pad_top,
-                                 w - pad_left,
-                                 out_width,
-                                 in_height,
-                                 in_width,
+                                 h - p.pad_top,
+                                 w - p.pad_left,
+                                 p.out_width,
+                                 p.in_height,
+                                 p.in_width,
                                  3,
                                  3,
                                  out_base);
@@ -297,86 +253,41 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
         }
       }  // m
     }    // b
-  }, 0, batch, 1, 0, out_channels, 1);  // threadpool
+  }, 0, p.batch, 1, 0, p.out_channels, 1);  // threadpool
 
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
-                                          const mace::Tensor *input,
-                                          const mace::Tensor *filter,
-                                          mace::Tensor *output) {
-  MACE_UNUSED(context);
-
-  std::vector<index_t> out_shape(4);
-  std::vector<int> paddings(2);
-  auto &in_shape = input->shape();
-  auto &filter_shape = filter->shape();
-
-  CalOutputShapeAndInputPadSize(in_shape, filter_shape, &out_shape, &paddings);
-  out_shape[1] *= in_shape[1];
-  MACE_RETURN_IF_ERROR(output->Resize(out_shape));
-  output->Clear();
-
-  const int pad_top = paddings[0] / 2;
-  const int pad_left = paddings[1] / 2;
-
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-  const index_t multiplier = out_channels / in_channels;
-
-  std::vector<index_t> out_bounds;
-  CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds);
-  const index_t valid_h_start = out_bounds[0];
-  const index_t valid_h_stop = out_bounds[1];
-  const index_t valid_w_start = out_bounds[2];
-  const index_t valid_w_stop = out_bounds[3];
-
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = input->data<float>();
-  auto output_data = output->mutable_data<float>();
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+template<>
+MaceStatus DepthwiseConv2dK3x3S2<float>::DoCompute(
+    const DepthwiseConvComputeParam &p, const float *filter_data,
+    const float *input_data, float *output_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t m = start1; m < end1; m += step1) {
-        index_t c = m / multiplier;
-        index_t multi_index = m % multiplier;
+        index_t c = m / p.multiplier;
+        index_t multi_index = m % p.multiplier;
         const float
-            *in_base = input_data + b * in_batch_size + c * in_image_size;
+            *in_base = input_data + b * p.in_batch_size + c * p.in_image_size;
         const float
-            *filter_ptr = filter_data + multi_index * in_channels * 9 + c * 9;
-        float *out_base = output_data + b * out_batch_size + m * out_image_size;
+            *filter_ptr = filter_data + multi_index * p.in_channels * 9 + c * 9;
+        float *out_base =
+            output_data + b * p.out_batch_size + m * p.out_image_size;
         index_t h, w;
 
         // top
-        for (h = 0; h < valid_h_start; ++h) {
-          for (w = 0; w < out_width; ++w) {
+        for (h = 0; h < p.valid_h_start; ++h) {
+          for (w = 0; w < p.out_width; ++w) {
             DepthwiseConv2dPixel(in_base,
                                  filter_ptr,
                                  h,
                                  w,
-                                 h * 2 - pad_top,
-                                 w * 2 - pad_left,
-                                 out_width,
-                                 in_height,
-                                 in_width,
+                                 h * 2 - p.pad_top,
+                                 w * 2 - p.pad_left,
+                                 p.out_width,
+                                 p.in_height,
+                                 p.in_width,
                                  3,
                                  3,
                                  out_base);
@@ -389,24 +300,24 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
         vf01 = vld1q_f32(filter_ptr + 3);
         vf02 = vld1q_f32(filter_ptr + 5);
 
-        for (h = valid_h_start; h < valid_h_stop; ++h) {
+        for (h = p.valid_h_start; h < p.valid_h_stop; ++h) {
           // left
-          for (w = 0; w < valid_w_start; ++w) {
+          for (w = 0; w < p.valid_w_start; ++w) {
             DepthwiseConv2dPixel(in_base,
                                  filter_ptr,
                                  h,
                                  w,
-                                 h * 2 - pad_top,
-                                 w * 2 - pad_left,
-                                 out_width,
-                                 in_height,
-                                 in_width,
+                                 h * 2 - p.pad_top,
+                                 w * 2 - p.pad_left,
+                                 p.out_width,
+                                 p.in_height,
+                                 p.in_width,
                                  3,
                                  3,
                                  out_base);
           }
 
-          for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
+          for (w = p.valid_w_start; w + 3 < p.valid_w_stop; w += 4) {
             float32x4x2_t vi0, vi1, vi2;
             float32x4_t vi0n, vi1n, vi2n;
 
@@ -419,19 +330,19 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
             float32x4_t vo;
 
             // load input
-            index_t in_h = h * 2 - pad_top;
-            index_t in_w = w * 2 - pad_left;
-            index_t in_offset = in_h * in_width + in_w;
+            index_t in_h = h * 2 - p.pad_top;
+            index_t in_w = w * 2 - p.pad_left;
+            index_t in_offset = in_h * p.in_width + in_w;
             vi0 = vld2q_f32(in_base + in_offset);  // [0.2.4.6, 1.3.5.7]
-            vi1 = vld2q_f32(in_base + in_offset + in_width);
-            vi2 = vld2q_f32(in_base + in_offset + 2 * in_width);
+            vi1 = vld2q_f32(in_base + in_offset + p.in_width);
+            vi2 = vld2q_f32(in_base + in_offset + 2 * p.in_width);
 
             vi0n = vld1q_f32(in_base + in_offset + 8);  // [8.9.10.11]
-            vi1n = vld1q_f32(in_base + in_offset + in_width + 8);
-            vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8);
+            vi1n = vld1q_f32(in_base + in_offset + p.in_width + 8);
+            vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 8);
 
             // load ouptut
-            index_t out_offset = h * out_width + w;
+            index_t out_offset = h * p.out_width + w;
             vo = vld1q_f32(out_base + out_offset);
 
             vi00 = vi0.val[0];                // [0.2.4.6]
@@ -471,16 +382,16 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
           }  // w
 
           // right
-          for (; w < out_width; ++w) {
+          for (; w < p.out_width; ++w) {
             DepthwiseConv2dPixel(in_base,
                                  filter_ptr,
                                  h,
                                  w,
-                                 h * 2 - pad_top,
-                                 w * 2 - pad_left,
-                                 out_width,
-                                 in_height,
-                                 in_width,
+                                 h * 2 - p.pad_top,
+                                 w * 2 - p.pad_left,
+                                 p.out_width,
+                                 p.in_height,
+                                 p.in_width,
                                  3,
                                  3,
                                  out_base);
@@ -489,17 +400,17 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
 
 
         // bottom
-        for (; h < out_height; ++h) {
-          for (w = 0; w < out_width; ++w) {
+        for (; h < p.out_height; ++h) {
+          for (w = 0; w < p.out_width; ++w) {
             DepthwiseConv2dPixel(in_base,
                                  filter_ptr,
                                  h,
                                  w,
-                                 h * 2 - pad_top,
-                                 w * 2 - pad_left,
-                                 out_width,
-                                 in_height,
-                                 in_width,
+                                 h * 2 - p.pad_top,
+                                 w * 2 - p.pad_left,
+                                 p.out_width,
+                                 p.in_height,
+                                 p.in_width,
                                  3,
                                  3,
                                  out_base);
@@ -507,23 +418,11 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
         }
       }  // m
     }    // b
-  }, 0, batch, 1, 0, out_channels, 1);
+  }, 0, p.batch, 1, 0, p.out_channels, 1);
 
   return MaceStatus::MACE_SUCCESS;
 }
 
-void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, DepthwiseConv2dK3x3S1, delegator::DepthwiseConv2dParam,
-      MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, DepthwiseConv2dK3x3S2, delegator::DepthwiseConv2dParam,
-      MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S2));
-}
-
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc
index 875e08fa5ed271d599b33d490b0211dcd1360254..99e9c9eb018a8817ce7096544bb565bb0c5e6e03 100644
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc
@@ -12,69 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h"
-
 #include <arm_neon.h>
+
+#include "mace/ops/arm/base/depthwise_deconv_2d_3x3.h"
 #include "mace/ops/arm/fp32/common_neon.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context,
-                                            const Tensor *input,
-                                            const Tensor *filter,
-                                            const Tensor *output_shape,
-                                            Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  group_ = input->dim(1);
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-
-  out_tensor->Clear();
-
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t channels = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t in_img_size = h * w;
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+
+template<>
+MaceStatus DepthwiseDeconv2dK3x3S1<float>::DoCompute(
+    const DepthwiseDeconvComputeParam &p, const float *filter_data,
+    const float *input_data, float *padded_out_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t c = start1; c < end1; c += step1) {
-        const index_t offset = b * channels + c;
-        float *out_base = padded_out_data + offset * out_img_size;
-        const float *input_base = input_data + offset * in_img_size;
+        const index_t offset = b * p.in_channels + c;
+        float *out_base = padded_out_data + offset * p.out_img_size;
+        const float *input_base = input_data + offset * p.in_img_size;
         const float *kernel_base = filter_data + c * 9;
         const float *in = input_base;
         const float *k0 = kernel_base;
@@ -86,14 +43,14 @@ MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context,
         float32x4_t k1_vec = vld1q_f32(k1);
         float32x4_t k2_vec = vld1q_f32(k2);
 
-        for (index_t i = 0; i < h; ++i) {
-          float *out_row_base = out_base + i * outw;
+        for (index_t i = 0; i < p.in_height; ++i) {
+          float *out_row_base = out_base + i * p.out_width;
           float *out_row0 = out_row_base;
-          float *out_row1 = out_row_base + outw;
-          float *out_row2 = out_row_base + 2 * outw;
+          float *out_row1 = out_row_base + p.out_width;
+          float *out_row2 = out_row_base + 2 * p.out_width;
           index_t j = 0;
 
-          for (; j + 3 < w; j += 4) {
+          for (; j + 3 < p.in_width; j += 4) {
             float32x4_t in_vec = vld1q_f32(in);
 
             float32x4_t out00, out01, out02;
@@ -142,7 +99,7 @@ MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context,
             out_row2 += 4;
           }
 
-          for (; j < w; ++j) {
+          for (; j < p.in_width; ++j) {
             float val = in[0];
             for (int k = 0; k < 3; ++k) {
               out_row0[k] += val * k0[k];
@@ -157,66 +114,22 @@ MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context,
         }
       }
     }
-  }, 0, batch, 1, 0, channels, 1);
-
-  UnPadOutput(*out_tensor, out_pad_size, output);
+  }, 0, p.batch, 1, 0, p.in_channels, 1);
 
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context,
-                                            const Tensor *input,
-                                            const Tensor *filter,
-                                            const Tensor *output_shape,
-                                            Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  group_ = input->dim(1);
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-
-  out_tensor->Clear();
-
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t channels = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t in_img_size = h * w;
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+template<>
+MaceStatus DepthwiseDeconv2dK3x3S2<float>::DoCompute(
+    const DepthwiseDeconvComputeParam &p, const float *filter_data,
+    const float *input_data, float *padded_out_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t c = start1; c < end1; c += step1) {
-        const index_t offset = b * channels + c;
-        float *out_base = padded_out_data + offset * out_img_size;
-        const float *input_base = input_data + offset * in_img_size;
+        const index_t offset = b * p.in_channels + c;
+        float *out_base = padded_out_data + offset * p.out_img_size;
+        const float *input_base = input_data + offset * p.in_img_size;
         const float *kernel_base = filter_data + c * 9;
         const float *in = input_base;
 
@@ -228,15 +141,15 @@ MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context,
         float32x4_t k1_vec = vld1q_f32(k1);
         float32x4_t k2_vec = vld1q_f32(k2);
 
-        for (index_t i = 0; i < h; ++i) {
-          float *out_row_base = out_base + i * 2 * outw;
+        for (index_t i = 0; i < p.in_height; ++i) {
+          float *out_row_base = out_base + i * 2 * p.out_width;
           float *out_row_0 = out_row_base;
-          float *out_row_1 = out_row_0 + outw;
-          float *out_row_2 = out_row_1 + outw;
+          float *out_row_1 = out_row_0 + p.out_width;
+          float *out_row_2 = out_row_1 + p.out_width;
 
           index_t j = 0;
 
-          for (index_t n = 0; n + 9 < outw; n += 8) {
+          for (index_t n = 0; n + 9 < p.out_width; n += 8) {
             float32x4_t in_vec = vld1q_f32(in);
 
             // out row 0
@@ -285,7 +198,7 @@ MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context,
             j += 4;
           }
 
-          for (; j < w; ++j) {
+          for (; j < p.in_width; ++j) {
             float val = in[0];
 
             for (int k = 0; k < 3; ++k) {
@@ -302,80 +215,31 @@ MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context,
         }
       }
     }
-  }, 0, batch, 1, 0, channels, 1);
-
-  UnPadOutput(*out_tensor, out_pad_size, output);
+  }, 0, p.batch, 1, 0, p.in_channels, 1);
 
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
-                                        const Tensor *input,
-                                        const Tensor *filter,
-                                        const Tensor *output_shape,
-                                        Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-
-  out_tensor->Clear();
-
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-
-  const index_t in_img_size = h * w;
-  const index_t out_img_size = outh * outw;
-
-  const index_t inch_g = inch / group_;
-  const index_t outch_g = outch / group_;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1,
-                            index_t start2, index_t end2, index_t step2) {
+template<>
+MaceStatus GroupDeconv2dK3x3S1<float>::DoCompute(
+    const GroupDeconvComputeParam &p, const float *filter_data,
+    const float *input_data, float *padded_out_data) {
+  p.thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1,
+                              index_t start2, index_t end2, index_t step2) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t g = start1; g < end1; g += step1) {
         for (index_t oc = start2; oc < end2; oc += step2) {
-          if (oc + 1 < outch_g) {
-            const index_t out_offset = b * outch + outch_g * g + oc;
-            float *out_base0 = padded_out_data + out_offset * out_img_size;
-            float *out_base1 = out_base0 + out_img_size;
-            for (index_t ic = 0; ic < inch_g; ++ic) {
-              const index_t in_offset = b * inch + inch_g * g + ic;
-              const float *input_base = input_data + in_offset * in_img_size;
-              const index_t kernel_offset = (oc * group_ + g) * inch_g + ic;
+          if (oc + 1 < p.outch_g) {
+            const index_t out_offset = b * p.out_channels + p.outch_g * g + oc;
+            float *out_base0 = padded_out_data + out_offset * p.out_img_size;
+            float *out_base1 = out_base0 + p.out_img_size;
+            for (index_t ic = 0; ic < p.inch_g; ++ic) {
+              const index_t in_offset = b * p.in_channels + p.inch_g * g + ic;
+              const float *input_base = input_data + in_offset * p.in_img_size;
+              const index_t kernel_offset = (oc * group_ + g) * p.inch_g + ic;
               const float *kernel_base0 = filter_data + kernel_offset * 9;
-              const float *kernel_base1 = kernel_base0 + inch * 9;
+              const float *kernel_base1 = kernel_base0 + p.in_channels * 9;
               const float *in = input_base;
 
               // output channel 0
@@ -399,20 +263,20 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
               k11_vec = vld1q_f32(k1_1);
               k12_vec = vld1q_f32(k1_2);
 
-              for (index_t i = 0; i < h; ++i) {
-                float *out_row_base0 = out_base0 + i * outw;
+              for (index_t i = 0; i < p.in_height; ++i) {
+                float *out_row_base0 = out_base0 + i * p.out_width;
                 float *out_row0_0 = out_row_base0;
-                float *out_row0_1 = out_row_base0 + outw;
-                float *out_row0_2 = out_row_base0 + 2 * outw;
+                float *out_row0_1 = out_row_base0 + p.out_width;
+                float *out_row0_2 = out_row_base0 + 2 * p.out_width;
 
-                float *out_row_base1 = out_base1 + i * outw;
+                float *out_row_base1 = out_base1 + i * p.out_width;
                 float *out_row1_0 = out_row_base1;
-                float *out_row1_1 = out_row_base1 + outw;
-                float *out_row1_2 = out_row_base1 + 2 * outw;
+                float *out_row1_1 = out_row_base1 + p.out_width;
+                float *out_row1_2 = out_row_base1 + 2 * p.out_width;
 
                 index_t j = 0;
 
-                for (; j + 3 < w; j += 4) {
+                for (; j + 3 < p.in_width; j += 4) {
                   float32x4_t in_vec = vld1q_f32(in);
 
                   float32x4_t out00, out01, out02;
@@ -500,7 +364,7 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
                   out_row1_2 += 4;
                 }
 
-                for (; j < w; ++j) {
+                for (; j < p.in_width; ++j) {
                   float val = in[0];
                   for (int k = 0; k < 3; ++k) {
                     out_row0_0[k] += val * k0_0[k];
@@ -521,12 +385,12 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
               }
             }
           } else {
-            const index_t out_offset = b * outch + outch_g * g + oc;
-            float *out_base0 = padded_out_data + out_offset * out_img_size;
-            for (index_t ic = 0; ic < inch_g; ++ic) {
-              const index_t in_offset = (b * group_ + g) * inch_g + ic;
-              const float *input_base = input_data + in_offset * in_img_size;
-              const index_t kernel_offset = (oc * group_ + g) * inch_g + ic;
+            const index_t out_offset = b * p.out_channels + p.outch_g * g + oc;
+            float *out_base0 = padded_out_data + out_offset * p.out_img_size;
+            for (index_t ic = 0; ic < p.inch_g; ++ic) {
+              const index_t in_offset = (b * group_ + g) * p.inch_g + ic;
+              const float *input_base = input_data + in_offset * p.in_img_size;
+              const index_t kernel_offset = (oc * group_ + g) * p.inch_g + ic;
               const float *kernel_base0 = filter_data + kernel_offset * 9;
               const float *in = input_base;
               const float *k0_0 = kernel_base0;
@@ -538,14 +402,14 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
               float32x4_t k01_vec = vld1q_f32(k0_1);
               float32x4_t k02_vec = vld1q_f32(k0_2);
 
-              for (index_t i = 0; i < h; ++i) {
-                float *out_row_base0 = out_base0 + i * outw;
+              for (index_t i = 0; i < p.in_height; ++i) {
+                float *out_row_base0 = out_base0 + i * p.out_width;
                 float *out_row0_0 = out_row_base0;
-                float *out_row0_1 = out_row_base0 + outw;
-                float *out_row0_2 = out_row_base0 + 2 * outw;
+                float *out_row0_1 = out_row_base0 + p.out_width;
+                float *out_row0_2 = out_row_base0 + 2 * p.out_width;
                 index_t j = 0;
 
-                for (; j + 3 < w; j += 4) {
+                for (; j + 3 < p.in_width; j += 4) {
                   float32x4_t in_vec = vld1q_f32(in);
 
                   float32x4_t out00, out01, out02;
@@ -594,7 +458,7 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
                   out_row0_2 += 4;
                 }
 
-                for (; j < w; ++j) {
+                for (; j < p.in_width; ++j) {
                   float val = in[0];
                   for (int k = 0; k < 3; ++k) {
                     out_row0_0[k] += val * k0_0[k];
@@ -612,76 +476,27 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
         }
       }
     }
-  }, 0, batch, 1, 0, group_, 1, 0, outch_g, 2);
-
-  UnPadOutput(*out_tensor, out_pad_size, output);
+  }, 0, p.batch, 1, 0, group_, 1, 0, p.outch_g, 2);
 
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context,
-                                        const Tensor *input,
-                                        const Tensor *filter,
-                                        const Tensor *output_shape,
-                                        Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-
-  out_tensor->Clear();
-
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-
-  const index_t in_img_size = h * w;
-  const index_t out_img_size = outh * outw;
-
-  const index_t inch_g = inch / group_;
-  const index_t outch_g = outch / group_;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1,
-                            index_t start2, index_t end2, index_t step2) {
+template<>
+MaceStatus GroupDeconv2dK3x3S2<float>::DoCompute(
+    const GroupDeconvComputeParam &p, const float *filter_data,
+    const float *input_data, float *padded_out_data) {
+  p.thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1,
+                              index_t start2, index_t end2, index_t step2) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t g = start1; g < end1; g += step1) {
         for (index_t oc = start2; oc < end2; oc += step2) {
-          const index_t out_offset = b * outch + outch_g * g + oc;
-          float *out_base = padded_out_data + out_offset * out_img_size;
-          for (index_t ic = 0; ic < inch_g; ++ic) {
-            const index_t in_offset = b * inch + inch_g * g + ic;
-            const float *input_base = input_data + in_offset * in_img_size;
-            const index_t kernel_offset = (oc * group_ + g) * inch_g + ic;
+          const index_t out_offset = b * p.out_channels + p.outch_g * g + oc;
+          float *out_base = padded_out_data + out_offset * p.out_img_size;
+          for (index_t ic = 0; ic < p.inch_g; ++ic) {
+            const index_t in_offset = b * p.in_channels + p.inch_g * g + ic;
+            const float *input_base = input_data + in_offset * p.in_img_size;
+            const index_t kernel_offset = (oc * group_ + g) * p.inch_g + ic;
             const float *kernel_base = filter_data + kernel_offset * 9;
             const float *in = input_base;
 
@@ -693,15 +508,15 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context,
             float32x4_t k1_vec = vld1q_f32(k1);
             float32x4_t k2_vec = vld1q_f32(k2);
 
-            for (index_t i = 0; i < h; ++i) {
-              float *out_row_base = out_base + i * 2 * outw;
+            for (index_t i = 0; i < p.in_height; ++i) {
+              float *out_row_base = out_base + i * 2 * p.out_width;
               float *out_row_0 = out_row_base;
-              float *out_row_1 = out_row_0 + outw;
-              float *out_row_2 = out_row_1 + outw;
+              float *out_row_1 = out_row_0 + p.out_width;
+              float *out_row_2 = out_row_1 + p.out_width;
 
               index_t j = 0;
 
-              for (index_t n = 0; n + 9 < outw; n += 8) {
+              for (index_t n = 0; n + 9 < p.out_width; n += 8) {
                 float32x4_t in_vec = vld1q_f32(in);
 
                 // out row 0
@@ -750,7 +565,7 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context,
                 j += 4;
               }
 
-              for (; j < w; ++j) {
+              for (; j < p.in_width; ++j) {
                 float val = in[0];
 
                 for (int k = 0; k < 3; ++k) {
@@ -769,36 +584,11 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context,
         }
       }
     }
-  }, 0, batch, 1, 0, group_, 1, 0, outch_g, 1);
-
-  UnPadOutput(*out_tensor, out_pad_size, output);
+  }, 0, p.batch, 1, 0, group_, 1, 0, p.outch_g, 1);
 
   return MaceStatus::MACE_SUCCESS;
 }
 
-void RegisterDepthwiseDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, DepthwiseDeconv2dK3x3S1, delegator::DepthwiseDeconv2dParam,
-      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, DepthwiseDeconv2dK3x3S2, delegator::DepthwiseDeconv2dParam,
-      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S2));
-}
-
-void RegisterGroupDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, GroupDeconv2dK3x3S1, delegator::GroupDeconv2dParam,
-      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, GroupDeconv2dK3x3S2, delegator::GroupDeconv2dParam,
-      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S2));
-}
-
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc
index 6f313c591212008b0c614cfebbf24d5dfebdc1a1..529b728fcb6baf2d6b04585d59025bea552d6ef5 100644
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc
@@ -12,69 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h"
-
 #include <arm_neon.h>
+
+#include "mace/ops/arm/base/depthwise_deconv_2d_4x4.h"
 #include "mace/ops/arm/fp32/common_neon.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context,
-                                            const Tensor *input,
-                                            const Tensor *filter,
-                                            const Tensor *output_shape,
-                                            Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  group_ = input->dim(1);
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-
-  out_tensor->Clear();
-
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t channels = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t in_img_size = h * w;
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+
+template<>
+MaceStatus DepthwiseDeconv2dK4x4S1<float>::DoCompute(
+    const DepthwiseDeconvComputeParam &p, const float *filter_data,
+    const float *input_data, float *padded_out_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t c = start1; c < end1; c += step1) {
-        const index_t offset = b * channels + c;
-        float *out_base = padded_out_data + offset * out_img_size;
-        const float *input_base = input_data + offset * in_img_size;
+        const index_t offset = b * p.in_channels + c;
+        float *out_base = padded_out_data + offset * p.out_img_size;
+        const float *input_base = input_data + offset * p.in_img_size;
         const float *kernel_base = filter_data + c * 16;
         const float *in = input_base;
         const float *k0 = kernel_base;
@@ -87,15 +44,15 @@ MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context,
         float32x4_t k2_vec = vld1q_f32(k2);
         float32x4_t k3_vec = vld1q_f32(k3);
 
-        for (index_t i = 0; i < h; i++) {
-          float *out_row = out_base + i * outw;
+        for (index_t i = 0; i < p.in_height; i++) {
+          float *out_row = out_base + i * p.out_width;
           float *out_row_0 = out_row;
-          float *out_row_1 = out_row_0 + outw;
-          float *out_row_2 = out_row_1 + outw;
-          float *out_row_3 = out_row_2 + outw;
+          float *out_row_1 = out_row_0 + p.out_width;
+          float *out_row_2 = out_row_1 + p.out_width;
+          float *out_row_3 = out_row_2 + p.out_width;
           index_t j = 0;
 
-          for (; j + 3 < w; j += 4) {
+          for (; j + 3 < p.in_width; j += 4) {
             float32x4_t in_vec = vld1q_f32(in);
 
             float32x4_t out00 = vld1q_f32(out_row_0);
@@ -172,7 +129,7 @@ MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context,
             out_row_3 += 4;
           }
 
-          for (; j < w; j++) {
+          for (; j < p.in_width; j++) {
             float val = in[0];
             for (int k = 0; k < 4; ++k) {
               out_row_0[k] += val * k0[k];
@@ -189,66 +146,22 @@ MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context,
         }
       }
     }
-  }, 0, batch, 1, 0, channels, 1);
-
-  UnPadOutput(*out_tensor, out_pad_size, output);
+  }, 0, p.batch, 1, 0, p.in_channels, 1);
 
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context,
-                                            const Tensor *input,
-                                            const Tensor *filter,
-                                            const Tensor *output_shape,
-                                            Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  group_ = input->dim(1);
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-
-  out_tensor->Clear();
-
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t channels = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t in_img_size = h * w;
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1) {
+template<>
+MaceStatus DepthwiseDeconv2dK4x4S2<float>::DoCompute(
+    const DepthwiseDeconvComputeParam &p, const float *filter_data,
+    const float *input_data, float *padded_out_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t c = start1; c < end1; c += step1) {
-        const index_t offset = b * channels + c;
-        float *out_base = padded_out_data + offset * out_img_size;
-        const float *input_base = input_data + offset * in_img_size;
+        const index_t offset = b * p.in_channels + c;
+        float *out_base = padded_out_data + offset * p.out_img_size;
+        const float *input_base = input_data + offset * p.in_img_size;
         const float *kernel_base = filter_data + c * 16;
         const float *in = input_base;
 
@@ -262,17 +175,17 @@ MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context,
         float32x4_t k2_vec = vld1q_f32(k2);
         float32x4_t k3_vec = vld1q_f32(k3);
 
-        for (index_t i = 0; i < h; i++) {
-          float *out_row = out_base + 2 * i * outw;
+        for (index_t i = 0; i < p.in_height; i++) {
+          float *out_row = out_base + 2 * i * p.out_width;
 
           float *out_row_0 = out_row;
-          float *out_row_1 = out_row_0 + outw;
-          float *out_row_2 = out_row_1 + outw;
-          float *out_row_3 = out_row_2 + outw;
+          float *out_row_1 = out_row_0 + p.out_width;
+          float *out_row_2 = out_row_1 + p.out_width;
+          float *out_row_3 = out_row_2 + p.out_width;
 
           index_t j = 0;
 
-          for (index_t n = 0; n + 9 < outw; n += 8) {
+          for (index_t n = 0; n + 9 < p.out_width; n += 8) {
             float32x4_t in_vec = vld1q_f32(in);
 
             // row 0
@@ -339,7 +252,7 @@ MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context,
             j += 4;
           }
 
-          for (; j < w; j++) {
+          for (; j < p.in_width; j++) {
             float val = in[0];
             for (int k = 0; k < 4; ++k) {
               out_row_0[k] += val * k0[k];
@@ -356,89 +269,40 @@ MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context,
         }
       }
     }
-  }, 0, batch, 1, 0, channels, 1);
-
-  UnPadOutput(*out_tensor, out_pad_size, output);
+  }, 0, p.batch, 1, 0, p.in_channels, 1);
 
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
-                                        const Tensor *input,
-                                        const Tensor *filter,
-                                        const Tensor *output_shape,
-                                        Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-
-  out_tensor->Clear();
-
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-
-  const index_t in_img_size = h * w;
-  const index_t out_img_size = outh * outw;
-
-  const index_t inch_g = inch / group_;
-  const index_t outch_g = outch / group_;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1,
-                            index_t start2, index_t end2, index_t step2) {
+template<>
+MaceStatus GroupDeconv2dK4x4S1<float>::DoCompute(
+    const GroupDeconvComputeParam &p, const float *filter_data,
+    const float *input_data, float *padded_out_data) {
+  p.thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1,
+                              index_t start2, index_t end2, index_t step2) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t g = start1; g < end1; g += step1) {
         for (index_t oc = start2; oc < end2; oc += step2) {
-          if (oc + 1 < outch_g) {
+          if (oc + 1 < p.outch_g) {
             const index_t out_offset =
-                (b * outch + outch_g * g + oc) * out_img_size;
+                (b * p.out_channels + p.outch_g * g + oc) * p.out_img_size;
             float *out_base = padded_out_data + out_offset;
-            float *out_base1 = out_base + out_img_size;
-            for (index_t ic = 0; ic < inch_g; ic++) {
+            float *out_base1 = out_base + p.out_img_size;
+            for (index_t ic = 0; ic < p.inch_g; ic++) {
               const index_t in_offset =
-                  (b * inch + inch_g * g + ic) * in_img_size;
+                  (b * p.in_channels + p.inch_g * g + ic) * p.in_img_size;
               const float *input_base = input_data + in_offset;
               const float *in = input_base;
               const index_t kernel_offset =
-                  ((oc * group_ + g) * inch_g + ic) * 16;
+                  ((oc * group_ + g) * p.inch_g + ic) * 16;
               const float *kernel_base = filter_data + kernel_offset;
               const float *k0 = kernel_base;
               const float *k1 = kernel_base + 4;
               const float *k2 = kernel_base + 8;
               const float *k3 = kernel_base + 12;
 
-              const float *kernel_base1 = kernel_base + inch * 16;
+              const float *kernel_base1 = kernel_base + p.in_channels * 16;
               const float *k10 = kernel_base1;
               const float *k11 = kernel_base1 + 4;
               const float *k12 = kernel_base1 + 8;
@@ -454,24 +318,24 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
               float32x4_t k12_vec = vld1q_f32(k12);
               float32x4_t k13_vec = vld1q_f32(k13);
 
-              for (index_t i = 0; i < h; i++) {
-                float *out_row = out_base + i * outw;
+              for (index_t i = 0; i < p.in_height; i++) {
+                float *out_row = out_base + i * p.out_width;
 
                 float *out_row_0 = out_row;
-                float *out_row_1 = out_row_0 + outw;
-                float *out_row_2 = out_row_1 + outw;
-                float *out_row_3 = out_row_2 + outw;
+                float *out_row_1 = out_row_0 + p.out_width;
+                float *out_row_2 = out_row_1 + p.out_width;
+                float *out_row_3 = out_row_2 + p.out_width;
 
-                float *out_row1 = out_base1 + i * outw;
+                float *out_row1 = out_base1 + i * p.out_width;
 
                 float *out_row1_0 = out_row1;
-                float *out_row1_1 = out_row1_0 + outw;
-                float *out_row1_2 = out_row1_1 + outw;
-                float *out_row1_3 = out_row1_2 + outw;
+                float *out_row1_1 = out_row1_0 + p.out_width;
+                float *out_row1_2 = out_row1_1 + p.out_width;
+                float *out_row1_3 = out_row1_2 + p.out_width;
 
                 index_t j = 0;
 
-                for (; j + 3 < w; j += 4) {
+                for (; j + 3 < p.in_width; j += 4) {
                   float32x4_t in_vec = vld1q_f32(in);
                   float32x4_t out00, out01, out02, out03;
                   float32x4_t out10, out11, out12, out13;
@@ -618,7 +482,7 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
                   out_row1_3 += 4;
                 }
 
-                for (; j < w; j++) {
+                for (; j < p.in_width; j++) {
                   float val = in[0];
                   for (int k = 0; k < 4; ++k) {
                     out_row_0[k] += val * k0[k];
@@ -644,13 +508,13 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
             }
           } else {
             const index_t out_offset =
-                (b * outch + outch_g * g + oc) * out_img_size;
+                (b * p.out_channels + p.outch_g * g + oc) * p.out_img_size;
             float *out_base = padded_out_data + out_offset;
-            for (index_t ic = 0; ic < inch_g; ++ic) {
+            for (index_t ic = 0; ic < p.inch_g; ++ic) {
               const index_t in_offset =
-                  (b * inch + inch_g * g + ic) * in_img_size;
+                  (b * p.in_channels + p.inch_g * g + ic) * p.in_img_size;
               const index_t kernel_offset =
-                  ((oc * group_ + g) * inch_g + ic) * 16;
+                  ((oc * group_ + g) * p.inch_g + ic) * 16;
 
               const float *input_base = input_data + in_offset;
               const float *kernel_base = filter_data + kernel_offset;
@@ -665,15 +529,15 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
               float32x4_t k2_vec = vld1q_f32(k2);
               float32x4_t k3_vec = vld1q_f32(k3);
 
-              for (index_t i = 0; i < h; i++) {
-                float *out_row = out_base + i * outw;
+              for (index_t i = 0; i < p.in_height; i++) {
+                float *out_row = out_base + i * p.out_width;
                 float *out_row_0 = out_row;
-                float *out_row_1 = out_row_0 + outw;
-                float *out_row_2 = out_row_1 + outw;
-                float *out_row_3 = out_row_2 + outw;
+                float *out_row_1 = out_row_0 + p.out_width;
+                float *out_row_2 = out_row_1 + p.out_width;
+                float *out_row_3 = out_row_2 + p.out_width;
                 index_t j = 0;
 
-                for (; j + 3 < w; j += 4) {
+                for (; j + 3 < p.in_width; j += 4) {
                   float32x4_t in_vec = vld1q_f32(in);
 
                   float32x4_t out00 = vld1q_f32(out_row_0);
@@ -750,7 +614,7 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
                   out_row_3 += 4;
                 }
 
-                for (; j < w; j++) {
+                for (; j < p.in_width; j++) {
                   float val = in[0];
                   for (int k = 0; k < 4; ++k) {
                     out_row_0[k] += val * k0[k];
@@ -770,78 +634,29 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
         }
       }
     }
-  }, 0, batch, 1, 0, group_, 1, 0, outch_g, 2);
-
-  UnPadOutput(*out_tensor, out_pad_size, output);
+  }, 0, p.batch, 1, 0, group_, 1, 0, p.outch_g, 2);
 
   return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context,
-                                        const Tensor *input,
-                                        const Tensor *filter,
-                                        const Tensor *output_shape,
-                                        Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-
-  out_tensor->Clear();
-
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-
-  const index_t batch = in_shape[0];
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-
-  const index_t in_img_size = h * w;
-  const index_t out_img_size = outh * outw;
-
-  const index_t inch_g = inch / group_;
-  const index_t outch_g = outch / group_;
-
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
-                            index_t start1, index_t end1, index_t step1,
-                            index_t start2, index_t end2, index_t step2) {
+template<>
+MaceStatus GroupDeconv2dK4x4S2<float>::DoCompute(
+    const GroupDeconvComputeParam &p, const float *filter_data,
+    const float *input_data, float *padded_out_data) {
+  p.thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
+                              index_t start1, index_t end1, index_t step1,
+                              index_t start2, index_t end2, index_t step2) {
     for (index_t b = start0; b < end0; b += step0) {
       for (index_t g = start1; g < end1; g += step1) {
         for (index_t oc = start2; oc < end2; oc += step2) {
           const index_t out_offset =
-              (b * outch + outch_g * g + oc) * out_img_size;
+              (b * p.out_channels + p.outch_g * g + oc) * p.out_img_size;
           float *out_base = padded_out_data + out_offset;
-          for (index_t ic = 0; ic < inch_g; ic++) {
+          for (index_t ic = 0; ic < p.inch_g; ic++) {
             const index_t in_offset =
-                (b * inch + inch_g * g + ic) * in_img_size;
+                (b * p.in_channels + p.inch_g * g + ic) * p.in_img_size;
             const index_t kernel_offset =
-                ((oc * group_ + g) * inch_g + ic) * 16;
+                ((oc * group_ + g) * p.inch_g + ic) * 16;
             const float *input_base = input_data + in_offset;
             const float *kernel_base = filter_data + kernel_offset;
             const float *in = input_base;
@@ -856,17 +671,17 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context,
             float32x4_t k2_vec = vld1q_f32(k2);
             float32x4_t k3_vec = vld1q_f32(k3);
 
-            for (index_t i = 0; i < h; i++) {
-              float *out_row = out_base + 2 * i * outw;
+            for (index_t i = 0; i < p.in_height; i++) {
+              float *out_row = out_base + 2 * i * p.out_width;
 
               float *out_row_0 = out_row;
-              float *out_row_1 = out_row_0 + outw;
-              float *out_row_2 = out_row_1 + outw;
-              float *out_row_3 = out_row_2 + outw;
+              float *out_row_1 = out_row_0 + p.out_width;
+              float *out_row_2 = out_row_1 + p.out_width;
+              float *out_row_3 = out_row_2 + p.out_width;
 
               index_t j = 0;
 
-              for (index_t n = 0; n + 9 < outw; n += 8) {
+              for (index_t n = 0; n + 9 < p.out_width; n += 8) {
                 float32x4_t in_vec = vld1q_f32(in);
 
                 // row 0
@@ -933,7 +748,7 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context,
                 j += 4;
               }
 
-              for (; j < w; j++) {
+              for (; j < p.in_width; j++) {
                 float val = in[0];
                 for (int k = 0; k < 4; ++k) {
                   out_row_0[k] += val * k0[k];
@@ -952,36 +767,11 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context,
         }
       }
     }
-  }, 0, batch, 1, 0, group_, 1, 0, outch_g, 1);
-
-  UnPadOutput(*out_tensor, out_pad_size, output);
+  }, 0, p.batch, 1, 0, group_, 1, 0, p.outch_g, 1);
 
   return MaceStatus::MACE_SUCCESS;
 }
 
-void RegisterDepthwiseDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, DepthwiseDeconv2dK4x4S1, delegator::DepthwiseDeconv2dParam,
-      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K4x4S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, DepthwiseDeconv2dK4x4S2, delegator::DepthwiseDeconv2dParam,
-      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K4x4S2));
-}
-
-void RegisterGroupDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, GroupDeconv2dK4x4S1, delegator::GroupDeconv2dParam,
-      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K4x4S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, GroupDeconv2dK4x4S2, delegator::GroupDeconv2dParam,
-      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K4x4S2));
-}
-
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/gemm.cc b/mace/ops/arm/fp32/gemm.cc
index d506d8b1dbec75121dc4d025b7e89eaf22da1ecf..123e3aaee0e46ded600cf8ff6182846eb23394b8 100644
--- a/mace/ops/arm/fp32/gemm.cc
+++ b/mace/ops/arm/fp32/gemm.cc
@@ -12,687 +12,498 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-
-#include "mace/ops/arm/fp32/gemm.h"
-
 #include <arm_neon.h>
 #include <algorithm>
 #include <utility>
 
+#include "mace/ops/arm/base/gemm.h"
 #include "mace/port/env.h"
 
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-enum { kNoCache, kCacheLhs, kCacheRhs };
-
-MaceStatus Gemm::Compute(const OpContext *context,
-                         const Tensor *lhs,
-                         const Tensor *rhs,
-                         const index_t batch,
-                         const index_t rows,
-                         const index_t cols,
-                         const index_t depth,
-                         const MatrixMajor lhs_major,
-                         const MatrixMajor rhs_major,
-                         const MatrixMajor output_major,
-                         const bool lhs_batched,
-                         const bool rhs_batched,
-                         Tensor *output) {
-  MACE_CHECK(output->size() == batch * rows * cols,
-             "Need resize output tensor before call gemm.");
-  Tensor::MappingGuard lhs_guard(lhs);
-  Tensor::MappingGuard rhs_guard(rhs);
-  Tensor::MappingGuard output_guard(output);
-  const float *lhs_data = lhs->data<float>();
-  const float *rhs_data = rhs->data<float>();
-  float *output_data = output->mutable_data<float>();
-
-#ifdef __aarch64__
-  const index_t row_block_size = 8;
-#else
-  const index_t row_block_size = 4;
-#endif
-  const index_t col_block_size = 8;
-  const index_t depth_block_size = 4;
-  const index_t row_block_count = RoundUpDiv(rows, row_block_size);
-  const index_t col_block_count = RoundUpDiv(cols, col_block_size);
-  const index_t rows_padded = RoundUp(rows, row_block_size);
-  const index_t cols_padded = RoundUp(cols, col_block_size);
-  const index_t depth_padded = RoundUp(depth, depth_block_size);
-
-  ScratchBuffer *scratch = context->device()->scratch_buffer();
 
-  index_t packed_lhs_size =
-      PadAlignSize(sizeof(float) * rows_padded * depth_padded);
-  index_t packed_rhs_size =
-      PadAlignSize(sizeof(float) * depth_padded * cols_padded);
-  index_t packed_output_size =
-      PadAlignSize(sizeof(float) * rows_padded * cols_padded);
-  // resize to the total size of lhs & rhs & output anyway,
-  // in case we do not cache const tensor for saving memory
-  MACE_RETURN_IF_ERROR(scratch->GrowSize(
-      packed_lhs_size + packed_rhs_size + packed_output_size));
-  float *packed_lhs_data =
-      scratch->Scratch(packed_lhs_size).mutable_data<float>();
-  float *packed_rhs_data =
-      scratch->Scratch(packed_rhs_size).mutable_data<float>();
-  float *packed_output_data =
-      scratch->Scratch(packed_output_size).mutable_data<float>();
+template<>
+template<>
+void Gemm<float>::Pack<4, 4>(const MatrixMap<const float> &matrix,
+                             MatrixMajor dst_major,
+                             float *packed_matrix) {
+  const index_t rows = matrix.rows();
+  const index_t cols = matrix.cols();
 
-  int cache_side = kNoCache;
-  if (cached_ == kCacheLhs) {
-    packed_lhs_data = pack_cache_.mutable_data<float>();
-  } else if (cached_ == kCacheRhs) {
-    packed_rhs_data = pack_cache_.mutable_data<float>();
-  } else if (should_cache_pack_) {
-    if (lhs->is_weight() && (!lhs_batched || batch == 1)) {
-      cache_side = kCacheLhs;
-      pack_cache_.Resize(packed_lhs_size);
-      packed_lhs_data = pack_cache_.mutable_data<float>();
-    } else if (rhs->is_weight() && (!rhs_batched || batch == 1)) {
-      cache_side = kCacheRhs;
-      pack_cache_.Resize(packed_rhs_size);
-      packed_rhs_data = pack_cache_.mutable_data<float>();
-    }
+  // use the same terminology as GemmLowp:
+  // depth is depth, width is the opposite dim other than depth
+  // lhs
+  index_t width = rows;
+  index_t depth = cols;
+  index_t width_stride = matrix.rows_stride();
+  index_t depth_stride = matrix.cols_stride();
+  if (dst_major == RowMajor) {
+    // rhs
+    std::swap(width, depth);
+    std::swap(width_stride, depth_stride);
   }
+  const float *data = matrix.data();
+  float *packed_ptr = packed_matrix;
 
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-
-  for (index_t b = 0; b < batch; ++b) {
-    MatrixMap<const float>
-        lhs_matrix
-        (lhs_data + static_cast<index_t>(lhs_batched) * b * rows * depth,
-         lhs_major,
-         rows,
-         depth);
-    MatrixMap<const float>
-        rhs_matrix
-        (rhs_data + static_cast<index_t>(rhs_batched) * b * depth * cols,
-         rhs_major,
-         depth,
-         cols);
-    MatrixMap<float> output_matrix
-        (output_data + b * rows * cols, output_major, rows, cols);
+  const index_t block_size = 4;
+  const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4));
 
-    // pack lhs
-    if (cached_ != kCacheLhs) {
-      thread_pool.Compute1D([=, &lhs_matrix](index_t start,
-                                             index_t end,
-                                             index_t step) {
-        for (index_t row_block_idx = start; row_block_idx < end;
-             row_block_idx += step) {
-          const index_t start_row = row_block_idx * row_block_size;
-          const index_t
-              row_block_len = std::min(row_block_size, rows - start_row);
-          float *packed_lhs_data_block =
-              packed_lhs_data + row_block_idx * row_block_size * depth_padded;
-          PackLhs(lhs_matrix.block(start_row, 0, row_block_len, depth),
-                  packed_lhs_data_block);
-        }
-      }, 0, row_block_count, 1);
+  if (depth_padded > depth) {
+    memset(packed_ptr + depth * block_size,
+           0,
+           sizeof(float) * (depth_padded - depth) * block_size);
+  }
 
-      if (cache_side == kCacheLhs) {
-        cached_ = kCacheLhs;
-        if (lhs->UnderlyingBuffer()->OnHost()) {
-          AdviseFree(reinterpret_cast<void *>(const_cast<float *>(lhs->data<
-                         float>())),
-                     lhs->raw_size());
-        }
+  if (dst_major == matrix.matrix_major()) {
+    if (width < block_size) {
+      const index_t width_remain = block_size - width;
+      for (index_t d = 0; d < depth; ++d) {
+        memcpy(packed_ptr, data, sizeof(float) * width);
+        memset(packed_ptr + width, 0, sizeof(float) * width_remain);
+        data += depth_stride;
+        packed_ptr += block_size;
+      }
+    } else {
+      for (index_t d = 0; d < depth; ++d) {
+        float32x4_t vi = vld1q_f32(data);
+        vst1q_f32(packed_ptr, vi);
+        data += depth_stride;
+        packed_ptr += block_size;
       }
     }
+  } else {
+    if (width < block_size) {
+      const index_t width_remain = block_size - width;
+      for (index_t d = 0; d < depth; ++d) {
+        for (index_t w = 0; w < width; ++w) {
+          packed_ptr[w] = data[w * width_stride + d];
+        }  // w
+        memset(packed_ptr + width, 0, sizeof(float) * width_remain);
+        packed_ptr += block_size;
+      }  // d
+    } else {
+      const float *data0 = data;
+      const float *data1 = data + width_stride;
+      const float *data2 = data1 + width_stride;
+      const float *data3 = data2 + width_stride;
 
-    // pack rhs
-    if (cached_ != kCacheRhs) {
-      thread_pool.Compute1D([=, &rhs_matrix](index_t start,
-                                             index_t end,
-                                             index_t step) {
-        for (index_t col_block_idx = start; col_block_idx < end;
-             col_block_idx += step) {
-          const index_t start_col = col_block_idx * col_block_size;
-          const index_t
-              col_block_len = std::min(col_block_size, cols - start_col);
-          float *packed_rhs_data_block =
-              packed_rhs_data + col_block_idx * col_block_size * depth_padded;
-          PackRhs(rhs_matrix.block(0, start_col, depth, col_block_len),
-                  packed_rhs_data_block);
-        }
-      }, 0, col_block_count, 1);
+      const index_t depth_block = depth / 4;
+      const index_t depth_remain = depth - depth_block * 4;
+      for (index_t depth_block_idx = 0; depth_block_idx < depth_block;
+           ++depth_block_idx) {
+        float32x4_t v0 = vld1q_f32(data0);
+        float32x4_t v1 = vld1q_f32(data1);
+        float32x4_t v2 = vld1q_f32(data2);
+        float32x4_t v3 = vld1q_f32(data3);
+        float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
+        float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
+        float32x4x2_t v0123_intertwined =
+            vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
+        float32x4x2_t v0123n_intertwined =
+            vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
 
-      if (cache_side == kCacheRhs) {
-        cached_ = kCacheRhs;
-        if (rhs->UnderlyingBuffer()->OnHost()) {
-          AdviseFree(reinterpret_cast<void *>(const_cast<float *>(rhs->data<
-                         float>())),
-                     rhs->raw_size());
-        }
-      }
-    }
+        vst1q_f32(packed_ptr, v0123_intertwined.val[0]);
+        packed_ptr += 4;
 
-    // multiply lhs and rhs
-    thread_pool.Compute1D([=, &output_matrix](index_t start,
-                                              index_t end,
-                                              index_t step) {
-      for (index_t row_block_idx = start; row_block_idx < end;
-           row_block_idx += step) {
-        const index_t start_row = row_block_idx * row_block_size;
-        const index_t
-            row_block_len = std::min(row_block_size, rows - start_row);
-        const float *packed_lhs_data_block =
-            packed_lhs_data + row_block_idx * row_block_size * depth_padded;
+        vst1q_f32(packed_ptr, v0123_intertwined.val[1]);
+        packed_ptr += 4;
 
-        for (index_t col_block_idx = 0; col_block_idx < col_block_count;
-             ++col_block_idx) {
-          const index_t start_col = col_block_idx * col_block_size;
-          const index_t
-              col_block_len = std::min(col_block_size, cols - start_col);
-          const float *packed_rhs_data_block =
-              packed_rhs_data + col_block_idx * col_block_size * depth_padded;
-          float *packed_output_data_block =
-              packed_output_data + row_block_idx * row_block_size * cols_padded
-                  + col_block_idx * col_block_size;
-          ComputeBlock(packed_lhs_data_block,
-                       packed_rhs_data_block,
-                       depth_padded,
-                       packed_output_data_block);
-          MatrixMap<float> output_block = output_matrix.block(start_row,
-                                                              start_col,
-                                                              row_block_len,
-                                                              col_block_len);
-          UnpackOutput(packed_output_data_block, &output_block);
-        }  // col_block_idx
-      }  // row_block_idx
-    }, 0, row_block_count, 1);
-  }  // b
+        vst1q_f32(packed_ptr, v0123n_intertwined.val[0]);
+        packed_ptr += 4;
 
-  return MaceStatus::MACE_SUCCESS;
-}
+        vst1q_f32(packed_ptr, v0123n_intertwined.val[1]);
+        packed_ptr += 4;
 
-void Gemm::ComputeBlock(const float *packed_lhs_data,
-                        const float *packed_rhs_data,
-                        const index_t depth_padded,
-                        float *packed_output_data) {
-  /* Ref:
-  for (index_t r = 0; r < block_size; ++r) {
-    for (index_t c = 0; c < block_size; ++c) {
-      float sum = 0;
-      for (index_t d = 0; d < depth; ++d) {
-        // (r, d) * (d, c)
-        sum += packed_lhs_data[d * r_block_size + r]
-            * packed_rhs_data[d * c_block_size + c];
+        data0 += 4;
+        data1 += 4;
+        data2 += 4;
+        data3 += 4;
       }
-      packed_output_data[r * c_block_size + c] = sum;
-    }
-  }
-  */
-  const float *lhs_ptr = packed_lhs_data;
-  const float *rhs_ptr = packed_rhs_data;
+      for (index_t d = 0; d < depth_remain; ++d) {
+        float32x4_t vi = {*data0, *data1, *data2, *data3};
+        vst1q_f32(packed_ptr, vi);
+        packed_ptr += 4;
 
-  const index_t depth_block_count = depth_padded / 4;
+        ++data0;
+        ++data1;
+        ++data2;
+        ++data3;
+      }  // d
+    }
+  }
+}
 
-#ifdef __aarch64__
-  // Register layout: (8x4) x (4,8)
-  //
-  //                               +--------+--------+
-  //                               | v8 ... | v9 ... |
-  //                       Rhs     +--------+--------+
-  //                               | v10... | v11... |
-  //                               +--------+--------+
-  //                               | v12... | v13... |
-  //                               +--------+--------+
-  //                               | v14... | v15... |
-  //                               +--------+--------+
-  //
-  //          Lhs
-  //
-  //  +----+----+----+----+  -  -  +--------+--------+
-  //  | v0 | v2 | v4 | v6 |        | v16... | v17... |
-  //  | .  |    |    |    |        | v18... | v19... |
-  //  | .  |    |    |    |        | v20... | v21... |
-  //  | .  |    |    |    |        | v22... | v23... |
-  //  +----+----|----+----+        +--------+--------+
-  //  | v1 | v3 | v5 | v7 |        | v24... | v25... |
-  //  | .  |    |    |    |        | v26... | v27... |
-  //  | .  |    |    |    |        | v28... | v29... |
-  //  | .  |    |    |    |        | v30... | v31... |
-  //  +----+----|----+----+        +--------+--------+
-  //
-  //                                    Accumulator
-  //
+template<>
+template<>
+void Gemm<float>::Pack<8, 4>(const MatrixMap<const float> &matrix,
+                             MatrixMajor dst_major,
+                             float *packed_matrix) {
+  const index_t rows = matrix.rows();
+  const index_t cols = matrix.cols();
 
-  if (depth_block_count > 0) {
-    index_t r_depth_block_count = depth_block_count;
-    // just make compiler happy
-    MACE_UNUSED(r_depth_block_count);
+  // use the same terminology as GemmLowp:
+  // depth is depth, width is the opposite dim other than depth
+  // lhs
+  index_t width = rows;
+  index_t depth = cols;
+  index_t width_stride = matrix.rows_stride();
+  index_t depth_stride = matrix.cols_stride();
+  if (dst_major == RowMajor) {
+    // rhs
+    std::swap(width, depth);
+    std::swap(width_stride, depth_stride);
+  }
+  const float *data = matrix.data();
+  float *packed_ptr = packed_matrix;
 
-    asm volatile(
-        "dup v16.4s, wzr \n"
-        "dup v17.4s, wzr \n"
-        "dup v18.4s, wzr \n"
-        "dup v19.4s, wzr \n"
-        "dup v20.4s, wzr \n"
-        "dup v21.4s, wzr \n"
-        "dup v22.4s, wzr \n"
-        "dup v23.4s, wzr \n"
-        "dup v24.4s, wzr \n"
-        "dup v25.4s, wzr \n"
-        "dup v26.4s, wzr \n"
-        "dup v27.4s, wzr \n"
-        "dup v28.4s, wzr \n"
-        "dup v29.4s, wzr \n"
-        "dup v30.4s, wzr \n"
-        "dup v31.4s, wzr \n"
+  const index_t block_size = 8;
+  const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4));
 
-        // prelogue
-        "ld1 {v0.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v1.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v2.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v3.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v4.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v5.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v6.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v7.4s}, [%[lhs_ptr]], #16 \n"
+  if (depth_padded > depth) {
+    memset(packed_ptr + depth * block_size,
+           0,
+           sizeof(float) * (depth_padded - depth) * block_size);
+  }
 
-        "ld1 {v8.4s}, [%[rhs_ptr]], #16 \n"
-        "ld1 {v9.4s}, [%[rhs_ptr]], #16 \n"
-        "ld1 {v10.4s}, [%[rhs_ptr]], #16 \n"
-        "ld1 {v11.4s}, [%[rhs_ptr]], #16 \n"
-        "ld1 {v12.4s}, [%[rhs_ptr]], #16 \n"
-        "ld1 {v13.4s}, [%[rhs_ptr]], #16 \n"
-        "ld1 {v14.4s}, [%[rhs_ptr]], #16 \n"
-        "ld1 {v15.4s}, [%[rhs_ptr]], #16 \n"
+  if (dst_major == matrix.matrix_major()) {
+    if (width < block_size) {
+      const index_t width_remain = block_size - width;
+      for (index_t d = 0; d < depth; ++d) {
+        memcpy(packed_ptr, data, sizeof(float) * width);
+        memset(packed_ptr + width, 0, sizeof(float) * width_remain);
+        data += depth_stride;
+        packed_ptr += block_size;
+      }
+    } else {
+      for (index_t d = 0; d < depth; ++d) {
+        float32x4_t vi = vld1q_f32(data);
+        vst1q_f32(packed_ptr, vi);
+        float32x4_t vin = vld1q_f32(data + 4);
+        vst1q_f32(packed_ptr + 4, vin);
+        data += depth_stride;
+        packed_ptr += block_size;
+      }
+    }
+  } else {
+    if (width < block_size) {
+      const index_t width_remain = block_size - width;
+      for (index_t d = 0; d < depth; ++d) {
+        for (index_t w = 0; w < width; ++w) {
+          packed_ptr[w] = data[w * width_stride + d];
+        }  // w
+        memset(packed_ptr + width, 0, sizeof(float) * width_remain);
+        packed_ptr += block_size;
+      }  // d
+    } else {
+      const float *data0 = data;
+      const float *data1 = data + width_stride;
+      const float *data2 = data1 + width_stride;
+      const float *data3 = data2 + width_stride;
+      const float *data4 = data3 + width_stride;
+      const float *data5 = data4 + width_stride;
+      const float *data6 = data5 + width_stride;
+      const float *data7 = data6 + width_stride;
 
-        "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
-        "beq 1f\n"
+      const index_t depth_block = depth / 4;
+      const index_t depth_remain = depth - depth_block * 4;
+      for (index_t depth_block_idx = 0; depth_block_idx < depth_block;
+           ++depth_block_idx) {
+        float32x4_t v0 = vld1q_f32(data0);
+        float32x4_t v1 = vld1q_f32(data1);
+        float32x4_t v2 = vld1q_f32(data2);
+        float32x4_t v3 = vld1q_f32(data3);
+        float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
+        float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
+        float32x4x2_t v0123_intertwined =
+            vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
+        float32x4x2_t v0123n_intertwined =
+            vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
 
-        "0: \n"
-        "fmla v16.4s, v8.4s, v0.s[0] \n"
-        "fmla v17.4s, v9.4s, v0.s[0] \n"
-        "fmla v18.4s, v8.4s, v0.s[1] \n"
-        "fmla v19.4s, v9.4s, v0.s[1] \n"
-        "fmla v20.4s, v8.4s, v0.s[2] \n"
-        "fmla v21.4s, v9.4s, v0.s[2] \n"
-        "fmla v22.4s, v8.4s, v0.s[3] \n"
-        "fmla v23.4s, v9.4s, v0.s[3] \n"
+        float32x4_t v4 = vld1q_f32(data4);
+        float32x4_t v5 = vld1q_f32(data5);
+        float32x4_t v6 = vld1q_f32(data6);
+        float32x4_t v7 = vld1q_f32(data7);
+        float32x4x2_t v46_intertwined = vzipq_f32(v4, v6);
+        float32x4x2_t v57_intertwined = vzipq_f32(v5, v7);
+        float32x4x2_t v4567_intertwined =
+            vzipq_f32(v46_intertwined.val[0], v57_intertwined.val[0]);
+        float32x4x2_t v4567n_intertwined =
+            vzipq_f32(v46_intertwined.val[1], v57_intertwined.val[1]);
 
-        "ld1 {v0.4s}, [%[lhs_ptr]], #16 \n"
+        vst1q_f32(packed_ptr, v0123_intertwined.val[0]);
+        packed_ptr += 4;
 
-        "fmla v24.4s, v8.4s, v1.s[0] \n"
-        "fmla v25.4s, v9.4s, v1.s[0] \n"
-        "fmla v26.4s, v8.4s, v1.s[1] \n"
-        "fmla v27.4s, v9.4s, v1.s[1] \n"
-        "fmla v28.4s, v8.4s, v1.s[2] \n"
-        "fmla v29.4s, v9.4s, v1.s[2] \n"
-        "fmla v30.4s, v8.4s, v1.s[3] \n"
-        "fmla v31.4s, v9.4s, v1.s[3] \n"
+        vst1q_f32(packed_ptr, v4567_intertwined.val[0]);
+        packed_ptr += 4;
 
-        "ld1 {v1.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v8.4s}, [%[rhs_ptr]], #16 \n"
-        "ld1 {v9.4s}, [%[rhs_ptr]], #16 \n"
+        vst1q_f32(packed_ptr, v0123_intertwined.val[1]);
+        packed_ptr += 4;
 
-        "fmla v16.4s, v10.4s, v2.s[0] \n"
-        "fmla v17.4s, v11.4s, v2.s[0] \n"
-        "fmla v18.4s, v10.4s, v2.s[1] \n"
-        "fmla v19.4s, v11.4s, v2.s[1] \n"
-        "fmla v20.4s, v10.4s, v2.s[2] \n"
-        "fmla v21.4s, v11.4s, v2.s[2] \n"
-        "fmla v22.4s, v10.4s, v2.s[3] \n"
-        "fmla v23.4s, v11.4s, v2.s[3] \n"
+        vst1q_f32(packed_ptr, v4567_intertwined.val[1]);
+        packed_ptr += 4;
 
-        "ld1 {v2.4s}, [%[lhs_ptr]], #16 \n"
+        vst1q_f32(packed_ptr, v0123n_intertwined.val[0]);
+        packed_ptr += 4;
 
-        "fmla v24.4s, v10.4s, v3.s[0] \n"
-        "fmla v25.4s, v11.4s, v3.s[0] \n"
-        "fmla v26.4s, v10.4s, v3.s[1] \n"
-        "fmla v27.4s, v11.4s, v3.s[1] \n"
-        "fmla v28.4s, v10.4s, v3.s[2] \n"
-        "fmla v29.4s, v11.4s, v3.s[2] \n"
-        "fmla v30.4s, v10.4s, v3.s[3] \n"
-        "fmla v31.4s, v11.4s, v3.s[3] \n"
+        vst1q_f32(packed_ptr, v4567n_intertwined.val[0]);
+        packed_ptr += 4;
 
-        "ld1 {v3.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v10.4s}, [%[rhs_ptr]], #16 \n"
-        "ld1 {v11.4s}, [%[rhs_ptr]], #16 \n"
+        vst1q_f32(packed_ptr, v0123n_intertwined.val[1]);
+        packed_ptr += 4;
 
-        "fmla v16.4s, v12.4s, v4.s[0] \n"
-        "fmla v17.4s, v13.4s, v4.s[0] \n"
-        "fmla v18.4s, v12.4s, v4.s[1] \n"
-        "fmla v19.4s, v13.4s, v4.s[1] \n"
-        "fmla v20.4s, v12.4s, v4.s[2] \n"
-        "fmla v21.4s, v13.4s, v4.s[2] \n"
-        "fmla v22.4s, v12.4s, v4.s[3] \n"
-        "fmla v23.4s, v13.4s, v4.s[3] \n"
+        vst1q_f32(packed_ptr, v4567n_intertwined.val[1]);
+        packed_ptr += 4;
 
-        "ld1 {v4.4s}, [%[lhs_ptr]], #16 \n"
+        data0 += 4;
+        data1 += 4;
+        data2 += 4;
+        data3 += 4;
+        data4 += 4;
+        data5 += 4;
+        data6 += 4;
+        data7 += 4;
+      }
+      for (index_t d = 0; d < depth_remain; ++d) {
+        float32x4_t vi = {*data0, *data1, *data2, *data3};
+        vst1q_f32(packed_ptr, vi);
+        packed_ptr += 4;
 
-        "fmla v24.4s, v12.4s, v5.s[0] \n"
-        "fmla v25.4s, v13.4s, v5.s[0] \n"
-        "fmla v26.4s, v12.4s, v5.s[1] \n"
-        "fmla v27.4s, v13.4s, v5.s[1] \n"
-        "fmla v28.4s, v12.4s, v5.s[2] \n"
-        "fmla v29.4s, v13.4s, v5.s[2] \n"
-        "fmla v30.4s, v12.4s, v5.s[3] \n"
-        "fmla v31.4s, v13.4s, v5.s[3] \n"
+        float32x4_t vin = {*data4, *data5, *data6, *data7};
+        vst1q_f32(packed_ptr, vin);
+        packed_ptr += 4;
 
-        "ld1 {v5.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v12.4s}, [%[rhs_ptr]], #16 \n"
-        "ld1 {v13.4s}, [%[rhs_ptr]], #16 \n"
-
-        "fmla v16.4s, v14.4s, v6.s[0] \n"
-        "fmla v17.4s, v15.4s, v6.s[0] \n"
-        "fmla v18.4s, v14.4s, v6.s[1] \n"
-        "fmla v19.4s, v15.4s, v6.s[1] \n"
-        "fmla v20.4s, v14.4s, v6.s[2] \n"
-        "fmla v21.4s, v15.4s, v6.s[2] \n"
-        "fmla v22.4s, v14.4s, v6.s[3] \n"
-        "fmla v23.4s, v15.4s, v6.s[3] \n"
+        ++data0;
+        ++data1;
+        ++data2;
+        ++data3;
+        ++data4;
+        ++data5;
+        ++data6;
+        ++data7;
+      }  // d
+    }
+  }
+}
 
-        "ld1 {v6.4s}, [%[lhs_ptr]], #16 \n"
+template<>
+template<>
+void Gemm<float>::Unpack<4, 8>(const float *packed_output,
+                               MatrixMap<float> *output) {
+  const index_t rows = output->rows();
+  const index_t cols = output->cols();
+  index_t row_stride = output->rows_stride();
+  index_t col_stride = output->cols_stride();
 
-        "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
+  float *output_ptr = output->data();
+  const float *packed_ptr = packed_output;
 
-        "fmla v24.4s, v14.4s, v7.s[0] \n"
-        "fmla v25.4s, v15.4s, v7.s[0] \n"
-        "fmla v26.4s, v14.4s, v7.s[1] \n"
-        "fmla v27.4s, v15.4s, v7.s[1] \n"
-        "fmla v28.4s, v14.4s, v7.s[2] \n"
-        "fmla v29.4s, v15.4s, v7.s[2] \n"
-        "fmla v30.4s, v14.4s, v7.s[3] \n"
-        "fmla v31.4s, v15.4s, v7.s[3] \n"
+  const index_t block_size = 8;
 
-        "ld1 {v7.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v14.4s}, [%[rhs_ptr]], #16 \n"
-        "ld1 {v15.4s}, [%[rhs_ptr]], #16 \n"
+  // packed_output always has row-major
+  if (output->matrix_major() == RowMajor) {
+    if (cols < block_size) {
+      for (index_t r = 0; r < rows; ++r) {
+        memcpy(output_ptr, packed_ptr, sizeof(float) * cols);
+        output_ptr += row_stride;
+        packed_ptr += block_size;
+      }
+    } else {
+      for (index_t r = 0; r < rows; ++r) {
+        float32x4_t vi = vld1q_f32(packed_ptr);
+        vst1q_f32(output_ptr, vi);
+        float32x4_t vin = vld1q_f32(packed_ptr + 4);
+        vst1q_f32(output_ptr + 4, vin);
 
-        "bne 0b \n"
+        output_ptr += row_stride;
+        packed_ptr += block_size;
+      }
+    }
+  } else {
+    // ColMajor
+    if (rows < block_size) {
+      for (index_t c = 0; c < cols; ++c) {
+        for (index_t r = 0; r < rows; ++r) {
+          output_ptr[c * col_stride + r] = packed_ptr[r * block_size + c];
+        }  // r
+      }  // c
+    } else {
+      const float *data0 = packed_ptr;
+      const float *data1 = data0 + block_size;
+      const float *data2 = data1 + block_size;
+      const float *data3 = data2 + block_size;
 
-        // prologue
-        "1:\n"
-        "fmla v16.4s, v8.4s, v0.s[0] \n"
-        "fmla v17.4s, v9.4s, v0.s[0] \n"
-        "fmla v18.4s, v8.4s, v0.s[1] \n"
-        "fmla v19.4s, v9.4s, v0.s[1] \n"
-        "fmla v20.4s, v8.4s, v0.s[2] \n"
-        "fmla v21.4s, v9.4s, v0.s[2] \n"
-        "fmla v22.4s, v8.4s, v0.s[3] \n"
-        "fmla v23.4s, v9.4s, v0.s[3] \n"
+      index_t col_block = cols / 4;
+      index_t col_remain = cols - col_block * 4;
+      for (index_t col_block_idx = 0; col_block_idx < col_block;
+           ++col_block_idx) {
+        float32x4_t v0 = vld1q_f32(data0);
+        float32x4_t v1 = vld1q_f32(data1);
+        float32x4_t v2 = vld1q_f32(data2);
+        float32x4_t v3 = vld1q_f32(data3);
+        float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
+        float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
+        float32x4x2_t v0123_intertwined =
+            vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
+        float32x4x2_t v0123n_intertwined =
+            vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
 
-        "fmla v24.4s, v8.4s, v1.s[0] \n"
-        "fmla v25.4s, v9.4s, v1.s[0] \n"
-        "fmla v26.4s, v8.4s, v1.s[1] \n"
-        "fmla v27.4s, v9.4s, v1.s[1] \n"
-        "fmla v28.4s, v8.4s, v1.s[2] \n"
-        "fmla v29.4s, v9.4s, v1.s[2] \n"
-        "fmla v30.4s, v8.4s, v1.s[3] \n"
-        "fmla v31.4s, v9.4s, v1.s[3] \n"
+        vst1q_f32(output_ptr, v0123_intertwined.val[0]);
+        output_ptr += col_stride;
 
-        "fmla v16.4s, v10.4s, v2.s[0] \n"
-        "fmla v17.4s, v11.4s, v2.s[0] \n"
-        "fmla v18.4s, v10.4s, v2.s[1] \n"
-        "fmla v19.4s, v11.4s, v2.s[1] \n"
-        "fmla v20.4s, v10.4s, v2.s[2] \n"
-        "fmla v21.4s, v11.4s, v2.s[2] \n"
-        "fmla v22.4s, v10.4s, v2.s[3] \n"
-        "fmla v23.4s, v11.4s, v2.s[3] \n"
+        vst1q_f32(output_ptr, v0123_intertwined.val[1]);
+        output_ptr += col_stride;
 
-        "fmla v24.4s, v10.4s, v3.s[0] \n"
-        "fmla v25.4s, v11.4s, v3.s[0] \n"
-        "fmla v26.4s, v10.4s, v3.s[1] \n"
-        "fmla v27.4s, v11.4s, v3.s[1] \n"
-        "fmla v28.4s, v10.4s, v3.s[2] \n"
-        "fmla v29.4s, v11.4s, v3.s[2] \n"
-        "fmla v30.4s, v10.4s, v3.s[3] \n"
-        "fmla v31.4s, v11.4s, v3.s[3] \n"
+        vst1q_f32(output_ptr, v0123n_intertwined.val[0]);
+        output_ptr += col_stride;
 
-        "fmla v16.4s, v12.4s, v4.s[0] \n"
-        "fmla v17.4s, v13.4s, v4.s[0] \n"
-        "fmla v18.4s, v12.4s, v4.s[1] \n"
-        "fmla v19.4s, v13.4s, v4.s[1] \n"
-        "fmla v20.4s, v12.4s, v4.s[2] \n"
-        "fmla v21.4s, v13.4s, v4.s[2] \n"
-        "fmla v22.4s, v12.4s, v4.s[3] \n"
-        "fmla v23.4s, v13.4s, v4.s[3] \n"
+        vst1q_f32(output_ptr, v0123n_intertwined.val[1]);
+        output_ptr += col_stride;
 
-        "fmla v24.4s, v12.4s, v5.s[0] \n"
-        "fmla v25.4s, v13.4s, v5.s[0] \n"
-        "fmla v26.4s, v12.4s, v5.s[1] \n"
-        "fmla v27.4s, v13.4s, v5.s[1] \n"
-        "fmla v28.4s, v12.4s, v5.s[2] \n"
-        "fmla v29.4s, v13.4s, v5.s[2] \n"
-        "fmla v30.4s, v12.4s, v5.s[3] \n"
-        "fmla v31.4s, v13.4s, v5.s[3] \n"
+        data0 += 4;
+        data1 += 4;
+        data2 += 4;
+        data3 += 4;
+      }
+      for (index_t c = 0; c < col_remain; ++c) {
+        float32x4_t vi = {*data0, *data1, *data2, *data3};
+        vst1q_f32(output_ptr, vi);
+        output_ptr += col_stride;
 
-        "fmla v16.4s, v14.4s, v6.s[0] \n"
-        "fmla v17.4s, v15.4s, v6.s[0] \n"
-        "fmla v18.4s, v14.4s, v6.s[1] \n"
-        "fmla v19.4s, v15.4s, v6.s[1] \n"
-        "fmla v20.4s, v14.4s, v6.s[2] \n"
-        "fmla v21.4s, v15.4s, v6.s[2] \n"
-        "fmla v22.4s, v14.4s, v6.s[3] \n"
-        "fmla v23.4s, v15.4s, v6.s[3] \n"
+        ++data0;
+        ++data1;
+        ++data2;
+        ++data3;
+      }  // d
+    }
+  }
+}
 
-        "fmla v24.4s, v14.4s, v7.s[0] \n"
-        "fmla v25.4s, v15.4s, v7.s[0] \n"
-        "fmla v26.4s, v14.4s, v7.s[1] \n"
-        "fmla v27.4s, v15.4s, v7.s[1] \n"
-        "fmla v28.4s, v14.4s, v7.s[2] \n"
-        "fmla v29.4s, v15.4s, v7.s[2] \n"
-        "fmla v30.4s, v14.4s, v7.s[3] \n"
-        "fmla v31.4s, v15.4s, v7.s[3] \n"
+template<>
+template<>
+void Gemm<float>::Unpack<8, 8>(const float *packed_output,
+                               MatrixMap<float> *output) {
+  const index_t rows = output->rows();
+  const index_t cols = output->cols();
+  index_t row_stride = output->rows_stride();
+  index_t col_stride = output->cols_stride();
 
-        "st1 {v16.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v17.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v18.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v19.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v20.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v21.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v22.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v23.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v24.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v25.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v26.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v27.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v28.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v29.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v30.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v31.4s}, [%[packed_output_data]], #16 \n"
-    :  // outputs
-    [lhs_ptr] "+r"(lhs_ptr),
-    [rhs_ptr] "+r"(rhs_ptr),
-    [packed_output_data] "+r"(packed_output_data),
-    [r_depth_block_count] "+r"(r_depth_block_count)
-    :  // inputs
-    :  // clabbers
-    "cc", "memory",
-        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
-        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
-        "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
-  }
-#else  // armeabi-v7a
-  // Register layout: (4x4) x (4,8)
-  //
-  //                               +--------+--------+
-  //                               | q4 ... | q5 ... |
-  //                       Rhs     +--------+--------+
-  //                               | q6 ... | q7 ... |
-  //                               +--------+--------+
-  //                               | q4 ... | q5 ... |
-  //                               +--------+--------+
-  //                               | q6 ... | q7 ... |
-  //                               +--------+--------+
-  //
-  //          Lhs
-  //
-  //  +----+----+----+----+  -  -  +--------+--------+
-  //  | q0 | q1 | q2 | q3 |        | q8...  | q9...  |
-  //  | .  |    |    |    |        | q10... | q11... |
-  //  | .  |    |    |    |        | q12... | q13... |
-  //  | .  |    |    |    |        | q14... | q15... |
-  //  +----+----+----+----+        +--------+--------+
-  //
-  //                                    Accumulator
-  //
+  float *output_ptr = output->data();
+  const float *packed_ptr = packed_output;
 
-  if (depth_block_count > 0) {
-    index_t r_depth_block_count = depth_block_count;
-    // just make compiler happy
-    MACE_UNUSED(r_depth_block_count);
+  const index_t block_size = 8;
 
-    asm volatile(
-    "mov r0, #0\n"
-    "vdup.f32 q8, r0 \n"
-    "vdup.f32 q9, r0 \n"
-    "vdup.f32 q10, r0 \n"
-    "vdup.f32 q11, r0 \n"
-    "vdup.f32 q12, r0 \n"
-    "vdup.f32 q13, r0 \n"
-    "vdup.f32 q14, r0 \n"
-    "vdup.f32 q15, r0 \n"
-
-    // prelogue
-    "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n"
-    "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n"
-    "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n"
-    "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n"
-
-    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
-    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
-    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
-    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
-
-    "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
-    "beq 1f\n"
-
-    "0: \n"
-
-    "vmla.f32 q8, q4, d0[0] \n"
-    "vmla.f32 q9, q5, d0[0] \n"
-    "vmla.f32 q10, q4, d0[1] \n"
-    "vmla.f32 q11, q5, d0[1] \n"
-    "vmla.f32 q12, q4, d1[0] \n"
-    "vmla.f32 q13, q5, d1[0] \n"
-    "vmla.f32 q14, q4, d1[1] \n"
-    "vmla.f32 q15, q5, d1[1] \n"
-
-    "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n"
-    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
-    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
-
-    "vmla.f32 q8, q6, d2[0] \n"
-    "vmla.f32 q9, q7, d2[0] \n"
-    "vmla.f32 q10, q6, d2[1] \n"
-    "vmla.f32 q11, q7, d2[1] \n"
-    "vmla.f32 q12, q6, d3[0] \n"
-    "vmla.f32 q13, q7, d3[0] \n"
-    "vmla.f32 q14, q6, d3[1] \n"
-    "vmla.f32 q15, q7, d3[1] \n"
-
-    "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n"
-    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
-    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
-
-    "vmla.f32 q8, q4, d4[0] \n"
-    "vmla.f32 q9, q5, d4[0] \n"
-    "vmla.f32 q10, q4, d4[1] \n"
-    "vmla.f32 q11, q5, d4[1] \n"
-    "vmla.f32 q12, q4, d5[0] \n"
-    "vmla.f32 q13, q5, d5[0] \n"
-    "vmla.f32 q14, q4, d5[1] \n"
-    "vmla.f32 q15, q5, d5[1] \n"
-
-    "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n"
-    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
-    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
-
-    "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
-
-    "vmla.f32 q8, q6, d6[0] \n"
-    "vmla.f32 q9, q7, d6[0] \n"
-    "vmla.f32 q10, q6, d6[1] \n"
-    "vmla.f32 q11, q7, d6[1] \n"
-    "vmla.f32 q12, q6, d7[0] \n"
-    "vmla.f32 q13, q7, d7[0] \n"
-    "vmla.f32 q14, q6, d7[1] \n"
-    "vmla.f32 q15, q7, d7[1] \n"
+  // packed_output always has row-major
+  if (output->matrix_major() == RowMajor) {
+    if (cols < block_size) {
+      for (index_t r = 0; r < rows; ++r) {
+        memcpy(output_ptr, packed_ptr, sizeof(float) * cols);
+        output_ptr += row_stride;
+        packed_ptr += block_size;
+      }
+    } else {
+      for (index_t r = 0; r < rows; ++r) {
+        float32x4_t vi = vld1q_f32(packed_ptr);
+        vst1q_f32(output_ptr, vi);
+        float32x4_t vin = vld1q_f32(packed_ptr + 4);
+        vst1q_f32(output_ptr + 4, vin);
 
-    "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n"
-    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
-    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
+        output_ptr += row_stride;
+        packed_ptr += block_size;
+      }
+    }
+  } else {
+    // ColMajor
+    if (rows < block_size) {
+      for (index_t c = 0; c < cols; ++c) {
+        for (index_t r = 0; r < rows; ++r) {
+          output_ptr[c * col_stride + r] = packed_ptr[r * block_size + c];
+        }  // r
+      }  // c
+    } else {
+      const float *data0 = packed_ptr;
+      const float *data1 = data0 + block_size;
+      const float *data2 = data1 + block_size;
+      const float *data3 = data2 + block_size;
+      const float *data4 = data3 + block_size;
+      const float *data5 = data4 + block_size;
+      const float *data6 = data5 + block_size;
+      const float *data7 = data6 + block_size;
 
-    "bne 0b \n"
+      index_t col_block = cols / 4;
+      index_t col_remain = cols - col_block * 4;
+      for (index_t col_block_idx = 0; col_block_idx < col_block;
+           ++col_block_idx) {
+        float32x4_t v0 = vld1q_f32(data0);
+        float32x4_t v1 = vld1q_f32(data1);
+        float32x4_t v2 = vld1q_f32(data2);
+        float32x4_t v3 = vld1q_f32(data3);
+        float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
+        float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
+        float32x4x2_t v0123_intertwined =
+            vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
+        float32x4x2_t v0123n_intertwined =
+            vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
 
-    // prologue
-    "1:\n"
-    "vmla.f32 q8, q4, d0[0] \n"
-    "vmla.f32 q9, q5, d0[0] \n"
-    "vmla.f32 q10, q4, d0[1] \n"
-    "vmla.f32 q11, q5, d0[1] \n"
-    "vmla.f32 q12, q4, d1[0] \n"
-    "vmla.f32 q13, q5, d1[0] \n"
-    "vmla.f32 q14, q4, d1[1] \n"
-    "vmla.f32 q15, q5, d1[1] \n"
+        float32x4_t v4 = vld1q_f32(data4);
+        float32x4_t v5 = vld1q_f32(data5);
+        float32x4_t v6 = vld1q_f32(data6);
+        float32x4_t v7 = vld1q_f32(data7);
+        float32x4x2_t v46_intertwined = vzipq_f32(v4, v6);
+        float32x4x2_t v57_intertwined = vzipq_f32(v5, v7);
+        float32x4x2_t v4567_intertwined =
+            vzipq_f32(v46_intertwined.val[0], v57_intertwined.val[0]);
+        float32x4x2_t v4567n_intertwined =
+            vzipq_f32(v46_intertwined.val[1], v57_intertwined.val[1]);
 
-    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
-    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
+        vst1q_f32(output_ptr, v0123_intertwined.val[0]);
+        vst1q_f32(output_ptr + 4, v4567_intertwined.val[0]);
+        output_ptr += col_stride;
 
-    "vmla.f32 q8, q6, d2[0] \n"
-    "vmla.f32 q9, q7, d2[0] \n"
-    "vmla.f32 q10, q6, d2[1] \n"
-    "vmla.f32 q11, q7, d2[1] \n"
-    "vmla.f32 q12, q6, d3[0] \n"
-    "vmla.f32 q13, q7, d3[0] \n"
-    "vmla.f32 q14, q6, d3[1] \n"
-    "vmla.f32 q15, q7, d3[1] \n"
+        vst1q_f32(output_ptr, v0123_intertwined.val[1]);
+        vst1q_f32(output_ptr + 4, v4567_intertwined.val[1]);
+        output_ptr += col_stride;
 
-    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
-    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
+        vst1q_f32(output_ptr, v0123n_intertwined.val[0]);
+        vst1q_f32(output_ptr + 4, v4567n_intertwined.val[0]);
+        output_ptr += col_stride;
 
-    "vmla.f32 q8, q4, d4[0] \n"
-    "vmla.f32 q9, q5, d4[0] \n"
-    "vmla.f32 q10, q4, d4[1] \n"
-    "vmla.f32 q11, q5, d4[1] \n"
-    "vmla.f32 q12, q4, d5[0] \n"
-    "vmla.f32 q13, q5, d5[0] \n"
-    "vmla.f32 q14, q4, d5[1] \n"
-    "vmla.f32 q15, q5, d5[1] \n"
+        vst1q_f32(output_ptr, v0123n_intertwined.val[1]);
+        vst1q_f32(output_ptr + 4, v4567n_intertwined.val[1]);
+        output_ptr += col_stride;
 
-    "vmla.f32 q8, q6, d6[0] \n"
-    "vmla.f32 q9, q7, d6[0] \n"
-    "vmla.f32 q10, q6, d6[1] \n"
-    "vmla.f32 q11, q7, d6[1] \n"
-    "vmla.f32 q12, q6, d7[0] \n"
-    "vmla.f32 q13, q7, d7[0] \n"
-    "vmla.f32 q14, q6, d7[1] \n"
-    "vmla.f32 q15, q7, d7[1] \n"
+        data0 += 4;
+        data1 += 4;
+        data2 += 4;
+        data3 += 4;
+        data4 += 4;
+        data5 += 4;
+        data6 += 4;
+        data7 += 4;
+      }
+      for (index_t c = 0; c < col_remain; ++c) {
+        float32x4_t vi = {*data0, *data1, *data2, *data3};
+        vst1q_f32(output_ptr, vi);
+        float32x4_t vin = {*data4, *data5, *data6, *data7};
+        vst1q_f32(output_ptr + 4, vin);
+        output_ptr += col_stride;
 
-    "vst1.f32 {d16-d17}, [%[packed_output_data]]! \n"
-    "vst1.f32 {d18-d19}, [%[packed_output_data]]! \n"
-    "vst1.f32 {d20-d21}, [%[packed_output_data]]! \n"
-    "vst1.f32 {d22-d23}, [%[packed_output_data]]! \n"
-    "vst1.f32 {d24-d25}, [%[packed_output_data]]! \n"
-    "vst1.f32 {d26-d27}, [%[packed_output_data]]! \n"
-    "vst1.f32 {d28-d29}, [%[packed_output_data]]! \n"
-    "vst1.f32 {d30-d31}, [%[packed_output_data]]! \n"
-    :  // outputs
-    [lhs_ptr] "+r"(lhs_ptr),
-    [rhs_ptr] "+r"(rhs_ptr),
-    [packed_output_data] "+r"(packed_output_data),
-    [r_depth_block_count] "+r"(r_depth_block_count)
-    :  // inputs
-    :  // clabbers
-    "cc", "memory", "r0",
-        "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-        "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+        ++data0;
+        ++data1;
+        ++data2;
+        ++data3;
+        ++data4;
+        ++data5;
+        ++data6;
+        ++data7;
+      }  // d
+    }
   }
-#endif
 }
 
-void Gemm::PackLhs(const MatrixMap<const float> &lhs,
-                   float *packed_lhs) {
+template<>
+void Gemm<float>::PackLhs(const MatrixMap<const float> &lhs,
+                          float *packed_lhs) {
 #ifdef __aarch64__
   Pack<8, 4>(lhs, ColMajor, packed_lhs);
 #else
@@ -700,12 +511,15 @@ void Gemm::PackLhs(const MatrixMap<const float> &lhs,
 #endif
 }
 
-void Gemm::PackRhs(const MatrixMap<const float> &rhs,
-                   float *packed_rhs) {
+template<>
+void Gemm<float>::PackRhs(const MatrixMap<const float> &rhs,
+                          float *packed_rhs) {
   Pack<8, 4>(rhs, RowMajor, packed_rhs);
 }
 
-void Gemm::UnpackOutput(const float *packed_output, MatrixMap<float> *output) {
+template<>
+void Gemm<float>::UnpackOutput(const float *packed_output,
+                               MatrixMap<float> *output) {
 #ifdef __aarch64__
   Unpack<8, 8>(packed_output, output);
 #else
@@ -714,523 +528,670 @@ void Gemm::UnpackOutput(const float *packed_output, MatrixMap<float> *output) {
 }
 
 template<>
-void Gemm::Pack<4, 4>(const MatrixMap<const float> &matrix,
-                      MatrixMajor dst_major,
-                      float *packed_matrix) {
-  const index_t rows = matrix.rows();
-  const index_t cols = matrix.cols();
-
-  // use the same terminology as GemmLowp:
-  // depth is depth, width is the opposite dim other than depth
-  // lhs
-  index_t width = rows;
-  index_t depth = cols;
-  index_t width_stride = matrix.rows_stride();
-  index_t depth_stride = matrix.cols_stride();
-  if (dst_major == RowMajor) {
-    // rhs
-    std::swap(width, depth);
-    std::swap(width_stride, depth_stride);
+void Gemm<float>::ComputeBlock(const float *packed_lhs_data,
+                               const float *packed_rhs_data,
+                               const index_t depth_padded,
+                               float *packed_output_data) {
+  /* Ref:
+  for (index_t r = 0; r < block_size; ++r) {
+    for (index_t c = 0; c < block_size; ++c) {
+      float sum = 0;
+      for (index_t d = 0; d < depth; ++d) {
+        // (r, d) * (d, c)
+        sum += packed_lhs_data[d * r_block_size + r]
+            * packed_rhs_data[d * c_block_size + c];
+      }
+      packed_output_data[r * c_block_size + c] = sum;
+    }
   }
-  const float *data = matrix.data();
-  float *packed_ptr = packed_matrix;
-
-  const index_t block_size = 4;
-  const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4));
+  */
+  const float *lhs_ptr = packed_lhs_data;
+  const float *rhs_ptr = packed_rhs_data;
 
-  if (depth_padded > depth) {
-    memset(packed_ptr + depth * block_size,
-           0,
-           sizeof(float) * (depth_padded - depth) * block_size);
-  }
+  const index_t depth_block_count = depth_padded / 4;
 
-  if (dst_major == matrix.matrix_major()) {
-    if (width < block_size) {
-      const index_t width_remain = block_size - width;
-      for (index_t d = 0; d < depth; ++d) {
-        memcpy(packed_ptr, data, sizeof(float) * width);
-        memset(packed_ptr + width, 0, sizeof(float) * width_remain);
-        data += depth_stride;
-        packed_ptr += block_size;
-      }
-    } else {
-      for (index_t d = 0; d < depth; ++d) {
-        float32x4_t vi = vld1q_f32(data);
-        vst1q_f32(packed_ptr, vi);
-        data += depth_stride;
-        packed_ptr += block_size;
-      }
-    }
-  } else {
-    if (width < block_size) {
-      const index_t width_remain = block_size - width;
-      for (index_t d = 0; d < depth; ++d) {
-        for (index_t w = 0; w < width; ++w) {
-          packed_ptr[w] = data[w * width_stride + d];
-        }  // w
-        memset(packed_ptr + width, 0, sizeof(float) * width_remain);
-        packed_ptr += block_size;
-      }  // d
-    } else {
-      const float *data0 = data;
-      const float *data1 = data + width_stride;
-      const float *data2 = data1 + width_stride;
-      const float *data3 = data2 + width_stride;
+#ifdef __aarch64__
+  // Register layout: (8x4) x (4,8)
+  //
+  //                               +--------+--------+
+  //                               | v8 ... | v9 ... |
+  //                       Rhs     +--------+--------+
+  //                               | v10... | v11... |
+  //                               +--------+--------+
+  //                               | v12... | v13... |
+  //                               +--------+--------+
+  //                               | v14... | v15... |
+  //                               +--------+--------+
+  //
+  //          Lhs
+  //
+  //  +----+----+----+----+  -  -  +--------+--------+
+  //  | v0 | v2 | v4 | v6 |        | v16... | v17... |
+  //  | .  |    |    |    |        | v18... | v19... |
+  //  | .  |    |    |    |        | v20... | v21... |
+  //  | .  |    |    |    |        | v22... | v23... |
+  //  +----+----|----+----+        +--------+--------+
+  //  | v1 | v3 | v5 | v7 |        | v24... | v25... |
+  //  | .  |    |    |    |        | v26... | v27... |
+  //  | .  |    |    |    |        | v28... | v29... |
+  //  | .  |    |    |    |        | v30... | v31... |
+  //  +----+----|----+----+        +--------+--------+
+  //
+  //                                    Accumulator
+  //
 
-      const index_t depth_block = depth / 4;
-      const index_t depth_remain = depth - depth_block * 4;
-      for (index_t depth_block_idx = 0; depth_block_idx < depth_block;
-           ++depth_block_idx) {
-        float32x4_t v0 = vld1q_f32(data0);
-        float32x4_t v1 = vld1q_f32(data1);
-        float32x4_t v2 = vld1q_f32(data2);
-        float32x4_t v3 = vld1q_f32(data3);
-        float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
-        float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
-        float32x4x2_t v0123_intertwined =
-            vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
-        float32x4x2_t v0123n_intertwined =
-            vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
+  if (depth_block_count > 0) {
+    index_t r_depth_block_count = depth_block_count;
+    // just make compiler happy
+    MACE_UNUSED(r_depth_block_count);
 
-        vst1q_f32(packed_ptr, v0123_intertwined.val[0]);
-        packed_ptr += 4;
+    asm volatile(
+        "dup v16.4s, wzr \n"
+        "dup v17.4s, wzr \n"
+        "dup v18.4s, wzr \n"
+        "dup v19.4s, wzr \n"
+        "dup v20.4s, wzr \n"
+        "dup v21.4s, wzr \n"
+        "dup v22.4s, wzr \n"
+        "dup v23.4s, wzr \n"
+        "dup v24.4s, wzr \n"
+        "dup v25.4s, wzr \n"
+        "dup v26.4s, wzr \n"
+        "dup v27.4s, wzr \n"
+        "dup v28.4s, wzr \n"
+        "dup v29.4s, wzr \n"
+        "dup v30.4s, wzr \n"
+        "dup v31.4s, wzr \n"
 
-        vst1q_f32(packed_ptr, v0123_intertwined.val[1]);
-        packed_ptr += 4;
+        // prelogue
+        "ld1 {v0.4s}, [%[lhs_ptr]], #16 \n"
+        "ld1 {v1.4s}, [%[lhs_ptr]], #16 \n"
+        "ld1 {v2.4s}, [%[lhs_ptr]], #16 \n"
+        "ld1 {v3.4s}, [%[lhs_ptr]], #16 \n"
+        "ld1 {v4.4s}, [%[lhs_ptr]], #16 \n"
+        "ld1 {v5.4s}, [%[lhs_ptr]], #16 \n"
+        "ld1 {v6.4s}, [%[lhs_ptr]], #16 \n"
+        "ld1 {v7.4s}, [%[lhs_ptr]], #16 \n"
 
-        vst1q_f32(packed_ptr, v0123n_intertwined.val[0]);
-        packed_ptr += 4;
+        "ld1 {v8.4s}, [%[rhs_ptr]], #16 \n"
+        "ld1 {v9.4s}, [%[rhs_ptr]], #16 \n"
+        "ld1 {v10.4s}, [%[rhs_ptr]], #16 \n"
+        "ld1 {v11.4s}, [%[rhs_ptr]], #16 \n"
+        "ld1 {v12.4s}, [%[rhs_ptr]], #16 \n"
+        "ld1 {v13.4s}, [%[rhs_ptr]], #16 \n"
+        "ld1 {v14.4s}, [%[rhs_ptr]], #16 \n"
+        "ld1 {v15.4s}, [%[rhs_ptr]], #16 \n"
 
-        vst1q_f32(packed_ptr, v0123n_intertwined.val[1]);
-        packed_ptr += 4;
+        "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
+        "beq 1f\n"
 
-        data0 += 4;
-        data1 += 4;
-        data2 += 4;
-        data3 += 4;
-      }
-      for (index_t d = 0; d < depth_remain; ++d) {
-        float32x4_t vi = {*data0, *data1, *data2, *data3};
-        vst1q_f32(packed_ptr, vi);
-        packed_ptr += 4;
+        "0: \n"
+        "fmla v16.4s, v8.4s, v0.s[0] \n"
+        "fmla v17.4s, v9.4s, v0.s[0] \n"
+        "fmla v18.4s, v8.4s, v0.s[1] \n"
+        "fmla v19.4s, v9.4s, v0.s[1] \n"
+        "fmla v20.4s, v8.4s, v0.s[2] \n"
+        "fmla v21.4s, v9.4s, v0.s[2] \n"
+        "fmla v22.4s, v8.4s, v0.s[3] \n"
+        "fmla v23.4s, v9.4s, v0.s[3] \n"
 
-        ++data0;
-        ++data1;
-        ++data2;
-        ++data3;
-      }  // d
-    }
-  }
-}
+        "ld1 {v0.4s}, [%[lhs_ptr]], #16 \n"
 
-template<>
-void Gemm::Pack<8, 4>(const MatrixMap<const float> &matrix,
-                      MatrixMajor dst_major,
-                      float *packed_matrix) {
-  const index_t rows = matrix.rows();
-  const index_t cols = matrix.cols();
+        "fmla v24.4s, v8.4s, v1.s[0] \n"
+        "fmla v25.4s, v9.4s, v1.s[0] \n"
+        "fmla v26.4s, v8.4s, v1.s[1] \n"
+        "fmla v27.4s, v9.4s, v1.s[1] \n"
+        "fmla v28.4s, v8.4s, v1.s[2] \n"
+        "fmla v29.4s, v9.4s, v1.s[2] \n"
+        "fmla v30.4s, v8.4s, v1.s[3] \n"
+        "fmla v31.4s, v9.4s, v1.s[3] \n"
 
-  // use the same terminology as GemmLowp:
-  // depth is depth, width is the opposite dim other than depth
-  // lhs
-  index_t width = rows;
-  index_t depth = cols;
-  index_t width_stride = matrix.rows_stride();
-  index_t depth_stride = matrix.cols_stride();
-  if (dst_major == RowMajor) {
-    // rhs
-    std::swap(width, depth);
-    std::swap(width_stride, depth_stride);
-  }
-  const float *data = matrix.data();
-  float *packed_ptr = packed_matrix;
+        "ld1 {v1.4s}, [%[lhs_ptr]], #16 \n"
+        "ld1 {v8.4s}, [%[rhs_ptr]], #16 \n"
+        "ld1 {v9.4s}, [%[rhs_ptr]], #16 \n"
 
-  const index_t block_size = 8;
-  const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4));
+        "fmla v16.4s, v10.4s, v2.s[0] \n"
+        "fmla v17.4s, v11.4s, v2.s[0] \n"
+        "fmla v18.4s, v10.4s, v2.s[1] \n"
+        "fmla v19.4s, v11.4s, v2.s[1] \n"
+        "fmla v20.4s, v10.4s, v2.s[2] \n"
+        "fmla v21.4s, v11.4s, v2.s[2] \n"
+        "fmla v22.4s, v10.4s, v2.s[3] \n"
+        "fmla v23.4s, v11.4s, v2.s[3] \n"
 
-  if (depth_padded > depth) {
-    memset(packed_ptr + depth * block_size,
-           0,
-           sizeof(float) * (depth_padded - depth) * block_size);
+        "ld1 {v2.4s}, [%[lhs_ptr]], #16 \n"
+
+        "fmla v24.4s, v10.4s, v3.s[0] \n"
+        "fmla v25.4s, v11.4s, v3.s[0] \n"
+        "fmla v26.4s, v10.4s, v3.s[1] \n"
+        "fmla v27.4s, v11.4s, v3.s[1] \n"
+        "fmla v28.4s, v10.4s, v3.s[2] \n"
+        "fmla v29.4s, v11.4s, v3.s[2] \n"
+        "fmla v30.4s, v10.4s, v3.s[3] \n"
+        "fmla v31.4s, v11.4s, v3.s[3] \n"
+
+        "ld1 {v3.4s}, [%[lhs_ptr]], #16 \n"
+        "ld1 {v10.4s}, [%[rhs_ptr]], #16 \n"
+        "ld1 {v11.4s}, [%[rhs_ptr]], #16 \n"
+
+        "fmla v16.4s, v12.4s, v4.s[0] \n"
+        "fmla v17.4s, v13.4s, v4.s[0] \n"
+        "fmla v18.4s, v12.4s, v4.s[1] \n"
+        "fmla v19.4s, v13.4s, v4.s[1] \n"
+        "fmla v20.4s, v12.4s, v4.s[2] \n"
+        "fmla v21.4s, v13.4s, v4.s[2] \n"
+        "fmla v22.4s, v12.4s, v4.s[3] \n"
+        "fmla v23.4s, v13.4s, v4.s[3] \n"
+
+        "ld1 {v4.4s}, [%[lhs_ptr]], #16 \n"
+
+        "fmla v24.4s, v12.4s, v5.s[0] \n"
+        "fmla v25.4s, v13.4s, v5.s[0] \n"
+        "fmla v26.4s, v12.4s, v5.s[1] \n"
+        "fmla v27.4s, v13.4s, v5.s[1] \n"
+        "fmla v28.4s, v12.4s, v5.s[2] \n"
+        "fmla v29.4s, v13.4s, v5.s[2] \n"
+        "fmla v30.4s, v12.4s, v5.s[3] \n"
+        "fmla v31.4s, v13.4s, v5.s[3] \n"
+
+        "ld1 {v5.4s}, [%[lhs_ptr]], #16 \n"
+        "ld1 {v12.4s}, [%[rhs_ptr]], #16 \n"
+        "ld1 {v13.4s}, [%[rhs_ptr]], #16 \n"
+
+        "fmla v16.4s, v14.4s, v6.s[0] \n"
+        "fmla v17.4s, v15.4s, v6.s[0] \n"
+        "fmla v18.4s, v14.4s, v6.s[1] \n"
+        "fmla v19.4s, v15.4s, v6.s[1] \n"
+        "fmla v20.4s, v14.4s, v6.s[2] \n"
+        "fmla v21.4s, v15.4s, v6.s[2] \n"
+        "fmla v22.4s, v14.4s, v6.s[3] \n"
+        "fmla v23.4s, v15.4s, v6.s[3] \n"
+
+        "ld1 {v6.4s}, [%[lhs_ptr]], #16 \n"
+
+        "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
+
+        "fmla v24.4s, v14.4s, v7.s[0] \n"
+        "fmla v25.4s, v15.4s, v7.s[0] \n"
+        "fmla v26.4s, v14.4s, v7.s[1] \n"
+        "fmla v27.4s, v15.4s, v7.s[1] \n"
+        "fmla v28.4s, v14.4s, v7.s[2] \n"
+        "fmla v29.4s, v15.4s, v7.s[2] \n"
+        "fmla v30.4s, v14.4s, v7.s[3] \n"
+        "fmla v31.4s, v15.4s, v7.s[3] \n"
+
+        "ld1 {v7.4s}, [%[lhs_ptr]], #16 \n"
+        "ld1 {v14.4s}, [%[rhs_ptr]], #16 \n"
+        "ld1 {v15.4s}, [%[rhs_ptr]], #16 \n"
+
+        "bne 0b \n"
+
+        // prologue
+        "1:\n"
+        "fmla v16.4s, v8.4s, v0.s[0] \n"
+        "fmla v17.4s, v9.4s, v0.s[0] \n"
+        "fmla v18.4s, v8.4s, v0.s[1] \n"
+        "fmla v19.4s, v9.4s, v0.s[1] \n"
+        "fmla v20.4s, v8.4s, v0.s[2] \n"
+        "fmla v21.4s, v9.4s, v0.s[2] \n"
+        "fmla v22.4s, v8.4s, v0.s[3] \n"
+        "fmla v23.4s, v9.4s, v0.s[3] \n"
+
+        "fmla v24.4s, v8.4s, v1.s[0] \n"
+        "fmla v25.4s, v9.4s, v1.s[0] \n"
+        "fmla v26.4s, v8.4s, v1.s[1] \n"
+        "fmla v27.4s, v9.4s, v1.s[1] \n"
+        "fmla v28.4s, v8.4s, v1.s[2] \n"
+        "fmla v29.4s, v9.4s, v1.s[2] \n"
+        "fmla v30.4s, v8.4s, v1.s[3] \n"
+        "fmla v31.4s, v9.4s, v1.s[3] \n"
+
+        "fmla v16.4s, v10.4s, v2.s[0] \n"
+        "fmla v17.4s, v11.4s, v2.s[0] \n"
+        "fmla v18.4s, v10.4s, v2.s[1] \n"
+        "fmla v19.4s, v11.4s, v2.s[1] \n"
+        "fmla v20.4s, v10.4s, v2.s[2] \n"
+        "fmla v21.4s, v11.4s, v2.s[2] \n"
+        "fmla v22.4s, v10.4s, v2.s[3] \n"
+        "fmla v23.4s, v11.4s, v2.s[3] \n"
+
+        "fmla v24.4s, v10.4s, v3.s[0] \n"
+        "fmla v25.4s, v11.4s, v3.s[0] \n"
+        "fmla v26.4s, v10.4s, v3.s[1] \n"
+        "fmla v27.4s, v11.4s, v3.s[1] \n"
+        "fmla v28.4s, v10.4s, v3.s[2] \n"
+        "fmla v29.4s, v11.4s, v3.s[2] \n"
+        "fmla v30.4s, v10.4s, v3.s[3] \n"
+        "fmla v31.4s, v11.4s, v3.s[3] \n"
+
+        "fmla v16.4s, v12.4s, v4.s[0] \n"
+        "fmla v17.4s, v13.4s, v4.s[0] \n"
+        "fmla v18.4s, v12.4s, v4.s[1] \n"
+        "fmla v19.4s, v13.4s, v4.s[1] \n"
+        "fmla v20.4s, v12.4s, v4.s[2] \n"
+        "fmla v21.4s, v13.4s, v4.s[2] \n"
+        "fmla v22.4s, v12.4s, v4.s[3] \n"
+        "fmla v23.4s, v13.4s, v4.s[3] \n"
+
+        "fmla v24.4s, v12.4s, v5.s[0] \n"
+        "fmla v25.4s, v13.4s, v5.s[0] \n"
+        "fmla v26.4s, v12.4s, v5.s[1] \n"
+        "fmla v27.4s, v13.4s, v5.s[1] \n"
+        "fmla v28.4s, v12.4s, v5.s[2] \n"
+        "fmla v29.4s, v13.4s, v5.s[2] \n"
+        "fmla v30.4s, v12.4s, v5.s[3] \n"
+        "fmla v31.4s, v13.4s, v5.s[3] \n"
+
+        "fmla v16.4s, v14.4s, v6.s[0] \n"
+        "fmla v17.4s, v15.4s, v6.s[0] \n"
+        "fmla v18.4s, v14.4s, v6.s[1] \n"
+        "fmla v19.4s, v15.4s, v6.s[1] \n"
+        "fmla v20.4s, v14.4s, v6.s[2] \n"
+        "fmla v21.4s, v15.4s, v6.s[2] \n"
+        "fmla v22.4s, v14.4s, v6.s[3] \n"
+        "fmla v23.4s, v15.4s, v6.s[3] \n"
+
+        "fmla v24.4s, v14.4s, v7.s[0] \n"
+        "fmla v25.4s, v15.4s, v7.s[0] \n"
+        "fmla v26.4s, v14.4s, v7.s[1] \n"
+        "fmla v27.4s, v15.4s, v7.s[1] \n"
+        "fmla v28.4s, v14.4s, v7.s[2] \n"
+        "fmla v29.4s, v15.4s, v7.s[2] \n"
+        "fmla v30.4s, v14.4s, v7.s[3] \n"
+        "fmla v31.4s, v15.4s, v7.s[3] \n"
+
+        "st1 {v16.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v17.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v18.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v19.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v20.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v21.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v22.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v23.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v24.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v25.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v26.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v27.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v28.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v29.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v30.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v31.4s}, [%[packed_output_data]], #16 \n"
+    :  // outputs
+    [lhs_ptr] "+r"(lhs_ptr),
+    [rhs_ptr] "+r"(rhs_ptr),
+    [packed_output_data] "+r"(packed_output_data),
+    [r_depth_block_count] "+r"(r_depth_block_count)
+    :  // inputs
+    :  // clabbers
+    "cc", "memory",
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+        "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
   }
+#else  // armeabi-v7a
+  // Register layout: (4x4) x (4,8)
+  //
+  //                               +--------+--------+
+  //                               | q4 ... | q5 ... |
+  //                       Rhs     +--------+--------+
+  //                               | q6 ... | q7 ... |
+  //                               +--------+--------+
+  //                               | q4 ... | q5 ... |
+  //                               +--------+--------+
+  //                               | q6 ... | q7 ... |
+  //                               +--------+--------+
+  //
+  //          Lhs
+  //
+  //  +----+----+----+----+  -  -  +--------+--------+
+  //  | q0 | q1 | q2 | q3 |        | q8...  | q9...  |
+  //  | .  |    |    |    |        | q10... | q11... |
+  //  | .  |    |    |    |        | q12... | q13... |
+  //  | .  |    |    |    |        | q14... | q15... |
+  //  +----+----+----+----+        +--------+--------+
+  //
+  //                                    Accumulator
+  //
 
-  if (dst_major == matrix.matrix_major()) {
-    if (width < block_size) {
-      const index_t width_remain = block_size - width;
-      for (index_t d = 0; d < depth; ++d) {
-        memcpy(packed_ptr, data, sizeof(float) * width);
-        memset(packed_ptr + width, 0, sizeof(float) * width_remain);
-        data += depth_stride;
-        packed_ptr += block_size;
-      }
-    } else {
-      for (index_t d = 0; d < depth; ++d) {
-        float32x4_t vi = vld1q_f32(data);
-        vst1q_f32(packed_ptr, vi);
-        float32x4_t vin = vld1q_f32(data + 4);
-        vst1q_f32(packed_ptr + 4, vin);
-        data += depth_stride;
-        packed_ptr += block_size;
-      }
-    }
-  } else {
-    if (width < block_size) {
-      const index_t width_remain = block_size - width;
-      for (index_t d = 0; d < depth; ++d) {
-        for (index_t w = 0; w < width; ++w) {
-          packed_ptr[w] = data[w * width_stride + d];
-        }  // w
-        memset(packed_ptr + width, 0, sizeof(float) * width_remain);
-        packed_ptr += block_size;
-      }  // d
-    } else {
-      const float *data0 = data;
-      const float *data1 = data + width_stride;
-      const float *data2 = data1 + width_stride;
-      const float *data3 = data2 + width_stride;
-      const float *data4 = data3 + width_stride;
-      const float *data5 = data4 + width_stride;
-      const float *data6 = data5 + width_stride;
-      const float *data7 = data6 + width_stride;
+  if (depth_block_count > 0) {
+    index_t r_depth_block_count = depth_block_count;
+    // just make compiler happy
+    MACE_UNUSED(r_depth_block_count);
 
-      const index_t depth_block = depth / 4;
-      const index_t depth_remain = depth - depth_block * 4;
-      for (index_t depth_block_idx = 0; depth_block_idx < depth_block;
-           ++depth_block_idx) {
-        float32x4_t v0 = vld1q_f32(data0);
-        float32x4_t v1 = vld1q_f32(data1);
-        float32x4_t v2 = vld1q_f32(data2);
-        float32x4_t v3 = vld1q_f32(data3);
-        float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
-        float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
-        float32x4x2_t v0123_intertwined =
-            vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
-        float32x4x2_t v0123n_intertwined =
-            vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
+    asm volatile(
+    "mov r0, #0\n"
+    "vdup.f32 q8, r0 \n"
+    "vdup.f32 q9, r0 \n"
+    "vdup.f32 q10, r0 \n"
+    "vdup.f32 q11, r0 \n"
+    "vdup.f32 q12, r0 \n"
+    "vdup.f32 q13, r0 \n"
+    "vdup.f32 q14, r0 \n"
+    "vdup.f32 q15, r0 \n"
 
-        float32x4_t v4 = vld1q_f32(data4);
-        float32x4_t v5 = vld1q_f32(data5);
-        float32x4_t v6 = vld1q_f32(data6);
-        float32x4_t v7 = vld1q_f32(data7);
-        float32x4x2_t v46_intertwined = vzipq_f32(v4, v6);
-        float32x4x2_t v57_intertwined = vzipq_f32(v5, v7);
-        float32x4x2_t v4567_intertwined =
-            vzipq_f32(v46_intertwined.val[0], v57_intertwined.val[0]);
-        float32x4x2_t v4567n_intertwined =
-            vzipq_f32(v46_intertwined.val[1], v57_intertwined.val[1]);
+    // prelogue
+    "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n"
 
-        vst1q_f32(packed_ptr, v0123_intertwined.val[0]);
-        packed_ptr += 4;
+    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
 
-        vst1q_f32(packed_ptr, v4567_intertwined.val[0]);
-        packed_ptr += 4;
+    "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
+    "beq 1f\n"
 
-        vst1q_f32(packed_ptr, v0123_intertwined.val[1]);
-        packed_ptr += 4;
+    "0: \n"
 
-        vst1q_f32(packed_ptr, v4567_intertwined.val[1]);
-        packed_ptr += 4;
+    "vmla.f32 q8, q4, d0[0] \n"
+    "vmla.f32 q9, q5, d0[0] \n"
+    "vmla.f32 q10, q4, d0[1] \n"
+    "vmla.f32 q11, q5, d0[1] \n"
+    "vmla.f32 q12, q4, d1[0] \n"
+    "vmla.f32 q13, q5, d1[0] \n"
+    "vmla.f32 q14, q4, d1[1] \n"
+    "vmla.f32 q15, q5, d1[1] \n"
 
-        vst1q_f32(packed_ptr, v0123n_intertwined.val[0]);
-        packed_ptr += 4;
+    "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
 
-        vst1q_f32(packed_ptr, v4567n_intertwined.val[0]);
-        packed_ptr += 4;
+    "vmla.f32 q8, q6, d2[0] \n"
+    "vmla.f32 q9, q7, d2[0] \n"
+    "vmla.f32 q10, q6, d2[1] \n"
+    "vmla.f32 q11, q7, d2[1] \n"
+    "vmla.f32 q12, q6, d3[0] \n"
+    "vmla.f32 q13, q7, d3[0] \n"
+    "vmla.f32 q14, q6, d3[1] \n"
+    "vmla.f32 q15, q7, d3[1] \n"
 
-        vst1q_f32(packed_ptr, v0123n_intertwined.val[1]);
-        packed_ptr += 4;
+    "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
 
-        vst1q_f32(packed_ptr, v4567n_intertwined.val[1]);
-        packed_ptr += 4;
+    "vmla.f32 q8, q4, d4[0] \n"
+    "vmla.f32 q9, q5, d4[0] \n"
+    "vmla.f32 q10, q4, d4[1] \n"
+    "vmla.f32 q11, q5, d4[1] \n"
+    "vmla.f32 q12, q4, d5[0] \n"
+    "vmla.f32 q13, q5, d5[0] \n"
+    "vmla.f32 q14, q4, d5[1] \n"
+    "vmla.f32 q15, q5, d5[1] \n"
 
-        data0 += 4;
-        data1 += 4;
-        data2 += 4;
-        data3 += 4;
-        data4 += 4;
-        data5 += 4;
-        data6 += 4;
-        data7 += 4;
-      }
-      for (index_t d = 0; d < depth_remain; ++d) {
-        float32x4_t vi = {*data0, *data1, *data2, *data3};
-        vst1q_f32(packed_ptr, vi);
-        packed_ptr += 4;
+    "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
 
-        float32x4_t vin = {*data4, *data5, *data6, *data7};
-        vst1q_f32(packed_ptr, vin);
-        packed_ptr += 4;
+    "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
 
-        ++data0;
-        ++data1;
-        ++data2;
-        ++data3;
-        ++data4;
-        ++data5;
-        ++data6;
-        ++data7;
-      }  // d
-    }
-  }
-}
+    "vmla.f32 q8, q6, d6[0] \n"
+    "vmla.f32 q9, q7, d6[0] \n"
+    "vmla.f32 q10, q6, d6[1] \n"
+    "vmla.f32 q11, q7, d6[1] \n"
+    "vmla.f32 q12, q6, d7[0] \n"
+    "vmla.f32 q13, q7, d7[0] \n"
+    "vmla.f32 q14, q6, d7[1] \n"
+    "vmla.f32 q15, q7, d7[1] \n"
 
-template<>
-void Gemm::Unpack<4, 8>(const float *packed_output, MatrixMap<float> *output) {
-  const index_t rows = output->rows();
-  const index_t cols = output->cols();
-  index_t row_stride = output->rows_stride();
-  index_t col_stride = output->cols_stride();
+    "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
 
-  float *output_ptr = output->data();
-  const float *packed_ptr = packed_output;
+    "bne 0b \n"
 
-  const index_t block_size = 8;
+    // prologue
+    "1:\n"
+    "vmla.f32 q8, q4, d0[0] \n"
+    "vmla.f32 q9, q5, d0[0] \n"
+    "vmla.f32 q10, q4, d0[1] \n"
+    "vmla.f32 q11, q5, d0[1] \n"
+    "vmla.f32 q12, q4, d1[0] \n"
+    "vmla.f32 q13, q5, d1[0] \n"
+    "vmla.f32 q14, q4, d1[1] \n"
+    "vmla.f32 q15, q5, d1[1] \n"
 
-  // packed_output always has row-major
-  if (output->matrix_major() == RowMajor) {
-    if (cols < block_size) {
-      for (index_t r = 0; r < rows; ++r) {
-        memcpy(output_ptr, packed_ptr, sizeof(float) * cols);
-        output_ptr += row_stride;
-        packed_ptr += block_size;
-      }
-    } else {
-      for (index_t r = 0; r < rows; ++r) {
-        float32x4_t vi = vld1q_f32(packed_ptr);
-        vst1q_f32(output_ptr, vi);
-        float32x4_t vin = vld1q_f32(packed_ptr + 4);
-        vst1q_f32(output_ptr + 4, vin);
+    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
 
-        output_ptr += row_stride;
-        packed_ptr += block_size;
-      }
-    }
-  } else {
-    // ColMajor
-    if (rows < block_size) {
-      for (index_t c = 0; c < cols; ++c) {
-        for (index_t r = 0; r < rows; ++r) {
-          output_ptr[c * col_stride + r] = packed_ptr[r * block_size + c];
-        }  // r
-      }  // c
-    } else {
-      const float *data0 = packed_ptr;
-      const float *data1 = data0 + block_size;
-      const float *data2 = data1 + block_size;
-      const float *data3 = data2 + block_size;
+    "vmla.f32 q8, q6, d2[0] \n"
+    "vmla.f32 q9, q7, d2[0] \n"
+    "vmla.f32 q10, q6, d2[1] \n"
+    "vmla.f32 q11, q7, d2[1] \n"
+    "vmla.f32 q12, q6, d3[0] \n"
+    "vmla.f32 q13, q7, d3[0] \n"
+    "vmla.f32 q14, q6, d3[1] \n"
+    "vmla.f32 q15, q7, d3[1] \n"
 
-      index_t col_block = cols / 4;
-      index_t col_remain = cols - col_block * 4;
-      for (index_t col_block_idx = 0; col_block_idx < col_block;
-           ++col_block_idx) {
-        float32x4_t v0 = vld1q_f32(data0);
-        float32x4_t v1 = vld1q_f32(data1);
-        float32x4_t v2 = vld1q_f32(data2);
-        float32x4_t v3 = vld1q_f32(data3);
-        float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
-        float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
-        float32x4x2_t v0123_intertwined =
-            vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
-        float32x4x2_t v0123n_intertwined =
-            vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
+    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
 
-        vst1q_f32(output_ptr, v0123_intertwined.val[0]);
-        output_ptr += col_stride;
+    "vmla.f32 q8, q4, d4[0] \n"
+    "vmla.f32 q9, q5, d4[0] \n"
+    "vmla.f32 q10, q4, d4[1] \n"
+    "vmla.f32 q11, q5, d4[1] \n"
+    "vmla.f32 q12, q4, d5[0] \n"
+    "vmla.f32 q13, q5, d5[0] \n"
+    "vmla.f32 q14, q4, d5[1] \n"
+    "vmla.f32 q15, q5, d5[1] \n"
 
-        vst1q_f32(output_ptr, v0123_intertwined.val[1]);
-        output_ptr += col_stride;
+    "vmla.f32 q8, q6, d6[0] \n"
+    "vmla.f32 q9, q7, d6[0] \n"
+    "vmla.f32 q10, q6, d6[1] \n"
+    "vmla.f32 q11, q7, d6[1] \n"
+    "vmla.f32 q12, q6, d7[0] \n"
+    "vmla.f32 q13, q7, d7[0] \n"
+    "vmla.f32 q14, q6, d7[1] \n"
+    "vmla.f32 q15, q7, d7[1] \n"
 
-        vst1q_f32(output_ptr, v0123n_intertwined.val[0]);
-        output_ptr += col_stride;
+    "vst1.f32 {d16-d17}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d18-d19}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d20-d21}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d22-d23}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d24-d25}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d26-d27}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d28-d29}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d30-d31}, [%[packed_output_data]]! \n"
+    :  // outputs
+    [lhs_ptr] "+r"(lhs_ptr),
+    [rhs_ptr] "+r"(rhs_ptr),
+    [packed_output_data] "+r"(packed_output_data),
+    [r_depth_block_count] "+r"(r_depth_block_count)
+    :  // inputs
+    :  // clabbers
+    "cc", "memory", "r0",
+        "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+  }
+#endif
+}
 
-        vst1q_f32(output_ptr, v0123n_intertwined.val[1]);
-        output_ptr += col_stride;
+template<>
+MaceStatus Gemm<float>::Compute(const OpContext *context,
+                                const Tensor *lhs,
+                                const Tensor *rhs,
+                                const index_t batch,
+                                const index_t rows,
+                                const index_t cols,
+                                const index_t depth,
+                                const MatrixMajor lhs_major,
+                                const MatrixMajor rhs_major,
+                                const MatrixMajor output_major,
+                                const bool lhs_batched,
+                                const bool rhs_batched,
+                                Tensor *output) {
+  MACE_CHECK(output->size() == batch * rows * cols,
+             "Need resize output tensor before call gemm.");
+  Tensor::MappingGuard lhs_guard(lhs);
+  Tensor::MappingGuard rhs_guard(rhs);
+  Tensor::MappingGuard output_guard(output);
+  const float *lhs_data = lhs->data<float>();
+  const float *rhs_data = rhs->data<float>();
+  float *output_data = output->mutable_data<float>();
+
+#ifdef __aarch64__
+  const index_t row_block_size = 8;
+#else
+  const index_t row_block_size = 4;
+#endif
+  const index_t col_block_size = 8;
+  const index_t depth_block_size = 4;
+  const index_t row_block_count = RoundUpDiv(rows, row_block_size);
+  const index_t col_block_count = RoundUpDiv(cols, col_block_size);
+  const index_t rows_padded = RoundUp(rows, row_block_size);
+  const index_t cols_padded = RoundUp(cols, col_block_size);
+  const index_t depth_padded = RoundUp(depth, depth_block_size);
 
-        data0 += 4;
-        data1 += 4;
-        data2 += 4;
-        data3 += 4;
-      }
-      for (index_t c = 0; c < col_remain; ++c) {
-        float32x4_t vi = {*data0, *data1, *data2, *data3};
-        vst1q_f32(output_ptr, vi);
-        output_ptr += col_stride;
+  ScratchBuffer *scratch = context->device()->scratch_buffer();
 
-        ++data0;
-        ++data1;
-        ++data2;
-        ++data3;
-      }  // d
+  index_t packed_lhs_size =
+      PadAlignSize(sizeof(float) * rows_padded * depth_padded);
+  index_t packed_rhs_size =
+      PadAlignSize(sizeof(float) * depth_padded * cols_padded);
+  index_t packed_output_size =
+      PadAlignSize(sizeof(float) * rows_padded * cols_padded);
+  // resize to the total size of lhs & rhs & output anyway,
+  // in case we do not cache const tensor for saving memory
+  MACE_RETURN_IF_ERROR(scratch->GrowSize(
+      packed_lhs_size + packed_rhs_size + packed_output_size));
+  float *packed_lhs_data =
+      scratch->Scratch(packed_lhs_size).mutable_data<float>();
+  float *packed_rhs_data =
+      scratch->Scratch(packed_rhs_size).mutable_data<float>();
+  float *packed_output_data =
+      scratch->Scratch(packed_output_size).mutable_data<float>();
+
+  int cache_side = kNoCache;
+  if (cached_ == kCacheLhs) {
+    packed_lhs_data = pack_cache_.mutable_data<float>();
+  } else if (cached_ == kCacheRhs) {
+    packed_rhs_data = pack_cache_.mutable_data<float>();
+  } else if (should_cache_pack_) {
+    if (lhs->is_weight() && (!lhs_batched || batch == 1)) {
+      cache_side = kCacheLhs;
+      pack_cache_.Resize(packed_lhs_size);
+      packed_lhs_data = pack_cache_.mutable_data<float>();
+    } else if (rhs->is_weight() && (!rhs_batched || batch == 1)) {
+      cache_side = kCacheRhs;
+      pack_cache_.Resize(packed_rhs_size);
+      packed_rhs_data = pack_cache_.mutable_data<float>();
     }
   }
-}
-
-template<>
-void Gemm::Unpack<8, 8>(const float *packed_output, MatrixMap<float> *output) {
-  const index_t rows = output->rows();
-  const index_t cols = output->cols();
-  index_t row_stride = output->rows_stride();
-  index_t col_stride = output->cols_stride();
 
-  float *output_ptr = output->data();
-  const float *packed_ptr = packed_output;
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
 
-  const index_t block_size = 8;
+  for (index_t b = 0; b < batch; ++b) {
+    MatrixMap<const float>
+        lhs_matrix
+        (lhs_data + static_cast<index_t>(lhs_batched) * b * rows * depth,
+         lhs_major,
+         rows,
+         depth);
+    MatrixMap<const float>
+        rhs_matrix
+        (rhs_data + static_cast<index_t>(rhs_batched) * b * depth * cols,
+         rhs_major,
+         depth,
+         cols);
+    MatrixMap<float> output_matrix
+        (output_data + b * rows * cols, output_major, rows, cols);
 
-  // packed_output always has row-major
-  if (output->matrix_major() == RowMajor) {
-    if (cols < block_size) {
-      for (index_t r = 0; r < rows; ++r) {
-        memcpy(output_ptr, packed_ptr, sizeof(float) * cols);
-        output_ptr += row_stride;
-        packed_ptr += block_size;
-      }
-    } else {
-      for (index_t r = 0; r < rows; ++r) {
-        float32x4_t vi = vld1q_f32(packed_ptr);
-        vst1q_f32(output_ptr, vi);
-        float32x4_t vin = vld1q_f32(packed_ptr + 4);
-        vst1q_f32(output_ptr + 4, vin);
+    // pack lhs
+    if (cached_ != kCacheLhs) {
+      thread_pool.Compute1D([=, &lhs_matrix](index_t start,
+                                             index_t end,
+                                             index_t step) {
+        for (index_t row_block_idx = start; row_block_idx < end;
+             row_block_idx += step) {
+          const index_t start_row = row_block_idx * row_block_size;
+          const index_t
+              row_block_len = std::min(row_block_size, rows - start_row);
+          float *packed_lhs_data_block =
+              packed_lhs_data + row_block_idx * row_block_size * depth_padded;
+          PackLhs(lhs_matrix.block(start_row, 0, row_block_len, depth),
+                  packed_lhs_data_block);
+        }
+      }, 0, row_block_count, 1);
 
-        output_ptr += row_stride;
-        packed_ptr += block_size;
+      if (cache_side == kCacheLhs) {
+        cached_ = kCacheLhs;
+        if (lhs->UnderlyingBuffer()->OnHost()) {
+          AdviseFree(reinterpret_cast<void *>(const_cast<float *>(lhs->data<
+                         float>())),
+                     lhs->raw_size());
+        }
       }
     }
-  } else {
-    // ColMajor
-    if (rows < block_size) {
-      for (index_t c = 0; c < cols; ++c) {
-        for (index_t r = 0; r < rows; ++r) {
-          output_ptr[c * col_stride + r] = packed_ptr[r * block_size + c];
-        }  // r
-      }  // c
-    } else {
-      const float *data0 = packed_ptr;
-      const float *data1 = data0 + block_size;
-      const float *data2 = data1 + block_size;
-      const float *data3 = data2 + block_size;
-      const float *data4 = data3 + block_size;
-      const float *data5 = data4 + block_size;
-      const float *data6 = data5 + block_size;
-      const float *data7 = data6 + block_size;
-
-      index_t col_block = cols / 4;
-      index_t col_remain = cols - col_block * 4;
-      for (index_t col_block_idx = 0; col_block_idx < col_block;
-           ++col_block_idx) {
-        float32x4_t v0 = vld1q_f32(data0);
-        float32x4_t v1 = vld1q_f32(data1);
-        float32x4_t v2 = vld1q_f32(data2);
-        float32x4_t v3 = vld1q_f32(data3);
-        float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
-        float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
-        float32x4x2_t v0123_intertwined =
-            vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
-        float32x4x2_t v0123n_intertwined =
-            vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
-
-        float32x4_t v4 = vld1q_f32(data4);
-        float32x4_t v5 = vld1q_f32(data5);
-        float32x4_t v6 = vld1q_f32(data6);
-        float32x4_t v7 = vld1q_f32(data7);
-        float32x4x2_t v46_intertwined = vzipq_f32(v4, v6);
-        float32x4x2_t v57_intertwined = vzipq_f32(v5, v7);
-        float32x4x2_t v4567_intertwined =
-            vzipq_f32(v46_intertwined.val[0], v57_intertwined.val[0]);
-        float32x4x2_t v4567n_intertwined =
-            vzipq_f32(v46_intertwined.val[1], v57_intertwined.val[1]);
-
-        vst1q_f32(output_ptr, v0123_intertwined.val[0]);
-        vst1q_f32(output_ptr + 4, v4567_intertwined.val[0]);
-        output_ptr += col_stride;
-
-        vst1q_f32(output_ptr, v0123_intertwined.val[1]);
-        vst1q_f32(output_ptr + 4, v4567_intertwined.val[1]);
-        output_ptr += col_stride;
-
-        vst1q_f32(output_ptr, v0123n_intertwined.val[0]);
-        vst1q_f32(output_ptr + 4, v4567n_intertwined.val[0]);
-        output_ptr += col_stride;
 
-        vst1q_f32(output_ptr, v0123n_intertwined.val[1]);
-        vst1q_f32(output_ptr + 4, v4567n_intertwined.val[1]);
-        output_ptr += col_stride;
+    // pack rhs
+    if (cached_ != kCacheRhs) {
+      thread_pool.Compute1D([=, &rhs_matrix](index_t start,
+                                             index_t end,
+                                             index_t step) {
+        for (index_t col_block_idx = start; col_block_idx < end;
+             col_block_idx += step) {
+          const index_t start_col = col_block_idx * col_block_size;
+          const index_t
+              col_block_len = std::min(col_block_size, cols - start_col);
+          float *packed_rhs_data_block =
+              packed_rhs_data + col_block_idx * col_block_size * depth_padded;
+          PackRhs(rhs_matrix.block(0, start_col, depth, col_block_len),
+                  packed_rhs_data_block);
+        }
+      }, 0, col_block_count, 1);
 
-        data0 += 4;
-        data1 += 4;
-        data2 += 4;
-        data3 += 4;
-        data4 += 4;
-        data5 += 4;
-        data6 += 4;
-        data7 += 4;
+      if (cache_side == kCacheRhs) {
+        cached_ = kCacheRhs;
+        if (rhs->UnderlyingBuffer()->OnHost()) {
+          AdviseFree(reinterpret_cast<void *>(const_cast<float *>(rhs->data<
+                         float>())),
+                     rhs->raw_size());
+        }
       }
-      for (index_t c = 0; c < col_remain; ++c) {
-        float32x4_t vi = {*data0, *data1, *data2, *data3};
-        vst1q_f32(output_ptr, vi);
-        float32x4_t vin = {*data4, *data5, *data6, *data7};
-        vst1q_f32(output_ptr + 4, vin);
-        output_ptr += col_stride;
-
-        ++data0;
-        ++data1;
-        ++data2;
-        ++data3;
-        ++data4;
-        ++data5;
-        ++data6;
-        ++data7;
-      }  // d
     }
-  }
-}
 
-MaceStatus Gemm::Compute(const OpContext *context,
-                         const Tensor *lhs,
-                         const Tensor *rhs,
-                         const index_t batch,
-                         const index_t lhs_rows,
-                         const index_t lhs_cols,
-                         const index_t rhs_rows,
-                         const index_t rhs_cols,
-                         const bool transpose_lhs,
-                         const bool transpose_rhs,
-                         const bool transpose_out,
-                         const bool lhs_batched,
-                         const bool rhs_batched,
-                         Tensor *output) {
-  index_t rows = transpose_lhs ? lhs_cols : lhs_rows;
-  index_t depth = transpose_lhs ? lhs_rows : lhs_cols;
-  index_t cols = transpose_rhs ? rhs_rows : rhs_cols;
-  index_t depth2 = transpose_rhs ? rhs_cols : rhs_rows;
-  MACE_CHECK(depth == depth2,
-             "Matrices that multiply have inconsistent depth dim: ",
-             depth,
-             " vs. ",
-             depth2);
-
-  return Compute(context,
-                 lhs,
-                 rhs,
-                 batch,
-                 rows,
-                 cols,
-                 depth,
-                 transpose_lhs ? ColMajor : RowMajor,
-                 transpose_rhs ? ColMajor : RowMajor,
-                 transpose_out ? ColMajor : RowMajor,
-                 lhs_batched,
-                 rhs_batched,
-                 output);
-}
+    // multiply lhs and rhs
+    thread_pool.Compute1D([=, &output_matrix](index_t start,
+                                              index_t end,
+                                              index_t step) {
+      for (index_t row_block_idx = start; row_block_idx < end;
+           row_block_idx += step) {
+        const index_t start_row = row_block_idx * row_block_size;
+        const index_t
+            row_block_len = std::min(row_block_size, rows - start_row);
+        const float *packed_lhs_data_block =
+            packed_lhs_data + row_block_idx * row_block_size * depth_padded;
+
+        for (index_t col_block_idx = 0; col_block_idx < col_block_count;
+             ++col_block_idx) {
+          const index_t start_col = col_block_idx * col_block_size;
+          const index_t
+              col_block_len = std::min(col_block_size, cols - start_col);
+          const float *packed_rhs_data_block =
+              packed_rhs_data + col_block_idx * col_block_size * depth_padded;
+          float *packed_output_data_block =
+              packed_output_data + row_block_idx * row_block_size * cols_padded
+                  + col_block_idx * col_block_size;
+          ComputeBlock(packed_lhs_data_block,
+                       packed_rhs_data_block,
+                       depth_padded,
+                       packed_output_data_block);
+          MatrixMap<float> output_block = output_matrix.block(start_row,
+                                                              start_col,
+                                                              row_block_len,
+                                                              col_block_len);
+          UnpackOutput(packed_output_data_block, &output_block);
+        }  // col_block_idx
+      }  // row_block_idx
+    }, 0, row_block_count, 1);
+  }  // b
 
-void RegisterGemmDelegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Gemm, delegator::GemmParam,
-      MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::NEON));
+  return MaceStatus::MACE_SUCCESS;
 }
 
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/arm/fp32/gemv.cc b/mace/ops/arm/fp32/gemv.cc
index 57f2f248ebbbf738793bd3df1cc509f88ffcf3e6..257b1665a39c68c76016630c2f5394f03542cd15 100644
--- a/mace/ops/arm/fp32/gemv.cc
+++ b/mace/ops/arm/fp32/gemv.cc
@@ -12,12 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-
-#include "mace/ops/arm/fp32/gemv.h"
-
 #include <arm_neon.h>
 #include <algorithm>
 
+#include "mace/ops/arm/base/gemv.h"
 #include "mace/utils/math.h"
 
 #if !defined(__aarch64__)
@@ -34,18 +32,18 @@ float vaddvq_f32(float32x4_t v) {
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-
-MaceStatus Gemv::Compute(const OpContext *context,
-                         const Tensor *lhs,
-                         const Tensor *rhs,
-                         const Tensor *bias,
-                         const index_t batch,
-                         const index_t lhs_height,
-                         const index_t lhs_width,
-                         const bool lhs_batched,
-                         const bool rhs_batched,
-                         Tensor *output) {
+
+template<>
+MaceStatus Gemv<float>::Compute(const OpContext *context,
+                                const Tensor *lhs,
+                                const Tensor *rhs,
+                                const Tensor *bias,
+                                const index_t batch,
+                                const index_t lhs_height,
+                                const index_t lhs_width,
+                                const bool lhs_batched,
+                                const bool rhs_batched,
+                                Tensor *output) {
   MACE_UNUSED(context);
 
   MACE_CHECK(output->size() == batch * lhs_height,
@@ -378,13 +376,6 @@ MaceStatus Gemv::Compute(const OpContext *context,
 #undef vaddvq_f32
 #endif
 
-void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Gemv, DelegatorParam,
-      MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::NEON));
-}
-
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/registry/op_delegators_registry.cc b/mace/ops/registry/op_delegators_registry.cc
index 4aac7282edae65c83211a50b16bfb641c18c7881..9ef615d584a1277c02a1507041e477f616d6e716 100644
--- a/mace/ops/registry/op_delegators_registry.cc
+++ b/mace/ops/registry/op_delegators_registry.cc
@@ -38,13 +38,15 @@ extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
 #ifdef MACE_ENABLE_NEON
 namespace arm {
 namespace fp32 {
+extern void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry);
+}  // namespace fp32
+
 extern void RegisterActivationDelegator(OpDelegatorRegistry *registry);
 extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry);
 
 extern void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry);
 extern void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry);
 extern void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry);
-extern void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry);
 extern void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry);
 extern void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry);
 extern void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry);
@@ -69,7 +71,6 @@ extern void RegisterGroupDeconv2dGeneralDelegator(
 
 extern void RegisterGemmDelegator(OpDelegatorRegistry *registry);
 extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
-}  // namespace fp32
 
 #ifdef MACE_ENABLE_QUANTIZE
 namespace q8 {
@@ -97,32 +98,33 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
 #endif  // MACE_ENABLE_QUANTIZE
 
 #ifdef MACE_ENABLE_NEON
-  arm::fp32::RegisterActivationDelegator(registry);
-  arm::fp32::RegisterBiasAddDelegator(registry);
-
-  arm::fp32::RegisterConv2dK1x1Delegator(registry);
-  arm::fp32::RegisterConv2dK1xNDelegator(registry);
-  arm::fp32::RegisterConv2dK3x3Delegator(registry);
   arm::fp32::RegisterConv2dK3x3WinogradDelegator(registry);
-  arm::fp32::RegisterConv2dK5x5Delegator(registry);
-  arm::fp32::RegisterConv2dK7x7Delegator(registry);
-  arm::fp32::RegisterConv2dGeneralDelegator(registry);
-
-  arm::fp32::RegisterDeconv2dK2x2Delegator(registry);
-  arm::fp32::RegisterDeconv2dK3x3Delegator(registry);
-  arm::fp32::RegisterDeconv2dK4x4Delegator(registry);
-  arm::fp32::RegisterDeconv2dGeneralDelegator(registry);
-
-  arm::fp32::RegisterDepthwiseConv2dK3x3Delegator(registry);
-  arm::fp32::RegisterDepthwiseDeconv2dK3x3Delegator(registry);
-  arm::fp32::RegisterGroupDeconv2dK3x3Delegator(registry);
-  arm::fp32::RegisterDepthwiseDeconv2dK4x4Delegator(registry);
-  arm::fp32::RegisterGroupDeconv2dK4x4Delegator(registry);
-  arm::fp32::RegisterDepthwiseDeconv2dGeneralDelegator(registry);
-  arm::fp32::RegisterGroupDeconv2dGeneralDelegator(registry);
-
-  arm::fp32::RegisterGemmDelegator(registry);
-  arm::fp32::RegisterGemvDelegator(registry);
+
+  arm::RegisterActivationDelegator(registry);
+  arm::RegisterBiasAddDelegator(registry);
+
+  arm::RegisterConv2dK1x1Delegator(registry);
+  arm::RegisterConv2dK1xNDelegator(registry);
+  arm::RegisterConv2dK3x3Delegator(registry);
+  arm::RegisterConv2dK5x5Delegator(registry);
+  arm::RegisterConv2dK7x7Delegator(registry);
+  arm::RegisterConv2dGeneralDelegator(registry);
+
+  arm::RegisterDeconv2dK2x2Delegator(registry);
+  arm::RegisterDeconv2dK3x3Delegator(registry);
+  arm::RegisterDeconv2dK4x4Delegator(registry);
+  arm::RegisterDeconv2dGeneralDelegator(registry);
+
+  arm::RegisterDepthwiseConv2dK3x3Delegator(registry);
+  arm::RegisterDepthwiseDeconv2dK3x3Delegator(registry);
+  arm::RegisterGroupDeconv2dK3x3Delegator(registry);
+  arm::RegisterDepthwiseDeconv2dK4x4Delegator(registry);
+  arm::RegisterGroupDeconv2dK4x4Delegator(registry);
+  arm::RegisterDepthwiseDeconv2dGeneralDelegator(registry);
+  arm::RegisterGroupDeconv2dGeneralDelegator(registry);
+
+  arm::RegisterGemmDelegator(registry);
+  arm::RegisterGemvDelegator(registry);
 
 #ifdef MACE_ENABLE_QUANTIZE
   arm::q8::RegisterEltwiseDelegator(registry);