refactor: refactor the delegators for arm

N/A Signed-off-by: N Luxuhui <luxuhui@xiaomi.com>

refactor: refactor the delegators for arm
N/A Signed-off-by: N Luxuhui <luxuhui@xiaomi.com>
9aff3c14 · luxuhui · fbd0ff09 · 9aff3c14 · 9aff3c14 · 9aff3c14
66 changed file
--- a/mace/core/registry/op_delegator_registry.cc
+++ b/mace/core/registry/op_delegator_registry.cc
@@ -60,6 +60,7 @@ MaceStatus OpDelegatorRegistry::Register(const DelegatorInfo &key,
 DelegatorCreator OpDelegatorRegistry::GetCreator(
    const DelegatorInfo &key) const {
  if (registry_.count(key) > 0) {
+    VLOG(3) << "find delegator creator: " << key.ToString();
    return registry_.at(key);
  }

--- a/mace/ops/BUILD.bazel
+++ b/mace/ops/BUILD.bazel
@@ -105,6 +105,7 @@ cc_library(
    name = "arm_neon_kernels",
    srcs = glob(
        [
+            "arm/base/*.cc",
            "arm/fp32/*.cc",
            "arm/fp16/gemv.h",
        ],
@@ -121,6 +122,7 @@ cc_library(
    )),
    hdrs = glob(
        [
+            "arm/base/*.h",
            "arm/fp32/*.h",
        ],
    ) + if_quantize_enabled(glob(

--- a/mace/ops/CMakeLists.txt
+++ b/mace/ops/CMakeLists.txt
@@ -5,6 +5,9 @@ file(GLOB OPS_REF_Q8_KERNELS_SRCS
  ref/q8/*.cc
 )
+file(GLOB OPS_ARM_NEON_BASE_KERNELS_SRCS
+  arm/base/*.cc
+)
 file(GLOB OPS_ARM_NEON_FP32_KERNELS_SRCS
  arm/fp32/*.cc
 )
@@ -32,7 +35,7 @@ if(MACE_ENABLE_QUANTIZE)
 endif(MACE_ENABLE_QUANTIZE)
 if(MACE_ENABLE_NEON)
-  set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_FP32_KERNELS_SRCS})
+  set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_BASE_KERNELS_SRCS} ${OPS_ARM_NEON_FP32_KERNELS_SRCS})
  if(MACE_ENABLE_QUANTIZE)
    set(OPS_SRCS ${OPS_SRCS} ${OPS_ARM_NEON_Q8_KERNELS_SRCS})
  endif(MACE_ENABLE_QUANTIZE)

--- a/mace/ops/arm/base/activation.cc
+++ b/mace/ops/arm/base/activation.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/arm/base/activation.h"
+namespace mace {
+namespace ops {
+namespace arm {
+template<typename T>
+MaceStatus Activation<T>::Compute(const OpContext *context,
+                                  const Tensor *input, Tensor *output) {
+  Tensor::MappingGuard input_guard(input);
+  if (input != output) {
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+    Tensor::MappingGuard output_guard(output);
+    DoActivation(context, input, output);
+  } else {
+    DoActivation(context, input, output);
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+template<typename T>
+void Activation<T>::DoActivation(const OpContext *context,
+                                 const Tensor *input,
+                                 Tensor *output) {
+  const T *input_data = input->data<T>();
+  T *output_data = output->mutable_data<T>();
+  const index_t size = input->size();
+  utils::ThreadPool &thread_pool =
+      context->device()->cpu_runtime()->thread_pool();
+  switch (type_) {
+    case RELU: {
+      ActivateRelu(&thread_pool, input_data, size, output_data);
+      break;
+    }
+    case RELUX: {
+      ActivateRelux(&thread_pool, input_data, size, output_data);
+      break;
+    }
+    case LEAKYRELU: {
+      ActivateLeakyRelu(&thread_pool, input_data, size, output_data);
+      break;
+    }
+    case TANH: {
+      ActivateTanh(&thread_pool, input_data, size, output_data);
+      break;
+    }
+    case SIGMOID: {
+      ActivateSigmoid(&thread_pool, input_data, size, output_data);
+      break;
+    }
+    case NOOP: {
+      break;
+    }
+    default: {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+}
+void RegisterActivationDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Activation<float>, delegator::ActivationParam,
+      MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::NEON));
+}
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/base/activation.h
+++ b/mace/ops/arm/base/activation.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_ARM_BASE_ACTIVATION_H_
+#define MACE_OPS_ARM_BASE_ACTIVATION_H_
+#include "mace/ops/delegator/activation.h"
+namespace mace {
+namespace ops {
+namespace arm {
+template<typename T>
+class Activation : public delegator::Activation {
+ public:
+  explicit Activation(const delegator::ActivationParam &param)
+      : delegator::Activation(param) {}
+  ~Activation() = default;
+  MaceStatus Compute(const OpContext *context,
+                     const Tensor *input, Tensor *output) override;
+ private:
+  void DoActivation(const OpContext *context,
+                    const Tensor *input, Tensor *output);
+  void ActivateRelu(utils::ThreadPool *thread_pool, const T *input_data,
+                    const index_t input_size, T *output_data);
+  void ActivateRelux(utils::ThreadPool *thread_pool, const T *input_data,
+                     const index_t input_size, T *output_data);
+  void ActivateLeakyRelu(utils::ThreadPool *thread_pool, const T *input_data,
+                         const index_t input_size, T *output_data);
+  void ActivateTanh(utils::ThreadPool *thread_pool, const T *input_data,
+                    const index_t input_size, T *output_data);
+  void ActivateSigmoid(utils::ThreadPool *thread_pool, const T *input_data,
+                       const index_t input_size, T *output_data);
+};
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+#endif  // MACE_OPS_ARM_BASE_ACTIVATION_H_
--- a/mace/ops/arm/base/bias_add.cc
+++ b/mace/ops/arm/base/bias_add.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/arm/base/bias_add.h"
+namespace mace {
+namespace ops {
+namespace arm {
+template<typename T>
+MaceStatus BiasAdd<T>::Compute(const OpContext *context, const Tensor *input,
+                               const Tensor *bias, Tensor *output) {
+  if (input != output) {
+    if (bias == nullptr) {
+      output->Copy(*input);
+    } else {
+      MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+      Tensor::MappingGuard input_guard(input);
+      Tensor::MappingGuard bias_guard(bias);
+      Tensor::MappingGuard output_guard(output);
+      AddBias(context, input, bias, output);
+    }
+  } else {
+    if (bias != nullptr) {
+      Tensor::MappingGuard input_guard(input);
+      Tensor::MappingGuard bias_guard(bias);
+      AddBias(context, input, bias, output);
+    }
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+template<typename T>
+void BiasAdd<T>::AddBias(const OpContext *context, const Tensor *input,
+                         const Tensor *bias, mace::Tensor *output) {
+  auto input_data = input->data<T>();
+  auto bias_data = bias->data<T>();
+  auto output_data = output->mutable_data<T>();
+  const index_t batch = input->dim(0);
+  const index_t channels = input->dim(1);
+  const index_t height = input->dim(2);
+  const index_t width = input->dim(3);
+  const index_t image_size = height * width;
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+  if (bias->dim_size() == 1) {
+    Add1DimBias(&thread_pool, input_data, bias_data,
+                output_data, batch, channels, image_size);
+  } else {
+    Add2DimsBias(&thread_pool, input_data, bias_data,
+                     output_data, batch, channels, image_size);
+  }
+}
+void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, BiasAdd<float>, DelegatorParam,
+      MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::NEON));
+}
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/base/bias_add.h
+++ b/mace/ops/arm/base/bias_add.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_ARM_BASE_BIAS_ADD_H_
+#define MACE_OPS_ARM_BASE_BIAS_ADD_H_
+#include "mace/ops/delegator/bias_add.h"
+namespace mace {
+namespace ops {
+namespace arm {
+template<typename T>
+class BiasAdd : public delegator::BiasAdd {
+ public:
+  explicit BiasAdd(const DelegatorParam &param) : delegator::BiasAdd(param) {}
+  ~BiasAdd() = default;
+  MaceStatus Compute(const OpContext *context, const Tensor *input,
+                     const Tensor *bias, Tensor *output) override;
+ private:
+  void AddBias(const OpContext *context, const Tensor *input,
+               const Tensor *bias, Tensor *output);
+  void Add1DimBias(utils::ThreadPool *thread_pool, const T *input_data,
+                   const T *bias_data, T *output_data,
+                   const index_t batch, const index_t channels,
+                   const index_t image_size);
+  void Add2DimsBias(utils::ThreadPool *thread_pool, const T *input_data,
+                    const T *bias_data, T *output_data,
+                    const index_t batch, const index_t channels,
+                    const index_t image_size);
+};
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+#endif  // MACE_OPS_ARM_BASE_BIAS_ADD_H_
--- a/mace/ops/arm/fp32/conv_2d.cc
+++ b/mace/ops/arm/fp32/conv_2d.cc
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,18 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d.h"
+#include <algorithm>
 #include <memory>
 #include <utility>
-#include <algorithm>
 #include "mace/utils/memory.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 void Conv2dBase::CalOutputShapeAndInputPadSize(
    const std::vector<index_t> &input_shape,
@@ -164,10 +163,10 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
  auto scratch_buffer = context->device()->scratch_buffer();
  const index_t padded_in_size =
      MACE_EXTRA_BUFFER_PAD_SIZE + (is_in_padded ? PadAlignSize(
-          sizeof(float) * batch * in_channels * padded_in_height
+          type_size_ * batch * in_channels * padded_in_height
              * padded_in_width) : 0);
  const index_t padded_out_size = is_out_padded ? PadAlignSize(
-      sizeof(float) * batch * out_channels * padded_out_height
+      type_size_ * batch * out_channels * padded_out_height
          * padded_out_width) : 0;
  scratch_buffer->Rewind();
@@ -176,7 +175,7 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
    std::unique_ptr<Tensor>
        padded_in =
        make_unique<Tensor>(scratch_buffer->Scratch(padded_in_size),
-                            DataType::DT_FLOAT);
+                            input->dtype());
    padded_in->Resize({batch, in_channels, padded_in_height, padded_in_width});
    PadInput(*input, in_pad_size[0], in_pad_size[2], padded_in.get());
    *padded_input = std::move(padded_in);
@@ -185,7 +184,7 @@ MaceStatus Conv2dBase::ResizeOutAndPadInOut(const OpContext *context,
    std::unique_ptr<Tensor>
        padded_out =
        make_unique<Tensor>(scratch_buffer->Scratch(padded_out_size),
-                            DataType::DT_FLOAT);
+                            output->dtype());
    padded_out->Resize({batch, out_channels, padded_out_height,
                        padded_out_width});
    *padded_output = std::move(padded_out);
@@ -206,8 +205,8 @@ void Conv2dBase::PadInput(const Tensor &src,
  const index_t padded_width = dst->dim(3);
  const int pad_bottom = static_cast<int>(padded_height - height - pad_top);
  const int pad_right = static_cast<int>(padded_width - width - pad_left);
-  auto in_data = src.data<float>();
+  auto in_data = src.data<uint8_t>();
-  auto padded_in_data = dst->mutable_data<float>();
+  auto padded_in_data = dst->mutable_data<uint8_t>();
  const index_t img_size = height * width;
  const index_t padded_img_size = padded_height * padded_width;
@@ -215,25 +214,26 @@ void Conv2dBase::PadInput(const Tensor &src,
  for (index_t b = 0; b < batch; ++b) {
    for (index_t c = 0; c < channels; ++c) {
      const index_t bc = b * channels + c;
-      const float *in_base = in_data + bc * img_size;
+      const uint8_t *in_base = in_data + bc * img_size * type_size_;
-      float *padded_in_base = padded_in_data + bc * padded_img_size;
+      uint8_t *padded_in_base =
+          padded_in_data + bc * padded_img_size * type_size_;
-      memset(padded_in_base, 0, sizeof(float) * pad_top * padded_width);
+      memset(padded_in_base, 0, type_size_ * pad_top * padded_width);
-      padded_in_base += pad_top * padded_width;
+      padded_in_base += pad_top * padded_width * type_size_;
      for (index_t h = 0; h < height; ++h) {
        memset(padded_in_base,
               0,
-               sizeof(float) * pad_left);
+               type_size_ * pad_left);
-        memcpy(padded_in_base + pad_left,
+        memcpy(padded_in_base + pad_left * type_size_,
               in_base,
-               sizeof(float) * width);
+               type_size_ * width);
-        memset(padded_in_base + pad_left + width,
+        memset(padded_in_base + (pad_left + width) * type_size_,
               0,
-               sizeof(float) * pad_right);
+               type_size_ * pad_right);
-        in_base += width;
+        in_base += width * type_size_;
-        padded_in_base += padded_width;
+        padded_in_base += padded_width * type_size_;
      }
-      memset(padded_in_base, 0, sizeof(float) * pad_bottom * padded_width);
+      memset(padded_in_base, 0, type_size_ * pad_bottom * padded_width);
    }
  }
 }
@@ -247,8 +247,8 @@ void Conv2dBase::UnPadOutput(const Tensor &src, Tensor *dst) {
  const index_t padded_height = src.dim(2);
  const index_t padded_width = src.dim(3);
-  auto padded_out_data = src.data<float>();
+  auto padded_out_data = src.data<uint8_t>();
-  auto out_data = dst->mutable_data<float>();
+  auto out_data = dst->mutable_data<uint8_t>();
  const index_t img_size = height * width;
  const index_t padded_img_size = padded_height * padded_width;
@@ -256,21 +256,93 @@ void Conv2dBase::UnPadOutput(const Tensor &src, Tensor *dst) {
  for (index_t b = 0; b < batch; ++b) {
    for (index_t c = 0; c < channels; ++c) {
      const index_t bc = (b * channels + c);
-      float *out_base = out_data + bc * img_size;
+      uint8_t *out_base = out_data + bc * img_size * type_size_;
-      const float *padded_out_base = padded_out_data + bc * padded_img_size;
+      const uint8_t *padded_out_base =
+          padded_out_data + bc * padded_img_size * type_size_;
      for (index_t h = 0; h < height; ++h) {
-        memcpy(out_base,
+        memcpy(out_base, padded_out_base, type_size_ * width);
-               padded_out_base,
+        out_base += width * type_size_;
-               sizeof(float) * width);
+        padded_out_base += padded_width * type_size_;
-        out_base += width;
-        padded_out_base += padded_width;
      }  // h
    }  // c
  }  // b
 }
-}  // namespace fp32
+ConvComputeParam Conv2dBase::PreWorkAndGetConv2DParam(
+    const OpContext *context, const Tensor *in_tensor, Tensor *out_tensor) {
+  auto &in_shape = in_tensor->shape();
+  auto &out_shape = out_tensor->shape();
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+  return ConvComputeParam(batch, in_channels, in_height, in_width,
+                          out_channels, out_height, out_width,
+                          in_image_size, out_image_size,
+                          in_batch_size, out_batch_size, &thread_pool);
+}
+DepthwiseConvComputeParam Conv2dBase::PreWorkAndGetDepthwiseConv2DParam(
+    const OpContext *context, const Tensor *input,
+    const Tensor *filter, Tensor *output) {
+  std::vector<index_t> out_shape(4);
+  std::vector<int> paddings(2);
+  auto &in_shape = input->shape();
+  auto &filter_shape = filter->shape();
+  CalOutputShapeAndInputPadSize(in_shape, filter_shape, &out_shape, &paddings);
+  out_shape[1] *= filter_shape[1];
+  MACE_CHECK(output->Resize(out_shape) == MaceStatus::MACE_SUCCESS,
+             "Resize failed.");
+  output->Clear();
+  const int pad_top = paddings[0] / 2;
+  const int pad_left = paddings[1] / 2;
+  const index_t batch = in_shape[0];
+  const index_t in_channels = in_shape[1];
+  const index_t in_height = in_shape[2];
+  const index_t in_width = in_shape[3];
+  const index_t out_channels = out_shape[1];
+  const index_t out_height = out_shape[2];
+  const index_t out_width = out_shape[3];
+  const index_t in_image_size = in_height * in_width;
+  const index_t out_image_size = out_height * out_width;
+  const index_t in_batch_size = in_channels * in_image_size;
+  const index_t out_batch_size = out_channels * out_image_size;
+  const index_t multiplier = out_channels / in_channels;
+  std::vector<index_t> out_bounds;
+  CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds);
+  const index_t valid_h_start = out_bounds[0];
+  const index_t valid_h_stop = out_bounds[1];
+  const index_t valid_w_start = out_bounds[2];
+  const index_t valid_w_stop = out_bounds[3];
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+  return DepthwiseConvComputeParam(
+      batch, in_channels, in_height, in_width, out_channels, out_height,
+      out_width, in_image_size, out_image_size, in_batch_size, out_batch_size,
+      &thread_pool, pad_top, pad_left, multiplier, valid_h_start, valid_h_stop,
+      valid_w_start, valid_w_stop);
+}
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

--- a/mace/ops/arm/fp32/conv_2d.h
+++ b/mace/ops/arm/fp32/conv_2d.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,28 +12,97 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_CONV_2D_H_
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_H_
-#define MACE_OPS_ARM_FP32_CONV_2D_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_H_
-#include <vector>
 #include <memory>
+#include <vector>
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/delegator/conv_2d.h"
+#include "mace/ops/arm/base/gemm.h"
-#include "mace/ops/arm/fp32/gemm.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/conv_2d.h"
 #include "mace/public/mace.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
+struct ConvComputeParam {
+  const index_t batch;
+  const index_t in_channels;
+  const index_t in_height;
+  const index_t in_width;
+  const index_t out_channels;
+  const index_t out_height;
+  const index_t out_width;
+  const index_t in_image_size;
+  const index_t out_image_size;
+  const index_t in_batch_size;
+  const index_t out_batch_size;
+  utils::ThreadPool &thread_pool;
+  ConvComputeParam(const index_t b,
+                   const index_t in_c,
+                   const index_t in_h,
+                   const index_t in_w,
+                   const index_t out_c,
+                   const index_t out_h,
+                   const index_t out_w,
+                   const index_t in_size,
+                   const index_t out_size,
+                   const index_t in_b_size,
+                   const index_t out_b_size,
+                   utils::ThreadPool *thrd_pool)
+      : batch(b), in_channels(in_c), in_height(in_h), in_width(in_w),
+        out_channels(out_c), out_height(out_h), out_width(out_w),
+        in_image_size(in_size), out_image_size(out_size),
+        in_batch_size(in_b_size), out_batch_size(out_b_size),
+        thread_pool(*thrd_pool) {}
+};
+struct DepthwiseConvComputeParam : public ConvComputeParam {
+  const int pad_top;
+  const int pad_left;
+  const index_t multiplier;
+  const index_t valid_h_start;
+  const index_t valid_h_stop;
+  const index_t valid_w_start;
+  const index_t valid_w_stop;
+  DepthwiseConvComputeParam(const index_t b,
+                            const index_t in_c,
+                            const index_t in_h,
+                            const index_t in_w,
+                            const index_t out_c,
+                            const index_t out_h,
+                            const index_t out_w,
+                            const index_t in_size,
+                            const index_t out_size,
+                            const index_t in_b_size,
+                            const index_t out_b_size,
+                            utils::ThreadPool *thrd_pool,
+                            const int pad_top_data,
+                            const int pad_left_data,
+                            const index_t multiplier_data,
+                            const index_t valid_height_start,
+                            const index_t valid_height_stop,
+                            const index_t valid_width_start,
+                            const index_t valid_width_stop)
+      : ConvComputeParam(b, in_c, in_h, in_w, out_c, out_h, out_w,
+                         in_size, out_size, in_b_size, out_b_size, thrd_pool),
+        pad_top(pad_top_data), pad_left(pad_left_data),
+        multiplier(multiplier_data),
+        valid_h_start(valid_height_start), valid_h_stop(valid_height_stop),
+        valid_w_start(valid_width_start), valid_w_stop(valid_width_stop) {}
+};
 class Conv2dBase : public delegator::Conv2d {
 public:
-  explicit Conv2dBase(const delegator::Conv2dParam &param)
+  explicit Conv2dBase(const delegator::Conv2dParam &param, int type_size)
-      : delegator::Conv2d(param) {}
+      : delegator::Conv2d(param), type_size_(type_size) {}
  virtual ~Conv2dBase() = default;
@@ -72,11 +141,19 @@ class Conv2dBase : public delegator::Conv2d {
                const int pad_left,
                Tensor *dst);
  void UnPadOutput(const Tensor &src, Tensor *dst);
+  ConvComputeParam PreWorkAndGetConv2DParam(
+      const OpContext *context, const Tensor *in_tensor, Tensor *out_tensor);
+  DepthwiseConvComputeParam PreWorkAndGetDepthwiseConv2DParam(
+      const OpContext *context, const Tensor *input,
+      const Tensor *filter, Tensor *output);
+ private:
+  int type_size_;
 };
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_CONV_2D_H_
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_H_
--- a/mace/ops/arm/fp32/conv_2d_1x1.cc
+++ b/mace/ops/arm/fp32/conv_2d_1x1.cc
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,33 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d_1x1.h"
-#include "mace/ops/arm/fp32/gemm.h"
-#include "mace/ops/delegator/conv_2d.h"
+#include <vector>
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-class Conv2dK1x1 : public Conv2dBase {
- public:
-  explicit Conv2dK1x1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param),
-        gemm_(delegator::GemmParam()) {}
-  virtual ~Conv2dK1x1() {}
-  MaceStatus Compute(
+template<typename T>
-      const OpContext *context,
+MaceStatus Conv2dK1x1<T>::Compute(const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
- private:
-  Gemm gemm_;
-};
-MaceStatus Conv2dK1x1::Compute(const OpContext *context,
                                  const Tensor *input,
                                  const Tensor *filter,
                                  Tensor *output) {
@@ -50,13 +33,8 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
  std::vector<index_t> output_shape;
  std::vector<int> in_pad_size;
  std::vector<int> out_pad_size;
-  CalOutputShapeAndPadSize(input,
+  CalOutputShapeAndPadSize(input, filter, 1, 1,
-                           filter,
+                           &output_shape, &in_pad_size, &out_pad_size);
-                           1,
-                           1,
-                           &output_shape,
-                           &in_pad_size,
-                           &out_pad_size);
  MACE_RETURN_IF_ERROR(output->Resize(output_shape));
  const index_t out_channels = output_shape[1];
@@ -70,16 +48,16 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
      in_height != padded_in_height || in_width != padded_in_width;
  auto scratch_buffer = context->device()->scratch_buffer();
  const index_t padded_in_size = is_in_padded ? PadAlignSize(
-      sizeof(float) * batch * in_channels * padded_in_height
+      sizeof(T) * batch * in_channels * padded_in_height
          * padded_in_width) : 0;
  const index_t pack_filter_size =
-      PadAlignSize(sizeof(float) * out_channels * in_channels);
+      PadAlignSize(sizeof(T) * out_channels * in_channels);
  const index_t pack_input_size =
      PadAlignSize(
-          sizeof(float) * in_channels * padded_in_height * padded_in_width);
+          sizeof(T) * in_channels * padded_in_height * padded_in_width);
  const index_t pack_output_size =
      PadAlignSize(
-          sizeof(float) * out_channels * padded_in_height * padded_in_width);
+          sizeof(T) * out_channels * padded_in_height * padded_in_width);
  const index_t gemm_pack_size =
      pack_filter_size + pack_input_size + pack_output_size;
@@ -115,12 +93,11 @@ MaceStatus Conv2dK1x1::Compute(const OpContext *context,
 void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry) {
  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK1x1, delegator::Conv2dParam,
+      registry, Conv2dK1x1<float>, delegator::Conv2dParam,
      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
                            float, ImplType::NEON, K1x1));
 }
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/base/conv_2d_1x1.h
+++ b/mace/ops/arm/base/conv_2d_1x1.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_1X1_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_1X1_H_
+#include "mace/ops/arm/base/conv_2d.h"
+#include "mace/ops/arm/base/gemm.h"
+namespace mace {
+namespace ops {
+namespace arm {
+template<typename T>
+class Conv2dK1x1 : public Conv2dBase {
+ public:
+  explicit Conv2dK1x1(const delegator::Conv2dParam &param)
+      : Conv2dBase(param, sizeof(T)),
+        gemm_(delegator::GemmParam()) {}
+  virtual ~Conv2dK1x1() {}
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      Tensor *output) override;
+ private:
+  Gemm<T> gemm_;
+};
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_1X1_H_
--- a/mace/ops/arm/base/conv_2d_1xn.cc
+++ b/mace/ops/arm/base/conv_2d_1xn.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/arm/base/conv_2d_1xn.h"
+namespace mace {
+namespace ops {
+namespace arm {
+void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK1x7S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K1x7S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK7x1S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K7x1S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK1x15S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K1x15S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK15x1S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K15x1S1));
+}
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/conv_2d_1xn.h
+++ b/mace/ops/arm/fp32/conv_2d_1xn.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,76 +12,66 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_1XN_H_
-#define MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_1XN_H_
 #include <vector>
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d_mxn.h"
 #include "mace/public/mace.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-class Conv2dK1x7S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK1x7S1 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK1x7S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
  virtual ~Conv2dK1x7S1() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *output_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
 };
-class Conv2dK7x1S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK7x1S1 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK7x1S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 4, 1) {}
  virtual ~Conv2dK7x1S1() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *output_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
 };
-class Conv2dK1x15S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK1x15S1 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK1x15S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
  virtual ~Conv2dK1x15S1() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *output_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
 };
-class Conv2dK15x1S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK15x1S1 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK15x1S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 4, 1) {}
  virtual ~Conv2dK15x1S1() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *output_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
 };
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_CONV_2D_1XN_H_
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_1XN_H_
--- a/mace/ops/arm/base/conv_2d_3x3.cc
+++ b/mace/ops/arm/base/conv_2d_3x3.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/arm/base/conv_2d_3x3.h"
+namespace mace {
+namespace ops {
+namespace arm {
+void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK3x3S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK3x3S2<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S2));
+}
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/conv_2d_3x3.h
+++ b/mace/ops/arm/fp32/conv_2d_3x3.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,50 +12,44 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_3X3_H_
-#define MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_3X3_H_
 #include <vector>
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d_mxn.h"
 #include "mace/public/mace.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-class Conv2dK3x3S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK3x3S1 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK3x3S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 2, 4) {}
  virtual ~Conv2dK3x3S1() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *output_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
 };
-class Conv2dK3x3S2 : public Conv2dBase {
+template<typename T>
+class Conv2dK3x3S2 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK3x3S2(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
  virtual ~Conv2dK3x3S2() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *output_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
 };
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_CONV_2D_3X3_H_
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_3X3_H_
--- a/mace/ops/arm/base/conv_2d_5x5.cc
+++ b/mace/ops/arm/base/conv_2d_5x5.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/arm/base/conv_2d_5x5.h"
+namespace mace {
+namespace ops {
+namespace arm {
+void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK5x5S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K5x5S1));
+}
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/base/conv_2d_5x5.h
+++ b/mace/ops/arm/base/conv_2d_5x5.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_5X5_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_5X5_H_
+#include <vector>
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/ops/arm/base/conv_2d_mxn.h"
+#include "mace/public/mace.h"
+namespace mace {
+namespace ops {
+namespace arm {
+template<typename T>
+class Conv2dK5x5S1 : public Conv2dKMxN<T> {
+ public:
+  explicit Conv2dK5x5S1(const delegator::Conv2dParam &param)
+      : Conv2dKMxN<T>(param, 1, 4) {}
+  virtual ~Conv2dK5x5S1() {}
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                       const T *input_data, T *output_data) override;
+};
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_5X5_H_
--- a/mace/ops/arm/base/conv_2d_7x7.cc
+++ b/mace/ops/arm/base/conv_2d_7x7.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/arm/base/conv_2d_7x7.h"
+namespace mace {
+namespace ops {
+namespace arm {
+void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK7x7S1<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K7x7S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK7x7S2<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K7x7S2));
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dK7x7S3<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K7x7S3));
+}
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/conv_2d_7x7.h
+++ b/mace/ops/arm/fp32/conv_2d_7x7.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,63 +12,55 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_7X7_H_
-#define MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_7X7_H_
 #include <vector>
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d_mxn.h"
 #include "mace/public/mace.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-class Conv2dK7x7S1 : public Conv2dBase {
+template<typename T>
+class Conv2dK7x7S1 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK7x7S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
  virtual ~Conv2dK7x7S1() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *output_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
 };
-class Conv2dK7x7S2 : public Conv2dBase {
+template<typename T>
+class Conv2dK7x7S2 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK7x7S2(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
  virtual ~Conv2dK7x7S2() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *output_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
 };
-class Conv2dK7x7S3 : public Conv2dBase {
+template<typename T>
+class Conv2dK7x7S3 : public Conv2dKMxN<T> {
 public:
  explicit Conv2dK7x7S3(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
+      : Conv2dKMxN<T>(param, 1, 4) {}
  virtual ~Conv2dK7x7S3() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *output_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
 };
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_CONV_2D_7X7_H_
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_7X7_H_
--- a/mace/ops/arm/base/conv_2d_general.cc
+++ b/mace/ops/arm/base/conv_2d_general.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/arm/base/conv_2d_general.h"
+#include <memory>
+namespace mace {
+namespace ops {
+namespace arm {
+template<typename T>
+MaceStatus Conv2dGeneral<T>::Compute(const OpContext *context,
+                                     const Tensor *input,
+                                     const Tensor *filter,
+                                     Tensor *output) {
+  std::unique_ptr<const Tensor> padded_input;
+  std::unique_ptr<Tensor> padded_output;
+  ResizeOutAndPadInOut(context, input, filter, output, 1, 4,
+                       &padded_input, &padded_output);
+  const Tensor *in_tensor = input;
+  if (padded_input != nullptr) {
+    in_tensor = padded_input.get();
+  }
+  Tensor *out_tensor = output;
+  if (padded_output != nullptr) {
+    out_tensor = padded_output.get();
+  }
+  out_tensor->Clear();
+  Tensor::MappingGuard in_guard(input);
+  Tensor::MappingGuard filter_guard(filter);
+  Tensor::MappingGuard out_guard(output);
+  const T *filter_data = filter->data<T>();
+  const T *input_data = in_tensor->data<T>();
+  T *output_data = out_tensor->mutable_data<T>();
+  const ConvComputeParam p =
+      PreWorkAndGetConv2DParam(context, in_tensor, out_tensor);
+  auto &filter_shape = filter->shape();
+  DoCompute(p, filter_data, input_data, output_data, filter_shape);
+  UnPadOutput(*out_tensor, output);
+  return MaceStatus::MACE_SUCCESS;
+}
+void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Conv2dGeneral<float>, delegator::Conv2dParam,
+      MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::NEON));
+}
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/base/conv_2d_general.h
+++ b/mace/ops/arm/base/conv_2d_general.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_GENERAL_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_GENERAL_H_
+#include <vector>
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/ops/arm/base/conv_2d.h"
+#include "mace/public/mace.h"
+namespace mace {
+namespace ops {
+namespace arm {
+template<typename T>
+class Conv2dGeneral : public Conv2dBase {
+ public:
+  explicit Conv2dGeneral(const delegator::Conv2dParam &param)
+      : Conv2dBase(param, sizeof(T)) {}
+  virtual ~Conv2dGeneral() {}
+  MaceStatus Compute(const OpContext *context, const Tensor *input,
+                     const Tensor *filter, Tensor *output) override;
+ protected:
+  MaceStatus DoCompute(
+      const ConvComputeParam &p, const T *filter_data,
+      const T *input_data, T *output_data,
+      const std::vector<index_t> &filter_shape);
+};
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_GENERAL_H_
--- a/mace/ops/arm/base/conv_2d_mxn.h
+++ b/mace/ops/arm/base/conv_2d_mxn.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_ARM_BASE_CONV_2D_MXN_H_
+#define MACE_OPS_ARM_BASE_CONV_2D_MXN_H_
+#include <memory>
+#include <vector>
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/ops/arm/base/conv_2d.h"
+#include "mace/public/mace.h"
+namespace mace {
+namespace ops {
+namespace arm {
+template<typename T>
+class Conv2dKMxN : public Conv2dBase {
+ public:
+  explicit Conv2dKMxN(const delegator::Conv2dParam &param,
+                      const int tile_h, const int tile_w)
+      : Conv2dBase(param, sizeof(T)),
+        out_tile_h_(tile_h), out_tile_w_(tile_w) {}
+  virtual ~Conv2dKMxN() {}
+  MaceStatus Compute(const OpContext *context, const Tensor *input,
+                     const Tensor *filter, Tensor *output) override {
+    std::unique_ptr<const Tensor> padded_input;
+    std::unique_ptr<Tensor> padded_output;
+    ResizeOutAndPadInOut(context, input, filter, output, out_tile_h_,
+                         out_tile_w_, &padded_input, &padded_output);
+    const Tensor *in_tensor = input;
+    if (padded_input != nullptr) {
+      in_tensor = padded_input.get();
+    }
+    Tensor *out_tensor = output;
+    if (padded_output != nullptr) {
+      out_tensor = padded_output.get();
+    }
+    out_tensor->Clear();
+    Tensor::MappingGuard in_guard(input);
+    Tensor::MappingGuard filter_guard(filter);
+    Tensor::MappingGuard out_guard(output);
+    const T *filter_data = filter->data<T>();
+    const T *input_data = in_tensor->data<T>();
+    T *output_data = out_tensor->mutable_data<T>();
+    const ConvComputeParam p =
+        PreWorkAndGetConv2DParam(context, in_tensor, out_tensor);
+    DoCompute(p, filter_data, input_data, output_data);
+    UnPadOutput(*out_tensor, output);
+    return MaceStatus::MACE_SUCCESS;
+  }
+  virtual MaceStatus DoCompute(const ConvComputeParam &p, const T *filter,
+                               const T *input_data, T *output_data) = 0;
+ private:
+  const int out_tile_h_;
+  const int out_tile_w_;
+};
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+#endif  // MACE_OPS_ARM_BASE_CONV_2D_MXN_H_
--- a/mace/ops/arm/fp32/deconv_2d.cc
+++ b/mace/ops/arm/fp32/deconv_2d.cc
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d.h"
-#include <utility>
 #include <functional>
-#include "mace/utils/memory.h"
+#include <utility>
 #include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/utils/memory.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 MaceStatus Deconv2dBase::ResizeOutAndPadOut(
    const OpContext *context,
@@ -67,7 +67,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut(
        std::accumulate(padded_out_shape.begin(),
                        padded_out_shape.end(),
                        1,
-                        std::multiplies<index_t>()) * sizeof(float);
+                        std::multiplies<index_t>()) * type_size_;
    ScratchBuffer *scratch = context->device()->scratch_buffer();
    scratch->Rewind();
    index_t scratch_size = PadAlignSize(padded_out_size);
@@ -75,7 +75,7 @@ MaceStatus Deconv2dBase::ResizeOutAndPadOut(
    std::unique_ptr<Tensor>
        padded_out
-        (make_unique<Tensor>(scratch->Scratch(scratch_size), DT_FLOAT));
+        (make_unique<Tensor>(scratch->Scratch(scratch_size), output->dtype()));
    padded_out->Reshape(padded_out_shape);
    *padded_output = std::move(padded_out);
  }
@@ -97,24 +97,97 @@ void Deconv2dBase::UnPadOutput(const Tensor &src,
  const index_t padded_height = src.dim(2);
  const index_t padded_width = src.dim(3);
-  auto padded_out_data = src.data<float>();
+  auto padded_out_data = src.data<uint8_t>();
-  auto out_data = dst->mutable_data<float>();
+  auto out_data = dst->mutable_data<uint8_t>();
  for (index_t i = 0; i < batch; ++i) {
    for (index_t j = 0; j < channels; ++j) {
      for (index_t k = 0; k < height; ++k) {
-        const float *input_base =
+        const uint8_t *input_base =
            padded_out_data + ((i * channels + j) * padded_height
-                + (k + pad_h)) * padded_width;
+                + (k + pad_h)) * padded_width * type_size_;
-        float *output_base =
+        uint8_t *output_base =
-            out_data + ((i * channels + j) * height + k) * width;
+            out_data + ((i * channels + j) * height + k) * width * type_size_;
-        memcpy(output_base, input_base + pad_w, width * sizeof(float));
+        memcpy(output_base,
+               input_base + pad_w * type_size_,
+               width * type_size_);
      }
    }
  }
 }
-}  // namespace fp32
+DeconvComputeParam Deconv2dBase::PreWorkAndGetDeconvParam(
+    const OpContext *context, const Tensor *input, Tensor *out_tensor) {
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+  const index_t batch = in_shape[0];
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+  return DeconvComputeParam(batch, inch, h, w, outch, outh, outw,
+                            out_img_size, &thread_pool);
+}
+DepthwiseDeconvComputeParam Deconv2dBase::PreWorkAndGetDepthwiseDeconvParam(
+    const OpContext *context, const Tensor *input, Tensor *out_tensor) {
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+  const index_t batch = in_shape[0];
+  const index_t channels = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+  const index_t in_img_size = h * w;
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t out_img_size = outh * outw;
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+  return DepthwiseDeconvComputeParam(batch, channels, h, w, in_img_size,
+                                     outh, outw, out_img_size, &thread_pool);
+}
+GroupDeconvComputeParam Deconv2dBase::PreWorkAndGetGroupDeconvParam(
+    const OpContext *context, const Tensor *input, Tensor *out_tensor) {
+  auto &in_shape = input->shape();
+  auto &out_shape = out_tensor->shape();
+  const index_t batch = in_shape[0];
+  const index_t inch = in_shape[1];
+  const index_t h = in_shape[2];
+  const index_t w = in_shape[3];
+  const index_t outch = out_shape[1];
+  const index_t outh = out_shape[2];
+  const index_t outw = out_shape[3];
+  const index_t in_img_size = h * w;
+  const index_t out_img_size = outh * outw;
+  const index_t inch_g = inch / group_;
+  const index_t outch_g = outch / group_;
+  utils::ThreadPool
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+  return GroupDeconvComputeParam(batch, inch, h, w, outch, outh, outw,
+                                 in_img_size, out_img_size, inch_g,
+                                 outch_g, &thread_pool);
+}
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/base/deconv_2d.h
+++ b/mace/ops/arm/base/deconv_2d.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_H_
+#include <memory>
+#include <vector>
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/ops/arm/base/gemm.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/deconv_2d.h"
+#include "mace/public/mace.h"
+namespace mace {
+namespace ops {
+namespace arm {
+struct DeconvComputeParam {
+  const index_t batch;
+  const index_t in_channels;
+  const index_t in_height;
+  const index_t in_width;
+  const index_t out_channels;
+  const index_t out_height;
+  const index_t out_width;
+  const index_t out_img_size;
+  utils::ThreadPool &thread_pool;
+  DeconvComputeParam(const index_t b,
+                     const index_t in_c,
+                     const index_t in_h,
+                     const index_t in_w,
+                     const index_t out_c,
+                     const index_t out_h,
+                     const index_t out_w,
+                     const index_t out_size,
+                     utils::ThreadPool *thrd_pool)
+      : batch(b), in_channels(in_c), in_height(in_h), in_width(in_w),
+        out_channels(out_c), out_height(out_h), out_width(out_w),
+        out_img_size(out_size), thread_pool(*thrd_pool) {}
+};
+struct DepthwiseDeconvComputeParam {
+  const index_t batch;
+  const index_t in_channels;
+  const index_t in_height;
+  const index_t in_width;
+  const index_t in_img_size;
+  const index_t out_height;
+  const index_t out_width;
+  const index_t out_img_size;
+  utils::ThreadPool &thread_pool;
+  DepthwiseDeconvComputeParam(const index_t b,
+                              const index_t in_c,
+                              const index_t in_h,
+                              const index_t in_w,
+                              const index_t in_size,
+                              const index_t out_h,
+                              const index_t out_w,
+                              const index_t out_size,
+                              utils::ThreadPool *thrd_pool)
+      : batch(b),
+        in_channels(in_c),
+        in_height(in_h),
+        in_width(in_w),
+        in_img_size(in_size),
+        out_height(out_h),
+        out_width(out_w),
+        out_img_size(out_size),
+        thread_pool(*thrd_pool) {}
+};
+struct GroupDeconvComputeParam {
+  const index_t batch;
+  const index_t in_channels;
+  const index_t in_height;
+  const index_t in_width;
+  const index_t out_channels;
+  const index_t out_height;
+  const index_t out_width;
+  const index_t in_img_size;
+  const index_t out_img_size;
+  const index_t inch_g;
+  const index_t outch_g;
+  utils::ThreadPool &thread_pool;
+  GroupDeconvComputeParam(const index_t in_b,
+                          const index_t in_ch,
+                          const index_t in_h,
+                          const index_t in_w,
+                          const index_t out_ch,
+                          const index_t out_h,
+                          const index_t out_w,
+                          const index_t in_size,
+                          const index_t out_size,
+                          const index_t in_ch_g,
+                          const index_t out_ch_g,
+                          utils::ThreadPool *thrd_pool)
+      : batch(in_b),
+        in_channels(in_ch),
+        in_height(in_h),
+        in_width(in_w),
+        out_channels(out_ch),
+        out_height(out_h),
+        out_width(out_w),
+        in_img_size(in_size),
+        out_img_size(out_size),
+        inch_g(in_ch_g),
+        outch_g(out_ch_g),
+        thread_pool(*thrd_pool) {}
+};
+class Deconv2dBase : public delegator::Deconv2d {
+ public:
+  explicit Deconv2dBase(const delegator::Deconv2dParam &param, int type_size)
+      : delegator::Deconv2d(param),
+        group_(param.group_), type_size_(type_size) {}
+  virtual ~Deconv2dBase() = default;
+ protected:
+  MaceStatus ResizeOutAndPadOut(const OpContext *context,
+                                const Tensor *input,
+                                const Tensor *filter,
+                                const Tensor *output_shape,
+                                Tensor *output,
+                                std::vector<int> *out_pad_size,
+                                std::unique_ptr<Tensor> *padded_output);
+  void UnPadOutput(const Tensor &src,
+                   const std::vector<int> &out_pad_size,
+                   Tensor *dst);
+  DeconvComputeParam PreWorkAndGetDeconvParam(
+      const OpContext *context, const Tensor *input, Tensor *out_tensor);
+  DepthwiseDeconvComputeParam PreWorkAndGetDepthwiseDeconvParam(
+      const OpContext *context, const Tensor *input, Tensor *out_tensor);
+  GroupDeconvComputeParam PreWorkAndGetGroupDeconvParam(
+      const OpContext *context, const Tensor *input, Tensor *out_tensor);
+ protected:
+  index_t group_;
+ private:
+  int type_size_;
+};
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_H_
--- a/mace/ops/arm/base/deconv_2d_2x2.cc
+++ b/mace/ops/arm/base/deconv_2d_2x2.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/arm/base/deconv_2d_2x2.h"
+namespace mace {
+namespace ops {
+namespace arm {
+void RegisterDeconv2dK2x2Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK2x2S1<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K2x2S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK2x2S2<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K2x2S2));
+}
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/deconv_2d_2x2.h
+++ b/mace/ops/arm/fp32/deconv_2d_2x2.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_2X2_H_
-#define MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_2X2_H_
 #include <vector>
 #include <memory>
@@ -21,46 +21,38 @@
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d_mxn.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/public/mace.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-class Deconv2dK2x2S1 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK2x2S1 : public Deconv2dKMxN<T> {
 public:
  explicit Deconv2dK2x2S1(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
  virtual ~Deconv2dK2x2S1() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *padded_out_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
 };
-class Deconv2dK2x2S2 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK2x2S2 : public Deconv2dKMxN<T> {
 public:
  explicit Deconv2dK2x2S2(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
  virtual ~Deconv2dK2x2S2() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *padded_out_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
 };
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_DECONV_2D_2X2_H_
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_2X2_H_
--- a/mace/ops/arm/base/deconv_2d_3x3.cc
+++ b/mace/ops/arm/base/deconv_2d_3x3.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/arm/base/deconv_2d_3x3.h"
+namespace mace {
+namespace ops {
+namespace arm {
+void RegisterDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK3x3S1<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK3x3S2<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S2));
+}
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/deconv_2d_3x3.h
+++ b/mace/ops/arm/fp32/deconv_2d_3x3.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_3X3_H_
-#define MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_3X3_H_
 #include <vector>
 #include <memory>
@@ -21,46 +21,38 @@
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d_mxn.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/public/mace.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-class Deconv2dK3x3S1 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK3x3S1 : public Deconv2dKMxN<T> {
 public:
  explicit Deconv2dK3x3S1(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
  virtual ~Deconv2dK3x3S1() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *padded_out_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
 };
-class Deconv2dK3x3S2 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK3x3S2 : public Deconv2dKMxN<T> {
 public:
  explicit Deconv2dK3x3S2(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
  virtual ~Deconv2dK3x3S2() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *padded_out_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
 };
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_DECONV_2D_3X3_H_
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_3X3_H_
--- a/mace/ops/arm/base/deconv_2d_4x4.cc
+++ b/mace/ops/arm/base/deconv_2d_4x4.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/arm/base/deconv_2d_4x4.h"
+namespace mace {
+namespace ops {
+namespace arm {
+void RegisterDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK4x4S1<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, Deconv2dK4x4S2<float>, delegator::Deconv2dParam,
+      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S2));
+}
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/deconv_2d_4x4.h
+++ b/mace/ops/arm/fp32/deconv_2d_4x4.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,55 +12,47 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_4X4_H_
-#define MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_4X4_H_
-#include <vector>
 #include <memory>
+#include <vector>
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d_mxn.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/public/mace.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-class Deconv2dK4x4S1 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK4x4S1 : public Deconv2dKMxN<T> {
 public:
  explicit Deconv2dK4x4S1(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
  virtual ~Deconv2dK4x4S1() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *padded_out_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
 };
-class Deconv2dK4x4S2 : public Deconv2dBase {
+template<typename T>
+class Deconv2dK4x4S2 : public Deconv2dKMxN<T> {
 public:
  explicit Deconv2dK4x4S2(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dKMxN<T>(param) {}
  virtual ~Deconv2dK4x4S2() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *padded_out_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
 };
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_DECONV_2D_4X4_H_
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_4X4_H_
--- a/mace/ops/arm/fp32/deconv_2d_general.cc
+++ b/mace/ops/arm/fp32/deconv_2d_general.cc
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,30 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d_general.h"
-// TODO(liutuo): optimize it
+#include <memory>
+#include <vector>
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-class Deconv2dGeneral : public Deconv2dBase {
+template<typename T>
- public:
+MaceStatus Deconv2dGeneral<T>::Compute(const OpContext *context,
-  explicit Deconv2dGeneral(const delegator::Deconv2dParam &param)
-      : Deconv2dBase(param) {}
-  virtual ~Deconv2dGeneral() {}
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
-};
-MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
                                       const Tensor *input,
                                       const Tensor *filter,
                                       const Tensor *output_shape,
@@ -60,9 +47,9 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
  Tensor::MappingGuard filter_mapper(filter);
  Tensor::MappingGuard output_mapper(output);
-  auto input_data = input->data<float>();
+  auto input_data = input->data<T>();
-  auto filter_data = filter->data<float>();
+  auto filter_data = filter->data<T>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
+  auto padded_out_data = out_tensor->mutable_data<T>();
  auto &in_shape = input->shape();
  auto &out_shape = out_tensor->shape();
@@ -95,7 +82,7 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
                            index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t oc = start1; oc < end1; oc += step1) {
-        float *out_base =
+        T *out_base =
            padded_out_data + (b * out_channels + oc) * out_img_size;
        for (index_t i = 0; i < in_height; ++i) {
          for (index_t j = 0; j < in_width; ++j) {
@@ -104,7 +91,7 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
            for (int ic = 0; ic < in_channels; ++ic) {
              const index_t input_idx =
                  (b * in_channels + ic) * in_img_size + i * in_width + j;
-              const float val = input_data[input_idx];
+              const T val = input_data[input_idx];
              const index_t kernel_offset =
                  (oc * in_channels + ic) * kernel_size;
              for (int k = 0; k < kernel_size; ++k) {
@@ -126,11 +113,10 @@ MaceStatus Deconv2dGeneral::Compute(const OpContext *context,
 void RegisterDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
  MACE_REGISTER_DELEGATOR(
-      registry, Deconv2dGeneral, delegator::Deconv2dParam,
+      registry, Deconv2dGeneral<float>, delegator::Deconv2dParam,
      MACE_DELEGATOR_KEY(Deconv2d, DeviceType::CPU, float, ImplType::NEON));
 }
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

--- a/mace/ops/arm/base/deconv_2d_general.h
+++ b/mace/ops/arm/base/deconv_2d_general.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_GENERAL_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_GENERAL_H_
+#include "mace/ops/arm/base/deconv_2d.h"
+// TODO(liutuo): optimize it
+namespace mace {
+namespace ops {
+namespace arm {
+template<typename T>
+class Deconv2dGeneral : public Deconv2dBase {
+ public:
+  explicit Deconv2dGeneral(const delegator::Deconv2dParam &param)
+      : Deconv2dBase(param, sizeof(T)) {}
+  virtual ~Deconv2dGeneral() {}
+  MaceStatus Compute(
+      const OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *output_shape,
+      Tensor *output) override;
+};
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_GENERAL_H_
--- a/mace/ops/arm/base/deconv_2d_mxn.h
+++ b/mace/ops/arm/base/deconv_2d_mxn.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_ARM_BASE_DECONV_2D_MXN_H_
+#define MACE_OPS_ARM_BASE_DECONV_2D_MXN_H_
+#include <memory>
+#include <vector>
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/ops/arm/base/deconv_2d.h"
+#include "mace/public/mace.h"
+namespace mace {
+namespace ops {
+namespace arm {
+template<typename T>
+class Deconv2dKMxN : public Deconv2dBase {
+ public:
+  explicit Deconv2dKMxN(const delegator::Deconv2dParam &param)
+      : Deconv2dBase(param, sizeof(T)) {}
+  virtual ~Deconv2dKMxN() {}
+  MaceStatus Compute(const OpContext *context,
+                     const Tensor *input,
+                     const Tensor *filter,
+                     const Tensor *output_shape,
+                     Tensor *output) {
+    std::unique_ptr<Tensor> padded_out;
+    std::vector<int> out_pad_size;
+    ResizeOutAndPadOut(context, input, filter, output_shape,
+                       output, &out_pad_size, &padded_out);
+    Tensor *out_tensor = output;
+    if (padded_out != nullptr) {
+      out_tensor = padded_out.get();
+    }
+    out_tensor->Clear();
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard filter_mapper(filter);
+    Tensor::MappingGuard output_mapper(output);
+    const T *input_data = input->data<T>();
+    const T *filter_data = filter->data<T>();
+    T *padded_out_data = out_tensor->mutable_data<T>();
+    const DeconvComputeParam p =
+        PreWorkAndGetDeconvParam(context, input, out_tensor);
+    DoCompute(p, filter_data, input_data, padded_out_data);
+    UnPadOutput(*out_tensor, out_pad_size, output);
+    return MaceStatus::MACE_SUCCESS;
+  }
+  virtual MaceStatus DoCompute(const DeconvComputeParam &p, const T *filter,
+                               const T *input_data, T *padded_out_data) = 0;
+};
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+#endif  // MACE_OPS_ARM_BASE_DECONV_2D_MXN_H_
--- a/mace/ops/arm/base/depthwise_conv_2d_3x3.cc
+++ b/mace/ops/arm/base/depthwise_conv_2d_3x3.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/arm/base/depthwise_conv_2d_3x3.h"
+namespace mace {
+namespace ops {
+namespace arm {
+void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseConv2dK3x3S1<float>, delegator::DepthwiseConv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseConv2dK3x3S2<float>, delegator::DepthwiseConv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S2));
+}
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h
+++ b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,51 +12,47 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_3X3_H_
-#define MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_3X3_H_
 #include <vector>
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/depthwise_conv_2d_mxn.h"
 #include "mace/ops/delegator/depthwise_conv_2d.h"
 #include "mace/public/mace.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-class DepthwiseConv2dK3x3S1 : public Conv2dBase {
+template<typename T>
+class DepthwiseConv2dK3x3S1 : public DepthwiseConv2dKMxN<T> {
 public:
  explicit DepthwiseConv2dK3x3S1(const delegator::DepthwiseConv2dParam &param)
-      : Conv2dBase(param) {}
+      : DepthwiseConv2dKMxN<T>(param) {}
  virtual ~DepthwiseConv2dK3x3S1() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(
-      const OpContext *context,
+      const DepthwiseConvComputeParam &p, const T *filter,
-      const Tensor *input,
+      const T *input_data, T *output_data) override;
-      const Tensor *filter,
-      Tensor *output) override;
 };
-class DepthwiseConv2dK3x3S2 : public Conv2dBase {
+template<typename T>
+class DepthwiseConv2dK3x3S2 : public DepthwiseConv2dKMxN<T> {
 public:
  explicit DepthwiseConv2dK3x3S2(const delegator::DepthwiseConv2dParam &param)
-      : Conv2dBase(param) {}
+      : DepthwiseConv2dKMxN<T>(param) {}
  virtual ~DepthwiseConv2dK3x3S2() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(
-      const OpContext *context,
+      const DepthwiseConvComputeParam &p, const T *filter,
-      const Tensor *input,
+      const T *input_data, T *output_data) override;
-      const Tensor *filter,
-      Tensor *output) override;
 };
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_DEPTHWISE_CONV_2D_3X3_H_
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_3X3_H_
--- a/mace/ops/arm/fp32/deconv_2d.h
+++ b/mace/ops/arm/fp32/deconv_2d.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,51 +12,53 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_DECONV_2D_H_
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_MXN_H_
-#define MACE_OPS_ARM_FP32_DECONV_2D_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_MXN_H_
 #include <vector>
-#include <memory>
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/core/types.h"
+#include "mace/ops/arm/base/conv_2d.h"
-#include "mace/ops/arm/fp32/gemm.h"
+#include "mace/ops/delegator/depthwise_conv_2d.h"
-#include "mace/ops/common/conv_pool_2d_util.h"
-#include "mace/ops/delegator/deconv_2d.h"
 #include "mace/public/mace.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-class Deconv2dBase : public delegator::Deconv2d {
+template<typename T>
+class DepthwiseConv2dKMxN : public Conv2dBase {
 public:
-  explicit Deconv2dBase(const delegator::Deconv2dParam &param)
+  explicit DepthwiseConv2dKMxN(const delegator::DepthwiseConv2dParam &param)
-      : delegator::Deconv2d(param),
+      : Conv2dBase(param, sizeof(T)) {}
-        group_(param.group_) {}
+  virtual ~DepthwiseConv2dKMxN() {}
-  virtual ~Deconv2dBase() = default;
+  MaceStatus Compute(const OpContext *context, const Tensor *input,
+                     const Tensor *filter, Tensor *output) {
+    DepthwiseConvComputeParam p =
+        PreWorkAndGetDepthwiseConv2DParam(context, input, filter, output);
+    Tensor::MappingGuard in_guard(input);
+    Tensor::MappingGuard filter_guard(filter);
+    Tensor::MappingGuard out_guard(output);
+    const T *filter_data = filter->data<T>();
+    const T *input_data = input->data<T>();
+    T *output_data = output->mutable_data<T>();
+    DoCompute(p, filter_data, input_data, output_data);
+    return MaceStatus::MACE_SUCCESS;
+  }
 protected:
-  MaceStatus ResizeOutAndPadOut(const OpContext *context,
+  virtual MaceStatus DoCompute(
-                                const Tensor *input,
+      const DepthwiseConvComputeParam &p, const T *filter,
-                                const Tensor *filter,
+      const T *input_data, T *output_data) = 0;
-                                const Tensor *output_shape,
-                                Tensor *output,
-                                std::vector<int> *out_pad_size,
-                                std::unique_ptr<Tensor> *padded_output);
-  void UnPadOutput(const Tensor &src,
-                   const std::vector<int> &out_pad_size,
-                   Tensor *dst);
-  index_t group_;
 };
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_DECONV_2D_H_
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_CONV_2D_MXN_H_
--- a/mace/ops/arm/base/depthwise_deconv_2d_3x3.cc
+++ b/mace/ops/arm/base/depthwise_deconv_2d_3x3.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/arm/base/depthwise_deconv_2d_3x3.h"
+namespace mace {
+namespace ops {
+namespace arm {
+void RegisterDepthwiseDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseDeconv2dK3x3S1<float>,
+      delegator::DepthwiseDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseDeconv2dK3x3S2<float>,
+      delegator::DepthwiseDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S2));
+}
+void RegisterGroupDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, GroupDeconv2dK3x3S1<float>, delegator::GroupDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, GroupDeconv2dK3x3S2<float>, delegator::GroupDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K3x3S2));
+}
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_3X3_H_
-#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_3X3_H_
 #include <vector>
 #include <memory>
@@ -21,7 +21,7 @@
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/depthwise_deconv_2d_mxn.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/delegator/depthwise_deconv_2d.h"
 #include "mace/public/mace.h"
@@ -29,70 +29,56 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-class DepthwiseDeconv2dK3x3S1 : public Deconv2dBase {
+template<typename T>
+class DepthwiseDeconv2dK3x3S1 : public DepthwiseDeconv2dKMxN<T> {
 public:
  explicit DepthwiseDeconv2dK3x3S1(
      const delegator::DepthwiseDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : DepthwiseDeconv2dKMxN<T>(param) {}
  virtual ~DepthwiseDeconv2dK3x3S1() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *padded_out_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
 };
-class DepthwiseDeconv2dK3x3S2 : public Deconv2dBase {
+template<typename T>
+class DepthwiseDeconv2dK3x3S2 : public DepthwiseDeconv2dKMxN<T> {
 public:
  explicit DepthwiseDeconv2dK3x3S2(
      const delegator::DepthwiseDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : DepthwiseDeconv2dKMxN<T>(param) {}
  virtual ~DepthwiseDeconv2dK3x3S2() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *padded_out_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
 };
-class GroupDeconv2dK3x3S1 : public Deconv2dBase {
+template<typename T>
+class GroupDeconv2dK3x3S1 : public GroupDeconv2dKMxN<T> {
 public:
  explicit GroupDeconv2dK3x3S1(
      const delegator::GroupDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : GroupDeconv2dKMxN<T>(param) {}
  virtual ~GroupDeconv2dK3x3S1() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *padded_out_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
 };
-class GroupDeconv2dK3x3S2 : public Deconv2dBase {
+template<typename T>
+class GroupDeconv2dK3x3S2 : public GroupDeconv2dKMxN<T> {
 public:
  explicit GroupDeconv2dK3x3S2(const delegator::GroupDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : GroupDeconv2dKMxN<T>(param) {}
  virtual ~GroupDeconv2dK3x3S2() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *padded_out_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
 };
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_3X3_H_
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_3X3_H_
--- a/mace/ops/arm/base/depthwise_deconv_2d_4x4.cc
+++ b/mace/ops/arm/base/depthwise_deconv_2d_4x4.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/arm/base/depthwise_deconv_2d_4x4.h"
+namespace mace {
+namespace ops {
+namespace arm {
+void RegisterDepthwiseDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseDeconv2dK4x4S1<float>,
+      delegator::DepthwiseDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, DepthwiseDeconv2dK4x4S2<float>,
+      delegator::DepthwiseDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S2));
+}
+void RegisterGroupDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, GroupDeconv2dK4x4S1<float>, delegator::GroupDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S1));
+  MACE_REGISTER_DELEGATOR(
+      registry, GroupDeconv2dK4x4S2<float>, delegator::GroupDeconv2dParam,
+      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
+                            float, ImplType::NEON, K4x4S2));
+}
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_4X4_H_
-#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_4X4_H_
 #include <vector>
 #include <memory>
@@ -21,7 +21,7 @@
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/depthwise_deconv_2d_mxn.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/delegator/depthwise_deconv_2d.h"
 #include "mace/public/mace.h"
@@ -29,69 +29,55 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-class DepthwiseDeconv2dK4x4S1 : public Deconv2dBase {
+template<typename T>
+class DepthwiseDeconv2dK4x4S1 : public DepthwiseDeconv2dKMxN<T> {
 public:
  explicit DepthwiseDeconv2dK4x4S1(
      const delegator::DepthwiseDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : DepthwiseDeconv2dKMxN<T>(param) {}
  virtual ~DepthwiseDeconv2dK4x4S1() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *padded_out_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
 };
-class DepthwiseDeconv2dK4x4S2 : public Deconv2dBase {
+template<typename T>
+class DepthwiseDeconv2dK4x4S2 : public DepthwiseDeconv2dKMxN<T> {
 public:
  explicit DepthwiseDeconv2dK4x4S2(
      const delegator::DepthwiseDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : DepthwiseDeconv2dKMxN<T>(param) {}
  virtual ~DepthwiseDeconv2dK4x4S2() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const DepthwiseDeconvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *padded_out_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
 };
-class GroupDeconv2dK4x4S1 : public Deconv2dBase {
+template<typename T>
+class GroupDeconv2dK4x4S1 : public GroupDeconv2dKMxN<T> {
 public:
  explicit GroupDeconv2dK4x4S1(const delegator::GroupDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : GroupDeconv2dKMxN<T>(param) {}
  virtual ~GroupDeconv2dK4x4S1() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *padded_out_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
 };
-class GroupDeconv2dK4x4S2 : public Deconv2dBase {
+template<typename T>
+class GroupDeconv2dK4x4S2 : public GroupDeconv2dKMxN<T> {
 public:
  explicit GroupDeconv2dK4x4S2(const delegator::GroupDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : GroupDeconv2dKMxN<T>(param) {}
  virtual ~GroupDeconv2dK4x4S2() {}
-  MaceStatus Compute(
+  MaceStatus DoCompute(const GroupDeconvComputeParam &p, const T *filter,
-      const OpContext *context,
+                       const T *input_data, T *padded_out_data) override;
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *output_shape,
-      Tensor *output) override;
 };
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_4X4_H_
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_4X4_H_
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_general.cc
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/depthwise_deconv_2d_general.h"
+#include "mace/ops/arm/base/depthwise_deconv_2d_general.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context,
+template<typename T>
+MaceStatus DepthwiseDeconv2dGeneral<T>::Compute(const OpContext *context,
                                                const Tensor *input,
                                                const Tensor *filter,
                                                const Tensor *output_shape,
@@ -46,9 +46,9 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context,
  Tensor::MappingGuard filter_mapper(filter);
  Tensor::MappingGuard output_mapper(output);
-  auto input_data = input->data<float>();
+  const T *input_data = input->data<T>();
-  auto filter_data = filter->data<float>();
+  const T *filter_data = filter->data<T>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
+  T *padded_out_data = out_tensor->mutable_data<T>();
  auto &in_shape = input->shape();
  auto &out_shape = out_tensor->shape();
@@ -79,7 +79,7 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context,
                            index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t c = start1; c < end1; c += step1) {
-        float *out_base =
+        T *out_base =
            padded_out_data + (b * channels + c) * out_img_size;
        for (index_t i = 0; i < in_height; ++i) {
          for (index_t j = 0; j < in_width; ++j) {
@@ -105,7 +105,8 @@ MaceStatus DepthwiseDeconv2dGeneral::Compute(const OpContext *context,
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context,
+template<typename T>
+MaceStatus GroupDeconv2dGeneral<T>::Compute(const OpContext *context,
                                            const Tensor *input,
                                            const Tensor *filter,
                                            const Tensor *output_shape,
@@ -131,9 +132,9 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context,
  Tensor::MappingGuard filter_mapper(filter);
  Tensor::MappingGuard output_mapper(output);
-  auto input_data = input->data<float>();
+  const T *input_data = input->data<T>();
-  auto filter_data = filter->data<float>();
+  const T *filter_data = filter->data<T>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
+  T *padded_out_data = out_tensor->mutable_data<T>();
  auto &in_shape = input->shape();
  auto &out_shape = out_tensor->shape();
@@ -209,19 +210,19 @@ MaceStatus GroupDeconv2dGeneral::Compute(const OpContext *context,
 void RegisterDepthwiseDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
  MACE_REGISTER_DELEGATOR(
-      registry, DepthwiseDeconv2dGeneral, delegator::DepthwiseDeconv2dParam,
+      registry, DepthwiseDeconv2dGeneral<float>,
+      delegator::DepthwiseDeconv2dParam,
      MACE_DELEGATOR_KEY(DepthwiseDeconv2d, DeviceType::CPU,
                         float, ImplType::NEON));
 }
 void RegisterGroupDeconv2dGeneralDelegator(OpDelegatorRegistry *registry) {
  MACE_REGISTER_DELEGATOR(
-      registry, GroupDeconv2dGeneral, delegator::GroupDeconv2dParam,
+      registry, GroupDeconv2dGeneral<float>, delegator::GroupDeconv2dParam,
      MACE_DELEGATOR_KEY(GroupDeconv2d, DeviceType::CPU,
                         float, ImplType::NEON));
 }
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_general.h
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_general.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_GENERAL_H_
-#define MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_GENERAL_H_
 #include <vector>
 #include <memory>
@@ -21,7 +21,7 @@
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/ops/arm/fp32/deconv_2d.h"
+#include "mace/ops/arm/base/deconv_2d.h"
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/delegator/depthwise_deconv_2d.h"
 #include "mace/public/mace.h"
@@ -29,13 +29,13 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
+template<typename T>
 class DepthwiseDeconv2dGeneral : public Deconv2dBase {
 public:
  explicit DepthwiseDeconv2dGeneral(
      const delegator::DepthwiseDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dBase(param, sizeof(T)) {}
  virtual ~DepthwiseDeconv2dGeneral() {}
  MaceStatus Compute(
@@ -46,10 +46,11 @@ class DepthwiseDeconv2dGeneral : public Deconv2dBase {
      Tensor *output) override;
 };
+template<typename T>
 class GroupDeconv2dGeneral : public Deconv2dBase {
 public:
  explicit GroupDeconv2dGeneral(const delegator::GroupDeconv2dParam &param)
-      : Deconv2dBase(param) {}
+      : Deconv2dBase(param, sizeof(T)) {}
  virtual ~GroupDeconv2dGeneral() {}
  MaceStatus Compute(
@@ -60,9 +61,8 @@ class GroupDeconv2dGeneral : public Deconv2dBase {
      Tensor *output) override;
 };
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_DEPTHWISE_DECONV_2D_GENERAL_H_
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_GENERAL_H_
--- a/mace/ops/arm/base/depthwise_deconv_2d_mxn.h
+++ b/mace/ops/arm/base/depthwise_deconv_2d_mxn.h
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_MXN_H_
+#define MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_MXN_H_
+#include <vector>
+#include <memory>
+#include "mace/core/ops/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/ops/arm/base/deconv_2d.h"
+#include "mace/ops/common/conv_pool_2d_util.h"
+#include "mace/ops/delegator/depthwise_deconv_2d.h"
+#include "mace/public/mace.h"
+namespace mace {
+namespace ops {
+namespace arm {
+template<typename T>
+class DepthwiseDeconv2dKMxN : public Deconv2dBase {
+ public:
+  explicit DepthwiseDeconv2dKMxN(
+      const delegator::DepthwiseDeconv2dParam &param)
+      : Deconv2dBase(param, sizeof(T)) {}
+  virtual ~DepthwiseDeconv2dKMxN() {}
+  MaceStatus Compute(
+      const OpContext *context, const Tensor *input, const Tensor *filter,
+      const Tensor *output_shape, Tensor *output) override {
+    std::unique_ptr<Tensor> padded_out;
+    std::vector<int> out_pad_size;
+    group_ = input->dim(1);
+    ResizeOutAndPadOut(context,
+                       input,
+                       filter,
+                       output_shape,
+                       output,
+                       &out_pad_size,
+                       &padded_out);
+    Tensor *out_tensor = output;
+    if (padded_out != nullptr) {
+      out_tensor = padded_out.get();
+    }
+    out_tensor->Clear();
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard filter_mapper(filter);
+    Tensor::MappingGuard output_mapper(output);
+    const T *input_data = input->data<float>();
+    const T *filter_data = filter->data<float>();
+    T *padded_out_data = out_tensor->mutable_data<float>();
+    DepthwiseDeconvComputeParam p =
+        PreWorkAndGetDepthwiseDeconvParam(context, input, out_tensor);
+    DoCompute(p, filter_data, input_data, padded_out_data);
+    UnPadOutput(*out_tensor, out_pad_size, output);
+    return MaceStatus::MACE_SUCCESS;
+  }
+  virtual MaceStatus DoCompute(
+      const DepthwiseDeconvComputeParam &p, const T *filter,
+      const T *input_data, T *padded_out_data) = 0;
+};
+template<typename T>
+class GroupDeconv2dKMxN : public Deconv2dBase {
+ public:
+  explicit GroupDeconv2dKMxN(
+      const delegator::DepthwiseDeconv2dParam &param)
+      : Deconv2dBase(param, sizeof(T)) {}
+  virtual ~GroupDeconv2dKMxN() {}
+  MaceStatus Compute(
+      const OpContext *context, const Tensor *input, const Tensor *filter,
+      const Tensor *output_shape, Tensor *output) override {
+    std::unique_ptr<Tensor> padded_out;
+    std::vector<int> out_pad_size;
+    ResizeOutAndPadOut(context,
+                       input,
+                       filter,
+                       output_shape,
+                       output,
+                       &out_pad_size,
+                       &padded_out);
+    Tensor *out_tensor = output;
+    if (padded_out != nullptr) {
+      out_tensor = padded_out.get();
+    }
+    out_tensor->Clear();
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard filter_mapper(filter);
+    Tensor::MappingGuard output_mapper(output);
+    auto input_data = input->data<float>();
+    auto filter_data = filter->data<float>();
+    auto padded_out_data = out_tensor->mutable_data<float>();
+    GroupDeconvComputeParam p =
+        PreWorkAndGetGroupDeconvParam(context, input, out_tensor);
+    DoCompute(p, filter_data, input_data, padded_out_data);
+    UnPadOutput(*out_tensor, out_pad_size, output);
+    return MaceStatus::MACE_SUCCESS;
+  }
+  virtual MaceStatus DoCompute(
+      const GroupDeconvComputeParam &p, const T *filter,
+      const T *input_data, T *padded_out_data) = 0;
+};
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
+#endif  // MACE_OPS_ARM_BASE_DEPTHWISE_DECONV_2D_MXN_H_
--- a/mace/ops/arm/base/gemm.cc
+++ b/mace/ops/arm/base/gemm.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/arm/base/gemm.h"
+namespace mace {
+namespace ops {
+namespace arm {
+void RegisterGemmDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Gemm<float>, delegator::GemmParam,
+      MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::NEON));
+}
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/gemm.h
+++ b/mace/ops/arm/fp32/gemm.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_GEMM_H_
+#ifndef MACE_OPS_ARM_BASE_GEMM_H_
-#define MACE_OPS_ARM_FP32_GEMM_H_
+#define MACE_OPS_ARM_BASE_GEMM_H_
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
@@ -28,8 +28,10 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
+enum { kNoCache, kCacheLhs, kCacheRhs };
+template<typename T>
 class Gemm : public delegator::Gemm {
 public:
  explicit Gemm(const delegator::GemmParam &param)
@@ -68,26 +70,49 @@ class Gemm : public delegator::Gemm {
      const bool transpose_out,
      const bool lhs_batched,
      const bool rhs_batched,
-      Tensor *output) override;
+      Tensor *output) override {
+    index_t rows = transpose_lhs ? lhs_cols : lhs_rows;
+    index_t depth = transpose_lhs ? lhs_rows : lhs_cols;
+    index_t cols = transpose_rhs ? rhs_rows : rhs_cols;
+    index_t depth2 = transpose_rhs ? rhs_cols : rhs_rows;
+    MACE_CHECK(depth == depth2,
+               "Matrices that multiply have inconsistent depth dim: ",
+               depth,
+               " vs. ",
+               depth2);
+    return Compute(context,
+                   lhs,
+                   rhs,
+                   batch,
+                   rows,
+                   cols,
+                   depth,
+                   transpose_lhs ? ColMajor : RowMajor,
+                   transpose_rhs ? ColMajor : RowMajor,
+                   transpose_out ? ColMajor : RowMajor,
+                   lhs_batched,
+                   rhs_batched,
+                   output);
+  }
- private:
+ protected:
-  void ComputeBlock(const float *packed_lhs_data,
+  void ComputeBlock(const T *packed_lhs_data,
-                    const float *packed_rhs_data,
+                    const T *packed_rhs_data,
                    const index_t depth_padded,
-                    float *packed_output_data);
+                    T *packed_output_data);
-  void PackLhs(const MatrixMap<const float> &lhs,
+  void PackLhs(const MatrixMap<const T> &lhs,
-               float *packed_lhs);
+               T *packed_lhs);
-  void PackRhs(const MatrixMap<const float> &rhs,
+  void PackRhs(const MatrixMap<const T> &rhs,
-               float *packed_rhs);
+               T *packed_rhs);
-  void UnpackOutput(const float *packed_output,
-                    MatrixMap<float> *output);
+  void UnpackOutput(const T *packed_output,
+                    MatrixMap<T> *output);
  template<int RowBlockSize, int ColBlockSize>
-  void Unpack(const float *packed_output,
+  void Unpack(const T *packed_output,
-              MatrixMap<float> *output) {
+              MatrixMap<T> *output) {
    const index_t rows = output->rows();
    const index_t cols = output->cols();
    for (index_t r = 0; r < rows; ++r) {
@@ -98,9 +123,9 @@ class Gemm : public delegator::Gemm {
  }
  template<int WidthBlockSize, int DepthBlockSize>
-  void Pack(const MatrixMap<const float> &matrix,
+  void Pack(const MatrixMap<const T> &matrix,
            MatrixMajor dst_major,
-            float *packed_matrix) {
+            T *packed_matrix) {
    const index_t rows = matrix.rows();
    const index_t cols = matrix.cols();
    index_t depth = cols;
@@ -109,7 +134,7 @@ class Gemm : public delegator::Gemm {
      depth = rows;
    }
    const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4));
-    memset(packed_matrix, 0, sizeof(float) * WidthBlockSize * depth_padded);
+    memset(packed_matrix, 0, sizeof(T) * WidthBlockSize * depth_padded);
    if (dst_major == ColMajor) {
      for (index_t c = 0; c < cols; ++c) {
        for (index_t r = 0; r < rows; ++r) {
@@ -125,31 +150,14 @@ class Gemm : public delegator::Gemm {
    }
  }
+ private:
  Buffer pack_cache_;
  bool should_cache_pack_;
  int cached_;
 };
-template<>
-void Gemm::Pack<4, 4>(const MatrixMap<const float> &matrix,
-                      MatrixMajor dst_major,
-                      float *packed_matrix);
-template<>
-void Gemm::Pack<8, 4>(const MatrixMap<const float> &matrix,
-                      MatrixMajor dst_major,
-                      float *packed_matrix);
-template<>
-void Gemm::Unpack<4, 8>(const float *packed_output, MatrixMap<float> *output);
-template<>
-void Gemm::Unpack<8, 8>(const float *packed_output, MatrixMap<float> *output);
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_GEMM_H_
+#endif  // MACE_OPS_ARM_BASE_GEMM_H_
--- a/mace/ops/arm/base/gemv.cc
+++ b/mace/ops/arm/base/gemv.cc
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "mace/ops/arm/base/gemv.h"
+namespace mace {
+namespace ops {
+namespace arm {
+void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
+  MACE_REGISTER_DELEGATOR(
+      registry, Gemv<float>, DelegatorParam,
+      MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::NEON));
+}
+}  // namespace arm
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/arm/fp32/gemv.h
+++ b/mace/ops/arm/fp32/gemv.h
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef MACE_OPS_ARM_FP32_GEMV_H_
+#ifndef MACE_OPS_ARM_BASE_GEMV_H_
-#define MACE_OPS_ARM_FP32_GEMV_H_
+#define MACE_OPS_ARM_BASE_GEMV_H_
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
@@ -23,8 +23,8 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
+template<typename T>
 class Gemv : public delegator::Gemv {
 public:
  explicit Gemv(const DelegatorParam &param) : delegator::Gemv(param) {}
@@ -43,9 +43,8 @@ class Gemv : public delegator::Gemv {
      Tensor *output) override;
 };
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
-#endif  // MACE_OPS_ARM_FP32_GEMV_H_
+#endif  // MACE_OPS_ARM_BASE_GEMV_H_
--- a/mace/ops/arm/fp32/activation.cc
+++ b/mace/ops/arm/fp32/activation.cc
@@ -12,60 +12,24 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/delegator/activation.h"
 #include <arm_neon.h>
 #include <algorithm>
+#include "mace/ops/arm/base/activation.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-class Activation : public delegator::Activation {
- public:
-  explicit Activation(const delegator::ActivationParam &param)
-      : delegator::Activation(param) {}
-  ~Activation() = default;
-  MaceStatus Compute(const OpContext *context,
-                     const Tensor *input, Tensor *output) override;
- private:
-  void DoActivation(const OpContext *context,
-                    const Tensor *input, Tensor *output);
-};
-MaceStatus Activation::Compute(const OpContext *context,
-                               const Tensor *input, Tensor *output) {
-  Tensor::MappingGuard input_guard(input);
-  if (input != output) {
-    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-    Tensor::MappingGuard output_guard(output);
-    DoActivation(context, input, output);
-  } else {
-    DoActivation(context, input, output);
-  }
-  return MaceStatus::MACE_SUCCESS;
-}
-void Activation::DoActivation(const OpContext *context,
-                              const Tensor *input,
-                              Tensor *output) {
-  auto input_data = input->data<float>();
-  auto output_data = output->mutable_data<float>();
-  const index_t size = input->size();
-  utils::ThreadPool &thread_pool =
-      context->device()->cpu_runtime()->thread_pool();
-  switch (type_) {
+template<>
-    case RELU: {
+void Activation<float>::ActivateRelu(utils::ThreadPool *thread_pool,
+                                     const float *input_data,
+                                     const index_t input_size,
+                                     float *output_data) {
  const float32x4_t vzero = vdupq_n_f32(0.f);
-      const index_t block_count = size / 4;
+  const index_t block_count = input_size / 4;
-      thread_pool.Compute1D(
+  thread_pool->Compute1D(
      [=](index_t start, index_t end, index_t step) {
        auto input_ptr = input_data + start * 4;
        auto output_ptr = output_data + start * 4;
@@ -82,19 +46,21 @@ void Activation::DoActivation(const OpContext *context,
      0, block_count, 1);
  // remain
-      for (index_t i = block_count * 4; i < size; ++i) {
+  for (index_t i = block_count * 4; i < input_size; ++i) {
    output_data[i] = std::max(0.f, input_data[i]);
  }
+}
-      break;
+template<>
-    }
+void Activation<float>::ActivateRelux(utils::ThreadPool *thread_pool,
+                                      const float *input_data,
-    case RELUX: {
+                                      const index_t input_size,
+                                      float *output_data) {
  const float32x4_t vzero = vdupq_n_f32(0.f);
  const float32x4_t vlimit = vdupq_n_f32(limit_);
-      const index_t block_count = size / 4;
+  const index_t block_count = input_size / 4;
-      thread_pool.Compute1D(
+  thread_pool->Compute1D(
      [=](index_t start, index_t end, index_t step) {
        auto input_ptr = input_data + start * 4;
        auto output_ptr = output_data + start * 4;
@@ -112,19 +78,21 @@ void Activation::DoActivation(const OpContext *context,
      0, block_count, 1);
  // remain
-      for (index_t i = block_count * 4; i < size; ++i) {
+  for (index_t i = block_count * 4; i < input_size; ++i) {
    output_data[i] = std::max(0.f, std::min(limit_, input_data[i]));
  }
+}
-      break;
+template<>
-    }
+void Activation<float>::ActivateLeakyRelu(utils::ThreadPool *thread_pool,
+                                          const float *input_data,
-    case LEAKYRELU: {
+                                          const index_t input_size,
+                                          float *output_data) {
  const float32x4_t vzero = vdupq_n_f32(0.f);
  const float32x4_t valpha = vdupq_n_f32(leakyrelu_coefficient_);
-      const index_t block_count = size / 4;
+  const index_t block_count = input_size / 4;
-      thread_pool.Compute1D(
+  thread_pool->Compute1D(
      [=](index_t start, index_t end, index_t step) {
        auto input_ptr = input_data + start * 4;
        auto output_ptr = output_data + start * 4;
@@ -143,55 +111,40 @@ void Activation::DoActivation(const OpContext *context,
      0, block_count, 1);
  // remain
-      for (index_t i = block_count * 4; i < size; ++i) {
+  for (index_t i = block_count * 4; i < input_size; ++i) {
    output_data[i] = std::max(input_data[i], 0.f) +
        std::min(input_data[i], 0.f) * leakyrelu_coefficient_;
  }
+}
-      break;
+template<>
-    }
+void Activation<float>::ActivateTanh(utils::ThreadPool *thread_pool,
+                                     const float *input_data,
-    case TANH: {
+                                     const index_t input_size,
-      thread_pool.Compute1D(
+                                     float *output_data) {
+  thread_pool->Compute1D(
      [=](index_t start, index_t end, index_t step) {
        for (index_t i = start; i < end; i += step) {
          output_data[i] = std::tanh(input_data[i]);
        }
      },
-          0, size, 1);
+      0, input_size, 1);
+}
-      break;
-    }
-    case SIGMOID: {
+template<>
-      thread_pool.Compute1D(
+void Activation<float>::ActivateSigmoid(utils::ThreadPool *thread_pool,
+                                        const float *input_data,
+                                        const index_t input_size,
+                                        float *output_data) {
+  thread_pool->Compute1D(
      [=](index_t start, index_t end, index_t step) {
        for (index_t i = start; i < end; i += step) {
          output_data[i] = 1 / (1 + std::exp(-(input_data[i])));
        }
      },
-          0, size, 1);
+      0, input_size, 1);
-      break;
-    }
-    case NOOP: {
-      break;
-    }
-    default: {
-      MACE_NOT_IMPLEMENTED;
-    }
-  }
-}
-void RegisterActivationDelegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Activation, delegator::ActivationParam,
-      MACE_DELEGATOR_KEY(Activation, DeviceType::CPU, float, ImplType::NEON));
 }
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/bias_add.cc
+++ b/mace/ops/arm/fp32/bias_add.cc
@@ -13,69 +13,21 @@
 // limitations under the License.
 #include <arm_neon.h>
-#include "mace/ops/delegator/bias_add.h"
+#include "mace/ops/arm/base/bias_add.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-class BiasAdd : public delegator::BiasAdd {
- public:
-  explicit BiasAdd(const DelegatorParam &param) : delegator::BiasAdd(param) {}
-  ~BiasAdd() = default;
-  MaceStatus Compute(const OpContext *context, const Tensor *input,
-                     const Tensor *bias, Tensor *output) override;
- private:
-  void AddBias(const OpContext *context, const Tensor *input,
-               const Tensor *bias, Tensor *output);
-};
-MaceStatus BiasAdd::Compute(const OpContext *context,
-                            const Tensor *input,
-                            const Tensor *bias,
-                            Tensor *output) {
-  Tensor::MappingGuard input_guard(input);
-  Tensor::MappingGuard bias_guard(bias);
-  if (input != output) {
-    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-    if (bias == nullptr) {
-      output->Copy(*input);
-    } else {
-      Tensor::MappingGuard output_guard(output);
-      AddBias(context, input, bias, output);
-    }
-  } else {
-    if (bias != nullptr) {
-      AddBias(context, input, bias, output);
-    }
-  }
-  return MaceStatus::MACE_SUCCESS;
-}
-void BiasAdd::AddBias(const OpContext *context,
+template<>
-                      const Tensor *input,
+void BiasAdd<float>::Add1DimBias(
-                      const Tensor *bias,
+    utils::ThreadPool *thread_pool, const float *input_data,
-                      mace::Tensor *output) {
+    const float *bias_data, float *output_data, const index_t batch,
-  auto input_data = input->data<float>();
+    const index_t channels, const index_t image_size) {
-  auto bias_data = bias->data<float>();
-  auto output_data = output->mutable_data<float>();
-  const index_t batch = input->dim(0);
-  const index_t channels = input->dim(1);
-  const index_t height = output->dim(2);
-  const index_t width = output->dim(3);
-  const index_t image_size = height * width;
  const index_t block_count = image_size / 4;
  const index_t remain = image_size % 4;
+  thread_pool->Compute2D([=](index_t start0, index_t end0, index_t step0,
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  if (bias->dim_size() == 1) {
-    thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                             index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      const index_t b_offset = b * channels;
@@ -100,8 +52,16 @@ void BiasAdd::AddBias(const OpContext *context,
      }
    }
  }, 0, batch, 1, 0, channels, 1);
-  } else {
+}
-    thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+template<>
+void BiasAdd<float>::Add2DimsBias(
+    utils::ThreadPool *thread_pool, const float *input_data,
+    const float *bias_data, float *output_data, const index_t batch,
+    const index_t channels, const index_t image_size) {
+  const index_t block_count = image_size / 4;
+  const index_t remain = image_size % 4;
+  thread_pool->Compute2D([=](index_t start0, index_t end0, index_t step0,
                             index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      const index_t b_offset = b * channels;
@@ -126,16 +86,8 @@ void BiasAdd::AddBias(const OpContext *context,
      }
    }
  }, 0, batch, 1, 0, channels, 1);
-  }
-}
-void RegisterBiasAddDelegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, BiasAdd, DelegatorParam,
-      MACE_DELEGATOR_KEY(BiasAdd, DeviceType::CPU, float, ImplType::NEON));
 }
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/common_neon.h
+++ b/mace/ops/arm/fp32/common_neon.h
@@ -21,7 +21,6 @@
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 inline float32x4_t neon_vfma_lane_0(float32x4_t a,
                          float32x4_t b,
@@ -63,7 +62,6 @@ inline float32x4_t neon_vfma_lane_3(float32x4_t a,
 #endif
 }
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

--- a/mace/ops/arm/fp32/conv_2d_1xn.cc
+++ b/mace/ops/arm/fp32/conv_2d_1xn.cc
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,93 +12,44 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/conv_2d_1xn.h"
 #include <arm_neon.h>
 #include <memory>
+#include "mace/ops/arm/base/conv_2d_1xn.h"
 #include "mace/ops/delegator/conv_2d.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
+template<>
-MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
+MaceStatus Conv2dK1x7S1<float>::DoCompute(
-                                 const Tensor *input,
+    const ConvComputeParam &p, const float *filter_data,
-                                 const Tensor *filter,
+    const float *input_data, float *output_data) {
-                                 Tensor *output) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       1,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t m = start1; m < end1; m += step1) {
-        if (m + 3 < out_channels) {
+        if (m + 3 < p.out_channels) {
          float *out_ptr0_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
          float *out_ptr1_base =
-              output_data + b * out_batch_size + (m + 1) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
          float *out_ptr2_base =
-              output_data + b * out_batch_size + (m + 2) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
          float *out_ptr3_base =
-              output_data + b * out_batch_size + (m + 3) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+          for (index_t c = 0; c < p.in_channels; ++c) {
            const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
-            const float
+            const float *filter_ptr0 =
-                *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7;
+                filter_data + m * p.in_channels * 7 + c * 7;
-            const float
+            const float *filter_ptr1 =
-                *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7;
+                filter_data + (m + 1) * p.in_channels * 7 + c * 7;
-            const float
+            const float *filter_ptr2 =
-                *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7;
+                filter_data + (m + 2) * p.in_channels * 7 + c * 7;
-            const float
+            const float *filter_ptr3 =
-                *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7;
+                filter_data + (m + 3) * p.in_channels * 7 + c * 7;
            /* load filter (4 outch x 1 height x 4 width) */
            float32x4_t vf00, vf01;
            float32x4_t vf10, vf11;
@@ -113,12 +64,12 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
            vf30 = vld1q_f32(filter_ptr3);
            vf31 = vld1q_f32(filter_ptr3 + 3);
-            for (index_t h = 0; h < out_height; ++h) {
+            for (index_t h = 0; h < p.out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                // output (4 outch x 1 height x 4 width): vo_outch_height
                float32x4_t vo0, vo1, vo2, vo3;
                // load output
-                index_t out_offset = h * out_width + w;
+                index_t out_offset = h * p.out_width + w;
                vo0 = vld1q_f32(out_ptr0_base + out_offset);
                vo1 = vld1q_f32(out_ptr1_base + out_offset);
                vo2 = vld1q_f32(out_ptr2_base + out_offset);
@@ -127,7 +78,7 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
                // input (3 slide)
                float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
                // input offset
-                index_t in_offset = h * in_width + w;
+                index_t in_offset = h * p.in_width + w;
                // load input
                vi0 = vld1q_f32(in_ptr_base + in_offset);
                vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
@@ -214,31 +165,31 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
            }    // h
          }  // c
        } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
+          for (index_t mm = m; mm < p.out_channels; ++mm) {
            float *out_ptr0_base =
-                output_data + b * out_batch_size + mm * out_image_size;
+                output_data + b * p.out_batch_size + mm * p.out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
+            for (index_t c = 0; c < p.in_channels; ++c) {
              const float *in_ptr_base =
-                  input_data + b * in_batch_size + c * in_image_size;
+                  input_data + b * p.in_batch_size + c * p.in_image_size;
-              const float
+              const float *filter_ptr0 =
-                  *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7;
+                  filter_data + mm * p.in_channels * 7 + c * 7;
              /* load filter (1 outch x 1 height x 4 width) */
              float32x4_t vf00, vf01;
              vf00 = vld1q_f32(filter_ptr0);
              vf01 = vld1q_f32(filter_ptr0 + 3);
-              for (index_t h = 0; h < out_height; ++h) {
+              for (index_t h = 0; h < p.out_height; ++h) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                  // output (1 outch x 1 height x 4 width): vo_outch_height
                  float32x4_t vo0;
                  // load output
-                  index_t out_offset = h * out_width + w;
+                  index_t out_offset = h * p.out_width + w;
                  vo0 = vld1q_f32(out_ptr0_base + out_offset);
                  // input (3 slide)
                  float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi8;
                  // input offset
-                  index_t in_offset = h * in_width + w;
+                  index_t in_offset = h * p.in_width + w;
                  // load input
                  vi0 = vld1q_f32(in_ptr_base + in_offset);
                  vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
@@ -275,87 +226,39 @@ MaceStatus Conv2dK1x7S1::Compute(const OpContext *context,
        }  // if
      }    // m
    }      // b
-  }, 0, batch, 1, 0, out_channels, 4);
+  }, 0, p.batch, 1, 0, p.out_channels, 4);
-  UnPadOutput(*out_tensor, output);
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
+template<>
-                                 const Tensor *input,
+MaceStatus Conv2dK7x1S1<float>::DoCompute(
-                                 const Tensor *filter,
+    const ConvComputeParam &p, const float *filter_data,
-                                 Tensor *output) {
+    const float *input_data, float *output_data) {
-  std::unique_ptr<const Tensor> padded_input;
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  std::unique_ptr<Tensor> padded_output;
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       4,
-                       1,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t m = start1; m < end1; m += step1) {
-        if (m + 3 < out_channels) {
+        if (m + 3 < p.out_channels) {
          float *out_ptr0_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
          float *out_ptr1_base =
-              output_data + b * out_batch_size + (m + 1) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
          float *out_ptr2_base =
-              output_data + b * out_batch_size + (m + 2) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
          float *out_ptr3_base =
-              output_data + b * out_batch_size + (m + 3) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+          for (index_t c = 0; c < p.in_channels; ++c) {
            const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
-            const float
+            const float *filter_ptr0 =
-                *filter_ptr0 = filter_data + m * in_channels * 7 + c * 7;
+                filter_data + m * p.in_channels * 7 + c * 7;
-            const float
+            const float *filter_ptr1 =
-                *filter_ptr1 = filter_data + (m + 1) * in_channels * 7 + c * 7;
+                filter_data + (m + 1) * p.in_channels * 7 + c * 7;
-            const float
+            const float *filter_ptr2 =
-                *filter_ptr2 = filter_data + (m + 2) * in_channels * 7 + c * 7;
+                filter_data + (m + 2) * p.in_channels * 7 + c * 7;
-            const float
+            const float *filter_ptr3 =
-                *filter_ptr3 = filter_data + (m + 3) * in_channels * 7 + c * 7;
+                filter_data + (m + 3) * p.in_channels * 7 + c * 7;
            /* load filter (4 outch x 4 height x 1 width) */
            float32x4_t vf00, vf01;
            float32x4_t vf10, vf11;
@@ -370,41 +273,41 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
            vf30 = vld1q_f32(filter_ptr3);
            vf31 = vld1q_f32(filter_ptr3 + 3);
-            for (index_t h = 0; h + 3 < out_height; h += 4) {
+            for (index_t h = 0; h + 3 < p.out_height; h += 4) {
-              for (index_t w = 0; w < out_width; ++w) {
+              for (index_t w = 0; w < p.out_width; ++w) {
                // load output
-                index_t out_offset = h * out_width + w;
+                index_t out_offset = h * p.out_width + w;
                // output (4 outch x 4 height x 1 width): vo_outch_height
                float32x4_t vo0 = {out_ptr0_base[out_offset],
-                                   out_ptr0_base[out_offset + out_width],
+                                   out_ptr0_base[out_offset + p.out_width],
-                                   out_ptr0_base[out_offset + 2 * out_width],
+                                   out_ptr0_base[out_offset + 2 * p.out_width],
-                                   out_ptr0_base[out_offset + 3 * out_width]};
+                                   out_ptr0_base[out_offset + 3 * p.out_width]};
                float32x4_t vo1 = {out_ptr1_base[out_offset],
-                                   out_ptr1_base[out_offset + out_width],
+                                   out_ptr1_base[out_offset + p.out_width],
-                                   out_ptr1_base[out_offset + 2 * out_width],
+                                   out_ptr1_base[out_offset + 2 * p.out_width],
-                                   out_ptr1_base[out_offset + 3 * out_width]};
+                                   out_ptr1_base[out_offset + 3 * p.out_width]};
                float32x4_t vo2 = {out_ptr2_base[out_offset],
-                                   out_ptr2_base[out_offset + out_width],
+                                   out_ptr2_base[out_offset + p.out_width],
-                                   out_ptr2_base[out_offset + 2 * out_width],
+                                   out_ptr2_base[out_offset + 2 * p.out_width],
-                                   out_ptr2_base[out_offset + 3 * out_width]};
+                                   out_ptr2_base[out_offset + 3 * p.out_width]};
                float32x4_t vo3 = {out_ptr3_base[out_offset],
-                                   out_ptr3_base[out_offset + out_width],
+                                   out_ptr3_base[out_offset + p.out_width],
-                                   out_ptr3_base[out_offset + 2 * out_width],
+                                   out_ptr3_base[out_offset + 2 * p.out_width],
-                                   out_ptr3_base[out_offset + 3 * out_width]};
+                                   out_ptr3_base[out_offset + 3 * p.out_width]};
                // input offset
-                index_t in_offset = h * in_width + w;
+                index_t in_offset = h * p.in_width + w;
                // input (3 slide)
                float32x4_t vi0 = {in_ptr_base[in_offset],
-                                   in_ptr_base[in_offset + in_width],
+                                   in_ptr_base[in_offset + p.in_width],
-                                   in_ptr_base[in_offset + 2 * in_width],
+                                   in_ptr_base[in_offset + 2 * p.in_width],
-                                   in_ptr_base[in_offset + 3 * in_width]};
+                                   in_ptr_base[in_offset + 3 * p.in_width]};
-                float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
+                float32x4_t vi4 = {in_ptr_base[in_offset + 4 * p.in_width],
-                                   in_ptr_base[in_offset + 5 * in_width],
+                                   in_ptr_base[in_offset + 5 * p.in_width],
-                                   in_ptr_base[in_offset + 6 * in_width],
+                                   in_ptr_base[in_offset + 6 * p.in_width],
-                                   in_ptr_base[in_offset + 7 * in_width]};
+                                   in_ptr_base[in_offset + 7 * p.in_width]};
-                float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
+                float32x4_t vi8 = {in_ptr_base[in_offset + 8 * p.in_width],
-                                   in_ptr_base[in_offset + 9 * in_width]};
+                                   in_ptr_base[in_offset + 9 * p.in_width]};
                float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
                float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
                float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
@@ -480,63 +383,65 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
 #endif
                out_ptr0_base[out_offset] = vo0[0];
-                out_ptr0_base[out_offset + out_width] = vo0[1];
+                out_ptr0_base[out_offset + p.out_width] = vo0[1];
-                out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
+                out_ptr0_base[out_offset + 2 * p.out_width] = vo0[2];
-                out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
+                out_ptr0_base[out_offset + 3 * p.out_width] = vo0[3];
                out_ptr1_base[out_offset] = vo1[0];
-                out_ptr1_base[out_offset + out_width] = vo1[1];
+                out_ptr1_base[out_offset + p.out_width] = vo1[1];
-                out_ptr1_base[out_offset + 2 * out_width] = vo1[2];
+                out_ptr1_base[out_offset + 2 * p.out_width] = vo1[2];
-                out_ptr1_base[out_offset + 3 * out_width] = vo1[3];
+                out_ptr1_base[out_offset + 3 * p.out_width] = vo1[3];
                out_ptr2_base[out_offset] = vo2[0];
-                out_ptr2_base[out_offset + out_width] = vo2[1];
+                out_ptr2_base[out_offset + p.out_width] = vo2[1];
-                out_ptr2_base[out_offset + 2 * out_width] = vo2[2];
+                out_ptr2_base[out_offset + 2 * p.out_width] = vo2[2];
-                out_ptr2_base[out_offset + 3 * out_width] = vo2[3];
+                out_ptr2_base[out_offset + 3 * p.out_width] = vo2[3];
                out_ptr3_base[out_offset] = vo3[0];
-                out_ptr3_base[out_offset + out_width] = vo3[1];
+                out_ptr3_base[out_offset + p.out_width] = vo3[1];
-                out_ptr3_base[out_offset + 2 * out_width] = vo3[2];
+                out_ptr3_base[out_offset + 2 * p.out_width] = vo3[2];
-                out_ptr3_base[out_offset + 3 * out_width] = vo3[3];
+                out_ptr3_base[out_offset + 3 * p.out_width] = vo3[3];
              }  // w
            }    // h
          }  // c
        } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
+          for (index_t mm = m; mm < p.out_channels; ++mm) {
            float *out_ptr0_base =
-                output_data + b * out_batch_size + mm * out_image_size;
+                output_data + b * p.out_batch_size + mm * p.out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
+            for (index_t c = 0; c < p.in_channels; ++c) {
              const float *in_ptr_base =
-                  input_data + b * in_batch_size + c * in_image_size;
+                  input_data + b * p.in_batch_size + c * p.in_image_size;
-              const float
+              const float *filter_ptr0 =
-                  *filter_ptr0 = filter_data + mm * in_channels * 7 + c * 7;
+                  filter_data + mm * p.in_channels * 7 + c * 7;
              /* load filter (1 outch x 4 height x 1 width) */
              float32x4_t vf00, vf01;
              vf00 = vld1q_f32(filter_ptr0);
              vf01 = vld1q_f32(filter_ptr0 + 3);
-              for (index_t h = 0; h + 3 < out_height; h += 4) {
+              for (index_t h = 0; h + 3 < p.out_height; h += 4) {
-                for (index_t w = 0; w < out_width; ++w) {
+                for (index_t w = 0; w < p.out_width; ++w) {
                  // load output
-                  index_t out_offset = h * out_width + w;
+                  index_t out_offset = h * p.out_width + w;
                  // output (1 outch x 4 height x 1 width): vo_outch_height
                  float32x4_t vo0 = {out_ptr0_base[out_offset],
-                                     out_ptr0_base[out_offset + out_width],
+                                     out_ptr0_base[out_offset + p.out_width],
-                                     out_ptr0_base[out_offset + 2 * out_width],
+                                     out_ptr0_base[out_offset
-                                     out_ptr0_base[out_offset + 3 * out_width]};
+                                         + 2 * p.out_width],
+                                     out_ptr0_base[out_offset
+                                         + 3 * p.out_width]};
                  // input offset
-                  index_t in_offset = h * in_width + w;
+                  index_t in_offset = h * p.in_width + w;
                  // input (3 slide)
                  float32x4_t vi0 = {in_ptr_base[in_offset],
-                                     in_ptr_base[in_offset + in_width],
+                                     in_ptr_base[in_offset + p.in_width],
-                                     in_ptr_base[in_offset + 2 * in_width],
+                                     in_ptr_base[in_offset + 2 * p.in_width],
-                                     in_ptr_base[in_offset + 3 * in_width]};
+                                     in_ptr_base[in_offset + 3 * p.in_width]};
-                  float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
+                  float32x4_t vi4 = {in_ptr_base[in_offset + 4 * p.in_width],
-                                     in_ptr_base[in_offset + 5 * in_width],
+                                     in_ptr_base[in_offset + 5 * p.in_width],
-                                     in_ptr_base[in_offset + 6 * in_width],
+                                     in_ptr_base[in_offset + 6 * p.in_width],
-                                     in_ptr_base[in_offset + 7 * in_width]};
+                                     in_ptr_base[in_offset + 7 * p.in_width]};
-                  float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
+                  float32x4_t vi8 = {in_ptr_base[in_offset + 8 * p.in_width],
-                                     in_ptr_base[in_offset + 9 * in_width],
+                                     in_ptr_base[in_offset + 9 * p.in_width],
-                                     in_ptr_base[in_offset + 10 * in_width],
+                                     in_ptr_base[in_offset + 10 * p.in_width],
-                                     in_ptr_base[in_offset + 11 * in_width]};
+                                     in_ptr_base[in_offset + 11 * p.in_width]};
                  float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
                  float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
                  float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
@@ -562,9 +467,9 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
 #endif
                  out_ptr0_base[out_offset] = vo0[0];
-                  out_ptr0_base[out_offset + out_width] = vo0[1];
+                  out_ptr0_base[out_offset + p.out_width] = vo0[1];
-                  out_ptr0_base[out_offset + 2 * out_width] = vo0[2];
+                  out_ptr0_base[out_offset + 2 * p.out_width] = vo0[2];
-                  out_ptr0_base[out_offset + 3 * out_width] = vo0[3];
+                  out_ptr0_base[out_offset + 3 * p.out_width] = vo0[3];
                }  // w
              }    // h
            }  // c
@@ -572,78 +477,30 @@ MaceStatus Conv2dK7x1S1::Compute(const OpContext *context,
        }  // if
      }    // m
    }      // b
-  }, 0, batch, 1, 0, out_channels, 4);
+  }, 0, p.batch, 1, 0, p.out_channels, 4);
-  UnPadOutput(*out_tensor, output);
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus Conv2dK1x15S1::Compute(const OpContext *context,
+template<>
-                                  const Tensor *input,
+MaceStatus Conv2dK1x15S1<float>::DoCompute(
-                                  const Tensor *filter,
+    const ConvComputeParam &p, const float *filter_data,
-                                  Tensor *output) {
+    const float *input_data, float *output_data) {
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       1,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input.get() != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output.get() != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
  const index_t tile_height =
-      out_channels < 4 ? RoundUpDiv4(out_height) : out_height;
+      p.out_channels < 4 ? RoundUpDiv4(p.out_height) : p.out_height;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t m = start1; m < end1; m += step1) {
-        for (index_t h = 0; h < out_height; h += tile_height) {
+        for (index_t h = 0; h < p.out_height; h += tile_height) {
          float *out_ptr_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+          for (index_t c = 0; c < p.in_channels; ++c) {
            const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
-            const float
+            const float *filter_ptr =
-                *filter_ptr = filter_data + m * in_channels * 15 + c * 15;
+                filter_data + m * p.in_channels * 15 + c * 15;
            /* load filter (1 outch x 4 height x 1 width) */
            float32x4_t vf0, vf1, vf2, vf3;
            vf0 = vld1q_f32(filter_ptr);
@@ -651,20 +508,20 @@ MaceStatus Conv2dK1x15S1::Compute(const OpContext *context,
            vf2 = vld1q_f32(filter_ptr + 8);
            vf3 = vld1q_f32(filter_ptr + 11);
-            for (index_t ht = 0; ht < tile_height && h + ht < out_height;
+            for (index_t ht = 0; ht < tile_height && h + ht < p.out_height;
                 ++ht) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                // output (1 outch x 1 height x 4 width): vo_outch_height
                float32x4_t vo;
                // load output
-                index_t out_offset = (h + ht) * out_width + w;
+                index_t out_offset = (h + ht) * p.out_width + w;
                vo = vld1q_f32(out_ptr_base + out_offset);
                // input (3 slide)
                float32x4_t vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7, vi8, vi9,
                    vi10, vi11, vi12, vi13, vi14, vi16;
                // input offset
-                index_t in_offset = (h + ht) * in_width + w;
+                index_t in_offset = (h + ht) * p.in_width + w;
                // load input
                vi0 = vld1q_f32(in_ptr_base + in_offset);
                vi4 = vld1q_f32(in_ptr_base + in_offset + 4);
@@ -706,78 +563,30 @@ MaceStatus Conv2dK1x15S1::Compute(const OpContext *context,
        }    // h
      }      // m
    }        // b
-  }, 0, batch, 1, 0, out_channels, 1);
+  }, 0, p.batch, 1, 0, p.out_channels, 1);
-  UnPadOutput(*out_tensor, output);
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus Conv2dK15x1S1::Compute(const OpContext *context,
+template<>
-                                  const Tensor *input,
+MaceStatus Conv2dK15x1S1<float>::DoCompute(
-                                  const Tensor *filter,
+    const ConvComputeParam &p, const float *filter_data,
-                                  Tensor *output) {
+    const float *input_data, float *output_data) {
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       4,
-                       1,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input.get() != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output.get() != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
  const index_t tile_width =
-      out_channels < 4 ? RoundUpDiv4(out_width) : out_width;
+      p.out_channels < 4 ? RoundUpDiv4(p.out_width) : p.out_width;
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t m = start1; m < end1; m += step1) {
-        for (index_t w = 0; w < out_width; w += tile_width) {
+        for (index_t w = 0; w < p.out_width; w += tile_width) {
          float *out_ptr_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+          for (index_t c = 0; c < p.in_channels; ++c) {
            const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
-            const float
+            const float *filter_ptr =
-                *filter_ptr = filter_data + m * in_channels * 15 + c * 15;
+                filter_data + m * p.in_channels * 15 + c * 15;
            /* load filter (1 outch x 4 height x 1 width) */
            float32x4_t vf0, vf1, vf2, vf3;
            vf0 = vld1q_f32(filter_ptr);
@@ -785,38 +594,38 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context,
            vf2 = vld1q_f32(filter_ptr + 8);
            vf3 = vld1q_f32(filter_ptr + 11);
-            for (index_t h = 0; h + 3 < out_height; h += 4) {
+            for (index_t h = 0; h + 3 < p.out_height; h += 4) {
-              for (index_t wt = 0; wt < tile_width && w + wt < out_width;
+              for (index_t wt = 0; wt < tile_width && w + wt < p.out_width;
                   ++wt) {
                // load output
-                index_t out_offset = h * out_width + w + wt;
+                index_t out_offset = h * p.out_width + w + wt;
                // output (1 outch x 4 height x 1 width): vo_outch_height
                float32x4_t vo = {out_ptr_base[out_offset],
-                                  out_ptr_base[out_offset + out_width],
+                                  out_ptr_base[out_offset + p.out_width],
-                                  out_ptr_base[out_offset + 2 * out_width],
+                                  out_ptr_base[out_offset + 2 * p.out_width],
-                                  out_ptr_base[out_offset + 3 * out_width]};
+                                  out_ptr_base[out_offset + 3 * p.out_width]};
                // input offset
-                index_t in_offset = h * in_width + w + wt;
+                index_t in_offset = h * p.in_width + w + wt;
                // input (3 slide)
                float32x4_t vi0 = {in_ptr_base[in_offset],
-                                   in_ptr_base[in_offset + in_width],
+                                   in_ptr_base[in_offset + p.in_width],
-                                   in_ptr_base[in_offset + 2 * in_width],
+                                   in_ptr_base[in_offset + 2 * p.in_width],
-                                   in_ptr_base[in_offset + 3 * in_width]};
+                                   in_ptr_base[in_offset + 3 * p.in_width]};
-                float32x4_t vi4 = {in_ptr_base[in_offset + 4 * in_width],
+                float32x4_t vi4 = {in_ptr_base[in_offset + 4 * p.in_width],
-                                   in_ptr_base[in_offset + 5 * in_width],
+                                   in_ptr_base[in_offset + 5 * p.in_width],
-                                   in_ptr_base[in_offset + 6 * in_width],
+                                   in_ptr_base[in_offset + 6 * p.in_width],
-                                   in_ptr_base[in_offset + 7 * in_width]};
+                                   in_ptr_base[in_offset + 7 * p.in_width]};
-                float32x4_t vi8 = {in_ptr_base[in_offset + 8 * in_width],
+                float32x4_t vi8 = {in_ptr_base[in_offset + 8 * p.in_width],
-                                   in_ptr_base[in_offset + 9 * in_width],
+                                   in_ptr_base[in_offset + 9 * p.in_width],
-                                   in_ptr_base[in_offset + 10 * in_width],
+                                   in_ptr_base[in_offset + 10 * p.in_width],
-                                   in_ptr_base[in_offset + 11 * in_width]};
+                                   in_ptr_base[in_offset + 11 * p.in_width]};
-                float32x4_t vi12 = {in_ptr_base[in_offset + 12 * in_width],
+                float32x4_t vi12 = {in_ptr_base[in_offset + 12 * p.in_width],
-                                    in_ptr_base[in_offset + 13 * in_width],
+                                    in_ptr_base[in_offset + 13 * p.in_width],
-                                    in_ptr_base[in_offset + 14 * in_width],
+                                    in_ptr_base[in_offset + 14 * p.in_width],
-                                    in_ptr_base[in_offset + 15 * in_width]};
+                                    in_ptr_base[in_offset + 15 * p.in_width]};
-                float32x4_t vi16 = {in_ptr_base[in_offset + 16 * in_width],
+                float32x4_t vi16 = {in_ptr_base[in_offset + 16 * p.in_width],
-                                    in_ptr_base[in_offset + 17 * in_width]};
+                                    in_ptr_base[in_offset + 17 * p.in_width]};
                float32x4_t vi1 = vextq_f32(vi0, vi4, 1);
                float32x4_t vi2 = vextq_f32(vi0, vi4, 2);
                float32x4_t vi3 = vextq_f32(vi0, vi4, 3);
@@ -846,44 +655,20 @@ MaceStatus Conv2dK15x1S1::Compute(const OpContext *context,
                vo = vmlaq_lane_f32(vo, vi14, vget_high_f32(vf3), 1);
                out_ptr_base[out_offset] = vo[0];
-                out_ptr_base[out_offset + out_width] = vo[1];
+                out_ptr_base[out_offset + p.out_width] = vo[1];
-                out_ptr_base[out_offset + 2 * out_width] = vo[2];
+                out_ptr_base[out_offset + 2 * p.out_width] = vo[2];
-                out_ptr_base[out_offset + 3 * out_width] = vo[3];
+                out_ptr_base[out_offset + 3 * p.out_width] = vo[3];
              }  // wt
            }    // h
          }  // c
        }    // w
      }      // m
    }        // b
-  }, 0, batch, 1, 0, out_channels, 1);
+  }, 0, p.batch, 1, 0, p.out_channels, 1);
-  UnPadOutput(*out_tensor, output);
  return MaceStatus::MACE_SUCCESS;
 }
-void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK1x7S1, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K1x7S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK7x1S1, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K7x1S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK1x15S1, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K1x15S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK15x1S1, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K15x1S1));
-}
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/conv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/conv_2d_3x3.cc
-// Copyright 2019 The MACE Authors. All Rights Reserved.
+// Copyright 2020 The MACE Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,95 +12,47 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/conv_2d_3x3.h"
 #include <arm_neon.h>
 #include <memory>
+#include "mace/ops/arm/base/conv_2d_3x3.h"
 #include "mace/ops/delegator/conv_2d.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
+template<>
-MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
+MaceStatus Conv2dK3x3S1<float>::DoCompute(
-                                 const Tensor *input,
+    const ConvComputeParam &p, const float *filter_data,
-                                 const Tensor *filter,
+    const float *input_data, float *output_data) {
-                                 Tensor *output) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       2,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t m = start1; m < end1; m += step1) {
-        if (m + 1 < out_channels) {
+        if (m + 1 < p.out_channels) {
          float *out_ptr0_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
          float *out_ptr1_base =
-              output_data + b * out_batch_size + (m + 1) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+          for (index_t c = 0; c < p.in_channels; ++c) {
-            const float
+            const float *in_ptr0 =
-                *in_ptr0 = input_data + b * in_batch_size + c * in_image_size;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
            const float
-                *filter_ptr0 = filter_data + m * in_channels * 9 + c * 9;
+                *filter_ptr0 = filter_data + m * p.in_channels * 9 + c * 9;
            float *out_ptr1 = out_ptr1_base;
            const float *in_ptr1 =
-                input_data + b * in_batch_size + c * in_image_size
+                input_data + b * p.in_batch_size + c * p.in_image_size
-                    + 1 * in_width;
+                    + 1 * p.in_width;
            const float *in_ptr2 =
-                input_data + b * in_batch_size + c * in_image_size
+                input_data + b * p.in_batch_size + c * p.in_image_size
-                    + 2 * in_width;
+                    + 2 * p.in_width;
            const float *in_ptr3 =
-                input_data + b * in_batch_size + c * in_image_size
+                input_data + b * p.in_batch_size + c * p.in_image_size
-                    + 3 * in_width;
+                    + 3 * p.in_width;
-            const float
+            const float *filter_ptr1 =
-                *filter_ptr1 = filter_data + (m + 1) * in_channels * 9 + c * 9;
+                filter_data + (m + 1) * p.in_channels * 9 + c * 9;
 #if defined(__aarch64__)
            float *out_ptr0 = out_ptr0_base;
@@ -116,8 +68,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
            vf11 = vld1q_f32(filter_ptr1 + 3);
            vf12 = vld1q_f32(filter_ptr1 + 6);
-            for (index_t h = 0; h + 1 < out_height; h += 2) {
+            for (index_t h = 0; h + 1 < p.out_height; h += 2) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                // input (4 height x 3 slide): vi_height_slide
                float32x4_t vi00, vi01, vi02;  // reg count: 14
                float32x4_t vi10, vi11, vi12;
@@ -150,9 +102,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                // load ouptut
                vo00 = vld1q_f32(out_ptr0);
-                vo01 = vld1q_f32(out_ptr0 + out_width);
+                vo01 = vld1q_f32(out_ptr0 + p.out_width);
                vo10 = vld1q_f32(out_ptr1);
-                vo11 = vld1q_f32(out_ptr1 + out_width);
+                vo11 = vld1q_f32(out_ptr1 + p.out_width);
                // outch 0, height 0
                vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0);  // reg count: 18
@@ -199,9 +151,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                vo11 = vfmaq_laneq_f32(vo11, vi32, vf12, 2);
                vst1q_f32(out_ptr0, vo00);
-                vst1q_f32(out_ptr0 + out_width, vo01);
+                vst1q_f32(out_ptr0 + p.out_width, vo01);
                vst1q_f32(out_ptr1, vo10);
-                vst1q_f32(out_ptr1 + out_width, vo11);
+                vst1q_f32(out_ptr1 + p.out_width, vo11);
                in_ptr0 += 4;
                in_ptr1 += 4;
@@ -212,13 +164,13 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                out_ptr1 += 4;
              }  // w
-              in_ptr0 += 2 + in_width;
+              in_ptr0 += 2 + p.in_width;
-              in_ptr1 += 2 + in_width;
+              in_ptr1 += 2 + p.in_width;
-              in_ptr2 += 2 + in_width;
+              in_ptr2 += 2 + p.in_width;
-              in_ptr3 += 2 + in_width;
+              in_ptr3 += 2 + p.in_width;
-              out_ptr0 += out_width;
+              out_ptr0 += p.out_width;
-              out_ptr1 += out_width;
+              out_ptr1 += p.out_width;
            }                      // h
 #else  // arm v7
            float *out_ptr0 = out_ptr0_base;
@@ -238,8 +190,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
            vf167 = vld1_f32(filter_ptr1 + 6);
            vf189 = vld1_f32(filter_ptr1 + 8);
-            for (index_t h = 0; h + 1 < out_height; h += 2) {
+            for (index_t h = 0; h + 1 < p.out_height; h += 2) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                // input (4 height x 3 slide): vi_height_slide
                float32x4_t vi00, vi01, vi02;  // reg count: 14
                float32x4_t vi10, vi11, vi12;
@@ -272,9 +224,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                // load ouptut
                vo00 = vld1q_f32(out_ptr0);
-                vo01 = vld1q_f32(out_ptr0 + out_width);
+                vo01 = vld1q_f32(out_ptr0 + p.out_width);
                vo10 = vld1q_f32(out_ptr1);
-                vo11 = vld1q_f32(out_ptr1 + out_width);
+                vo11 = vld1q_f32(out_ptr1 + p.out_width);
                // outch 0, height 0
                vo00 = vmlaq_lane_f32(vo00, vi00, vf001, 0);
@@ -321,9 +273,9 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                vo11 = vmlaq_lane_f32(vo11, vi32, vf189, 0);
                vst1q_f32(out_ptr0, vo00);
-                vst1q_f32(out_ptr0 + out_width, vo01);
+                vst1q_f32(out_ptr0 + p.out_width, vo01);
                vst1q_f32(out_ptr1, vo10);
-                vst1q_f32(out_ptr1 + out_width, vo11);
+                vst1q_f32(out_ptr1 + p.out_width, vo11);
                in_ptr0 += 4;
                in_ptr1 += 4;
@@ -334,34 +286,34 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                out_ptr1 += 4;
              }  // w
-              in_ptr0 += 2 + in_width;
+              in_ptr0 += 2 + p.in_width;
-              in_ptr1 += 2 + in_width;
+              in_ptr1 += 2 + p.in_width;
-              in_ptr2 += 2 + in_width;
+              in_ptr2 += 2 + p.in_width;
-              in_ptr3 += 2 + in_width;
+              in_ptr3 += 2 + p.in_width;
-              out_ptr0 += out_width;
+              out_ptr0 += p.out_width;
-              out_ptr1 += out_width;
+              out_ptr1 += p.out_width;
            }  // h
 #endif
          }  // c
        } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
+          for (index_t mm = m; mm < p.out_channels; ++mm) {
            float *out_ptr0_base =
-                output_data + b * out_batch_size + mm * out_image_size;
+                output_data + b * p.out_batch_size + mm * p.out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
+            for (index_t c = 0; c < p.in_channels; ++c) {
              const float *in_ptr0 =
-                  input_data + b * in_batch_size + c * in_image_size;
+                  input_data + b * p.in_batch_size + c * p.in_image_size;
              const float *in_ptr1 =
-                  input_data + b * in_batch_size + c * in_image_size
+                  input_data + b * p.in_batch_size + c * p.in_image_size
-                      + 1 * in_width;
+                      + 1 * p.in_width;
              const float *in_ptr2 =
-                  input_data + b * in_batch_size + c * in_image_size
+                  input_data + b * p.in_batch_size + c * p.in_image_size
-                      + 2 * in_width;
+                      + 2 * p.in_width;
              const float *in_ptr3 =
-                  input_data + b * in_batch_size + c * in_image_size
+                  input_data + b * p.in_batch_size + c * p.in_image_size
-                      + 3 * in_width;
+                      + 3 * p.in_width;
              const float
-                  *filter_ptr0 = filter_data + mm * in_channels * 9 + c * 9;
+                  *filter_ptr0 = filter_data + mm * p.in_channels * 9 + c * 9;
 #if defined(__aarch64__)
              float *out_ptr0 = out_ptr0_base;
@@ -372,8 +324,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
              vf01 = vld1q_f32(filter_ptr0 + 3);
              vf02 = vld1q_f32(filter_ptr0 + 5);
-              for (index_t h = 0; h + 1 < out_height; h += 2) {
+              for (index_t h = 0; h + 1 < p.out_height; h += 2) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                  // input (4 height x 3 slide): vi_height_slide
                  float32x4_t vi00, vi01, vi02, vi0n;
                  float32x4_t vi10, vi11, vi12, vi1n;
@@ -404,7 +356,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                  // load ouptut
                  vo00 = vld1q_f32(out_ptr0);
-                  vo01 = vld1q_f32(out_ptr0 + out_width);
+                  vo01 = vld1q_f32(out_ptr0 + p.out_width);
                  // outch 0, height 0
                  vo00 = vfmaq_laneq_f32(vo00, vi00, vf00, 0);
@@ -429,7 +381,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                  vo01 = vfmaq_laneq_f32(vo01, vi32, vf02, 3);
                  vst1q_f32(out_ptr0, vo00);
-                  vst1q_f32(out_ptr0 + out_width, vo01);
+                  vst1q_f32(out_ptr0 + p.out_width, vo01);
                  in_ptr0 += 4;
                  in_ptr1 += 4;
@@ -439,12 +391,12 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                  out_ptr0 += 4;
                }  // w
-                in_ptr0 += 2 + in_width;
+                in_ptr0 += 2 + p.in_width;
-                in_ptr1 += 2 + in_width;
+                in_ptr1 += 2 + p.in_width;
-                in_ptr2 += 2 + in_width;
+                in_ptr2 += 2 + p.in_width;
-                in_ptr3 += 2 + in_width;
+                in_ptr3 += 2 + p.in_width;
-                out_ptr0 += out_width;
+                out_ptr0 += p.out_width;
              }                    // h
 #else  // arm v7
              float *out_ptr0 = out_ptr0_base;
@@ -457,8 +409,8 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
              vf67 = vld1_f32(filter_ptr0 + 6);
              vf78 = vld1_f32(filter_ptr0 + 7);
-              for (index_t h = 0; h + 1 < out_height; h += 2) {
+              for (index_t h = 0; h + 1 < p.out_height; h += 2) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                  // input (4 height x 3 slide): vi_height_slide
                  float32x4_t vi00, vi01, vi02, vi0n;
                  float32x4_t vi10, vi11, vi12, vi1n;
@@ -489,7 +441,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                  // load ouptut
                  vo00 = vld1q_f32(out_ptr0);
-                  vo01 = vld1q_f32(out_ptr0 + out_width);
+                  vo01 = vld1q_f32(out_ptr0 + p.out_width);
                  // outch 0, height 0
                  vo00 = vmlaq_lane_f32(vo00, vi00, vf01, 0);
@@ -514,7 +466,7 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                  vo01 = vmlaq_lane_f32(vo01, vi32, vf78, 1);
                  vst1q_f32(out_ptr0, vo00);
-                  vst1q_f32(out_ptr0 + out_width, vo01);
+                  vst1q_f32(out_ptr0 + p.out_width, vo01);
                  in_ptr0 += 4;
                  in_ptr1 += 4;
@@ -524,12 +476,12 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
                  out_ptr0 += 4;
                }  // w
-                in_ptr0 += 2 + in_width;
+                in_ptr0 += 2 + p.in_width;
-                in_ptr1 += 2 + in_width;
+                in_ptr1 += 2 + p.in_width;
-                in_ptr2 += 2 + in_width;
+                in_ptr2 += 2 + p.in_width;
-                in_ptr3 += 2 + in_width;
+                in_ptr3 += 2 + p.in_width;
-                out_ptr0 += out_width;
+                out_ptr0 += p.out_width;
              }  // h
 #endif
            }  // c
@@ -537,73 +489,25 @@ MaceStatus Conv2dK3x3S1::Compute(const OpContext *context,
        }      // if
      }        // m
    }          // b
-  }, 0, batch, 1, 0, out_channels, 2);
+  }, 0, p.batch, 1, 0, p.out_channels, 2);
-  UnPadOutput(*out_tensor, output);
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
+template<>
-                                 const Tensor *input,
+MaceStatus Conv2dK3x3S2<float>::DoCompute(
-                                 const Tensor *filter,
+    const ConvComputeParam &p, const float *filter_data,
-                                 Tensor *output) {
+    const float *input_data, float *output_data) {
-  std::unique_ptr<const Tensor> padded_input;
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  std::unique_ptr<Tensor> padded_output;
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       1,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t m = start1; m < end1; m += step1) {
-        for (index_t c = 0; c < in_channels; ++c) {
+        for (index_t c = 0; c < p.in_channels; ++c) {
          const float
-              *in_base = input_data + b * in_batch_size + c * in_image_size;
+              *in_base = input_data + b * p.in_batch_size + c * p.in_image_size;
-          const float *filter_ptr = filter_data + m * in_channels * 9 + c * 9;
+          const float *filter_ptr = filter_data + m * p.in_channels * 9 + c * 9;
-          float
+          float *out_base =
-              *out_base = output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
 #if defined(__aarch64__)
          // load filter (1 outch x 3 height x 3 width): vf_outch_height
@@ -612,8 +516,8 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
          vf01 = vld1q_f32(filter_ptr + 3);
          vf02 = vld1q_f32(filter_ptr + 5);
-          for (index_t h = 0; h < out_height; ++h) {
+          for (index_t h = 0; h < p.out_height; ++h) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
+            for (index_t w = 0; w + 3 < p.out_width; w += 4) {
              float32x4x2_t vi0, vi1, vi2;
              float32x4_t vi0n, vi1n, vi2n;
@@ -628,17 +532,17 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
              // load input
              index_t in_h = h * 2;
              index_t in_w = w * 2;
-              index_t in_offset = in_h * in_width + in_w;
+              index_t in_offset = in_h * p.in_width + in_w;
              vi0 = vld2q_f32(in_base + in_offset);  // [0.2.4.6, 1.3.5.7]
-              vi1 = vld2q_f32(in_base + in_offset + in_width);
+              vi1 = vld2q_f32(in_base + in_offset + p.in_width);
-              vi2 = vld2q_f32(in_base + in_offset + 2 * in_width);
+              vi2 = vld2q_f32(in_base + in_offset + 2 * p.in_width);
              vi0n = vld1q_f32(in_base + in_offset + 8);  // [8.9.10.11]
-              vi1n = vld1q_f32(in_base + in_offset + in_width + 8);
+              vi1n = vld1q_f32(in_base + in_offset + p.in_width + 8);
-              vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8);
+              vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 8);
              // load ouptut
-              index_t out_offset = h * out_width + w;
+              index_t out_offset = h * p.out_width + w;
              vo = vld1q_f32(out_base + out_offset);
              vi00 = vi0.val[0];                // [0.2.4.6]
@@ -674,8 +578,8 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
          vf67 = vld1_f32(filter_ptr + 6);
          vf78 = vld1_f32(filter_ptr + 7);
-          for (index_t h = 0; h < out_height; ++h) {
+          for (index_t h = 0; h < p.out_height; ++h) {
-            for (index_t w = 0; w + 3 < out_width; w += 4) {
+            for (index_t w = 0; w + 3 < p.out_width; w += 4) {
              float32x4x2_t vi0, vi1, vi2;
              float32x4_t vi0n, vi1n, vi2n;
@@ -690,17 +594,17 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
              // load input
              index_t in_h = h * 2;
              index_t in_w = w * 2;
-              index_t in_offset = in_h * in_width + in_w;
+              index_t in_offset = in_h * p.in_width + in_w;
              vi0 = vld2q_f32(in_base + in_offset);  // [0.2.4.6, 1.3.5.7]
-              vi1 = vld2q_f32(in_base + in_offset + in_width);
+              vi1 = vld2q_f32(in_base + in_offset + p.in_width);
-              vi2 = vld2q_f32(in_base + in_offset + 2 * in_width);
+              vi2 = vld2q_f32(in_base + in_offset + 2 * p.in_width);
              vi0n = vld1q_f32(in_base + in_offset + 8);  // [8.9.10.11]
-              vi1n = vld1q_f32(in_base + in_offset + in_width + 8);
+              vi1n = vld1q_f32(in_base + in_offset + p.in_width + 8);
-              vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8);
+              vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 8);
              // load ouptut
-              index_t out_offset = h * out_width + w;
+              index_t out_offset = h * p.out_width + w;
              vo = vld1q_f32(out_base + out_offset);
              vi00 = vi0.val[0];                // [0.2.4.6]
@@ -731,24 +635,11 @@ MaceStatus Conv2dK3x3S2::Compute(const OpContext *context,
        }  // c
      }    // m
    }      // b
-  }, 0, batch, 1, 0, out_channels, 1);
+  }, 0, p.batch, 1, 0, p.out_channels, 1);
-  UnPadOutput(*out_tensor, output);
  return MaceStatus::MACE_SUCCESS;
 }
-void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK3x3S1, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK3x3S2, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S2));
-}
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.cc
@@ -18,8 +18,8 @@
 #include "mace/ops/common/conv_pool_2d_util.h"
 #include "mace/ops/delegator/conv_2d.h"
-#include "mace/utils/memory.h"
 #include "mace/utils/math.h"
+#include "mace/utils/memory.h"
 namespace mace {
 namespace ops {

--- a/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
+++ b/mace/ops/arm/fp32/conv_2d_3x3_winograd.h
@@ -20,8 +20,8 @@
 #include "mace/core/ops/op_context.h"
 #include "mace/core/tensor.h"
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d.h"
-#include "mace/ops/arm/fp32/gemm.h"
+#include "mace/ops/arm/base/gemm.h"
 #include "mace/public/mace.h"
 namespace mace {
@@ -32,7 +32,7 @@ namespace fp32 {
 class Conv2dK3x3Winograd : public Conv2dBase {
 public:
  explicit Conv2dK3x3Winograd(const delegator::Conv2dParam &param)
-      : Conv2dBase(param),
+      : Conv2dBase(param, sizeof(float)),
        gemm_(delegator::GemmParam()),
        transformed_filter_(nullptr),
        out_tile_size_(0) {}
@@ -94,7 +94,7 @@ class Conv2dK3x3Winograd : public Conv2dBase {
                          index_t tile_count,
                          float *output);
-  Gemm gemm_;
+  Gemm<float> gemm_;
  std::unique_ptr<Tensor> transformed_filter_;
  index_t out_tile_size_;
 };

--- a/mace/ops/arm/fp32/conv_2d_5x5.cc
+++ b/mace/ops/arm/fp32/conv_2d_5x5.cc
@@ -15,26 +15,12 @@
 #include <arm_neon.h>
 #include <memory>
-#include "mace/ops/arm/fp32/conv_2d.h"
+#include "mace/ops/arm/base/conv_2d_5x5.h"
 #include "mace/ops/delegator/conv_2d.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-class Conv2dK5x5S1 : public Conv2dBase {
- public:
-  explicit Conv2dK5x5S1(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
-  virtual ~Conv2dK5x5S1() {}
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
-};
 #define MACE_Conv2dNeonK5x5SnLoadCalc4                    \
  /* load filter (4 outch x 1 height x 4 width) */        \
@@ -91,89 +77,43 @@ class Conv2dK5x5S1 : public Conv2dBase {
  vo0 = vmlaq_lane_f32(vo0, vi3, vget_high_f32(vf00), 1); \
  vo0 = vmlaq_lane_f32(vo0, vi4, vf01, 1);
-MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
+template<>
-                                 const Tensor *input,
+MaceStatus Conv2dK5x5S1<float>::DoCompute(
-                                 const Tensor *filter,
+    const ConvComputeParam &p, const float *filter_data,
-                                 Tensor *output) {
+    const float *input_data, float *output_data) {
-  std::unique_ptr<const Tensor> padded_input;
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  std::unique_ptr<Tensor> padded_output;
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       1,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t m = start1; m < end1; m += step1) {
-        if (m + 3 < out_channels) {
+        if (m + 3 < p.out_channels) {
          float *out_ptr0_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
          float *out_ptr1_base =
-              output_data + b * out_batch_size + (m + 1) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
          float *out_ptr2_base =
-              output_data + b * out_batch_size + (m + 2) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
          float *out_ptr3_base =
-              output_data + b * out_batch_size + (m + 3) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+          for (index_t c = 0; c < p.in_channels; ++c) {
            const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
            const float
-                *filter_ptr0 = filter_data + m * in_channels * 25 + c * 25;
+                *filter_ptr0 = filter_data + m * p.in_channels * 25 + c * 25;
            const float *filter_ptr1 =
-                filter_data + (m + 1) * in_channels * 25 + c * 25;
+                filter_data + (m + 1) * p.in_channels * 25 + c * 25;
            const float *filter_ptr2 =
-                filter_data + (m + 2) * in_channels * 25 + c * 25;
+                filter_data + (m + 2) * p.in_channels * 25 + c * 25;
            const float *filter_ptr3 =
-                filter_data + (m + 3) * in_channels * 25 + c * 25;
+                filter_data + (m + 3) * p.in_channels * 25 + c * 25;
-            for (index_t h = 0; h < out_height; ++h) {
+            for (index_t h = 0; h < p.out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                // input offset
-                index_t in_offset = h * in_width + w;
+                index_t in_offset = h * p.in_width + w;
                // output (4 outch x 1 height x 4 width): vo_outch_height
                float32x4_t vo0, vo1, vo2, vo3;
                // load output
-                index_t out_offset = h * out_width + w;
+                index_t out_offset = h * p.out_width + w;
                vo0 = vld1q_f32(out_ptr0_base + out_offset);
                vo1 = vld1q_f32(out_ptr1_base + out_offset);
                vo2 = vld1q_f32(out_ptr2_base + out_offset);
@@ -190,7 +130,7 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
                  MACE_Conv2dNeonK5x5SnLoadCalc4;
-                  in_offset += in_width;
+                  in_offset += p.in_width;
                  filter_ptr0 += 5;
                  filter_ptr1 += 5;
                  filter_ptr2 += 5;
@@ -210,22 +150,22 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
            }    // h
          }  // c
        } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
+          for (index_t mm = m; mm < p.out_channels; ++mm) {
            float *out_ptr0_base =
-                output_data + b * out_batch_size + mm * out_image_size;
+                output_data + b * p.out_batch_size + mm * p.out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
+            for (index_t c = 0; c < p.in_channels; ++c) {
              const float *in_ptr_base =
-                  input_data + b * in_batch_size + c * in_image_size;
+                  input_data + b * p.in_batch_size + c * p.in_image_size;
              const float
-                  *filter_ptr0 = filter_data + mm * in_channels * 25 + c * 25;
+                  *filter_ptr0 = filter_data + mm * p.in_channels * 25 + c * 25;
-              for (index_t h = 0; h < out_height; ++h) {
+              for (index_t h = 0; h < p.out_height; ++h) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                  // input offset
-                  index_t in_offset = h * in_width + w;
+                  index_t in_offset = h * p.in_width + w;
                  // output (1 outch x 1 height x 4 width): vo_outch_height
                  float32x4_t vo0;
                  // load output
-                  index_t out_offset = h * out_width + w;
+                  index_t out_offset = h * p.out_width + w;
                  vo0 = vld1q_f32(out_ptr0_base + out_offset);
                  for (index_t r = 0; r < 5; ++r) {
                    // input (3 slide)
@@ -239,7 +179,7 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
                    MACE_Conv2dNeonK5x5SnLoadCalc1;
-                    in_offset += in_width;
+                    in_offset += p.in_width;
                    filter_ptr0 += 5;
                  }  // r
@@ -252,20 +192,11 @@ MaceStatus Conv2dK5x5S1::Compute(const OpContext *context,
        }      // if
      }        // m
    }          // b
-  }, 0, batch, 1, 0, out_channels, 4);
+  }, 0, p.batch, 1, 0, p.out_channels, 4);
-  UnPadOutput(*out_tensor, output);
  return MaceStatus::MACE_SUCCESS;
 }
-void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK5x5S1, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K5x5S1));
-}
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/conv_2d_7x7.cc
+++ b/mace/ops/arm/fp32/conv_2d_7x7.cc
@@ -12,17 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/conv_2d_7x7.h"
 #include <arm_neon.h>
 #include <memory>
+#include "mace/ops/arm/base/conv_2d_7x7.h"
 #include "mace/ops/delegator/conv_2d.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 #define MACE_Conv2dArmv8NeonK7x7SnLoadCalc4        \
  /* load filter (4 outch x 1 height x 4 width) */ \
@@ -156,88 +154,43 @@ namespace fp32 {
  vo0 = vmlaq_lane_f32(vo0, vi5, vget_high_f32(vf01), 0); \
  vo0 = vmlaq_lane_f32(vo0, vi6, vget_high_f32(vf01), 1);
-MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
+template<>
-                                 const Tensor *input,
+MaceStatus Conv2dK7x7S1<float>::DoCompute(
-                                 const Tensor *filter,
+    const ConvComputeParam &p, const float *filter_data,
-                                 Tensor *output) {
+    const float *input_data, float *output_data) {
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       1,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t m = start1; m < end1; m += step1) {
-        if (m + 3 < out_channels) {
+        if (m + 3 < p.out_channels) {
          float *out_ptr0_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
          float *out_ptr1_base =
-              output_data + b * out_batch_size + (m + 1) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
          float *out_ptr2_base =
-              output_data + b * out_batch_size + (m + 2) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
          float *out_ptr3_base =
-              output_data + b * out_batch_size + (m + 3) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+          for (index_t c = 0; c < p.in_channels; ++c) {
            const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
            const float
-                *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
+                *filter_ptr0 = filter_data + m * p.in_channels * 49 + c * 49;
            const float *filter_ptr1 =
-                filter_data + (m + 1) * in_channels * 49 + c * 49;
+                filter_data + (m + 1) * p.in_channels * 49 + c * 49;
            const float *filter_ptr2 =
-                filter_data + (m + 2) * in_channels * 49 + c * 49;
+                filter_data + (m + 2) * p.in_channels * 49 + c * 49;
            const float *filter_ptr3 =
-                filter_data + (m + 3) * in_channels * 49 + c * 49;
+                filter_data + (m + 3) * p.in_channels * 49 + c * 49;
-            for (index_t h = 0; h < out_height; ++h) {
+            for (index_t h = 0; h < p.out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                // input offset
-                index_t in_offset = h * in_width + w;
+                index_t in_offset = h * p.in_width + w;
                // output (4 outch x 1 height x 4 width): vo_outch_height
                float32x4_t vo0, vo1, vo2, vo3;
                // load output
-                index_t out_offset = h * out_width + w;
+                index_t out_offset = h * p.out_width + w;
                vo0 = vld1q_f32(out_ptr0_base + out_offset);
                vo1 = vld1q_f32(out_ptr1_base + out_offset);
                vo2 = vld1q_f32(out_ptr2_base + out_offset);
@@ -262,7 +215,7 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
                  MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
 #endif
-                  in_offset += in_width;
+                  in_offset += p.in_width;
                  filter_ptr0 += 7;
                  filter_ptr1 += 7;
                  filter_ptr2 += 7;
@@ -282,22 +235,22 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
            }    // h
          }  // c
        } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
+          for (index_t mm = m; mm < p.out_channels; ++mm) {
            float *out_ptr0_base =
-                output_data + b * out_batch_size + mm * out_image_size;
+                output_data + b * p.out_batch_size + mm * p.out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
+            for (index_t c = 0; c < p.in_channels; ++c) {
              const float *in_ptr_base =
-                  input_data + b * in_batch_size + c * in_image_size;
+                  input_data + b * p.in_batch_size + c * p.in_image_size;
              const float
-                  *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
+                  *filter_ptr0 = filter_data + mm * p.in_channels * 49 + c * 49;
-              for (index_t h = 0; h < out_height; ++h) {
+              for (index_t h = 0; h < p.out_height; ++h) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                  // input offset
-                  index_t in_offset = h * in_width + w;
+                  index_t in_offset = h * p.in_width + w;
                  // output (1 outch x 1 height x 4 width): vo_outch_height
                  float32x4_t vo0;
                  // load output
-                  index_t out_offset = h * out_width + w;
+                  index_t out_offset = h * p.out_width + w;
                  vo0 = vld1q_f32(out_ptr0_base + out_offset);
                  for (index_t r = 0; r < 7; ++r) {
                    // input (3 slide)
@@ -319,7 +272,7 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
                    MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
 #endif
-                    in_offset += in_width;
+                    in_offset += p.in_width;
                    filter_ptr0 += 7;
                  }  // r
@@ -332,96 +285,49 @@ MaceStatus Conv2dK7x7S1::Compute(const OpContext *context,
        }      // if
      }        // m
    }          // b
-  }, 0, batch, 1, 0, out_channels, 4);
+  }, 0, p.batch, 1, 0, p.out_channels, 4);
-  UnPadOutput(*out_tensor, output);
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
+template<>
-                                 const Tensor *input,
+MaceStatus Conv2dK7x7S2<float>::DoCompute(
-                                 const Tensor *filter,
+    const ConvComputeParam &p, const float *filter_data,
-                                 Tensor *output) {
+    const float *input_data, float *output_data) {
-  std::unique_ptr<const Tensor> padded_input;
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  std::unique_ptr<Tensor> padded_output;
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       1,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t m = start1; m < end1; m += step1) {
-        if (m + 3 < out_channels) {
+        if (m + 3 < p.out_channels) {
          float *out_ptr0_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
          float *out_ptr1_base =
-              output_data + b * out_batch_size + (m + 1) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
          float *out_ptr2_base =
-              output_data + b * out_batch_size + (m + 2) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
          float *out_ptr3_base =
-              output_data + b * out_batch_size + (m + 3) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+          for (index_t c = 0; c < p.in_channels; ++c) {
            const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
            const float
-                *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
+                *filter_ptr0 = filter_data + m * p.in_channels * 49 + c * 49;
            const float *filter_ptr1 =
-                filter_data + (m + 1) * in_channels * 49 + c * 49;
+                filter_data + (m + 1) * p.in_channels * 49 + c * 49;
            const float *filter_ptr2 =
-                filter_data + (m + 2) * in_channels * 49 + c * 49;
+                filter_data + (m + 2) * p.in_channels * 49 + c * 49;
            const float *filter_ptr3 =
-                filter_data + (m + 3) * in_channels * 49 + c * 49;
+                filter_data + (m + 3) * p.in_channels * 49 + c * 49;
-            for (index_t h = 0; h < out_height; ++h) {
+            for (index_t h = 0; h < p.out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                // input offset
                index_t in_h = h * 2;
                index_t in_w = w * 2;
-                index_t in_offset = in_h * in_width + in_w;
+                index_t in_offset = in_h * p.in_width + in_w;
                // output (4 outch x 1 height x 4 width): vo_outch_height
                float32x4_t vo0, vo1, vo2, vo3;
                // load output
-                index_t out_offset = h * out_width + w;
+                index_t out_offset = h * p.out_width + w;
                vo0 = vld1q_f32(out_ptr0_base + out_offset);
                vo1 = vld1q_f32(out_ptr1_base + out_offset);
                vo2 = vld1q_f32(out_ptr2_base + out_offset);
@@ -449,7 +355,7 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
                  MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
 #endif
-                  in_offset += in_width;
+                  in_offset += p.in_width;
                  filter_ptr0 += 7;
                  filter_ptr1 += 7;
                  filter_ptr2 += 7;
@@ -469,24 +375,24 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
            }    // h
          }  // c
        } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
+          for (index_t mm = m; mm < p.out_channels; ++mm) {
            float *out_ptr0_base =
-                output_data + b * out_batch_size + mm * out_image_size;
+                output_data + b * p.out_batch_size + mm * p.out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
+            for (index_t c = 0; c < p.in_channels; ++c) {
              const float *in_ptr_base =
-                  input_data + b * in_batch_size + c * in_image_size;
+                  input_data + b * p.in_batch_size + c * p.in_image_size;
              const float
-                  *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
+                  *filter_ptr0 = filter_data + mm * p.in_channels * 49 + c * 49;
-              for (index_t h = 0; h < out_height; ++h) {
+              for (index_t h = 0; h < p.out_height; ++h) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                  // input offset
                  index_t in_h = h * 2;
                  index_t in_w = w * 2;
-                  index_t in_offset = in_h * in_width + in_w;
+                  index_t in_offset = in_h * p.in_width + in_w;
                  // output (1 outch x 1 height x 4 width): vo_outch_height
                  float32x4_t vo0;
                  // load ouput
-                  index_t out_offset = h * out_width + w;
+                  index_t out_offset = h * p.out_width + w;
                  vo0 = vld1q_f32(out_ptr0_base + out_offset);
                  for (index_t r = 0; r < 7; ++r) {
                    // input (3 slide)
@@ -511,7 +417,7 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
                    MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
 #endif
-                    in_offset += in_width;
+                    in_offset += p.in_width;
                    filter_ptr0 += 7;
                  }  // r
@@ -524,96 +430,49 @@ MaceStatus Conv2dK7x7S2::Compute(const OpContext *context,
        }      // if
      }        // m
    }          // b
-  }, 0, batch, 1, 0, out_channels, 4);
+  }, 0, p.batch, 1, 0, p.out_channels, 4);
-  UnPadOutput(*out_tensor, output);
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
+template<>
-                                 const Tensor *input,
+MaceStatus Conv2dK7x7S3<float>::DoCompute(
-                                 const Tensor *filter,
+    const ConvComputeParam &p, const float *filter_data,
-                                 Tensor *output) {
+    const float *input_data, float *output_data) {
-  std::unique_ptr<const Tensor> padded_input;
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  std::unique_ptr<Tensor> padded_output;
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       1,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-  auto &in_shape = in_tensor->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t m = start1; m < end1; m += step1) {
-        if (m + 3 < out_channels) {
+        if (m + 3 < p.out_channels) {
          float *out_ptr0_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
          float *out_ptr1_base =
-              output_data + b * out_batch_size + (m + 1) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 1) * p.out_image_size;
          float *out_ptr2_base =
-              output_data + b * out_batch_size + (m + 2) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 2) * p.out_image_size;
          float *out_ptr3_base =
-              output_data + b * out_batch_size + (m + 3) * out_image_size;
+              output_data + b * p.out_batch_size + (m + 3) * p.out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+          for (index_t c = 0; c < p.in_channels; ++c) {
            const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
            const float
-                *filter_ptr0 = filter_data + m * in_channels * 49 + c * 49;
+                *filter_ptr0 = filter_data + m * p.in_channels * 49 + c * 49;
            const float *filter_ptr1 =
-                filter_data + (m + 1) * in_channels * 49 + c * 49;
+                filter_data + (m + 1) * p.in_channels * 49 + c * 49;
            const float *filter_ptr2 =
-                filter_data + (m + 2) * in_channels * 49 + c * 49;
+                filter_data + (m + 2) * p.in_channels * 49 + c * 49;
            const float *filter_ptr3 =
-                filter_data + (m + 3) * in_channels * 49 + c * 49;
+                filter_data + (m + 3) * p.in_channels * 49 + c * 49;
-            for (index_t h = 0; h < out_height; ++h) {
+            for (index_t h = 0; h < p.out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                // input offset
                index_t in_h = h * 3;
                index_t in_w = w * 3;
-                index_t in_offset = in_h * in_width + in_w;
+                index_t in_offset = in_h * p.in_width + in_w;
                // output (4 outch x 1 height x 4 width): vo_outch_height
                float32x4_t vo0, vo1, vo2, vo3;
                // load output
-                index_t out_offset = h * out_width + w;
+                index_t out_offset = h * p.out_width + w;
                vo0 = vld1q_f32(out_ptr0_base + out_offset);
                vo1 = vld1q_f32(out_ptr1_base + out_offset);
                vo2 = vld1q_f32(out_ptr2_base + out_offset);
@@ -641,7 +500,7 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
                  MACE_Conv2dArmv7NeonK7x7SnLoadCalc4;
 #endif
-                  in_offset += in_width;
+                  in_offset += p.in_width;
                  filter_ptr0 += 7;
                  filter_ptr1 += 7;
                  filter_ptr2 += 7;
@@ -661,24 +520,24 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
            }    // h
          }  // c
        } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
+          for (index_t mm = m; mm < p.out_channels; ++mm) {
            float *out_ptr0_base =
-                output_data + b * out_batch_size + mm * out_image_size;
+                output_data + b * p.out_batch_size + mm * p.out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
+            for (index_t c = 0; c < p.in_channels; ++c) {
              const float *in_ptr_base =
-                  input_data + b * in_batch_size + c * in_image_size;
+                  input_data + b * p.in_batch_size + c * p.in_image_size;
              const float
-                  *filter_ptr0 = filter_data + mm * in_channels * 49 + c * 49;
+                  *filter_ptr0 = filter_data + mm * p.in_channels * 49 + c * 49;
-              for (index_t h = 0; h < out_height; ++h) {
+              for (index_t h = 0; h < p.out_height; ++h) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                  // input offset
                  index_t in_h = h * 3;
                  index_t in_w = w * 3;
-                  index_t in_offset = in_h * in_width + in_w;
+                  index_t in_offset = in_h * p.in_width + in_w;
                  // output (1 outch x 1 height x 4 width): vo_outch_height
                  float32x4_t vo0;
                  // load output
-                  index_t out_offset = h * out_width + w;
+                  index_t out_offset = h * p.out_width + w;
                  vo0 = vld1q_f32(out_ptr0_base + out_offset);
                  for (index_t r = 0; r < 7; ++r) {
                    // input (3 slide)
@@ -703,7 +562,7 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
                    MACE_Conv2dArmv7NeonK7x7SnLoadCalc1;
 #endif
-                    in_offset += in_width;
+                    in_offset += p.in_width;
                    filter_ptr0 += 7;
                  }  // r
@@ -716,28 +575,11 @@ MaceStatus Conv2dK7x7S3::Compute(const OpContext *context,
        }      // if
      }        // m
    }          // b
-  }, 0, batch, 1, 0, out_channels, 4);
+  }, 0, p.batch, 1, 0, p.out_channels, 4);
-  UnPadOutput(*out_tensor, output);
  return MaceStatus::MACE_SUCCESS;
 }
-void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK7x7S1, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K7x7S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK7x7S2, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K7x7S2));
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dK7x7S3, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY_EX(Conv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K7x7S3));
-}
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/conv_general.cc
+++ b/mace/ops/arm/fp32/conv_general.cc
@@ -12,87 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/conv_2d.h"
 #include <memory>
+#include "mace/ops/arm/base/conv_2d_general.h"
 #include "mace/ops/delegator/conv_2d.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-class Conv2dGeneral : public Conv2dBase {
- public:
-  explicit Conv2dGeneral(const delegator::Conv2dParam &param)
-      : Conv2dBase(param) {}
-  virtual ~Conv2dGeneral() {}
-  MaceStatus Compute(
-      const OpContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      Tensor *output) override;
-};
-MaceStatus Conv2dGeneral::Compute(const OpContext *context,
-                                  const Tensor *input,
-                                  const Tensor *filter,
-                                  Tensor *output) {
-  std::unique_ptr<const Tensor> padded_input;
-  std::unique_ptr<Tensor> padded_output;
-  ResizeOutAndPadInOut(context,
-                       input,
-                       filter,
-                       output,
-                       1,
-                       4,
-                       &padded_input,
-                       &padded_output);
-  const Tensor *in_tensor = input;
-  if (padded_input != nullptr) {
-    in_tensor = padded_input.get();
-  }
-  Tensor *out_tensor = output;
-  if (padded_output != nullptr) {
-    out_tensor = padded_output.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = in_tensor->data<float>();
-  auto output_data = out_tensor->mutable_data<float>();
-  auto &in_shape = in_tensor->shape();
+template<>
-  auto &out_shape = out_tensor->shape();
+MaceStatus Conv2dGeneral<float>::DoCompute(
-  auto &filter_shape = filter->shape();
+    const ConvComputeParam &p, const float *filter_data,
+    const float *input_data, float *output_data,
-  const index_t batch = in_shape[0];
+    const std::vector<index_t> &filter_shape) {
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
  const index_t filter_height = filter_shape[2];
  const index_t filter_width = filter_shape[3];
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
  const index_t filter_size = filter_height * filter_width;
-  utils::ThreadPool
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t m = start1; m < end1; m += step1) {
@@ -100,30 +38,33 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
        const int stride_w = strides_[1];
        const int dilation_h = dilations_[0];
        const int dilation_w = dilations_[1];
-        if (m + 3 < out_channels) {
+        if (m + 3 < p.out_channels) {
          float *out_ptr0_base =
-              output_data + b * out_batch_size + m * out_image_size;
+              output_data + b * p.out_batch_size + m * p.out_image_size;
-          float *out_ptr1_base = out_ptr0_base + out_image_size;
+          float *out_ptr1_base = out_ptr0_base + p.out_image_size;
-          float *out_ptr2_base = out_ptr1_base + out_image_size;
+          float *out_ptr2_base = out_ptr1_base + p.out_image_size;
-          float *out_ptr3_base = out_ptr2_base + out_image_size;
+          float *out_ptr3_base = out_ptr2_base + p.out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
+          for (index_t c = 0; c < p.in_channels; ++c) {
            const float *in_ptr_base =
-                input_data + b * in_batch_size + c * in_image_size;
+                input_data + b * p.in_batch_size + c * p.in_image_size;
            const float *filter_ptr0 =
-                filter_data + m * in_channels * filter_size + c * filter_size;
+                filter_data + m * p.in_channels * filter_size + c * filter_size;
-            const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
+            const float *filter_ptr1 =
-            const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
+                filter_ptr0 + p.in_channels * filter_size;
-            const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
+            const float *filter_ptr2 =
-            for (index_t h = 0; h < out_height; ++h) {
+                filter_ptr1 + p.in_channels * filter_size;
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
+            const float *filter_ptr3 =
+                filter_ptr2 + p.in_channels * filter_size;
+            for (index_t h = 0; h < p.out_height; ++h) {
+              for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                // input offset
                index_t ih = h * stride_h;
                index_t iw = w * stride_w;
-                index_t in_offset = ih * in_width + iw;
+                index_t in_offset = ih * p.in_width + iw;
                // output (4 outch x 1 height x 4 width): vo_outch_height
                float vo0[4], vo1[4], vo2[4], vo3[4];
                // load output
-                index_t out_offset = h * out_width + w;
+                index_t out_offset = h * p.out_width + w;
                for (index_t ow = 0; ow < 4; ++ow) {
                  vo0[ow] = out_ptr0_base[out_offset + ow];
                  vo1[ow] = out_ptr1_base[out_offset + ow];
@@ -171,7 +112,7 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
                        + kw * dilation_w] * filter_ptr3[kw];
                  }  // kw
-                  in_offset += dilation_h * in_width;
+                  in_offset += dilation_h * p.in_width;
                  filter_ptr0 += filter_width;
                  filter_ptr1 += filter_width;
                  filter_ptr2 += filter_width;
@@ -193,26 +134,26 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
            }  // h
          }  // c
        } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
+          for (index_t mm = m; mm < p.out_channels; ++mm) {
            float *out_ptr0_base =
-                output_data + b * out_batch_size + mm * out_image_size;
+                output_data + b * p.out_batch_size + mm * p.out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
+            for (index_t c = 0; c < p.in_channels; ++c) {
              const float *in_ptr_base =
-                  input_data + b * in_batch_size + c * in_image_size;
+                  input_data + b * p.in_batch_size + c * p.in_image_size;
              const float *filter_ptr0 =
-                  filter_data + mm * in_channels * filter_size
+                  filter_data + mm * p.in_channels * filter_size
                      + c * filter_size;
-              for (index_t h = 0; h < out_height; ++h) {
+              for (index_t h = 0; h < p.out_height; ++h) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                for (index_t w = 0; w + 3 < p.out_width; w += 4) {
                  // input offset
                  index_t ih = h * stride_h;
                  index_t iw = w * stride_w;
-                  index_t in_offset = ih * in_width + iw;
+                  index_t in_offset = ih * p.in_width + iw;
                  // output (1 outch x 1 height x 4 width): vo_outch_height
                  float vo0[4];
                  // load output
-                  index_t out_offset = h * out_width + w;
+                  index_t out_offset = h * p.out_width + w;
                  for (index_t ow = 0; ow < 4; ++ow) {
                    vo0[ow] = out_ptr0_base[out_offset + ow];
                  }
@@ -231,7 +172,7 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
                          + kw * dilation_w] * filter_ptr0[kw];
                    }  // kw
-                    in_offset += dilation_h * in_width;
+                    in_offset += dilation_h * p.in_width;
                    filter_ptr0 += filter_width;
                  }  // kh
@@ -246,19 +187,11 @@ MaceStatus Conv2dGeneral::Compute(const OpContext *context,
        }  // if
      }  // m
    }  // b
-  }, 0, batch, 1, 0, out_channels, 4);
+  }, 0, p.batch, 1, 0, p.out_channels, 4);
-  UnPadOutput(*out_tensor, output);
  return MaceStatus::MACE_SUCCESS;
 }
-void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Conv2dGeneral, delegator::Conv2dParam,
-      MACE_DELEGATOR_KEY(Conv2d, DeviceType::CPU, float, ImplType::NEON));
-}
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/deconv_2d_2x2.cc
+++ b/mace/ops/arm/fp32/deconv_2d_2x2.cc
@@ -12,74 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/deconv_2d_2x2.h"
 #include <arm_neon.h>
+#include "mace/ops/arm/base/deconv_2d_2x2.h"
 #include "mace/ops/arm/fp32/common_neon.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
-                                   const Tensor *input,
-                                   const Tensor *filter,
-                                   const Tensor *output_shape,
-                                   Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-  auto &in_shape = input->shape();
+template<>
-  auto &out_shape = out_tensor->shape();
+MaceStatus Deconv2dK2x2S1<float>::DoCompute(
+    const DeconvComputeParam &p, const float *filter_data,
-  const index_t batch = in_shape[0];
+    const float *input_data, float *padded_out_data) {
-  const index_t inch = in_shape[1];
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t oc = start1; oc < end1; oc += step1) {
-        if (oc + 1 < outch) {
+        if (oc + 1 < p.out_channels) {
-          float *out_base0 = padded_out_data + (b * outch + oc) * out_img_size;
+          float *out_base0 =
-          float *out_base1 = out_base0 + out_img_size;
+              padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
-          for (index_t ic = 0; ic < inch; ++ic) {
+          float *out_base1 = out_base0 + p.out_img_size;
-            const float *input_base = input_data + (b * inch + ic) * h * w;
+          for (index_t ic = 0; ic < p.in_channels; ++ic) {
-            const float *kernel_base0 = filter_data + (oc * inch + ic) * 4;
+            const float *input_base = input_data +
-            const float *kernel_base1 = kernel_base0 + inch * 4;
+                (b * p.in_channels + ic) * p.in_height * p.in_width;
+            const float *kernel_base0 =
+                filter_data + (oc * p.in_channels + ic) * 4;
+            const float *kernel_base1 = kernel_base0 + p.in_channels * 4;
            const float *in = input_base;
            // output channel 0
            const float *k0 = kernel_base0;
@@ -89,18 +48,18 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
            float32x4_t k0_vec = vld1q_f32(k0);
            float32x4_t k1_vec = vld1q_f32(k1);
-            for (index_t i = 0; i < h; ++i) {
+            for (index_t i = 0; i < p.in_height; ++i) {
-              float *out_row_base0 = out_base0 + i * outw;
+              float *out_row_base0 = out_base0 + i * p.out_width;
              float *out_row0_0 = out_row_base0;
-              float *out_row0_1 = out_row_base0 + outw;
+              float *out_row0_1 = out_row_base0 + p.out_width;
-              float *out_row_base1 = out_base1 + i * outw;
+              float *out_row_base1 = out_base1 + i * p.out_width;
              float *out_row1_0 = out_row_base1;
-              float *out_row1_1 = out_row_base1 + outw;
+              float *out_row1_1 = out_row_base1 + p.out_width;
              index_t j = 0;
-              for (; j + 3 < w; j += 4) {
+              for (; j + 3 < p.in_width; j += 4) {
                float32x4_t in_vec = vld1q_f32(in);
                float32x4_t out00, out01, out02, out03;
@@ -145,7 +104,7 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
                out_row1_1 += 4;
              }
-              for (; j < w; ++j) {
+              for (; j < p.in_width; ++j) {
                float val = in[0];
                for (int k = 0; k < 2; ++k) {
                  out_row0_0[k] += val * k0[k];
@@ -162,23 +121,26 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
            }
          }
        } else {
-          float *out_base0 = padded_out_data + (b * outch + oc) * outh * outw;
+          float *out_base0 = padded_out_data +
-          for (index_t ic = 0; ic < inch; ++ic) {
+              (b * p.out_channels + oc) * p.out_height * p.out_width;
-            const float *input_base = input_data + (b * inch + ic) * h * w;
+          for (index_t ic = 0; ic < p.in_channels; ++ic) {
-            const float *kernel_base0 = filter_data + (oc * inch + ic) * 4;
+            const float *input_base = input_data +
+                (b * p.in_channels + ic) * p.in_height * p.in_width;
+            const float *kernel_base0 =
+                filter_data + (oc * p.in_channels + ic) * 4;
            const float *in = input_base;
            const float *k0 = kernel_base0;
            // load filter
            float32x4_t k0_vec = vld1q_f32(k0);
-            for (index_t i = 0; i < h; ++i) {
+            for (index_t i = 0; i < p.in_height; ++i) {
-              float *out_row_base0 = out_base0 + i * outw;
+              float *out_row_base0 = out_base0 + i * p.out_width;
              float *out_row0_0 = out_row_base0;
-              float *out_row0_1 = out_row_base0 + outw;
+              float *out_row0_1 = out_row_base0 + p.out_width;
              index_t j = 0;
-              for (; j + 3 < w; j += 4) {
+              for (; j + 3 < p.in_width; j += 4) {
                float32x4_t in_vec = vld1q_f32(in);
                float32x4_t out00, out01, out02, out03;
@@ -203,7 +165,7 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
                out_row0_1 += 4;
              }
-              for (; j < w; ++j) {
+              for (; j < p.in_width; ++j) {
                float val = in[0];
                for (int k = 0; k < 2; ++k) {
                  out_row0_0[k] += val * k0[k];
@@ -218,79 +180,39 @@ MaceStatus Deconv2dK2x2S1::Compute(const OpContext *context,
        }
      }
    }
-  }, 0, batch, 1, 0, outch, 2);
+  }, 0, p.batch, 1, 0, p.out_channels, 2);
-  UnPadOutput(*out_tensor, out_pad_size, output);
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context,
+template<>
-                                   const Tensor *input,
+MaceStatus Deconv2dK2x2S2<float>::DoCompute(
-                                   const Tensor *filter,
+    const DeconvComputeParam &p, const float *filter_data,
-                                   const Tensor *output_shape,
+    const float *input_data, float *padded_out_data) {
-                                   Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t outch = out_shape[1];
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t oc = start1; oc < end1; oc += step1) {
-        float *out_base = padded_out_data + (b * outch + oc) * out_img_size;
+        float *out_base =
-        for (index_t ic = 0; ic < inch; ++ic) {
+            padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
-          const float *input_base = input_data + (b * inch + ic) * h * w;
+        for (index_t ic = 0; ic < p.in_channels; ++ic) {
-          const float *kernel_base = filter_data + (oc * inch + ic) * 4;
+          const float *input_base = input_data +
+              (b * p.in_channels + ic) * p.in_height * p.in_width;
+          const float *kernel_base =
+              filter_data + (oc * p.in_channels + ic) * 4;
          const float *in = input_base;
          const float *k0 = kernel_base;
          float32x4_t k0_vec = vld1q_f32(k0);
-          for (index_t i = 0; i < h; ++i) {
+          for (index_t i = 0; i < p.in_height; ++i) {
-            float *out_row_base = out_base + i * 2 * outw;
+            float *out_row_base = out_base + i * 2 * p.out_width;
            float *out_row_0 = out_row_base;
-            float *out_row_1 = out_row_0 + outw;
+            float *out_row_1 = out_row_0 + p.out_width;
            index_t j = 0;
-            for (; j + 3 < w; j += 4) {
+            for (; j + 3 < p.in_width; j += 4) {
              float32x4_t in_vec = vld1q_f32(in);
              // out row 0
@@ -314,7 +236,7 @@ MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context,
              out_row_1 += 8;
            }
-            for (; j < w; ++j) {
+            for (; j < p.in_width; ++j) {
              float val = in[0];
              for (int k = 0; k < 2; ++k) {
                out_row_0[k] += val * k0[k];
@@ -328,25 +250,11 @@ MaceStatus Deconv2dK2x2S2::Compute(const OpContext *context,
        }
      }
    }
-  }, 0, batch, 1, 0, outch, 1);
+  }, 0, p.batch, 1, 0, p.out_channels, 1);
-  UnPadOutput(*out_tensor, out_pad_size, output);
  return MaceStatus::MACE_SUCCESS;
 }
-void RegisterDeconv2dK2x2Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Deconv2dK2x2S1, delegator::Deconv2dParam,
-      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K2x2S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, Deconv2dK2x2S2, delegator::Deconv2dParam,
-      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K2x2S2));
-}
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/deconv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/deconv_2d_3x3.cc
@@ -12,73 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/deconv_2d_3x3.h"
 #include <arm_neon.h>
+#include "mace/ops/arm/base/deconv_2d_3x3.h"
 #include "mace/ops/arm/fp32/common_neon.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
-                                   const Tensor *input,
-                                   const Tensor *filter,
-                                   const Tensor *output_shape,
-                                   Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-  auto &in_shape = input->shape();
+template<>
-  auto &out_shape = out_tensor->shape();
+MaceStatus Deconv2dK3x3S1<float>::DoCompute(
+    const DeconvComputeParam &p, const float *filter_data,
-  const index_t batch = out_shape[0];
+    const float *input_data, float *padded_out_data) {
-  const index_t inch = in_shape[1];
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t oc = start1; oc < end1; oc += step1) {
-        if (oc + 1 < outch) {
+        if (oc + 1 < p.out_channels) {
-          float *out_base0 = padded_out_data + (b * outch + oc) * out_img_size;
+          float *out_base0 =
-          float *out_base1 = out_base0 + out_img_size;
+              padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
-          for (index_t ic = 0; ic < inch; ++ic) {
+          float *out_base1 = out_base0 + p.out_img_size;
-            const float *input_base = input_data + (b * inch + ic) * h * w;
+          for (index_t ic = 0; ic < p.in_channels; ++ic) {
-            const float *kernel_base0 = filter_data + (oc * inch + ic) * 9;
+            const float *input_base = input_data +
-            const float *kernel_base1 = kernel_base0 + inch * 9;
+                (b * p.in_channels + ic) * p.in_height * p.in_width;
+            const float *kernel_base0 =
+                filter_data + (oc * p.in_channels + ic) * 9;
+            const float *kernel_base1 = kernel_base0 + p.in_channels * 9;
            const float *in = input_base;
            // output channel 0
@@ -102,20 +62,20 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
            k11_vec = vld1q_f32(k1_1);
            k12_vec = vld1q_f32(k1_2);
-            for (index_t i = 0; i < h; ++i) {
+            for (index_t i = 0; i < p.in_height; ++i) {
-              float *out_row_base0 = out_base0 + i * outw;
+              float *out_row_base0 = out_base0 + i * p.out_width;
              float *out_row0_0 = out_row_base0;
-              float *out_row0_1 = out_row_base0 + outw;
+              float *out_row0_1 = out_row_base0 + p.out_width;
-              float *out_row0_2 = out_row_base0 + 2 * outw;
+              float *out_row0_2 = out_row_base0 + 2 * p.out_width;
-              float *out_row_base1 = out_base1 + i * outw;
+              float *out_row_base1 = out_base1 + i * p.out_width;
              float *out_row1_0 = out_row_base1;
-              float *out_row1_1 = out_row_base1 + outw;
+              float *out_row1_1 = out_row_base1 + p.out_width;
-              float *out_row1_2 = out_row_base1 + 2 * outw;
+              float *out_row1_2 = out_row_base1 + 2 * p.out_width;
              index_t j = 0;
-              for (; j + 3 < w; j += 4) {
+              for (; j + 3 < p.in_width; j += 4) {
                float32x4_t in_vec = vld1q_f32(in);
                float32x4_t out00, out01, out02;
@@ -203,7 +163,7 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
                out_row1_2 += 4;
              }
-              for (; j < w; ++j) {
+              for (; j < p.in_width; ++j) {
                float val = in[0];
                for (int k = 0; k < 3; ++k) {
                  out_row0_0[k] += val * k0_0[k];
@@ -224,10 +184,13 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
            }
          }
        } else {
-          float *out_base0 = padded_out_data + (b * outch + oc) * outh * outw;
+          float *out_base0 = padded_out_data +
-          for (index_t ic = 0; ic < inch; ++ic) {
+              (b * p.out_channels + oc) * p.out_height * p.out_width;
-            const float *input_base = input_data + (b * inch + ic) * h * w;
+          for (index_t ic = 0; ic < p.in_channels; ++ic) {
-            const float *kernel_base0 = filter_data + (oc * inch + ic) * 9;
+            const float *input_base = input_data +
+                (b * p.in_channels + ic) * p.in_height * p.in_width;
+            const float *kernel_base0 =
+                filter_data + (oc * p.in_channels + ic) * 9;
            const float *in = input_base;
            const float *k0_0 = kernel_base0;
            const float *k0_1 = kernel_base0 + 3;
@@ -238,14 +201,14 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
            float32x4_t k01_vec = vld1q_f32(k0_1);
            float32x4_t k02_vec = vld1q_f32(k0_2);
-            for (index_t i = 0; i < h; ++i) {
+            for (index_t i = 0; i < p.in_height; ++i) {
-              float *out_row_base0 = out_base0 + i * outw;
+              float *out_row_base0 = out_base0 + i * p.out_width;
              float *out_row0_0 = out_row_base0;
-              float *out_row0_1 = out_row_base0 + outw;
+              float *out_row0_1 = out_row_base0 + p.out_width;
-              float *out_row0_2 = out_row_base0 + 2 * outw;
+              float *out_row0_2 = out_row_base0 + 2 * p.out_width;
              index_t j = 0;
-              for (; j + 3 < w; j += 4) {
+              for (; j + 3 < p.in_width; j += 4) {
                float32x4_t in_vec = vld1q_f32(in);
                float32x4_t out00, out01, out02;
@@ -294,7 +257,7 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
                out_row0_2 += 4;
              }
-              for (; j < w; ++j) {
+              for (; j < p.in_width; ++j) {
                float val = in[0];
                for (int k = 0; k < 3; ++k) {
                  out_row0_0[k] += val * k0_0[k];
@@ -311,67 +274,26 @@ MaceStatus Deconv2dK3x3S1::Compute(const OpContext *context,
        }
      }
    }
-  }, 0, batch, 1, 0, outch, 2);
+  }, 0, p.batch, 1, 0, p.out_channels, 2);
-  UnPadOutput(*out_tensor, out_pad_size, output);
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context,
+template<>
-                                   const Tensor *input,
+MaceStatus Deconv2dK3x3S2<float>::DoCompute(
-                                   const Tensor *filter,
+    const DeconvComputeParam &p, const float *filter_data,
-                                   const Tensor *output_shape,
+    const float *input_data, float *padded_out_data) {
-                                   Tensor *output) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t oc = start1; oc < end1; oc += step1) {
-        float *out_base = padded_out_data + (b * outch + oc) * out_img_size;
+        float *out_base =
-        for (index_t ic = 0; ic < inch; ++ic) {
+            padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
-          const float *input_base = input_data + (b * inch + ic) * h * w;
+        for (index_t ic = 0; ic < p.in_channels; ++ic) {
-          const float *kernel_base = filter_data + (oc * inch + ic) * 9;
+          const float *input_base =
+              input_data + (b * p.in_channels + ic) * p.in_height * p.in_width;
+          const float *kernel_base =
+              filter_data + (oc * p.in_channels + ic) * 9;
          const float *in = input_base;
          const float *k0 = kernel_base;
@@ -382,15 +304,15 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context,
          float32x4_t k1_vec = vld1q_f32(k1);
          float32x4_t k2_vec = vld1q_f32(k2);
-          for (index_t i = 0; i < h; ++i) {
+          for (index_t i = 0; i < p.in_height; ++i) {
-            float *out_row_base = out_base + i * 2 * outw;
+            float *out_row_base = out_base + i * 2 * p.out_width;
            float *out_row_0 = out_row_base;
-            float *out_row_1 = out_row_0 + outw;
+            float *out_row_1 = out_row_0 + p.out_width;
-            float *out_row_2 = out_row_1 + outw;
+            float *out_row_2 = out_row_1 + p.out_width;
            index_t j = 0;
-            for (index_t n = 0; n + 9 < outw; n += 8) {
+            for (index_t n = 0; n + 9 < p.out_width; n += 8) {
              float32x4_t in_vec = vld1q_f32(in);
              // out row 0
@@ -439,7 +361,7 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context,
              j += 4;
            }
-            for (; j < w; ++j) {
+            for (; j < p.in_width; ++j) {
              float val = in[0];
              for (int k = 0; k < 3; ++k) {
@@ -457,25 +379,11 @@ MaceStatus Deconv2dK3x3S2::Compute(const OpContext *context,
        }
      }
    }
-  }, 0, batch, 1, 0, outch, 1);
+  }, 0, p.batch, 1, 0, p.out_channels, 1);
-  UnPadOutput(*out_tensor, out_pad_size, output);
  return MaceStatus::MACE_SUCCESS;
 }
-void RegisterDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Deconv2dK3x3S1, delegator::Deconv2dParam,
-      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, Deconv2dK3x3S2, delegator::Deconv2dParam,
-      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S2));
-}
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/deconv_2d_4x4.cc
+++ b/mace/ops/arm/fp32/deconv_2d_4x4.cc
@@ -12,78 +12,39 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/deconv_2d_4x4.h"
 #include <arm_neon.h>
+#include "mace/ops/arm/base/deconv_2d_4x4.h"
 #include "mace/ops/arm/fp32/common_neon.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
-                                   const Tensor *input,
-                                   const Tensor *filter,
-                                   const Tensor *output_shape,
-                                   Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
+template<>
-  const index_t inch = in_shape[1];
+MaceStatus Deconv2dK4x4S1<float>::DoCompute(
-  const index_t h = in_shape[2];
+    const DeconvComputeParam &p, const float *filter_data,
-  const index_t w = in_shape[3];
+    const float *input_data, float *padded_out_data) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t oc = start1; oc < end1; oc += step1) {
-        if (oc + 1 < outch) {
+        if (oc + 1 < p.out_channels) {
-          float *out_base = padded_out_data + (b * outch + oc) * out_img_size;
+          float *out_base =
-          float *out_base1 = out_base + out_img_size;
+              padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
-          for (index_t q = 0; q < inch; q++) {
+          float *out_base1 = out_base + p.out_img_size;
-            const float *input_base = input_data + (b * inch + q) * h * w;
+          for (index_t q = 0; q < p.in_channels; q++) {
+            const float *input_base = input_data +
+                (b * p.in_channels + q) * p.in_height * p.in_width;
            const float *in = input_base;
-            const float *kernel_base = filter_data + (oc * inch + q) * 16;
+            const float *kernel_base =
+                filter_data + (oc * p.in_channels + q) * 16;
            const float *k0 = kernel_base;
            const float *k1 = kernel_base + 4;
            const float *k2 = kernel_base + 8;
            const float *k3 = kernel_base + 12;
-            const float *kernel_base1 = kernel_base + inch * 16;
+            const float *kernel_base1 = kernel_base + p.in_channels * 16;
            const float *k10 = kernel_base1;
            const float *k11 = kernel_base1 + 4;
            const float *k12 = kernel_base1 + 8;
@@ -99,24 +60,24 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
            float32x4_t k12_vec = vld1q_f32(k12);
            float32x4_t k13_vec = vld1q_f32(k13);
-            for (index_t i = 0; i < h; i++) {
+            for (index_t i = 0; i < p.in_height; i++) {
-              float *out_row = out_base + i * outw;
+              float *out_row = out_base + i * p.out_width;
              float *out_row_0 = out_row;
-              float *out_row_1 = out_row_0 + outw;
+              float *out_row_1 = out_row_0 + p.out_width;
-              float *out_row_2 = out_row_1 + outw;
+              float *out_row_2 = out_row_1 + p.out_width;
-              float *out_row_3 = out_row_2 + outw;
+              float *out_row_3 = out_row_2 + p.out_width;
-              float *out_row1 = out_base1 + i * outw;
+              float *out_row1 = out_base1 + i * p.out_width;
              float *out_row1_0 = out_row1;
-              float *out_row1_1 = out_row1_0 + outw;
+              float *out_row1_1 = out_row1_0 + p.out_width;
-              float *out_row1_2 = out_row1_1 + outw;
+              float *out_row1_2 = out_row1_1 + p.out_width;
-              float *out_row1_3 = out_row1_2 + outw;
+              float *out_row1_3 = out_row1_2 + p.out_width;
              index_t j = 0;
-              for (; j + 3 < w; j += 4) {
+              for (; j + 3 < p.in_width; j += 4) {
                float32x4_t in_vec = vld1q_f32(in);
                float32x4_t out00, out01, out02, out03;
                float32x4_t out10, out11, out12, out13;
@@ -260,7 +221,7 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
                out_row1_3 += 4;
              }
-              for (; j < w; j++) {
+              for (; j < p.in_width; j++) {
                float val = in[0];
                for (int k = 0; k < 4; ++k) {
                  out_row_0[k] += val * k0[k];
@@ -285,10 +246,13 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
            }
          }
        } else {
-          float *out_base = padded_out_data + (b * outch + oc) * out_img_size;
+          float *out_base =
-          for (index_t q = 0; q < inch; q++) {
+              padded_out_data + (b * p.out_channels + oc) * p.out_img_size;
-            const float *input_base = input_data + (b * inch + q) * h * w;
+          for (index_t q = 0; q < p.in_channels; q++) {
-            const float *kernel_base = filter_data + (oc * inch + q) * 16;
+            const float *input_base = input_data +
+                (b * p.in_channels + q) * p.in_height * p.in_width;
+            const float *kernel_base =
+                filter_data + (oc * p.in_channels + q) * 16;
            const float *in = input_base;
            const float *k0 = kernel_base;
            const float *k1 = kernel_base + 4;
@@ -300,15 +264,15 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
            float32x4_t k2_vec = vld1q_f32(k2);
            float32x4_t k3_vec = vld1q_f32(k3);
-            for (index_t i = 0; i < h; i++) {
+            for (index_t i = 0; i < p.in_height; i++) {
-              float *out_row = out_base + i * outw;
+              float *out_row = out_base + i * p.out_width;
              float *out_row_0 = out_row;
-              float *out_row_1 = out_row_0 + outw;
+              float *out_row_1 = out_row_0 + p.out_width;
-              float *out_row_2 = out_row_1 + outw;
+              float *out_row_2 = out_row_1 + p.out_width;
-              float *out_row_3 = out_row_2 + outw;
+              float *out_row_3 = out_row_2 + p.out_width;
              int j = 0;
-              for (; j + 3 < w; j += 4) {
+              for (; j + 3 < p.in_width; j += 4) {
                float32x4_t in_vec = vld1q_f32(in);
                float32x4_t out00 = vld1q_f32(out_row_0);
@@ -382,7 +346,7 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
                out_row_3 += 4;
              }
-              for (; j < w; j++) {
+              for (; j < p.in_width; j++) {
                float val = in[0];
                for (int k = 0; k < 4; ++k) {
                  out_row_0[k] += val * k0[k];
@@ -401,65 +365,25 @@ MaceStatus Deconv2dK4x4S1::Compute(const OpContext *context,
        }
      }
    }
-  }, 0, batch, 1, 0, outch, 2);
+  }, 0, p.batch, 1, 0, p.out_channels, 2);
-  UnPadOutput(*out_tensor, out_pad_size, output);
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
+template<>
-                                   const Tensor *input,
+MaceStatus Deconv2dK4x4S2<float>::DoCompute(
-                                   const Tensor *filter,
+    const DeconvComputeParam &p, const float *filter_data,
-                                   const Tensor *output_shape,
+    const float *input_data, float *padded_out_data) {
-                                   Tensor *output) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
-      for (index_t p = start1; p < end1; p += step1) {
+      for (index_t k = start1; k < end1; k += step1) {
-        float *out_base = padded_out_data + (b * outch + p) * out_img_size;
+        float *out_base =
-        for (index_t q = 0; q < inch; q++) {
+            padded_out_data + (b * p.out_channels + k) * p.out_img_size;
-          const float *input_base = input_data + (b * inch + q) * h * w;
+        for (index_t q = 0; q < p.in_channels; q++) {
-          const float *kernel_base = filter_data + (p * inch + q) * 16;
+          const float *input_base = input_data +
+              (b * p.in_channels + q) * p.in_height * p.in_width;
+          const float *kernel_base = filter_data + (k * p.in_channels + q) * 16;
          const float *in = input_base;
          const float *k0 = kernel_base;
@@ -472,17 +396,17 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
          float32x4_t k2_vec = vld1q_f32(k2);
          float32x4_t k3_vec = vld1q_f32(k3);
-          for (index_t i = 0; i < h; i++) {
+          for (index_t i = 0; i < p.in_height; i++) {
-            float *out_row = out_base + 2 * i * outw;
+            float *out_row = out_base + 2 * i * p.out_width;
            float *out_row_0 = out_row;
-            float *out_row_1 = out_row_0 + outw;
+            float *out_row_1 = out_row_0 + p.out_width;
-            float *out_row_2 = out_row_1 + outw;
+            float *out_row_2 = out_row_1 + p.out_width;
-            float *out_row_3 = out_row_2 + outw;
+            float *out_row_3 = out_row_2 + p.out_width;
            index_t j = 0;
-            for (index_t n = 0; n + 9 < outw; n += 8) {
+            for (index_t n = 0; n + 9 < p.out_width; n += 8) {
              float32x4_t in_vec = vld1q_f32(in);
              // row 0
@@ -549,7 +473,7 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
              j += 4;
            }
-            for (; j < w; j++) {
+            for (; j < p.in_width; j++) {
              float val = in[0];
              for (int k = 0; k < 4; ++k) {
                out_row_0[k] += val * k0[k];
@@ -567,25 +491,11 @@ MaceStatus Deconv2dK4x4S2::Compute(const OpContext *context,
        }
      }
    }
-  }, 0, batch, 1, 0, outch, 1);
+  }, 0, p.batch, 1, 0, p.out_channels, 1);
-  UnPadOutput(*out_tensor, out_pad_size, output);
  return MaceStatus::MACE_SUCCESS;
 }
-void RegisterDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Deconv2dK4x4S1, delegator::Deconv2dParam,
-      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K4x4S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, Deconv2dK4x4S2, delegator::Deconv2dParam,
-      MACE_DELEGATOR_KEY_EX(Deconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K4x4S2));
-}
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/depthwise_conv_2d_3x3.cc
@@ -12,14 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/depthwise_conv_2d_3x3.h"
 #include <arm_neon.h>
+#include "mace/ops/arm/base/depthwise_conv_2d_3x3.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
 namespace {
 void DepthwiseConv2dPixel(const float *in_base,
@@ -48,79 +47,36 @@ void DepthwiseConv2dPixel(const float *in_base,
 }
 }  // namespace
-MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
+template<>
-                                          const mace::Tensor *input,
+MaceStatus DepthwiseConv2dK3x3S1<float>::DoCompute(
-                                          const mace::Tensor *filter,
+    const DepthwiseConvComputeParam &p, const float *filter_data,
-                                          mace::Tensor *output) {
+    const float *input_data, float *output_data) {
-  MACE_UNUSED(context);
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  std::vector<index_t> out_shape(4);
-  std::vector<int> paddings(2);
-  auto &in_shape = input->shape();
-  auto &filter_shape = filter->shape();
-  CalOutputShapeAndInputPadSize(in_shape, filter_shape, &out_shape, &paddings);
-  out_shape[1] *= filter_shape[1];
-  MACE_RETURN_IF_ERROR(output->Resize(out_shape));
-  output->Clear();
-  const int pad_top = paddings[0] / 2;
-  const int pad_left = paddings[1] / 2;
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-  const index_t multiplier = out_channels / in_channels;
-  std::vector<index_t> out_bounds;
-  CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds);
-  const index_t valid_h_start = out_bounds[0];
-  const index_t valid_h_stop = out_bounds[1];
-  const index_t valid_w_start = out_bounds[2];
-  const index_t valid_w_stop = out_bounds[3];
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = input->data<float>();
-  auto output_data = output->mutable_data<float>();
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t m = start1; m < end1; m += step1) {
-        const index_t c = m / multiplier;
+        const index_t c = m / p.multiplier;
-        const index_t multi_index = m % multiplier;
+        const index_t multi_index = m % p.multiplier;
        const float
-            *in_base = input_data + b * in_batch_size + c * in_image_size;
+            *in_base = input_data + b * p.in_batch_size + c * p.in_image_size;
        const float
-            *filter_ptr = filter_data + multi_index * in_channels * 9 + c * 9;
+            *filter_ptr = filter_data + multi_index * p.in_channels * 9 + c * 9;
-        float *out_base = output_data + b * out_batch_size + m * out_image_size;
+        float *out_base =
+            output_data + b * p.out_batch_size + m * p.out_image_size;
        index_t h, w;
        // top
-        for (h = 0; h < valid_h_start; ++h) {
+        for (h = 0; h < p.valid_h_start; ++h) {
-          for (w = 0; w < out_width; ++w) {
+          for (w = 0; w < p.out_width; ++w) {
            DepthwiseConv2dPixel(in_base,
                                 filter_ptr,
                                 h,
                                 w,
-                                 h - pad_top,
+                                 h - p.pad_top,
-                                 w - pad_left,
+                                 w - p.pad_left,
-                                 out_width,
+                                 p.out_width,
-                                 in_height,
+                                 p.in_height,
-                                 in_width,
+                                 p.in_width,
                                 3,
                                 3,
                                 out_base);
@@ -133,18 +89,18 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
        vf01 = vld1q_f32(filter_ptr + 3);
        vf02 = vld1q_f32(filter_ptr + 5);
-        for (h = valid_h_start; h + 1 < valid_h_stop; h += 2) {
+        for (h = p.valid_h_start; h + 1 < p.valid_h_stop; h += 2) {
          // left
-          for (w = 0; w < valid_w_start; ++w) {
+          for (w = 0; w < p.valid_w_start; ++w) {
            DepthwiseConv2dPixel(in_base,
                                 filter_ptr,
                                 h,
                                 w,
-                                 h - pad_top,
+                                 h - p.pad_top,
-                                 w - pad_left,
+                                 w - p.pad_left,
-                                 out_width,
+                                 p.out_width,
-                                 in_height,
+                                 p.in_height,
-                                 in_width,
+                                 p.in_width,
                                 3,
                                 3,
                                 out_base);
@@ -152,17 +108,17 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
                                 filter_ptr,
                                 h + 1,
                                 w,
-                                 h + 1 - pad_top,
+                                 h + 1 - p.pad_top,
-                                 w - pad_left,
+                                 w - p.pad_left,
-                                 out_width,
+                                 p.out_width,
-                                 in_height,
+                                 p.in_height,
-                                 in_width,
+                                 p.in_width,
                                 3,
                                 3,
                                 out_base);
          }
-          for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
+          for (w = p.valid_w_start; w + 3 < p.valid_w_stop; w += 4) {
            // input (4 height x 3 slide): vi_height_slide
            float32x4_t vi00, vi01, vi02, vi0n;
            float32x4_t vi10, vi11, vi12, vi1n;
@@ -173,17 +129,17 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
            float32x4_t vo00, vo01;
            // load input
-            index_t in_h = h - pad_top;
+            index_t in_h = h - p.pad_top;
-            index_t in_w = w - pad_left;
+            index_t in_w = w - p.pad_left;
-            index_t in_offset = in_h * in_width + in_w;
+            index_t in_offset = in_h * p.in_width + in_w;
            vi00 = vld1q_f32(in_base + in_offset);
            vi0n = vld1q_f32(in_base + in_offset + 4);
-            vi10 = vld1q_f32(in_base + in_offset + in_width);
+            vi10 = vld1q_f32(in_base + in_offset + p.in_width);
-            vi1n = vld1q_f32(in_base + in_offset + in_width + 4);
+            vi1n = vld1q_f32(in_base + in_offset + p.in_width + 4);
-            vi20 = vld1q_f32(in_base + in_offset + 2 * in_width);
+            vi20 = vld1q_f32(in_base + in_offset + 2 * p.in_width);
-            vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 4);
+            vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 4);
-            vi30 = vld1q_f32(in_base + in_offset + 3 * in_width);
+            vi30 = vld1q_f32(in_base + in_offset + 3 * p.in_width);
-            vi3n = vld1q_f32(in_base + in_offset + 3 * in_width + 4);
+            vi3n = vld1q_f32(in_base + in_offset + 3 * p.in_width + 4);
            vi01 = vextq_f32(vi00, vi0n, 1);
            vi02 = vextq_f32(vi00, vi0n, 2);
@@ -195,9 +151,9 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
            vi32 = vextq_f32(vi30, vi3n, 2);
            // load ouptut
-            index_t out_offset = h * out_width + w;
+            index_t out_offset = h * p.out_width + w;
            vo00 = vld1q_f32(out_base + out_offset);
-            vo01 = vld1q_f32(out_base + out_offset + out_width);
+            vo01 = vld1q_f32(out_base + out_offset + p.out_width);
 #if defined(__aarch64__)
            // outch 0, height 0
@@ -245,20 +201,20 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
            vo01 = vmlaq_lane_f32(vo01, vi32, vget_high_f32(vf02), 1);
 #endif
            vst1q_f32(out_base + out_offset, vo00);
-            vst1q_f32(out_base + out_offset + out_width, vo01);
+            vst1q_f32(out_base + out_offset + p.out_width, vo01);
          }  // w
          // right
-          for (; w < out_width; ++w) {
+          for (; w < p.out_width; ++w) {
            DepthwiseConv2dPixel(in_base,
                                 filter_ptr,
                                 h,
                                 w,
-                                 h - pad_top,
+                                 h - p.pad_top,
-                                 w - pad_left,
+                                 w - p.pad_left,
-                                 out_width,
+                                 p.out_width,
-                                 in_height,
+                                 p.in_height,
-                                 in_width,
+                                 p.in_width,
                                 3,
                                 3,
                                 out_base);
@@ -266,11 +222,11 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
                                 filter_ptr,
                                 h + 1,
                                 w,
-                                 h + 1 - pad_top,
+                                 h + 1 - p.pad_top,
-                                 w - pad_left,
+                                 w - p.pad_left,
-                                 out_width,
+                                 p.out_width,
-                                 in_height,
+                                 p.in_height,
-                                 in_width,
+                                 p.in_width,
                                 3,
                                 3,
                                 out_base);
@@ -279,17 +235,17 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
        // bottom
-        for (; h < out_height; ++h) {
+        for (; h < p.out_height; ++h) {
-          for (w = 0; w < out_width; ++w) {
+          for (w = 0; w < p.out_width; ++w) {
            DepthwiseConv2dPixel(in_base,
                                 filter_ptr,
                                 h,
                                 w,
-                                 h - pad_top,
+                                 h - p.pad_top,
-                                 w - pad_left,
+                                 w - p.pad_left,
-                                 out_width,
+                                 p.out_width,
-                                 in_height,
+                                 p.in_height,
-                                 in_width,
+                                 p.in_width,
                                 3,
                                 3,
                                 out_base);
@@ -297,86 +253,41 @@ MaceStatus DepthwiseConv2dK3x3S1::Compute(const mace::OpContext *context,
        }
      }  // m
    }    // b
-  }, 0, batch, 1, 0, out_channels, 1);  // threadpool
+  }, 0, p.batch, 1, 0, p.out_channels, 1);  // threadpool
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
+template<>
-                                          const mace::Tensor *input,
+MaceStatus DepthwiseConv2dK3x3S2<float>::DoCompute(
-                                          const mace::Tensor *filter,
+    const DepthwiseConvComputeParam &p, const float *filter_data,
-                                          mace::Tensor *output) {
+    const float *input_data, float *output_data) {
-  MACE_UNUSED(context);
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  std::vector<index_t> out_shape(4);
-  std::vector<int> paddings(2);
-  auto &in_shape = input->shape();
-  auto &filter_shape = filter->shape();
-  CalOutputShapeAndInputPadSize(in_shape, filter_shape, &out_shape, &paddings);
-  out_shape[1] *= in_shape[1];
-  MACE_RETURN_IF_ERROR(output->Resize(out_shape));
-  output->Clear();
-  const int pad_top = paddings[0] / 2;
-  const int pad_left = paddings[1] / 2;
-  const index_t batch = in_shape[0];
-  const index_t in_channels = in_shape[1];
-  const index_t in_height = in_shape[2];
-  const index_t in_width = in_shape[3];
-  const index_t out_channels = out_shape[1];
-  const index_t out_height = out_shape[2];
-  const index_t out_width = out_shape[3];
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = in_channels * in_image_size;
-  const index_t out_batch_size = out_channels * out_image_size;
-  const index_t multiplier = out_channels / in_channels;
-  std::vector<index_t> out_bounds;
-  CalOutputBoundaryWithoutUsingInputPad(out_shape, paddings, &out_bounds);
-  const index_t valid_h_start = out_bounds[0];
-  const index_t valid_h_stop = out_bounds[1];
-  const index_t valid_w_start = out_bounds[2];
-  const index_t valid_w_stop = out_bounds[3];
-  Tensor::MappingGuard in_guard(input);
-  Tensor::MappingGuard filter_guard(filter);
-  Tensor::MappingGuard out_guard(output);
-  auto filter_data = filter->data<float>();
-  auto input_data = input->data<float>();
-  auto output_data = output->mutable_data<float>();
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t m = start1; m < end1; m += step1) {
-        index_t c = m / multiplier;
+        index_t c = m / p.multiplier;
-        index_t multi_index = m % multiplier;
+        index_t multi_index = m % p.multiplier;
        const float
-            *in_base = input_data + b * in_batch_size + c * in_image_size;
+            *in_base = input_data + b * p.in_batch_size + c * p.in_image_size;
        const float
-            *filter_ptr = filter_data + multi_index * in_channels * 9 + c * 9;
+            *filter_ptr = filter_data + multi_index * p.in_channels * 9 + c * 9;
-        float *out_base = output_data + b * out_batch_size + m * out_image_size;
+        float *out_base =
+            output_data + b * p.out_batch_size + m * p.out_image_size;
        index_t h, w;
        // top
-        for (h = 0; h < valid_h_start; ++h) {
+        for (h = 0; h < p.valid_h_start; ++h) {
-          for (w = 0; w < out_width; ++w) {
+          for (w = 0; w < p.out_width; ++w) {
            DepthwiseConv2dPixel(in_base,
                                 filter_ptr,
                                 h,
                                 w,
-                                 h * 2 - pad_top,
+                                 h * 2 - p.pad_top,
-                                 w * 2 - pad_left,
+                                 w * 2 - p.pad_left,
-                                 out_width,
+                                 p.out_width,
-                                 in_height,
+                                 p.in_height,
-                                 in_width,
+                                 p.in_width,
                                 3,
                                 3,
                                 out_base);
@@ -389,24 +300,24 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
        vf01 = vld1q_f32(filter_ptr + 3);
        vf02 = vld1q_f32(filter_ptr + 5);
-        for (h = valid_h_start; h < valid_h_stop; ++h) {
+        for (h = p.valid_h_start; h < p.valid_h_stop; ++h) {
          // left
-          for (w = 0; w < valid_w_start; ++w) {
+          for (w = 0; w < p.valid_w_start; ++w) {
            DepthwiseConv2dPixel(in_base,
                                 filter_ptr,
                                 h,
                                 w,
-                                 h * 2 - pad_top,
+                                 h * 2 - p.pad_top,
-                                 w * 2 - pad_left,
+                                 w * 2 - p.pad_left,
-                                 out_width,
+                                 p.out_width,
-                                 in_height,
+                                 p.in_height,
-                                 in_width,
+                                 p.in_width,
                                 3,
                                 3,
                                 out_base);
          }
-          for (w = valid_w_start; w + 3 < valid_w_stop; w += 4) {
+          for (w = p.valid_w_start; w + 3 < p.valid_w_stop; w += 4) {
            float32x4x2_t vi0, vi1, vi2;
            float32x4_t vi0n, vi1n, vi2n;
@@ -419,19 +330,19 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
            float32x4_t vo;
            // load input
-            index_t in_h = h * 2 - pad_top;
+            index_t in_h = h * 2 - p.pad_top;
-            index_t in_w = w * 2 - pad_left;
+            index_t in_w = w * 2 - p.pad_left;
-            index_t in_offset = in_h * in_width + in_w;
+            index_t in_offset = in_h * p.in_width + in_w;
            vi0 = vld2q_f32(in_base + in_offset);  // [0.2.4.6, 1.3.5.7]
-            vi1 = vld2q_f32(in_base + in_offset + in_width);
+            vi1 = vld2q_f32(in_base + in_offset + p.in_width);
-            vi2 = vld2q_f32(in_base + in_offset + 2 * in_width);
+            vi2 = vld2q_f32(in_base + in_offset + 2 * p.in_width);
            vi0n = vld1q_f32(in_base + in_offset + 8);  // [8.9.10.11]
-            vi1n = vld1q_f32(in_base + in_offset + in_width + 8);
+            vi1n = vld1q_f32(in_base + in_offset + p.in_width + 8);
-            vi2n = vld1q_f32(in_base + in_offset + 2 * in_width + 8);
+            vi2n = vld1q_f32(in_base + in_offset + 2 * p.in_width + 8);
            // load ouptut
-            index_t out_offset = h * out_width + w;
+            index_t out_offset = h * p.out_width + w;
            vo = vld1q_f32(out_base + out_offset);
            vi00 = vi0.val[0];                // [0.2.4.6]
@@ -471,16 +382,16 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
          }  // w
          // right
-          for (; w < out_width; ++w) {
+          for (; w < p.out_width; ++w) {
            DepthwiseConv2dPixel(in_base,
                                 filter_ptr,
                                 h,
                                 w,
-                                 h * 2 - pad_top,
+                                 h * 2 - p.pad_top,
-                                 w * 2 - pad_left,
+                                 w * 2 - p.pad_left,
-                                 out_width,
+                                 p.out_width,
-                                 in_height,
+                                 p.in_height,
-                                 in_width,
+                                 p.in_width,
                                 3,
                                 3,
                                 out_base);
@@ -489,17 +400,17 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
        // bottom
-        for (; h < out_height; ++h) {
+        for (; h < p.out_height; ++h) {
-          for (w = 0; w < out_width; ++w) {
+          for (w = 0; w < p.out_width; ++w) {
            DepthwiseConv2dPixel(in_base,
                                 filter_ptr,
                                 h,
                                 w,
-                                 h * 2 - pad_top,
+                                 h * 2 - p.pad_top,
-                                 w * 2 - pad_left,
+                                 w * 2 - p.pad_left,
-                                 out_width,
+                                 p.out_width,
-                                 in_height,
+                                 p.in_height,
-                                 in_width,
+                                 p.in_width,
                                 3,
                                 3,
                                 out_base);
@@ -507,23 +418,11 @@ MaceStatus DepthwiseConv2dK3x3S2::Compute(const mace::OpContext *context,
        }
      }  // m
    }    // b
-  }, 0, batch, 1, 0, out_channels, 1);
+  }, 0, p.batch, 1, 0, p.out_channels, 1);
  return MaceStatus::MACE_SUCCESS;
 }
-void RegisterDepthwiseConv2dK3x3Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, DepthwiseConv2dK3x3S1, delegator::DepthwiseConv2dParam,
-      MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, DepthwiseConv2dK3x3S2, delegator::DepthwiseConv2dParam,
-      MACE_DELEGATOR_KEY_EX(DepthwiseConv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S2));
-}
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_3x3.cc
@@ -12,69 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/depthwise_deconv_2d_3x3.h"
 #include <arm_neon.h>
+#include "mace/ops/arm/base/depthwise_deconv_2d_3x3.h"
 #include "mace/ops/arm/fp32/common_neon.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
+template<>
-MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context,
+MaceStatus DepthwiseDeconv2dK3x3S1<float>::DoCompute(
-                                            const Tensor *input,
+    const DepthwiseDeconvComputeParam &p, const float *filter_data,
-                                            const Tensor *filter,
+    const float *input_data, float *padded_out_data) {
-                                            const Tensor *output_shape,
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                                            Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  group_ = input->dim(1);
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t channels = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t in_img_size = h * w;
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t c = start1; c < end1; c += step1) {
-        const index_t offset = b * channels + c;
+        const index_t offset = b * p.in_channels + c;
-        float *out_base = padded_out_data + offset * out_img_size;
+        float *out_base = padded_out_data + offset * p.out_img_size;
-        const float *input_base = input_data + offset * in_img_size;
+        const float *input_base = input_data + offset * p.in_img_size;
        const float *kernel_base = filter_data + c * 9;
        const float *in = input_base;
        const float *k0 = kernel_base;
@@ -86,14 +43,14 @@ MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context,
        float32x4_t k1_vec = vld1q_f32(k1);
        float32x4_t k2_vec = vld1q_f32(k2);
-        for (index_t i = 0; i < h; ++i) {
+        for (index_t i = 0; i < p.in_height; ++i) {
-          float *out_row_base = out_base + i * outw;
+          float *out_row_base = out_base + i * p.out_width;
          float *out_row0 = out_row_base;
-          float *out_row1 = out_row_base + outw;
+          float *out_row1 = out_row_base + p.out_width;
-          float *out_row2 = out_row_base + 2 * outw;
+          float *out_row2 = out_row_base + 2 * p.out_width;
          index_t j = 0;
-          for (; j + 3 < w; j += 4) {
+          for (; j + 3 < p.in_width; j += 4) {
            float32x4_t in_vec = vld1q_f32(in);
            float32x4_t out00, out01, out02;
@@ -142,7 +99,7 @@ MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context,
            out_row2 += 4;
          }
-          for (; j < w; ++j) {
+          for (; j < p.in_width; ++j) {
            float val = in[0];
            for (int k = 0; k < 3; ++k) {
              out_row0[k] += val * k0[k];
@@ -157,66 +114,22 @@ MaceStatus DepthwiseDeconv2dK3x3S1::Compute(const OpContext *context,
        }
      }
    }
-  }, 0, batch, 1, 0, channels, 1);
+  }, 0, p.batch, 1, 0, p.in_channels, 1);
-  UnPadOutput(*out_tensor, out_pad_size, output);
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context,
+template<>
-                                            const Tensor *input,
+MaceStatus DepthwiseDeconv2dK3x3S2<float>::DoCompute(
-                                            const Tensor *filter,
+    const DepthwiseDeconvComputeParam &p, const float *filter_data,
-                                            const Tensor *output_shape,
+    const float *input_data, float *padded_out_data) {
-                                            Tensor *output) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  group_ = input->dim(1);
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t channels = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t in_img_size = h * w;
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t c = start1; c < end1; c += step1) {
-        const index_t offset = b * channels + c;
+        const index_t offset = b * p.in_channels + c;
-        float *out_base = padded_out_data + offset * out_img_size;
+        float *out_base = padded_out_data + offset * p.out_img_size;
-        const float *input_base = input_data + offset * in_img_size;
+        const float *input_base = input_data + offset * p.in_img_size;
        const float *kernel_base = filter_data + c * 9;
        const float *in = input_base;
@@ -228,15 +141,15 @@ MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context,
        float32x4_t k1_vec = vld1q_f32(k1);
        float32x4_t k2_vec = vld1q_f32(k2);
-        for (index_t i = 0; i < h; ++i) {
+        for (index_t i = 0; i < p.in_height; ++i) {
-          float *out_row_base = out_base + i * 2 * outw;
+          float *out_row_base = out_base + i * 2 * p.out_width;
          float *out_row_0 = out_row_base;
-          float *out_row_1 = out_row_0 + outw;
+          float *out_row_1 = out_row_0 + p.out_width;
-          float *out_row_2 = out_row_1 + outw;
+          float *out_row_2 = out_row_1 + p.out_width;
          index_t j = 0;
-          for (index_t n = 0; n + 9 < outw; n += 8) {
+          for (index_t n = 0; n + 9 < p.out_width; n += 8) {
            float32x4_t in_vec = vld1q_f32(in);
            // out row 0
@@ -285,7 +198,7 @@ MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context,
            j += 4;
          }
-          for (; j < w; ++j) {
+          for (; j < p.in_width; ++j) {
            float val = in[0];
            for (int k = 0; k < 3; ++k) {
@@ -302,80 +215,31 @@ MaceStatus DepthwiseDeconv2dK3x3S2::Compute(const OpContext *context,
        }
      }
    }
-  }, 0, batch, 1, 0, channels, 1);
+  }, 0, p.batch, 1, 0, p.in_channels, 1);
-  UnPadOutput(*out_tensor, out_pad_size, output);
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
+template<>
-                                        const Tensor *input,
+MaceStatus GroupDeconv2dK3x3S1<float>::DoCompute(
-                                        const Tensor *filter,
+    const GroupDeconvComputeParam &p, const float *filter_data,
-                                        const Tensor *output_shape,
+    const float *input_data, float *padded_out_data) {
-                                        Tensor *output) {
+  p.thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t in_img_size = h * w;
-  const index_t out_img_size = outh * outw;
-  const index_t inch_g = inch / group_;
-  const index_t outch_g = outch / group_;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1,
                              index_t start2, index_t end2, index_t step2) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t g = start1; g < end1; g += step1) {
        for (index_t oc = start2; oc < end2; oc += step2) {
-          if (oc + 1 < outch_g) {
+          if (oc + 1 < p.outch_g) {
-            const index_t out_offset = b * outch + outch_g * g + oc;
+            const index_t out_offset = b * p.out_channels + p.outch_g * g + oc;
-            float *out_base0 = padded_out_data + out_offset * out_img_size;
+            float *out_base0 = padded_out_data + out_offset * p.out_img_size;
-            float *out_base1 = out_base0 + out_img_size;
+            float *out_base1 = out_base0 + p.out_img_size;
-            for (index_t ic = 0; ic < inch_g; ++ic) {
+            for (index_t ic = 0; ic < p.inch_g; ++ic) {
-              const index_t in_offset = b * inch + inch_g * g + ic;
+              const index_t in_offset = b * p.in_channels + p.inch_g * g + ic;
-              const float *input_base = input_data + in_offset * in_img_size;
+              const float *input_base = input_data + in_offset * p.in_img_size;
-              const index_t kernel_offset = (oc * group_ + g) * inch_g + ic;
+              const index_t kernel_offset = (oc * group_ + g) * p.inch_g + ic;
              const float *kernel_base0 = filter_data + kernel_offset * 9;
-              const float *kernel_base1 = kernel_base0 + inch * 9;
+              const float *kernel_base1 = kernel_base0 + p.in_channels * 9;
              const float *in = input_base;
              // output channel 0
@@ -399,20 +263,20 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
              k11_vec = vld1q_f32(k1_1);
              k12_vec = vld1q_f32(k1_2);
-              for (index_t i = 0; i < h; ++i) {
+              for (index_t i = 0; i < p.in_height; ++i) {
-                float *out_row_base0 = out_base0 + i * outw;
+                float *out_row_base0 = out_base0 + i * p.out_width;
                float *out_row0_0 = out_row_base0;
-                float *out_row0_1 = out_row_base0 + outw;
+                float *out_row0_1 = out_row_base0 + p.out_width;
-                float *out_row0_2 = out_row_base0 + 2 * outw;
+                float *out_row0_2 = out_row_base0 + 2 * p.out_width;
-                float *out_row_base1 = out_base1 + i * outw;
+                float *out_row_base1 = out_base1 + i * p.out_width;
                float *out_row1_0 = out_row_base1;
-                float *out_row1_1 = out_row_base1 + outw;
+                float *out_row1_1 = out_row_base1 + p.out_width;
-                float *out_row1_2 = out_row_base1 + 2 * outw;
+                float *out_row1_2 = out_row_base1 + 2 * p.out_width;
                index_t j = 0;
-                for (; j + 3 < w; j += 4) {
+                for (; j + 3 < p.in_width; j += 4) {
                  float32x4_t in_vec = vld1q_f32(in);
                  float32x4_t out00, out01, out02;
@@ -500,7 +364,7 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
                  out_row1_2 += 4;
                }
-                for (; j < w; ++j) {
+                for (; j < p.in_width; ++j) {
                  float val = in[0];
                  for (int k = 0; k < 3; ++k) {
                    out_row0_0[k] += val * k0_0[k];
@@ -521,12 +385,12 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
              }
            }
          } else {
-            const index_t out_offset = b * outch + outch_g * g + oc;
+            const index_t out_offset = b * p.out_channels + p.outch_g * g + oc;
-            float *out_base0 = padded_out_data + out_offset * out_img_size;
+            float *out_base0 = padded_out_data + out_offset * p.out_img_size;
-            for (index_t ic = 0; ic < inch_g; ++ic) {
+            for (index_t ic = 0; ic < p.inch_g; ++ic) {
-              const index_t in_offset = (b * group_ + g) * inch_g + ic;
+              const index_t in_offset = (b * group_ + g) * p.inch_g + ic;
-              const float *input_base = input_data + in_offset * in_img_size;
+              const float *input_base = input_data + in_offset * p.in_img_size;
-              const index_t kernel_offset = (oc * group_ + g) * inch_g + ic;
+              const index_t kernel_offset = (oc * group_ + g) * p.inch_g + ic;
              const float *kernel_base0 = filter_data + kernel_offset * 9;
              const float *in = input_base;
              const float *k0_0 = kernel_base0;
@@ -538,14 +402,14 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
              float32x4_t k01_vec = vld1q_f32(k0_1);
              float32x4_t k02_vec = vld1q_f32(k0_2);
-              for (index_t i = 0; i < h; ++i) {
+              for (index_t i = 0; i < p.in_height; ++i) {
-                float *out_row_base0 = out_base0 + i * outw;
+                float *out_row_base0 = out_base0 + i * p.out_width;
                float *out_row0_0 = out_row_base0;
-                float *out_row0_1 = out_row_base0 + outw;
+                float *out_row0_1 = out_row_base0 + p.out_width;
-                float *out_row0_2 = out_row_base0 + 2 * outw;
+                float *out_row0_2 = out_row_base0 + 2 * p.out_width;
                index_t j = 0;
-                for (; j + 3 < w; j += 4) {
+                for (; j + 3 < p.in_width; j += 4) {
                  float32x4_t in_vec = vld1q_f32(in);
                  float32x4_t out00, out01, out02;
@@ -594,7 +458,7 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
                  out_row0_2 += 4;
                }
-                for (; j < w; ++j) {
+                for (; j < p.in_width; ++j) {
                  float val = in[0];
                  for (int k = 0; k < 3; ++k) {
                    out_row0_0[k] += val * k0_0[k];
@@ -612,76 +476,27 @@ MaceStatus GroupDeconv2dK3x3S1::Compute(const OpContext *context,
        }
      }
    }
-  }, 0, batch, 1, 0, group_, 1, 0, outch_g, 2);
+  }, 0, p.batch, 1, 0, group_, 1, 0, p.outch_g, 2);
-  UnPadOutput(*out_tensor, out_pad_size, output);
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context,
+template<>
-                                        const Tensor *input,
+MaceStatus GroupDeconv2dK3x3S2<float>::DoCompute(
-                                        const Tensor *filter,
+    const GroupDeconvComputeParam &p, const float *filter_data,
-                                        const Tensor *output_shape,
+    const float *input_data, float *padded_out_data) {
-                                        Tensor *output) {
+  p.thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t in_img_size = h * w;
-  const index_t out_img_size = outh * outw;
-  const index_t inch_g = inch / group_;
-  const index_t outch_g = outch / group_;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1,
                              index_t start2, index_t end2, index_t step2) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t g = start1; g < end1; g += step1) {
        for (index_t oc = start2; oc < end2; oc += step2) {
-          const index_t out_offset = b * outch + outch_g * g + oc;
+          const index_t out_offset = b * p.out_channels + p.outch_g * g + oc;
-          float *out_base = padded_out_data + out_offset * out_img_size;
+          float *out_base = padded_out_data + out_offset * p.out_img_size;
-          for (index_t ic = 0; ic < inch_g; ++ic) {
+          for (index_t ic = 0; ic < p.inch_g; ++ic) {
-            const index_t in_offset = b * inch + inch_g * g + ic;
+            const index_t in_offset = b * p.in_channels + p.inch_g * g + ic;
-            const float *input_base = input_data + in_offset * in_img_size;
+            const float *input_base = input_data + in_offset * p.in_img_size;
-            const index_t kernel_offset = (oc * group_ + g) * inch_g + ic;
+            const index_t kernel_offset = (oc * group_ + g) * p.inch_g + ic;
            const float *kernel_base = filter_data + kernel_offset * 9;
            const float *in = input_base;
@@ -693,15 +508,15 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context,
            float32x4_t k1_vec = vld1q_f32(k1);
            float32x4_t k2_vec = vld1q_f32(k2);
-            for (index_t i = 0; i < h; ++i) {
+            for (index_t i = 0; i < p.in_height; ++i) {
-              float *out_row_base = out_base + i * 2 * outw;
+              float *out_row_base = out_base + i * 2 * p.out_width;
              float *out_row_0 = out_row_base;
-              float *out_row_1 = out_row_0 + outw;
+              float *out_row_1 = out_row_0 + p.out_width;
-              float *out_row_2 = out_row_1 + outw;
+              float *out_row_2 = out_row_1 + p.out_width;
              index_t j = 0;
-              for (index_t n = 0; n + 9 < outw; n += 8) {
+              for (index_t n = 0; n + 9 < p.out_width; n += 8) {
                float32x4_t in_vec = vld1q_f32(in);
                // out row 0
@@ -750,7 +565,7 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context,
                j += 4;
              }
-              for (; j < w; ++j) {
+              for (; j < p.in_width; ++j) {
                float val = in[0];
                for (int k = 0; k < 3; ++k) {
@@ -769,36 +584,11 @@ MaceStatus GroupDeconv2dK3x3S2::Compute(const OpContext *context,
        }
      }
    }
-  }, 0, batch, 1, 0, group_, 1, 0, outch_g, 1);
+  }, 0, p.batch, 1, 0, group_, 1, 0, p.outch_g, 1);
-  UnPadOutput(*out_tensor, out_pad_size, output);
  return MaceStatus::MACE_SUCCESS;
 }
-void RegisterDepthwiseDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, DepthwiseDeconv2dK3x3S1, delegator::DepthwiseDeconv2dParam,
-      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, DepthwiseDeconv2dK3x3S2, delegator::DepthwiseDeconv2dParam,
-      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S2));
-}
-void RegisterGroupDeconv2dK3x3Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, GroupDeconv2dK3x3S1, delegator::GroupDeconv2dParam,
-      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, GroupDeconv2dK3x3S2, delegator::GroupDeconv2dParam,
-      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K3x3S2));
-}
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc
+++ b/mace/ops/arm/fp32/depthwise_deconv_2d_4x4.cc
@@ -12,69 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/depthwise_deconv_2d_4x4.h"
 #include <arm_neon.h>
+#include "mace/ops/arm/base/depthwise_deconv_2d_4x4.h"
 #include "mace/ops/arm/fp32/common_neon.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
+template<>
-MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context,
+MaceStatus DepthwiseDeconv2dK4x4S1<float>::DoCompute(
-                                            const Tensor *input,
+    const DepthwiseDeconvComputeParam &p, const float *filter_data,
-                                            const Tensor *filter,
+    const float *input_data, float *padded_out_data) {
-                                            const Tensor *output_shape,
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-                                            Tensor *output) {
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  group_ = input->dim(1);
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t channels = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t in_img_size = h * w;
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t c = start1; c < end1; c += step1) {
-        const index_t offset = b * channels + c;
+        const index_t offset = b * p.in_channels + c;
-        float *out_base = padded_out_data + offset * out_img_size;
+        float *out_base = padded_out_data + offset * p.out_img_size;
-        const float *input_base = input_data + offset * in_img_size;
+        const float *input_base = input_data + offset * p.in_img_size;
        const float *kernel_base = filter_data + c * 16;
        const float *in = input_base;
        const float *k0 = kernel_base;
@@ -87,15 +44,15 @@ MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context,
        float32x4_t k2_vec = vld1q_f32(k2);
        float32x4_t k3_vec = vld1q_f32(k3);
-        for (index_t i = 0; i < h; i++) {
+        for (index_t i = 0; i < p.in_height; i++) {
-          float *out_row = out_base + i * outw;
+          float *out_row = out_base + i * p.out_width;
          float *out_row_0 = out_row;
-          float *out_row_1 = out_row_0 + outw;
+          float *out_row_1 = out_row_0 + p.out_width;
-          float *out_row_2 = out_row_1 + outw;
+          float *out_row_2 = out_row_1 + p.out_width;
-          float *out_row_3 = out_row_2 + outw;
+          float *out_row_3 = out_row_2 + p.out_width;
          index_t j = 0;
-          for (; j + 3 < w; j += 4) {
+          for (; j + 3 < p.in_width; j += 4) {
            float32x4_t in_vec = vld1q_f32(in);
            float32x4_t out00 = vld1q_f32(out_row_0);
@@ -172,7 +129,7 @@ MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context,
            out_row_3 += 4;
          }
-          for (; j < w; j++) {
+          for (; j < p.in_width; j++) {
            float val = in[0];
            for (int k = 0; k < 4; ++k) {
              out_row_0[k] += val * k0[k];
@@ -189,66 +146,22 @@ MaceStatus DepthwiseDeconv2dK4x4S1::Compute(const OpContext *context,
        }
      }
    }
-  }, 0, batch, 1, 0, channels, 1);
+  }, 0, p.batch, 1, 0, p.in_channels, 1);
-  UnPadOutput(*out_tensor, out_pad_size, output);
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context,
+template<>
-                                            const Tensor *input,
+MaceStatus DepthwiseDeconv2dK4x4S2<float>::DoCompute(
-                                            const Tensor *filter,
+    const DepthwiseDeconvComputeParam &p, const float *filter_data,
-                                            const Tensor *output_shape,
+    const float *input_data, float *padded_out_data) {
-                                            Tensor *output) {
+  p.thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  group_ = input->dim(1);
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t channels = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t in_img_size = h * w;
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t out_img_size = outh * outw;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute2D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t c = start1; c < end1; c += step1) {
-        const index_t offset = b * channels + c;
+        const index_t offset = b * p.in_channels + c;
-        float *out_base = padded_out_data + offset * out_img_size;
+        float *out_base = padded_out_data + offset * p.out_img_size;
-        const float *input_base = input_data + offset * in_img_size;
+        const float *input_base = input_data + offset * p.in_img_size;
        const float *kernel_base = filter_data + c * 16;
        const float *in = input_base;
@@ -262,17 +175,17 @@ MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context,
        float32x4_t k2_vec = vld1q_f32(k2);
        float32x4_t k3_vec = vld1q_f32(k3);
-        for (index_t i = 0; i < h; i++) {
+        for (index_t i = 0; i < p.in_height; i++) {
-          float *out_row = out_base + 2 * i * outw;
+          float *out_row = out_base + 2 * i * p.out_width;
          float *out_row_0 = out_row;
-          float *out_row_1 = out_row_0 + outw;
+          float *out_row_1 = out_row_0 + p.out_width;
-          float *out_row_2 = out_row_1 + outw;
+          float *out_row_2 = out_row_1 + p.out_width;
-          float *out_row_3 = out_row_2 + outw;
+          float *out_row_3 = out_row_2 + p.out_width;
          index_t j = 0;
-          for (index_t n = 0; n + 9 < outw; n += 8) {
+          for (index_t n = 0; n + 9 < p.out_width; n += 8) {
            float32x4_t in_vec = vld1q_f32(in);
            // row 0
@@ -339,7 +252,7 @@ MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context,
            j += 4;
          }
-          for (; j < w; j++) {
+          for (; j < p.in_width; j++) {
            float val = in[0];
            for (int k = 0; k < 4; ++k) {
              out_row_0[k] += val * k0[k];
@@ -356,89 +269,40 @@ MaceStatus DepthwiseDeconv2dK4x4S2::Compute(const OpContext *context,
        }
      }
    }
-  }, 0, batch, 1, 0, channels, 1);
+  }, 0, p.batch, 1, 0, p.in_channels, 1);
-  UnPadOutput(*out_tensor, out_pad_size, output);
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
+template<>
-                                        const Tensor *input,
+MaceStatus GroupDeconv2dK4x4S1<float>::DoCompute(
-                                        const Tensor *filter,
+    const GroupDeconvComputeParam &p, const float *filter_data,
-                                        const Tensor *output_shape,
+    const float *input_data, float *padded_out_data) {
-                                        Tensor *output) {
+  p.thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t in_img_size = h * w;
-  const index_t out_img_size = outh * outw;
-  const index_t inch_g = inch / group_;
-  const index_t outch_g = outch / group_;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1,
                              index_t start2, index_t end2, index_t step2) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t g = start1; g < end1; g += step1) {
        for (index_t oc = start2; oc < end2; oc += step2) {
-          if (oc + 1 < outch_g) {
+          if (oc + 1 < p.outch_g) {
            const index_t out_offset =
-                (b * outch + outch_g * g + oc) * out_img_size;
+                (b * p.out_channels + p.outch_g * g + oc) * p.out_img_size;
            float *out_base = padded_out_data + out_offset;
-            float *out_base1 = out_base + out_img_size;
+            float *out_base1 = out_base + p.out_img_size;
-            for (index_t ic = 0; ic < inch_g; ic++) {
+            for (index_t ic = 0; ic < p.inch_g; ic++) {
              const index_t in_offset =
-                  (b * inch + inch_g * g + ic) * in_img_size;
+                  (b * p.in_channels + p.inch_g * g + ic) * p.in_img_size;
              const float *input_base = input_data + in_offset;
              const float *in = input_base;
              const index_t kernel_offset =
-                  ((oc * group_ + g) * inch_g + ic) * 16;
+                  ((oc * group_ + g) * p.inch_g + ic) * 16;
              const float *kernel_base = filter_data + kernel_offset;
              const float *k0 = kernel_base;
              const float *k1 = kernel_base + 4;
              const float *k2 = kernel_base + 8;
              const float *k3 = kernel_base + 12;
-              const float *kernel_base1 = kernel_base + inch * 16;
+              const float *kernel_base1 = kernel_base + p.in_channels * 16;
              const float *k10 = kernel_base1;
              const float *k11 = kernel_base1 + 4;
              const float *k12 = kernel_base1 + 8;
@@ -454,24 +318,24 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
              float32x4_t k12_vec = vld1q_f32(k12);
              float32x4_t k13_vec = vld1q_f32(k13);
-              for (index_t i = 0; i < h; i++) {
+              for (index_t i = 0; i < p.in_height; i++) {
-                float *out_row = out_base + i * outw;
+                float *out_row = out_base + i * p.out_width;
                float *out_row_0 = out_row;
-                float *out_row_1 = out_row_0 + outw;
+                float *out_row_1 = out_row_0 + p.out_width;
-                float *out_row_2 = out_row_1 + outw;
+                float *out_row_2 = out_row_1 + p.out_width;
-                float *out_row_3 = out_row_2 + outw;
+                float *out_row_3 = out_row_2 + p.out_width;
-                float *out_row1 = out_base1 + i * outw;
+                float *out_row1 = out_base1 + i * p.out_width;
                float *out_row1_0 = out_row1;
-                float *out_row1_1 = out_row1_0 + outw;
+                float *out_row1_1 = out_row1_0 + p.out_width;
-                float *out_row1_2 = out_row1_1 + outw;
+                float *out_row1_2 = out_row1_1 + p.out_width;
-                float *out_row1_3 = out_row1_2 + outw;
+                float *out_row1_3 = out_row1_2 + p.out_width;
                index_t j = 0;
-                for (; j + 3 < w; j += 4) {
+                for (; j + 3 < p.in_width; j += 4) {
                  float32x4_t in_vec = vld1q_f32(in);
                  float32x4_t out00, out01, out02, out03;
                  float32x4_t out10, out11, out12, out13;
@@ -618,7 +482,7 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
                  out_row1_3 += 4;
                }
-                for (; j < w; j++) {
+                for (; j < p.in_width; j++) {
                  float val = in[0];
                  for (int k = 0; k < 4; ++k) {
                    out_row_0[k] += val * k0[k];
@@ -644,13 +508,13 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
            }
          } else {
            const index_t out_offset =
-                (b * outch + outch_g * g + oc) * out_img_size;
+                (b * p.out_channels + p.outch_g * g + oc) * p.out_img_size;
            float *out_base = padded_out_data + out_offset;
-            for (index_t ic = 0; ic < inch_g; ++ic) {
+            for (index_t ic = 0; ic < p.inch_g; ++ic) {
              const index_t in_offset =
-                  (b * inch + inch_g * g + ic) * in_img_size;
+                  (b * p.in_channels + p.inch_g * g + ic) * p.in_img_size;
              const index_t kernel_offset =
-                  ((oc * group_ + g) * inch_g + ic) * 16;
+                  ((oc * group_ + g) * p.inch_g + ic) * 16;
              const float *input_base = input_data + in_offset;
              const float *kernel_base = filter_data + kernel_offset;
@@ -665,15 +529,15 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
              float32x4_t k2_vec = vld1q_f32(k2);
              float32x4_t k3_vec = vld1q_f32(k3);
-              for (index_t i = 0; i < h; i++) {
+              for (index_t i = 0; i < p.in_height; i++) {
-                float *out_row = out_base + i * outw;
+                float *out_row = out_base + i * p.out_width;
                float *out_row_0 = out_row;
-                float *out_row_1 = out_row_0 + outw;
+                float *out_row_1 = out_row_0 + p.out_width;
-                float *out_row_2 = out_row_1 + outw;
+                float *out_row_2 = out_row_1 + p.out_width;
-                float *out_row_3 = out_row_2 + outw;
+                float *out_row_3 = out_row_2 + p.out_width;
                index_t j = 0;
-                for (; j + 3 < w; j += 4) {
+                for (; j + 3 < p.in_width; j += 4) {
                  float32x4_t in_vec = vld1q_f32(in);
                  float32x4_t out00 = vld1q_f32(out_row_0);
@@ -750,7 +614,7 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
                  out_row_3 += 4;
                }
-                for (; j < w; j++) {
+                for (; j < p.in_width; j++) {
                  float val = in[0];
                  for (int k = 0; k < 4; ++k) {
                    out_row_0[k] += val * k0[k];
@@ -770,78 +634,29 @@ MaceStatus GroupDeconv2dK4x4S1::Compute(const OpContext *context,
        }
      }
    }
-  }, 0, batch, 1, 0, group_, 1, 0, outch_g, 2);
+  }, 0, p.batch, 1, 0, group_, 1, 0, p.outch_g, 2);
-  UnPadOutput(*out_tensor, out_pad_size, output);
  return MaceStatus::MACE_SUCCESS;
 }
-MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context,
+template<>
-                                        const Tensor *input,
+MaceStatus GroupDeconv2dK4x4S2<float>::DoCompute(
-                                        const Tensor *filter,
+    const GroupDeconvComputeParam &p, const float *filter_data,
-                                        const Tensor *output_shape,
+    const float *input_data, float *padded_out_data) {
-                                        Tensor *output) {
+  p.thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
-  std::unique_ptr<Tensor> padded_out;
-  std::vector<int> out_pad_size;
-  ResizeOutAndPadOut(context,
-                     input,
-                     filter,
-                     output_shape,
-                     output,
-                     &out_pad_size,
-                     &padded_out);
-  Tensor *out_tensor = output;
-  if (padded_out != nullptr) {
-    out_tensor = padded_out.get();
-  }
-  out_tensor->Clear();
-  Tensor::MappingGuard input_mapper(input);
-  Tensor::MappingGuard filter_mapper(filter);
-  Tensor::MappingGuard output_mapper(output);
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto padded_out_data = out_tensor->mutable_data<float>();
-  auto &in_shape = input->shape();
-  auto &out_shape = out_tensor->shape();
-  const index_t batch = in_shape[0];
-  const index_t inch = in_shape[1];
-  const index_t h = in_shape[2];
-  const index_t w = in_shape[3];
-  const index_t outch = out_shape[1];
-  const index_t outh = out_shape[2];
-  const index_t outw = out_shape[3];
-  const index_t in_img_size = h * w;
-  const index_t out_img_size = outh * outw;
-  const index_t inch_g = inch / group_;
-  const index_t outch_g = outch / group_;
-  utils::ThreadPool
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-  thread_pool.Compute3D([=](index_t start0, index_t end0, index_t step0,
                              index_t start1, index_t end1, index_t step1,
                              index_t start2, index_t end2, index_t step2) {
    for (index_t b = start0; b < end0; b += step0) {
      for (index_t g = start1; g < end1; g += step1) {
        for (index_t oc = start2; oc < end2; oc += step2) {
          const index_t out_offset =
-              (b * outch + outch_g * g + oc) * out_img_size;
+              (b * p.out_channels + p.outch_g * g + oc) * p.out_img_size;
          float *out_base = padded_out_data + out_offset;
-          for (index_t ic = 0; ic < inch_g; ic++) {
+          for (index_t ic = 0; ic < p.inch_g; ic++) {
            const index_t in_offset =
-                (b * inch + inch_g * g + ic) * in_img_size;
+                (b * p.in_channels + p.inch_g * g + ic) * p.in_img_size;
            const index_t kernel_offset =
-                ((oc * group_ + g) * inch_g + ic) * 16;
+                ((oc * group_ + g) * p.inch_g + ic) * 16;
            const float *input_base = input_data + in_offset;
            const float *kernel_base = filter_data + kernel_offset;
            const float *in = input_base;
@@ -856,17 +671,17 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context,
            float32x4_t k2_vec = vld1q_f32(k2);
            float32x4_t k3_vec = vld1q_f32(k3);
-            for (index_t i = 0; i < h; i++) {
+            for (index_t i = 0; i < p.in_height; i++) {
-              float *out_row = out_base + 2 * i * outw;
+              float *out_row = out_base + 2 * i * p.out_width;
              float *out_row_0 = out_row;
-              float *out_row_1 = out_row_0 + outw;
+              float *out_row_1 = out_row_0 + p.out_width;
-              float *out_row_2 = out_row_1 + outw;
+              float *out_row_2 = out_row_1 + p.out_width;
-              float *out_row_3 = out_row_2 + outw;
+              float *out_row_3 = out_row_2 + p.out_width;
              index_t j = 0;
-              for (index_t n = 0; n + 9 < outw; n += 8) {
+              for (index_t n = 0; n + 9 < p.out_width; n += 8) {
                float32x4_t in_vec = vld1q_f32(in);
                // row 0
@@ -933,7 +748,7 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context,
                j += 4;
              }
-              for (; j < w; j++) {
+              for (; j < p.in_width; j++) {
                float val = in[0];
                for (int k = 0; k < 4; ++k) {
                  out_row_0[k] += val * k0[k];
@@ -952,36 +767,11 @@ MaceStatus GroupDeconv2dK4x4S2::Compute(const OpContext *context,
        }
      }
    }
-  }, 0, batch, 1, 0, group_, 1, 0, outch_g, 1);
+  }, 0, p.batch, 1, 0, group_, 1, 0, p.outch_g, 1);
-  UnPadOutput(*out_tensor, out_pad_size, output);
  return MaceStatus::MACE_SUCCESS;
 }
-void RegisterDepthwiseDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, DepthwiseDeconv2dK4x4S1, delegator::DepthwiseDeconv2dParam,
-      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K4x4S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, DepthwiseDeconv2dK4x4S2, delegator::DepthwiseDeconv2dParam,
-      MACE_DELEGATOR_KEY_EX(DepthwiseDeconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K4x4S2));
-}
-void RegisterGroupDeconv2dK4x4Delegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, GroupDeconv2dK4x4S1, delegator::GroupDeconv2dParam,
-      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K4x4S1));
-  MACE_REGISTER_DELEGATOR(
-      registry, GroupDeconv2dK4x4S2, delegator::GroupDeconv2dParam,
-      MACE_DELEGATOR_KEY_EX(GroupDeconv2d, DeviceType::CPU,
-                            float, ImplType::NEON, K4x4S2));
-}
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace

--- a/mace/ops/arm/fp32/gemm.cc
+++ b/mace/ops/arm/fp32/gemm.cc
@@ -12,527 +12,570 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/gemm.h"
 #include <arm_neon.h>
 #include <algorithm>
 #include <utility>
+#include "mace/ops/arm/base/gemm.h"
 #include "mace/port/env.h"
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-enum { kNoCache, kCacheLhs, kCacheRhs };
+template<>
+template<>
-MaceStatus Gemm::Compute(const OpContext *context,
+void Gemm<float>::Pack<4, 4>(const MatrixMap<const float> &matrix,
-                         const Tensor *lhs,
+                             MatrixMajor dst_major,
-                         const Tensor *rhs,
+                             float *packed_matrix) {
-                         const index_t batch,
+  const index_t rows = matrix.rows();
-                         const index_t rows,
+  const index_t cols = matrix.cols();
-                         const index_t cols,
-                         const index_t depth,
-                         const MatrixMajor lhs_major,
-                         const MatrixMajor rhs_major,
-                         const MatrixMajor output_major,
-                         const bool lhs_batched,
-                         const bool rhs_batched,
-                         Tensor *output) {
-  MACE_CHECK(output->size() == batch * rows * cols,
-             "Need resize output tensor before call gemm.");
-  Tensor::MappingGuard lhs_guard(lhs);
-  Tensor::MappingGuard rhs_guard(rhs);
-  Tensor::MappingGuard output_guard(output);
-  const float *lhs_data = lhs->data<float>();
-  const float *rhs_data = rhs->data<float>();
-  float *output_data = output->mutable_data<float>();
-#ifdef __aarch64__
+  // use the same terminology as GemmLowp:
-  const index_t row_block_size = 8;
+  // depth is depth, width is the opposite dim other than depth
-#else
+  // lhs
-  const index_t row_block_size = 4;
+  index_t width = rows;
-#endif
+  index_t depth = cols;
-  const index_t col_block_size = 8;
+  index_t width_stride = matrix.rows_stride();
-  const index_t depth_block_size = 4;
+  index_t depth_stride = matrix.cols_stride();
-  const index_t row_block_count = RoundUpDiv(rows, row_block_size);
+  if (dst_major == RowMajor) {
-  const index_t col_block_count = RoundUpDiv(cols, col_block_size);
+    // rhs
-  const index_t rows_padded = RoundUp(rows, row_block_size);
+    std::swap(width, depth);
-  const index_t cols_padded = RoundUp(cols, col_block_size);
+    std::swap(width_stride, depth_stride);
-  const index_t depth_padded = RoundUp(depth, depth_block_size);
+  }
+  const float *data = matrix.data();
+  float *packed_ptr = packed_matrix;
-  ScratchBuffer *scratch = context->device()->scratch_buffer();
+  const index_t block_size = 4;
+  const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4));
-  index_t packed_lhs_size =
+  if (depth_padded > depth) {
-      PadAlignSize(sizeof(float) * rows_padded * depth_padded);
+    memset(packed_ptr + depth * block_size,
-  index_t packed_rhs_size =
+           0,
-      PadAlignSize(sizeof(float) * depth_padded * cols_padded);
+           sizeof(float) * (depth_padded - depth) * block_size);
-  index_t packed_output_size =
+  }
-      PadAlignSize(sizeof(float) * rows_padded * cols_padded);
-  // resize to the total size of lhs & rhs & output anyway,
-  // in case we do not cache const tensor for saving memory
-  MACE_RETURN_IF_ERROR(scratch->GrowSize(
-      packed_lhs_size + packed_rhs_size + packed_output_size));
-  float *packed_lhs_data =
-      scratch->Scratch(packed_lhs_size).mutable_data<float>();
-  float *packed_rhs_data =
-      scratch->Scratch(packed_rhs_size).mutable_data<float>();
-  float *packed_output_data =
-      scratch->Scratch(packed_output_size).mutable_data<float>();
-  int cache_side = kNoCache;
+  if (dst_major == matrix.matrix_major()) {
-  if (cached_ == kCacheLhs) {
+    if (width < block_size) {
-    packed_lhs_data = pack_cache_.mutable_data<float>();
+      const index_t width_remain = block_size - width;
-  } else if (cached_ == kCacheRhs) {
+      for (index_t d = 0; d < depth; ++d) {
-    packed_rhs_data = pack_cache_.mutable_data<float>();
+        memcpy(packed_ptr, data, sizeof(float) * width);
-  } else if (should_cache_pack_) {
+        memset(packed_ptr + width, 0, sizeof(float) * width_remain);
-    if (lhs->is_weight() && (!lhs_batched || batch == 1)) {
+        data += depth_stride;
-      cache_side = kCacheLhs;
+        packed_ptr += block_size;
-      pack_cache_.Resize(packed_lhs_size);
+      }
-      packed_lhs_data = pack_cache_.mutable_data<float>();
+    } else {
-    } else if (rhs->is_weight() && (!rhs_batched || batch == 1)) {
+      for (index_t d = 0; d < depth; ++d) {
-      cache_side = kCacheRhs;
+        float32x4_t vi = vld1q_f32(data);
-      pack_cache_.Resize(packed_rhs_size);
+        vst1q_f32(packed_ptr, vi);
-      packed_rhs_data = pack_cache_.mutable_data<float>();
+        data += depth_stride;
+        packed_ptr += block_size;
      }
    }
+  } else {
+    if (width < block_size) {
+      const index_t width_remain = block_size - width;
+      for (index_t d = 0; d < depth; ++d) {
+        for (index_t w = 0; w < width; ++w) {
+          packed_ptr[w] = data[w * width_stride + d];
+        }  // w
+        memset(packed_ptr + width, 0, sizeof(float) * width_remain);
+        packed_ptr += block_size;
+      }  // d
+    } else {
+      const float *data0 = data;
+      const float *data1 = data + width_stride;
+      const float *data2 = data1 + width_stride;
+      const float *data3 = data2 + width_stride;
-  utils::ThreadPool
+      const index_t depth_block = depth / 4;
-      &thread_pool = context->device()->cpu_runtime()->thread_pool();
+      const index_t depth_remain = depth - depth_block * 4;
+      for (index_t depth_block_idx = 0; depth_block_idx < depth_block;
+           ++depth_block_idx) {
+        float32x4_t v0 = vld1q_f32(data0);
+        float32x4_t v1 = vld1q_f32(data1);
+        float32x4_t v2 = vld1q_f32(data2);
+        float32x4_t v3 = vld1q_f32(data3);
+        float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
+        float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
+        float32x4x2_t v0123_intertwined =
+            vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
+        float32x4x2_t v0123n_intertwined =
+            vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
-  for (index_t b = 0; b < batch; ++b) {
+        vst1q_f32(packed_ptr, v0123_intertwined.val[0]);
-    MatrixMap<const float>
+        packed_ptr += 4;
-        lhs_matrix
-        (lhs_data + static_cast<index_t>(lhs_batched) * b * rows * depth,
-         lhs_major,
-         rows,
-         depth);
-    MatrixMap<const float>
-        rhs_matrix
-        (rhs_data + static_cast<index_t>(rhs_batched) * b * depth * cols,
-         rhs_major,
-         depth,
-         cols);
-    MatrixMap<float> output_matrix
-        (output_data + b * rows * cols, output_major, rows, cols);
-    // pack lhs
+        vst1q_f32(packed_ptr, v0123_intertwined.val[1]);
-    if (cached_ != kCacheLhs) {
+        packed_ptr += 4;
-      thread_pool.Compute1D([=, &lhs_matrix](index_t start,
-                                             index_t end,
-                                             index_t step) {
-        for (index_t row_block_idx = start; row_block_idx < end;
-             row_block_idx += step) {
-          const index_t start_row = row_block_idx * row_block_size;
-          const index_t
-              row_block_len = std::min(row_block_size, rows - start_row);
-          float *packed_lhs_data_block =
-              packed_lhs_data + row_block_idx * row_block_size * depth_padded;
-          PackLhs(lhs_matrix.block(start_row, 0, row_block_len, depth),
-                  packed_lhs_data_block);
-        }
-      }, 0, row_block_count, 1);
-      if (cache_side == kCacheLhs) {
+        vst1q_f32(packed_ptr, v0123n_intertwined.val[0]);
-        cached_ = kCacheLhs;
+        packed_ptr += 4;
-        if (lhs->UnderlyingBuffer()->OnHost()) {
-          AdviseFree(reinterpret_cast<void *>(const_cast<float *>(lhs->data<
+        vst1q_f32(packed_ptr, v0123n_intertwined.val[1]);
-                         float>())),
+        packed_ptr += 4;
-                     lhs->raw_size());
+        data0 += 4;
+        data1 += 4;
+        data2 += 4;
+        data3 += 4;
      }
+      for (index_t d = 0; d < depth_remain; ++d) {
+        float32x4_t vi = {*data0, *data1, *data2, *data3};
+        vst1q_f32(packed_ptr, vi);
+        packed_ptr += 4;
+        ++data0;
+        ++data1;
+        ++data2;
+        ++data3;
+      }  // d
    }
  }
+}
-    // pack rhs
+template<>
-    if (cached_ != kCacheRhs) {
+template<>
-      thread_pool.Compute1D([=, &rhs_matrix](index_t start,
+void Gemm<float>::Pack<8, 4>(const MatrixMap<const float> &matrix,
-                                             index_t end,
+                             MatrixMajor dst_major,
-                                             index_t step) {
+                             float *packed_matrix) {
-        for (index_t col_block_idx = start; col_block_idx < end;
+  const index_t rows = matrix.rows();
-             col_block_idx += step) {
+  const index_t cols = matrix.cols();
-          const index_t start_col = col_block_idx * col_block_size;
-          const index_t
-              col_block_len = std::min(col_block_size, cols - start_col);
-          float *packed_rhs_data_block =
-              packed_rhs_data + col_block_idx * col_block_size * depth_padded;
-          PackRhs(rhs_matrix.block(0, start_col, depth, col_block_len),
-                  packed_rhs_data_block);
-        }
-      }, 0, col_block_count, 1);
-      if (cache_side == kCacheRhs) {
+  // use the same terminology as GemmLowp:
-        cached_ = kCacheRhs;
+  // depth is depth, width is the opposite dim other than depth
-        if (rhs->UnderlyingBuffer()->OnHost()) {
+  // lhs
-          AdviseFree(reinterpret_cast<void *>(const_cast<float *>(rhs->data<
+  index_t width = rows;
-                         float>())),
+  index_t depth = cols;
-                     rhs->raw_size());
+  index_t width_stride = matrix.rows_stride();
-        }
+  index_t depth_stride = matrix.cols_stride();
+  if (dst_major == RowMajor) {
+    // rhs
+    std::swap(width, depth);
+    std::swap(width_stride, depth_stride);
  }
+  const float *data = matrix.data();
+  float *packed_ptr = packed_matrix;
+  const index_t block_size = 8;
+  const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4));
+  if (depth_padded > depth) {
+    memset(packed_ptr + depth * block_size,
+           0,
+           sizeof(float) * (depth_padded - depth) * block_size);
  }
-    // multiply lhs and rhs
+  if (dst_major == matrix.matrix_major()) {
-    thread_pool.Compute1D([=, &output_matrix](index_t start,
+    if (width < block_size) {
-                                              index_t end,
+      const index_t width_remain = block_size - width;
-                                              index_t step) {
-      for (index_t row_block_idx = start; row_block_idx < end;
-           row_block_idx += step) {
-        const index_t start_row = row_block_idx * row_block_size;
-        const index_t
-            row_block_len = std::min(row_block_size, rows - start_row);
-        const float *packed_lhs_data_block =
-            packed_lhs_data + row_block_idx * row_block_size * depth_padded;
-        for (index_t col_block_idx = 0; col_block_idx < col_block_count;
-             ++col_block_idx) {
-          const index_t start_col = col_block_idx * col_block_size;
-          const index_t
-              col_block_len = std::min(col_block_size, cols - start_col);
-          const float *packed_rhs_data_block =
-              packed_rhs_data + col_block_idx * col_block_size * depth_padded;
-          float *packed_output_data_block =
-              packed_output_data + row_block_idx * row_block_size * cols_padded
-                  + col_block_idx * col_block_size;
-          ComputeBlock(packed_lhs_data_block,
-                       packed_rhs_data_block,
-                       depth_padded,
-                       packed_output_data_block);
-          MatrixMap<float> output_block = output_matrix.block(start_row,
-                                                              start_col,
-                                                              row_block_len,
-                                                              col_block_len);
-          UnpackOutput(packed_output_data_block, &output_block);
-        }  // col_block_idx
-      }  // row_block_idx
-    }, 0, row_block_count, 1);
-  }  // b
-  return MaceStatus::MACE_SUCCESS;
-}
-void Gemm::ComputeBlock(const float *packed_lhs_data,
-                        const float *packed_rhs_data,
-                        const index_t depth_padded,
-                        float *packed_output_data) {
-  /* Ref:
-  for (index_t r = 0; r < block_size; ++r) {
-    for (index_t c = 0; c < block_size; ++c) {
-      float sum = 0;
      for (index_t d = 0; d < depth; ++d) {
-        // (r, d) * (d, c)
+        memcpy(packed_ptr, data, sizeof(float) * width);
-        sum += packed_lhs_data[d * r_block_size + r]
+        memset(packed_ptr + width, 0, sizeof(float) * width_remain);
-            * packed_rhs_data[d * c_block_size + c];
+        data += depth_stride;
+        packed_ptr += block_size;
      }
-      packed_output_data[r * c_block_size + c] = sum;
+    } else {
+      for (index_t d = 0; d < depth; ++d) {
+        float32x4_t vi = vld1q_f32(data);
+        vst1q_f32(packed_ptr, vi);
+        float32x4_t vin = vld1q_f32(data + 4);
+        vst1q_f32(packed_ptr + 4, vin);
+        data += depth_stride;
+        packed_ptr += block_size;
      }
    }
-  */
+  } else {
-  const float *lhs_ptr = packed_lhs_data;
+    if (width < block_size) {
-  const float *rhs_ptr = packed_rhs_data;
+      const index_t width_remain = block_size - width;
+      for (index_t d = 0; d < depth; ++d) {
+        for (index_t w = 0; w < width; ++w) {
+          packed_ptr[w] = data[w * width_stride + d];
+        }  // w
+        memset(packed_ptr + width, 0, sizeof(float) * width_remain);
+        packed_ptr += block_size;
+      }  // d
+    } else {
+      const float *data0 = data;
+      const float *data1 = data + width_stride;
+      const float *data2 = data1 + width_stride;
+      const float *data3 = data2 + width_stride;
+      const float *data4 = data3 + width_stride;
+      const float *data5 = data4 + width_stride;
+      const float *data6 = data5 + width_stride;
+      const float *data7 = data6 + width_stride;
-  const index_t depth_block_count = depth_padded / 4;
+      const index_t depth_block = depth / 4;
+      const index_t depth_remain = depth - depth_block * 4;
+      for (index_t depth_block_idx = 0; depth_block_idx < depth_block;
+           ++depth_block_idx) {
+        float32x4_t v0 = vld1q_f32(data0);
+        float32x4_t v1 = vld1q_f32(data1);
+        float32x4_t v2 = vld1q_f32(data2);
+        float32x4_t v3 = vld1q_f32(data3);
+        float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
+        float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
+        float32x4x2_t v0123_intertwined =
+            vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
+        float32x4x2_t v0123n_intertwined =
+            vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
-#ifdef __aarch64__
+        float32x4_t v4 = vld1q_f32(data4);
-  // Register layout: (8x4) x (4,8)
+        float32x4_t v5 = vld1q_f32(data5);
-  //
+        float32x4_t v6 = vld1q_f32(data6);
-  //                               +--------+--------+
+        float32x4_t v7 = vld1q_f32(data7);
-  //                               | v8 ... | v9 ... |
+        float32x4x2_t v46_intertwined = vzipq_f32(v4, v6);
-  //                       Rhs     +--------+--------+
+        float32x4x2_t v57_intertwined = vzipq_f32(v5, v7);
-  //                               | v10... | v11... |
+        float32x4x2_t v4567_intertwined =
-  //                               +--------+--------+
+            vzipq_f32(v46_intertwined.val[0], v57_intertwined.val[0]);
-  //                               | v12... | v13... |
+        float32x4x2_t v4567n_intertwined =
-  //                               +--------+--------+
+            vzipq_f32(v46_intertwined.val[1], v57_intertwined.val[1]);
-  //                               | v14... | v15... |
-  //                               +--------+--------+
-  //
-  //          Lhs
-  //
-  //  +----+----+----+----+  -  -  +--------+--------+
-  //  | v0 | v2 | v4 | v6 |        | v16... | v17... |
-  //  | .  |    |    |    |        | v18... | v19... |
-  //  | .  |    |    |    |        | v20... | v21... |
-  //  | .  |    |    |    |        | v22... | v23... |
-  //  +----+----|----+----+        +--------+--------+
-  //  | v1 | v3 | v5 | v7 |        | v24... | v25... |
-  //  | .  |    |    |    |        | v26... | v27... |
-  //  | .  |    |    |    |        | v28... | v29... |
-  //  | .  |    |    |    |        | v30... | v31... |
-  //  +----+----|----+----+        +--------+--------+
-  //
-  //                                    Accumulator
-  //
-  if (depth_block_count > 0) {
+        vst1q_f32(packed_ptr, v0123_intertwined.val[0]);
-    index_t r_depth_block_count = depth_block_count;
+        packed_ptr += 4;
-    // just make compiler happy
-    MACE_UNUSED(r_depth_block_count);
-    asm volatile(
+        vst1q_f32(packed_ptr, v4567_intertwined.val[0]);
-        "dup v16.4s, wzr \n"
+        packed_ptr += 4;
-        "dup v17.4s, wzr \n"
-        "dup v18.4s, wzr \n"
-        "dup v19.4s, wzr \n"
-        "dup v20.4s, wzr \n"
-        "dup v21.4s, wzr \n"
-        "dup v22.4s, wzr \n"
-        "dup v23.4s, wzr \n"
-        "dup v24.4s, wzr \n"
-        "dup v25.4s, wzr \n"
-        "dup v26.4s, wzr \n"
-        "dup v27.4s, wzr \n"
-        "dup v28.4s, wzr \n"
-        "dup v29.4s, wzr \n"
-        "dup v30.4s, wzr \n"
-        "dup v31.4s, wzr \n"
-        // prelogue
+        vst1q_f32(packed_ptr, v0123_intertwined.val[1]);
-        "ld1 {v0.4s}, [%[lhs_ptr]], #16 \n"
+        packed_ptr += 4;
-        "ld1 {v1.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v2.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v3.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v4.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v5.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v6.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v7.4s}, [%[lhs_ptr]], #16 \n"
-        "ld1 {v8.4s}, [%[rhs_ptr]], #16 \n"
+        vst1q_f32(packed_ptr, v4567_intertwined.val[1]);
-        "ld1 {v9.4s}, [%[rhs_ptr]], #16 \n"
+        packed_ptr += 4;
-        "ld1 {v10.4s}, [%[rhs_ptr]], #16 \n"
-        "ld1 {v11.4s}, [%[rhs_ptr]], #16 \n"
-        "ld1 {v12.4s}, [%[rhs_ptr]], #16 \n"
-        "ld1 {v13.4s}, [%[rhs_ptr]], #16 \n"
-        "ld1 {v14.4s}, [%[rhs_ptr]], #16 \n"
-        "ld1 {v15.4s}, [%[rhs_ptr]], #16 \n"
-        "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
+        vst1q_f32(packed_ptr, v0123n_intertwined.val[0]);
-        "beq 1f\n"
+        packed_ptr += 4;
-        "0: \n"
+        vst1q_f32(packed_ptr, v4567n_intertwined.val[0]);
-        "fmla v16.4s, v8.4s, v0.s[0] \n"
+        packed_ptr += 4;
-        "fmla v17.4s, v9.4s, v0.s[0] \n"
-        "fmla v18.4s, v8.4s, v0.s[1] \n"
-        "fmla v19.4s, v9.4s, v0.s[1] \n"
-        "fmla v20.4s, v8.4s, v0.s[2] \n"
-        "fmla v21.4s, v9.4s, v0.s[2] \n"
-        "fmla v22.4s, v8.4s, v0.s[3] \n"
-        "fmla v23.4s, v9.4s, v0.s[3] \n"
-        "ld1 {v0.4s}, [%[lhs_ptr]], #16 \n"
+        vst1q_f32(packed_ptr, v0123n_intertwined.val[1]);
+        packed_ptr += 4;
-        "fmla v24.4s, v8.4s, v1.s[0] \n"
+        vst1q_f32(packed_ptr, v4567n_intertwined.val[1]);
-        "fmla v25.4s, v9.4s, v1.s[0] \n"
+        packed_ptr += 4;
-        "fmla v26.4s, v8.4s, v1.s[1] \n"
-        "fmla v27.4s, v9.4s, v1.s[1] \n"
-        "fmla v28.4s, v8.4s, v1.s[2] \n"
-        "fmla v29.4s, v9.4s, v1.s[2] \n"
-        "fmla v30.4s, v8.4s, v1.s[3] \n"
-        "fmla v31.4s, v9.4s, v1.s[3] \n"
-        "ld1 {v1.4s}, [%[lhs_ptr]], #16 \n"
+        data0 += 4;
-        "ld1 {v8.4s}, [%[rhs_ptr]], #16 \n"
+        data1 += 4;
-        "ld1 {v9.4s}, [%[rhs_ptr]], #16 \n"
+        data2 += 4;
+        data3 += 4;
+        data4 += 4;
+        data5 += 4;
+        data6 += 4;
+        data7 += 4;
+      }
+      for (index_t d = 0; d < depth_remain; ++d) {
+        float32x4_t vi = {*data0, *data1, *data2, *data3};
+        vst1q_f32(packed_ptr, vi);
+        packed_ptr += 4;
-        "fmla v16.4s, v10.4s, v2.s[0] \n"
+        float32x4_t vin = {*data4, *data5, *data6, *data7};
-        "fmla v17.4s, v11.4s, v2.s[0] \n"
+        vst1q_f32(packed_ptr, vin);
-        "fmla v18.4s, v10.4s, v2.s[1] \n"
+        packed_ptr += 4;
-        "fmla v19.4s, v11.4s, v2.s[1] \n"
-        "fmla v20.4s, v10.4s, v2.s[2] \n"
-        "fmla v21.4s, v11.4s, v2.s[2] \n"
-        "fmla v22.4s, v10.4s, v2.s[3] \n"
-        "fmla v23.4s, v11.4s, v2.s[3] \n"
-        "ld1 {v2.4s}, [%[lhs_ptr]], #16 \n"
+        ++data0;
+        ++data1;
+        ++data2;
+        ++data3;
+        ++data4;
+        ++data5;
+        ++data6;
+        ++data7;
+      }  // d
+    }
+  }
+}
-        "fmla v24.4s, v10.4s, v3.s[0] \n"
+template<>
-        "fmla v25.4s, v11.4s, v3.s[0] \n"
+template<>
-        "fmla v26.4s, v10.4s, v3.s[1] \n"
+void Gemm<float>::Unpack<4, 8>(const float *packed_output,
-        "fmla v27.4s, v11.4s, v3.s[1] \n"
+                               MatrixMap<float> *output) {
-        "fmla v28.4s, v10.4s, v3.s[2] \n"
+  const index_t rows = output->rows();
-        "fmla v29.4s, v11.4s, v3.s[2] \n"
+  const index_t cols = output->cols();
-        "fmla v30.4s, v10.4s, v3.s[3] \n"
+  index_t row_stride = output->rows_stride();
-        "fmla v31.4s, v11.4s, v3.s[3] \n"
+  index_t col_stride = output->cols_stride();
-        "ld1 {v3.4s}, [%[lhs_ptr]], #16 \n"
+  float *output_ptr = output->data();
-        "ld1 {v10.4s}, [%[rhs_ptr]], #16 \n"
+  const float *packed_ptr = packed_output;
-        "ld1 {v11.4s}, [%[rhs_ptr]], #16 \n"
-        "fmla v16.4s, v12.4s, v4.s[0] \n"
+  const index_t block_size = 8;
-        "fmla v17.4s, v13.4s, v4.s[0] \n"
-        "fmla v18.4s, v12.4s, v4.s[1] \n"
-        "fmla v19.4s, v13.4s, v4.s[1] \n"
-        "fmla v20.4s, v12.4s, v4.s[2] \n"
-        "fmla v21.4s, v13.4s, v4.s[2] \n"
-        "fmla v22.4s, v12.4s, v4.s[3] \n"
-        "fmla v23.4s, v13.4s, v4.s[3] \n"
-        "ld1 {v4.4s}, [%[lhs_ptr]], #16 \n"
+  // packed_output always has row-major
+  if (output->matrix_major() == RowMajor) {
+    if (cols < block_size) {
+      for (index_t r = 0; r < rows; ++r) {
+        memcpy(output_ptr, packed_ptr, sizeof(float) * cols);
+        output_ptr += row_stride;
+        packed_ptr += block_size;
+      }
+    } else {
+      for (index_t r = 0; r < rows; ++r) {
+        float32x4_t vi = vld1q_f32(packed_ptr);
+        vst1q_f32(output_ptr, vi);
+        float32x4_t vin = vld1q_f32(packed_ptr + 4);
+        vst1q_f32(output_ptr + 4, vin);
-        "fmla v24.4s, v12.4s, v5.s[0] \n"
+        output_ptr += row_stride;
-        "fmla v25.4s, v13.4s, v5.s[0] \n"
+        packed_ptr += block_size;
-        "fmla v26.4s, v12.4s, v5.s[1] \n"
+      }
-        "fmla v27.4s, v13.4s, v5.s[1] \n"
+    }
-        "fmla v28.4s, v12.4s, v5.s[2] \n"
+  } else {
-        "fmla v29.4s, v13.4s, v5.s[2] \n"
+    // ColMajor
-        "fmla v30.4s, v12.4s, v5.s[3] \n"
+    if (rows < block_size) {
-        "fmla v31.4s, v13.4s, v5.s[3] \n"
+      for (index_t c = 0; c < cols; ++c) {
+        for (index_t r = 0; r < rows; ++r) {
+          output_ptr[c * col_stride + r] = packed_ptr[r * block_size + c];
+        }  // r
+      }  // c
+    } else {
+      const float *data0 = packed_ptr;
+      const float *data1 = data0 + block_size;
+      const float *data2 = data1 + block_size;
+      const float *data3 = data2 + block_size;
-        "ld1 {v5.4s}, [%[lhs_ptr]], #16 \n"
+      index_t col_block = cols / 4;
-        "ld1 {v12.4s}, [%[rhs_ptr]], #16 \n"
+      index_t col_remain = cols - col_block * 4;
-        "ld1 {v13.4s}, [%[rhs_ptr]], #16 \n"
+      for (index_t col_block_idx = 0; col_block_idx < col_block;
+           ++col_block_idx) {
+        float32x4_t v0 = vld1q_f32(data0);
+        float32x4_t v1 = vld1q_f32(data1);
+        float32x4_t v2 = vld1q_f32(data2);
+        float32x4_t v3 = vld1q_f32(data3);
+        float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
+        float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
+        float32x4x2_t v0123_intertwined =
+            vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
+        float32x4x2_t v0123n_intertwined =
+            vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
-        "fmla v16.4s, v14.4s, v6.s[0] \n"
+        vst1q_f32(output_ptr, v0123_intertwined.val[0]);
-        "fmla v17.4s, v15.4s, v6.s[0] \n"
+        output_ptr += col_stride;
-        "fmla v18.4s, v14.4s, v6.s[1] \n"
-        "fmla v19.4s, v15.4s, v6.s[1] \n"
-        "fmla v20.4s, v14.4s, v6.s[2] \n"
-        "fmla v21.4s, v15.4s, v6.s[2] \n"
-        "fmla v22.4s, v14.4s, v6.s[3] \n"
-        "fmla v23.4s, v15.4s, v6.s[3] \n"
-        "ld1 {v6.4s}, [%[lhs_ptr]], #16 \n"
+        vst1q_f32(output_ptr, v0123_intertwined.val[1]);
+        output_ptr += col_stride;
-        "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
+        vst1q_f32(output_ptr, v0123n_intertwined.val[0]);
+        output_ptr += col_stride;
-        "fmla v24.4s, v14.4s, v7.s[0] \n"
+        vst1q_f32(output_ptr, v0123n_intertwined.val[1]);
-        "fmla v25.4s, v15.4s, v7.s[0] \n"
+        output_ptr += col_stride;
-        "fmla v26.4s, v14.4s, v7.s[1] \n"
-        "fmla v27.4s, v15.4s, v7.s[1] \n"
-        "fmla v28.4s, v14.4s, v7.s[2] \n"
-        "fmla v29.4s, v15.4s, v7.s[2] \n"
-        "fmla v30.4s, v14.4s, v7.s[3] \n"
-        "fmla v31.4s, v15.4s, v7.s[3] \n"
-        "ld1 {v7.4s}, [%[lhs_ptr]], #16 \n"
+        data0 += 4;
-        "ld1 {v14.4s}, [%[rhs_ptr]], #16 \n"
+        data1 += 4;
-        "ld1 {v15.4s}, [%[rhs_ptr]], #16 \n"
+        data2 += 4;
+        data3 += 4;
+      }
+      for (index_t c = 0; c < col_remain; ++c) {
+        float32x4_t vi = {*data0, *data1, *data2, *data3};
+        vst1q_f32(output_ptr, vi);
+        output_ptr += col_stride;
-        "bne 0b \n"
+        ++data0;
+        ++data1;
+        ++data2;
+        ++data3;
+      }  // d
+    }
+  }
+}
-        // prologue
+template<>
-        "1:\n"
+template<>
-        "fmla v16.4s, v8.4s, v0.s[0] \n"
+void Gemm<float>::Unpack<8, 8>(const float *packed_output,
-        "fmla v17.4s, v9.4s, v0.s[0] \n"
+                               MatrixMap<float> *output) {
-        "fmla v18.4s, v8.4s, v0.s[1] \n"
+  const index_t rows = output->rows();
-        "fmla v19.4s, v9.4s, v0.s[1] \n"
+  const index_t cols = output->cols();
-        "fmla v20.4s, v8.4s, v0.s[2] \n"
+  index_t row_stride = output->rows_stride();
-        "fmla v21.4s, v9.4s, v0.s[2] \n"
+  index_t col_stride = output->cols_stride();
-        "fmla v22.4s, v8.4s, v0.s[3] \n"
-        "fmla v23.4s, v9.4s, v0.s[3] \n"
-        "fmla v24.4s, v8.4s, v1.s[0] \n"
+  float *output_ptr = output->data();
-        "fmla v25.4s, v9.4s, v1.s[0] \n"
+  const float *packed_ptr = packed_output;
-        "fmla v26.4s, v8.4s, v1.s[1] \n"
-        "fmla v27.4s, v9.4s, v1.s[1] \n"
-        "fmla v28.4s, v8.4s, v1.s[2] \n"
-        "fmla v29.4s, v9.4s, v1.s[2] \n"
-        "fmla v30.4s, v8.4s, v1.s[3] \n"
-        "fmla v31.4s, v9.4s, v1.s[3] \n"
-        "fmla v16.4s, v10.4s, v2.s[0] \n"
+  const index_t block_size = 8;
-        "fmla v17.4s, v11.4s, v2.s[0] \n"
-        "fmla v18.4s, v10.4s, v2.s[1] \n"
-        "fmla v19.4s, v11.4s, v2.s[1] \n"
-        "fmla v20.4s, v10.4s, v2.s[2] \n"
-        "fmla v21.4s, v11.4s, v2.s[2] \n"
-        "fmla v22.4s, v10.4s, v2.s[3] \n"
-        "fmla v23.4s, v11.4s, v2.s[3] \n"
-        "fmla v24.4s, v10.4s, v3.s[0] \n"
+  // packed_output always has row-major
-        "fmla v25.4s, v11.4s, v3.s[0] \n"
+  if (output->matrix_major() == RowMajor) {
-        "fmla v26.4s, v10.4s, v3.s[1] \n"
+    if (cols < block_size) {
-        "fmla v27.4s, v11.4s, v3.s[1] \n"
+      for (index_t r = 0; r < rows; ++r) {
-        "fmla v28.4s, v10.4s, v3.s[2] \n"
+        memcpy(output_ptr, packed_ptr, sizeof(float) * cols);
-        "fmla v29.4s, v11.4s, v3.s[2] \n"
+        output_ptr += row_stride;
-        "fmla v30.4s, v10.4s, v3.s[3] \n"
+        packed_ptr += block_size;
-        "fmla v31.4s, v11.4s, v3.s[3] \n"
+      }
+    } else {
+      for (index_t r = 0; r < rows; ++r) {
+        float32x4_t vi = vld1q_f32(packed_ptr);
+        vst1q_f32(output_ptr, vi);
+        float32x4_t vin = vld1q_f32(packed_ptr + 4);
+        vst1q_f32(output_ptr + 4, vin);
-        "fmla v16.4s, v12.4s, v4.s[0] \n"
+        output_ptr += row_stride;
-        "fmla v17.4s, v13.4s, v4.s[0] \n"
+        packed_ptr += block_size;
-        "fmla v18.4s, v12.4s, v4.s[1] \n"
+      }
-        "fmla v19.4s, v13.4s, v4.s[1] \n"
+    }
-        "fmla v20.4s, v12.4s, v4.s[2] \n"
+  } else {
-        "fmla v21.4s, v13.4s, v4.s[2] \n"
+    // ColMajor
-        "fmla v22.4s, v12.4s, v4.s[3] \n"
+    if (rows < block_size) {
-        "fmla v23.4s, v13.4s, v4.s[3] \n"
+      for (index_t c = 0; c < cols; ++c) {
+        for (index_t r = 0; r < rows; ++r) {
+          output_ptr[c * col_stride + r] = packed_ptr[r * block_size + c];
+        }  // r
+      }  // c
+    } else {
+      const float *data0 = packed_ptr;
+      const float *data1 = data0 + block_size;
+      const float *data2 = data1 + block_size;
+      const float *data3 = data2 + block_size;
+      const float *data4 = data3 + block_size;
+      const float *data5 = data4 + block_size;
+      const float *data6 = data5 + block_size;
+      const float *data7 = data6 + block_size;
-        "fmla v24.4s, v12.4s, v5.s[0] \n"
+      index_t col_block = cols / 4;
-        "fmla v25.4s, v13.4s, v5.s[0] \n"
+      index_t col_remain = cols - col_block * 4;
-        "fmla v26.4s, v12.4s, v5.s[1] \n"
+      for (index_t col_block_idx = 0; col_block_idx < col_block;
-        "fmla v27.4s, v13.4s, v5.s[1] \n"
+           ++col_block_idx) {
-        "fmla v28.4s, v12.4s, v5.s[2] \n"
+        float32x4_t v0 = vld1q_f32(data0);
-        "fmla v29.4s, v13.4s, v5.s[2] \n"
+        float32x4_t v1 = vld1q_f32(data1);
-        "fmla v30.4s, v12.4s, v5.s[3] \n"
+        float32x4_t v2 = vld1q_f32(data2);
-        "fmla v31.4s, v13.4s, v5.s[3] \n"
+        float32x4_t v3 = vld1q_f32(data3);
+        float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
+        float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
+        float32x4x2_t v0123_intertwined =
+            vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
+        float32x4x2_t v0123n_intertwined =
+            vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
-        "fmla v16.4s, v14.4s, v6.s[0] \n"
+        float32x4_t v4 = vld1q_f32(data4);
-        "fmla v17.4s, v15.4s, v6.s[0] \n"
+        float32x4_t v5 = vld1q_f32(data5);
-        "fmla v18.4s, v14.4s, v6.s[1] \n"
+        float32x4_t v6 = vld1q_f32(data6);
-        "fmla v19.4s, v15.4s, v6.s[1] \n"
+        float32x4_t v7 = vld1q_f32(data7);
-        "fmla v20.4s, v14.4s, v6.s[2] \n"
+        float32x4x2_t v46_intertwined = vzipq_f32(v4, v6);
-        "fmla v21.4s, v15.4s, v6.s[2] \n"
+        float32x4x2_t v57_intertwined = vzipq_f32(v5, v7);
-        "fmla v22.4s, v14.4s, v6.s[3] \n"
+        float32x4x2_t v4567_intertwined =
-        "fmla v23.4s, v15.4s, v6.s[3] \n"
+            vzipq_f32(v46_intertwined.val[0], v57_intertwined.val[0]);
+        float32x4x2_t v4567n_intertwined =
+            vzipq_f32(v46_intertwined.val[1], v57_intertwined.val[1]);
-        "fmla v24.4s, v14.4s, v7.s[0] \n"
+        vst1q_f32(output_ptr, v0123_intertwined.val[0]);
-        "fmla v25.4s, v15.4s, v7.s[0] \n"
+        vst1q_f32(output_ptr + 4, v4567_intertwined.val[0]);
-        "fmla v26.4s, v14.4s, v7.s[1] \n"
+        output_ptr += col_stride;
-        "fmla v27.4s, v15.4s, v7.s[1] \n"
-        "fmla v28.4s, v14.4s, v7.s[2] \n"
-        "fmla v29.4s, v15.4s, v7.s[2] \n"
-        "fmla v30.4s, v14.4s, v7.s[3] \n"
-        "fmla v31.4s, v15.4s, v7.s[3] \n"
-        "st1 {v16.4s}, [%[packed_output_data]], #16 \n"
+        vst1q_f32(output_ptr, v0123_intertwined.val[1]);
-        "st1 {v17.4s}, [%[packed_output_data]], #16 \n"
+        vst1q_f32(output_ptr + 4, v4567_intertwined.val[1]);
-        "st1 {v18.4s}, [%[packed_output_data]], #16 \n"
+        output_ptr += col_stride;
-        "st1 {v19.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v20.4s}, [%[packed_output_data]], #16 \n"
+        vst1q_f32(output_ptr, v0123n_intertwined.val[0]);
-        "st1 {v21.4s}, [%[packed_output_data]], #16 \n"
+        vst1q_f32(output_ptr + 4, v4567n_intertwined.val[0]);
-        "st1 {v22.4s}, [%[packed_output_data]], #16 \n"
+        output_ptr += col_stride;
-        "st1 {v23.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v24.4s}, [%[packed_output_data]], #16 \n"
+        vst1q_f32(output_ptr, v0123n_intertwined.val[1]);
-        "st1 {v25.4s}, [%[packed_output_data]], #16 \n"
+        vst1q_f32(output_ptr + 4, v4567n_intertwined.val[1]);
-        "st1 {v26.4s}, [%[packed_output_data]], #16 \n"
+        output_ptr += col_stride;
-        "st1 {v27.4s}, [%[packed_output_data]], #16 \n"
-        "st1 {v28.4s}, [%[packed_output_data]], #16 \n"
+        data0 += 4;
-        "st1 {v29.4s}, [%[packed_output_data]], #16 \n"
+        data1 += 4;
-        "st1 {v30.4s}, [%[packed_output_data]], #16 \n"
+        data2 += 4;
-        "st1 {v31.4s}, [%[packed_output_data]], #16 \n"
+        data3 += 4;
-    :  // outputs
+        data4 += 4;
-    [lhs_ptr] "+r"(lhs_ptr),
+        data5 += 4;
-    [rhs_ptr] "+r"(rhs_ptr),
+        data6 += 4;
-    [packed_output_data] "+r"(packed_output_data),
+        data7 += 4;
-    [r_depth_block_count] "+r"(r_depth_block_count)
-    :  // inputs
-    :  // clabbers
-    "cc", "memory",
-        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
-        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
-        "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
      }
-#else  // armeabi-v7a
+      for (index_t c = 0; c < col_remain; ++c) {
-  // Register layout: (4x4) x (4,8)
+        float32x4_t vi = {*data0, *data1, *data2, *data3};
+        vst1q_f32(output_ptr, vi);
+        float32x4_t vin = {*data4, *data5, *data6, *data7};
+        vst1q_f32(output_ptr + 4, vin);
+        output_ptr += col_stride;
+        ++data0;
+        ++data1;
+        ++data2;
+        ++data3;
+        ++data4;
+        ++data5;
+        ++data6;
+        ++data7;
+      }  // d
+    }
+  }
+}
+template<>
+void Gemm<float>::PackLhs(const MatrixMap<const float> &lhs,
+                          float *packed_lhs) {
+#ifdef __aarch64__
+  Pack<8, 4>(lhs, ColMajor, packed_lhs);
+#else
+  Pack<4, 4>(lhs, ColMajor, packed_lhs);
+#endif
+}
+template<>
+void Gemm<float>::PackRhs(const MatrixMap<const float> &rhs,
+                          float *packed_rhs) {
+  Pack<8, 4>(rhs, RowMajor, packed_rhs);
+}
+template<>
+void Gemm<float>::UnpackOutput(const float *packed_output,
+                               MatrixMap<float> *output) {
+#ifdef __aarch64__
+  Unpack<8, 8>(packed_output, output);
+#else
+  Unpack<4, 8>(packed_output, output);
+#endif
+}
+template<>
+void Gemm<float>::ComputeBlock(const float *packed_lhs_data,
+                               const float *packed_rhs_data,
+                               const index_t depth_padded,
+                               float *packed_output_data) {
+  /* Ref:
+  for (index_t r = 0; r < block_size; ++r) {
+    for (index_t c = 0; c < block_size; ++c) {
+      float sum = 0;
+      for (index_t d = 0; d < depth; ++d) {
+        // (r, d) * (d, c)
+        sum += packed_lhs_data[d * r_block_size + r]
+            * packed_rhs_data[d * c_block_size + c];
+      }
+      packed_output_data[r * c_block_size + c] = sum;
+    }
+  }
+  */
+  const float *lhs_ptr = packed_lhs_data;
+  const float *rhs_ptr = packed_rhs_data;
+  const index_t depth_block_count = depth_padded / 4;
+#ifdef __aarch64__
+  // Register layout: (8x4) x (4,8)
  //
  //                               +--------+--------+
-  //                               | q4 ... | q5 ... |
+  //                               | v8 ... | v9 ... |
  //                       Rhs     +--------+--------+
-  //                               | q6 ... | q7 ... |
+  //                               | v10... | v11... |
  //                               +--------+--------+
-  //                               | q4 ... | q5 ... |
+  //                               | v12... | v13... |
  //                               +--------+--------+
-  //                               | q6 ... | q7 ... |
+  //                               | v14... | v15... |
  //                               +--------+--------+
  //
  //          Lhs
  //
  //  +----+----+----+----+  -  -  +--------+--------+
-  //  | q0 | q1 | q2 | q3 |        | q8...  | q9...  |
+  //  | v0 | v2 | v4 | v6 |        | v16... | v17... |
-  //  | .  |    |    |    |        | q10... | q11... |
+  //  | .  |    |    |    |        | v18... | v19... |
-  //  | .  |    |    |    |        | q12... | q13... |
+  //  | .  |    |    |    |        | v20... | v21... |
-  //  | .  |    |    |    |        | q14... | q15... |
+  //  | .  |    |    |    |        | v22... | v23... |
-  //  +----+----+----+----+        +--------+--------+
+  //  +----+----|----+----+        +--------+--------+
+  //  | v1 | v3 | v5 | v7 |        | v24... | v25... |
+  //  | .  |    |    |    |        | v26... | v27... |
+  //  | .  |    |    |    |        | v28... | v29... |
+  //  | .  |    |    |    |        | v30... | v31... |
+  //  +----+----|----+----+        +--------+--------+
  //
  //                                    Accumulator
  //
@@ -543,90 +586,306 @@ void Gemm::ComputeBlock(const float *packed_lhs_data,
    MACE_UNUSED(r_depth_block_count);
    asm volatile(
-    "mov r0, #0\n"
+        "dup v16.4s, wzr \n"
-    "vdup.f32 q8, r0 \n"
+        "dup v17.4s, wzr \n"
-    "vdup.f32 q9, r0 \n"
+        "dup v18.4s, wzr \n"
-    "vdup.f32 q10, r0 \n"
+        "dup v19.4s, wzr \n"
-    "vdup.f32 q11, r0 \n"
+        "dup v20.4s, wzr \n"
-    "vdup.f32 q12, r0 \n"
+        "dup v21.4s, wzr \n"
-    "vdup.f32 q13, r0 \n"
+        "dup v22.4s, wzr \n"
-    "vdup.f32 q14, r0 \n"
+        "dup v23.4s, wzr \n"
-    "vdup.f32 q15, r0 \n"
+        "dup v24.4s, wzr \n"
+        "dup v25.4s, wzr \n"
+        "dup v26.4s, wzr \n"
+        "dup v27.4s, wzr \n"
+        "dup v28.4s, wzr \n"
+        "dup v29.4s, wzr \n"
+        "dup v30.4s, wzr \n"
+        "dup v31.4s, wzr \n"
        // prelogue
-    "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n"
+        "ld1 {v0.4s}, [%[lhs_ptr]], #16 \n"
-    "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n"
+        "ld1 {v1.4s}, [%[lhs_ptr]], #16 \n"
-    "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n"
+        "ld1 {v2.4s}, [%[lhs_ptr]], #16 \n"
-    "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n"
+        "ld1 {v3.4s}, [%[lhs_ptr]], #16 \n"
+        "ld1 {v4.4s}, [%[lhs_ptr]], #16 \n"
+        "ld1 {v5.4s}, [%[lhs_ptr]], #16 \n"
+        "ld1 {v6.4s}, [%[lhs_ptr]], #16 \n"
+        "ld1 {v7.4s}, [%[lhs_ptr]], #16 \n"
-    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
+        "ld1 {v8.4s}, [%[rhs_ptr]], #16 \n"
-    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
+        "ld1 {v9.4s}, [%[rhs_ptr]], #16 \n"
-    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
+        "ld1 {v10.4s}, [%[rhs_ptr]], #16 \n"
-    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
+        "ld1 {v11.4s}, [%[rhs_ptr]], #16 \n"
+        "ld1 {v12.4s}, [%[rhs_ptr]], #16 \n"
+        "ld1 {v13.4s}, [%[rhs_ptr]], #16 \n"
+        "ld1 {v14.4s}, [%[rhs_ptr]], #16 \n"
+        "ld1 {v15.4s}, [%[rhs_ptr]], #16 \n"
        "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
        "beq 1f\n"
        "0: \n"
+        "fmla v16.4s, v8.4s, v0.s[0] \n"
+        "fmla v17.4s, v9.4s, v0.s[0] \n"
+        "fmla v18.4s, v8.4s, v0.s[1] \n"
+        "fmla v19.4s, v9.4s, v0.s[1] \n"
+        "fmla v20.4s, v8.4s, v0.s[2] \n"
+        "fmla v21.4s, v9.4s, v0.s[2] \n"
+        "fmla v22.4s, v8.4s, v0.s[3] \n"
+        "fmla v23.4s, v9.4s, v0.s[3] \n"
-    "vmla.f32 q8, q4, d0[0] \n"
+        "ld1 {v0.4s}, [%[lhs_ptr]], #16 \n"
-    "vmla.f32 q9, q5, d0[0] \n"
-    "vmla.f32 q10, q4, d0[1] \n"
-    "vmla.f32 q11, q5, d0[1] \n"
-    "vmla.f32 q12, q4, d1[0] \n"
-    "vmla.f32 q13, q5, d1[0] \n"
-    "vmla.f32 q14, q4, d1[1] \n"
-    "vmla.f32 q15, q5, d1[1] \n"
-    "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n"
+        "fmla v24.4s, v8.4s, v1.s[0] \n"
-    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
+        "fmla v25.4s, v9.4s, v1.s[0] \n"
-    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
+        "fmla v26.4s, v8.4s, v1.s[1] \n"
+        "fmla v27.4s, v9.4s, v1.s[1] \n"
+        "fmla v28.4s, v8.4s, v1.s[2] \n"
+        "fmla v29.4s, v9.4s, v1.s[2] \n"
+        "fmla v30.4s, v8.4s, v1.s[3] \n"
+        "fmla v31.4s, v9.4s, v1.s[3] \n"
-    "vmla.f32 q8, q6, d2[0] \n"
+        "ld1 {v1.4s}, [%[lhs_ptr]], #16 \n"
-    "vmla.f32 q9, q7, d2[0] \n"
+        "ld1 {v8.4s}, [%[rhs_ptr]], #16 \n"
-    "vmla.f32 q10, q6, d2[1] \n"
+        "ld1 {v9.4s}, [%[rhs_ptr]], #16 \n"
-    "vmla.f32 q11, q7, d2[1] \n"
-    "vmla.f32 q12, q6, d3[0] \n"
-    "vmla.f32 q13, q7, d3[0] \n"
-    "vmla.f32 q14, q6, d3[1] \n"
-    "vmla.f32 q15, q7, d3[1] \n"
-    "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n"
+        "fmla v16.4s, v10.4s, v2.s[0] \n"
-    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
+        "fmla v17.4s, v11.4s, v2.s[0] \n"
-    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
+        "fmla v18.4s, v10.4s, v2.s[1] \n"
+        "fmla v19.4s, v11.4s, v2.s[1] \n"
+        "fmla v20.4s, v10.4s, v2.s[2] \n"
+        "fmla v21.4s, v11.4s, v2.s[2] \n"
+        "fmla v22.4s, v10.4s, v2.s[3] \n"
+        "fmla v23.4s, v11.4s, v2.s[3] \n"
-    "vmla.f32 q8, q4, d4[0] \n"
+        "ld1 {v2.4s}, [%[lhs_ptr]], #16 \n"
-    "vmla.f32 q9, q5, d4[0] \n"
-    "vmla.f32 q10, q4, d4[1] \n"
-    "vmla.f32 q11, q5, d4[1] \n"
-    "vmla.f32 q12, q4, d5[0] \n"
-    "vmla.f32 q13, q5, d5[0] \n"
-    "vmla.f32 q14, q4, d5[1] \n"
-    "vmla.f32 q15, q5, d5[1] \n"
-    "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n"
+        "fmla v24.4s, v10.4s, v3.s[0] \n"
-    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
+        "fmla v25.4s, v11.4s, v3.s[0] \n"
-    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
+        "fmla v26.4s, v10.4s, v3.s[1] \n"
+        "fmla v27.4s, v11.4s, v3.s[1] \n"
+        "fmla v28.4s, v10.4s, v3.s[2] \n"
+        "fmla v29.4s, v11.4s, v3.s[2] \n"
+        "fmla v30.4s, v10.4s, v3.s[3] \n"
+        "fmla v31.4s, v11.4s, v3.s[3] \n"
+        "ld1 {v3.4s}, [%[lhs_ptr]], #16 \n"
+        "ld1 {v10.4s}, [%[rhs_ptr]], #16 \n"
+        "ld1 {v11.4s}, [%[rhs_ptr]], #16 \n"
+        "fmla v16.4s, v12.4s, v4.s[0] \n"
+        "fmla v17.4s, v13.4s, v4.s[0] \n"
+        "fmla v18.4s, v12.4s, v4.s[1] \n"
+        "fmla v19.4s, v13.4s, v4.s[1] \n"
+        "fmla v20.4s, v12.4s, v4.s[2] \n"
+        "fmla v21.4s, v13.4s, v4.s[2] \n"
+        "fmla v22.4s, v12.4s, v4.s[3] \n"
+        "fmla v23.4s, v13.4s, v4.s[3] \n"
+        "ld1 {v4.4s}, [%[lhs_ptr]], #16 \n"
+        "fmla v24.4s, v12.4s, v5.s[0] \n"
+        "fmla v25.4s, v13.4s, v5.s[0] \n"
+        "fmla v26.4s, v12.4s, v5.s[1] \n"
+        "fmla v27.4s, v13.4s, v5.s[1] \n"
+        "fmla v28.4s, v12.4s, v5.s[2] \n"
+        "fmla v29.4s, v13.4s, v5.s[2] \n"
+        "fmla v30.4s, v12.4s, v5.s[3] \n"
+        "fmla v31.4s, v13.4s, v5.s[3] \n"
+        "ld1 {v5.4s}, [%[lhs_ptr]], #16 \n"
+        "ld1 {v12.4s}, [%[rhs_ptr]], #16 \n"
+        "ld1 {v13.4s}, [%[rhs_ptr]], #16 \n"
+        "fmla v16.4s, v14.4s, v6.s[0] \n"
+        "fmla v17.4s, v15.4s, v6.s[0] \n"
+        "fmla v18.4s, v14.4s, v6.s[1] \n"
+        "fmla v19.4s, v15.4s, v6.s[1] \n"
+        "fmla v20.4s, v14.4s, v6.s[2] \n"
+        "fmla v21.4s, v15.4s, v6.s[2] \n"
+        "fmla v22.4s, v14.4s, v6.s[3] \n"
+        "fmla v23.4s, v15.4s, v6.s[3] \n"
+        "ld1 {v6.4s}, [%[lhs_ptr]], #16 \n"
        "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
-    "vmla.f32 q8, q6, d6[0] \n"
+        "fmla v24.4s, v14.4s, v7.s[0] \n"
-    "vmla.f32 q9, q7, d6[0] \n"
+        "fmla v25.4s, v15.4s, v7.s[0] \n"
-    "vmla.f32 q10, q6, d6[1] \n"
+        "fmla v26.4s, v14.4s, v7.s[1] \n"
-    "vmla.f32 q11, q7, d6[1] \n"
+        "fmla v27.4s, v15.4s, v7.s[1] \n"
-    "vmla.f32 q12, q6, d7[0] \n"
+        "fmla v28.4s, v14.4s, v7.s[2] \n"
-    "vmla.f32 q13, q7, d7[0] \n"
+        "fmla v29.4s, v15.4s, v7.s[2] \n"
-    "vmla.f32 q14, q6, d7[1] \n"
+        "fmla v30.4s, v14.4s, v7.s[3] \n"
-    "vmla.f32 q15, q7, d7[1] \n"
+        "fmla v31.4s, v15.4s, v7.s[3] \n"
-    "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n"
+        "ld1 {v7.4s}, [%[lhs_ptr]], #16 \n"
-    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
+        "ld1 {v14.4s}, [%[rhs_ptr]], #16 \n"
-    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
+        "ld1 {v15.4s}, [%[rhs_ptr]], #16 \n"
        "bne 0b \n"
        // prologue
        "1:\n"
+        "fmla v16.4s, v8.4s, v0.s[0] \n"
+        "fmla v17.4s, v9.4s, v0.s[0] \n"
+        "fmla v18.4s, v8.4s, v0.s[1] \n"
+        "fmla v19.4s, v9.4s, v0.s[1] \n"
+        "fmla v20.4s, v8.4s, v0.s[2] \n"
+        "fmla v21.4s, v9.4s, v0.s[2] \n"
+        "fmla v22.4s, v8.4s, v0.s[3] \n"
+        "fmla v23.4s, v9.4s, v0.s[3] \n"
+        "fmla v24.4s, v8.4s, v1.s[0] \n"
+        "fmla v25.4s, v9.4s, v1.s[0] \n"
+        "fmla v26.4s, v8.4s, v1.s[1] \n"
+        "fmla v27.4s, v9.4s, v1.s[1] \n"
+        "fmla v28.4s, v8.4s, v1.s[2] \n"
+        "fmla v29.4s, v9.4s, v1.s[2] \n"
+        "fmla v30.4s, v8.4s, v1.s[3] \n"
+        "fmla v31.4s, v9.4s, v1.s[3] \n"
+        "fmla v16.4s, v10.4s, v2.s[0] \n"
+        "fmla v17.4s, v11.4s, v2.s[0] \n"
+        "fmla v18.4s, v10.4s, v2.s[1] \n"
+        "fmla v19.4s, v11.4s, v2.s[1] \n"
+        "fmla v20.4s, v10.4s, v2.s[2] \n"
+        "fmla v21.4s, v11.4s, v2.s[2] \n"
+        "fmla v22.4s, v10.4s, v2.s[3] \n"
+        "fmla v23.4s, v11.4s, v2.s[3] \n"
+        "fmla v24.4s, v10.4s, v3.s[0] \n"
+        "fmla v25.4s, v11.4s, v3.s[0] \n"
+        "fmla v26.4s, v10.4s, v3.s[1] \n"
+        "fmla v27.4s, v11.4s, v3.s[1] \n"
+        "fmla v28.4s, v10.4s, v3.s[2] \n"
+        "fmla v29.4s, v11.4s, v3.s[2] \n"
+        "fmla v30.4s, v10.4s, v3.s[3] \n"
+        "fmla v31.4s, v11.4s, v3.s[3] \n"
+        "fmla v16.4s, v12.4s, v4.s[0] \n"
+        "fmla v17.4s, v13.4s, v4.s[0] \n"
+        "fmla v18.4s, v12.4s, v4.s[1] \n"
+        "fmla v19.4s, v13.4s, v4.s[1] \n"
+        "fmla v20.4s, v12.4s, v4.s[2] \n"
+        "fmla v21.4s, v13.4s, v4.s[2] \n"
+        "fmla v22.4s, v12.4s, v4.s[3] \n"
+        "fmla v23.4s, v13.4s, v4.s[3] \n"
+        "fmla v24.4s, v12.4s, v5.s[0] \n"
+        "fmla v25.4s, v13.4s, v5.s[0] \n"
+        "fmla v26.4s, v12.4s, v5.s[1] \n"
+        "fmla v27.4s, v13.4s, v5.s[1] \n"
+        "fmla v28.4s, v12.4s, v5.s[2] \n"
+        "fmla v29.4s, v13.4s, v5.s[2] \n"
+        "fmla v30.4s, v12.4s, v5.s[3] \n"
+        "fmla v31.4s, v13.4s, v5.s[3] \n"
+        "fmla v16.4s, v14.4s, v6.s[0] \n"
+        "fmla v17.4s, v15.4s, v6.s[0] \n"
+        "fmla v18.4s, v14.4s, v6.s[1] \n"
+        "fmla v19.4s, v15.4s, v6.s[1] \n"
+        "fmla v20.4s, v14.4s, v6.s[2] \n"
+        "fmla v21.4s, v15.4s, v6.s[2] \n"
+        "fmla v22.4s, v14.4s, v6.s[3] \n"
+        "fmla v23.4s, v15.4s, v6.s[3] \n"
+        "fmla v24.4s, v14.4s, v7.s[0] \n"
+        "fmla v25.4s, v15.4s, v7.s[0] \n"
+        "fmla v26.4s, v14.4s, v7.s[1] \n"
+        "fmla v27.4s, v15.4s, v7.s[1] \n"
+        "fmla v28.4s, v14.4s, v7.s[2] \n"
+        "fmla v29.4s, v15.4s, v7.s[2] \n"
+        "fmla v30.4s, v14.4s, v7.s[3] \n"
+        "fmla v31.4s, v15.4s, v7.s[3] \n"
+        "st1 {v16.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v17.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v18.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v19.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v20.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v21.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v22.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v23.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v24.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v25.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v26.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v27.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v28.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v29.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v30.4s}, [%[packed_output_data]], #16 \n"
+        "st1 {v31.4s}, [%[packed_output_data]], #16 \n"
+    :  // outputs
+    [lhs_ptr] "+r"(lhs_ptr),
+    [rhs_ptr] "+r"(rhs_ptr),
+    [packed_output_data] "+r"(packed_output_data),
+    [r_depth_block_count] "+r"(r_depth_block_count)
+    :  // inputs
+    :  // clabbers
+    "cc", "memory",
+        "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+        "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+        "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+  }
+#else  // armeabi-v7a
+  // Register layout: (4x4) x (4,8)
+  //
+  //                               +--------+--------+
+  //                               | q4 ... | q5 ... |
+  //                       Rhs     +--------+--------+
+  //                               | q6 ... | q7 ... |
+  //                               +--------+--------+
+  //                               | q4 ... | q5 ... |
+  //                               +--------+--------+
+  //                               | q6 ... | q7 ... |
+  //                               +--------+--------+
+  //
+  //          Lhs
+  //
+  //  +----+----+----+----+  -  -  +--------+--------+
+  //  | q0 | q1 | q2 | q3 |        | q8...  | q9...  |
+  //  | .  |    |    |    |        | q10... | q11... |
+  //  | .  |    |    |    |        | q12... | q13... |
+  //  | .  |    |    |    |        | q14... | q15... |
+  //  +----+----+----+----+        +--------+--------+
+  //
+  //                                    Accumulator
+  //
+  if (depth_block_count > 0) {
+    index_t r_depth_block_count = depth_block_count;
+    // just make compiler happy
+    MACE_UNUSED(r_depth_block_count);
+    asm volatile(
+    "mov r0, #0\n"
+    "vdup.f32 q8, r0 \n"
+    "vdup.f32 q9, r0 \n"
+    "vdup.f32 q10, r0 \n"
+    "vdup.f32 q11, r0 \n"
+    "vdup.f32 q12, r0 \n"
+    "vdup.f32 q13, r0 \n"
+    "vdup.f32 q14, r0 \n"
+    "vdup.f32 q15, r0 \n"
+    // prelogue
+    "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
+    "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
+    "beq 1f\n"
+    "0: \n"
    "vmla.f32 q8, q4, d0[0] \n"
    "vmla.f32 q9, q5, d0[0] \n"
    "vmla.f32 q10, q4, d0[1] \n"
@@ -636,6 +895,7 @@ void Gemm::ComputeBlock(const float *packed_lhs_data,
    "vmla.f32 q14, q4, d1[1] \n"
    "vmla.f32 q15, q5, d1[1] \n"
+    "vld1.f32 {d0-d1}, [%[lhs_ptr]]! \n"
    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
@@ -648,6 +908,7 @@ void Gemm::ComputeBlock(const float *packed_lhs_data,
    "vmla.f32 q14, q6, d3[1] \n"
    "vmla.f32 q15, q7, d3[1] \n"
+    "vld1.f32 {d2-d3}, [%[lhs_ptr]]! \n"
    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
@@ -660,6 +921,12 @@ void Gemm::ComputeBlock(const float *packed_lhs_data,
    "vmla.f32 q14, q4, d5[1] \n"
    "vmla.f32 q15, q5, d5[1] \n"
+    "vld1.f32 {d4-d5}, [%[lhs_ptr]]! \n"
+    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
+    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
+    "subs %[r_depth_block_count], %[r_depth_block_count], #1 \n"
    "vmla.f32 q8, q6, d6[0] \n"
    "vmla.f32 q9, q7, d6[0] \n"
    "vmla.f32 q10, q6, d6[1] \n"
@@ -669,568 +936,262 @@ void Gemm::ComputeBlock(const float *packed_lhs_data,
    "vmla.f32 q14, q6, d7[1] \n"
    "vmla.f32 q15, q7, d7[1] \n"
-    "vst1.f32 {d16-d17}, [%[packed_output_data]]! \n"
+    "vld1.f32 {d6-d7}, [%[lhs_ptr]]! \n"
-    "vst1.f32 {d18-d19}, [%[packed_output_data]]! \n"
+    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
-    "vst1.f32 {d20-d21}, [%[packed_output_data]]! \n"
+    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
-    "vst1.f32 {d22-d23}, [%[packed_output_data]]! \n"
-    "vst1.f32 {d24-d25}, [%[packed_output_data]]! \n"
-    "vst1.f32 {d26-d27}, [%[packed_output_data]]! \n"
-    "vst1.f32 {d28-d29}, [%[packed_output_data]]! \n"
-    "vst1.f32 {d30-d31}, [%[packed_output_data]]! \n"
-    :  // outputs
-    [lhs_ptr] "+r"(lhs_ptr),
-    [rhs_ptr] "+r"(rhs_ptr),
-    [packed_output_data] "+r"(packed_output_data),
-    [r_depth_block_count] "+r"(r_depth_block_count)
-    :  // inputs
-    :  // clabbers
-    "cc", "memory", "r0",
-        "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-        "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-  }
-#endif
-}
-void Gemm::PackLhs(const MatrixMap<const float> &lhs,
-                   float *packed_lhs) {
-#ifdef __aarch64__
-  Pack<8, 4>(lhs, ColMajor, packed_lhs);
-#else
-  Pack<4, 4>(lhs, ColMajor, packed_lhs);
-#endif
-}
-void Gemm::PackRhs(const MatrixMap<const float> &rhs,
-                   float *packed_rhs) {
-  Pack<8, 4>(rhs, RowMajor, packed_rhs);
-}
-void Gemm::UnpackOutput(const float *packed_output, MatrixMap<float> *output) {
-#ifdef __aarch64__
-  Unpack<8, 8>(packed_output, output);
-#else
-  Unpack<4, 8>(packed_output, output);
-#endif
-}
-template<>
-void Gemm::Pack<4, 4>(const MatrixMap<const float> &matrix,
-                      MatrixMajor dst_major,
-                      float *packed_matrix) {
-  const index_t rows = matrix.rows();
-  const index_t cols = matrix.cols();
-  // use the same terminology as GemmLowp:
-  // depth is depth, width is the opposite dim other than depth
-  // lhs
-  index_t width = rows;
-  index_t depth = cols;
-  index_t width_stride = matrix.rows_stride();
-  index_t depth_stride = matrix.cols_stride();
-  if (dst_major == RowMajor) {
-    // rhs
-    std::swap(width, depth);
-    std::swap(width_stride, depth_stride);
-  }
-  const float *data = matrix.data();
-  float *packed_ptr = packed_matrix;
-  const index_t block_size = 4;
-  const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4));
-  if (depth_padded > depth) {
-    memset(packed_ptr + depth * block_size,
-           0,
-           sizeof(float) * (depth_padded - depth) * block_size);
-  }
-  if (dst_major == matrix.matrix_major()) {
-    if (width < block_size) {
-      const index_t width_remain = block_size - width;
-      for (index_t d = 0; d < depth; ++d) {
-        memcpy(packed_ptr, data, sizeof(float) * width);
-        memset(packed_ptr + width, 0, sizeof(float) * width_remain);
-        data += depth_stride;
-        packed_ptr += block_size;
-      }
-    } else {
-      for (index_t d = 0; d < depth; ++d) {
-        float32x4_t vi = vld1q_f32(data);
-        vst1q_f32(packed_ptr, vi);
-        data += depth_stride;
-        packed_ptr += block_size;
-      }
-    }
-  } else {
-    if (width < block_size) {
-      const index_t width_remain = block_size - width;
-      for (index_t d = 0; d < depth; ++d) {
-        for (index_t w = 0; w < width; ++w) {
-          packed_ptr[w] = data[w * width_stride + d];
-        }  // w
-        memset(packed_ptr + width, 0, sizeof(float) * width_remain);
-        packed_ptr += block_size;
-      }  // d
-    } else {
-      const float *data0 = data;
-      const float *data1 = data + width_stride;
-      const float *data2 = data1 + width_stride;
-      const float *data3 = data2 + width_stride;
-      const index_t depth_block = depth / 4;
-      const index_t depth_remain = depth - depth_block * 4;
-      for (index_t depth_block_idx = 0; depth_block_idx < depth_block;
-           ++depth_block_idx) {
-        float32x4_t v0 = vld1q_f32(data0);
-        float32x4_t v1 = vld1q_f32(data1);
-        float32x4_t v2 = vld1q_f32(data2);
-        float32x4_t v3 = vld1q_f32(data3);
-        float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
-        float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
-        float32x4x2_t v0123_intertwined =
-            vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
-        float32x4x2_t v0123n_intertwined =
-            vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
-        vst1q_f32(packed_ptr, v0123_intertwined.val[0]);
-        packed_ptr += 4;
-        vst1q_f32(packed_ptr, v0123_intertwined.val[1]);
-        packed_ptr += 4;
-        vst1q_f32(packed_ptr, v0123n_intertwined.val[0]);
-        packed_ptr += 4;
-        vst1q_f32(packed_ptr, v0123n_intertwined.val[1]);
-        packed_ptr += 4;
-        data0 += 4;
-        data1 += 4;
-        data2 += 4;
-        data3 += 4;
-      }
-      for (index_t d = 0; d < depth_remain; ++d) {
-        float32x4_t vi = {*data0, *data1, *data2, *data3};
-        vst1q_f32(packed_ptr, vi);
-        packed_ptr += 4;
-        ++data0;
-        ++data1;
-        ++data2;
-        ++data3;
-      }  // d
-    }
-  }
-}
-template<>
-void Gemm::Pack<8, 4>(const MatrixMap<const float> &matrix,
-                      MatrixMajor dst_major,
-                      float *packed_matrix) {
-  const index_t rows = matrix.rows();
-  const index_t cols = matrix.cols();
-  // use the same terminology as GemmLowp:
-  // depth is depth, width is the opposite dim other than depth
-  // lhs
-  index_t width = rows;
-  index_t depth = cols;
-  index_t width_stride = matrix.rows_stride();
-  index_t depth_stride = matrix.cols_stride();
-  if (dst_major == RowMajor) {
-    // rhs
-    std::swap(width, depth);
-    std::swap(width_stride, depth_stride);
-  }
-  const float *data = matrix.data();
-  float *packed_ptr = packed_matrix;
-  const index_t block_size = 8;
-  const index_t depth_padded = RoundUp(depth, static_cast<index_t>(4));
-  if (depth_padded > depth) {
-    memset(packed_ptr + depth * block_size,
-           0,
-           sizeof(float) * (depth_padded - depth) * block_size);
-  }
-  if (dst_major == matrix.matrix_major()) {
-    if (width < block_size) {
-      const index_t width_remain = block_size - width;
-      for (index_t d = 0; d < depth; ++d) {
-        memcpy(packed_ptr, data, sizeof(float) * width);
-        memset(packed_ptr + width, 0, sizeof(float) * width_remain);
-        data += depth_stride;
-        packed_ptr += block_size;
-      }
-    } else {
-      for (index_t d = 0; d < depth; ++d) {
-        float32x4_t vi = vld1q_f32(data);
-        vst1q_f32(packed_ptr, vi);
-        float32x4_t vin = vld1q_f32(data + 4);
-        vst1q_f32(packed_ptr + 4, vin);
-        data += depth_stride;
-        packed_ptr += block_size;
-      }
-    }
-  } else {
-    if (width < block_size) {
-      const index_t width_remain = block_size - width;
-      for (index_t d = 0; d < depth; ++d) {
-        for (index_t w = 0; w < width; ++w) {
-          packed_ptr[w] = data[w * width_stride + d];
-        }  // w
-        memset(packed_ptr + width, 0, sizeof(float) * width_remain);
-        packed_ptr += block_size;
-      }  // d
-    } else {
-      const float *data0 = data;
-      const float *data1 = data + width_stride;
-      const float *data2 = data1 + width_stride;
-      const float *data3 = data2 + width_stride;
-      const float *data4 = data3 + width_stride;
-      const float *data5 = data4 + width_stride;
-      const float *data6 = data5 + width_stride;
-      const float *data7 = data6 + width_stride;
-      const index_t depth_block = depth / 4;
-      const index_t depth_remain = depth - depth_block * 4;
-      for (index_t depth_block_idx = 0; depth_block_idx < depth_block;
-           ++depth_block_idx) {
-        float32x4_t v0 = vld1q_f32(data0);
-        float32x4_t v1 = vld1q_f32(data1);
-        float32x4_t v2 = vld1q_f32(data2);
-        float32x4_t v3 = vld1q_f32(data3);
-        float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
-        float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
-        float32x4x2_t v0123_intertwined =
-            vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
-        float32x4x2_t v0123n_intertwined =
-            vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
-        float32x4_t v4 = vld1q_f32(data4);
-        float32x4_t v5 = vld1q_f32(data5);
-        float32x4_t v6 = vld1q_f32(data6);
-        float32x4_t v7 = vld1q_f32(data7);
-        float32x4x2_t v46_intertwined = vzipq_f32(v4, v6);
-        float32x4x2_t v57_intertwined = vzipq_f32(v5, v7);
-        float32x4x2_t v4567_intertwined =
-            vzipq_f32(v46_intertwined.val[0], v57_intertwined.val[0]);
-        float32x4x2_t v4567n_intertwined =
-            vzipq_f32(v46_intertwined.val[1], v57_intertwined.val[1]);
-        vst1q_f32(packed_ptr, v0123_intertwined.val[0]);
-        packed_ptr += 4;
-        vst1q_f32(packed_ptr, v4567_intertwined.val[0]);
-        packed_ptr += 4;
-        vst1q_f32(packed_ptr, v0123_intertwined.val[1]);
-        packed_ptr += 4;
-        vst1q_f32(packed_ptr, v4567_intertwined.val[1]);
-        packed_ptr += 4;
-        vst1q_f32(packed_ptr, v0123n_intertwined.val[0]);
-        packed_ptr += 4;
-        vst1q_f32(packed_ptr, v4567n_intertwined.val[0]);
-        packed_ptr += 4;
-        vst1q_f32(packed_ptr, v0123n_intertwined.val[1]);
-        packed_ptr += 4;
-        vst1q_f32(packed_ptr, v4567n_intertwined.val[1]);
-        packed_ptr += 4;
-        data0 += 4;
-        data1 += 4;
-        data2 += 4;
-        data3 += 4;
-        data4 += 4;
-        data5 += 4;
-        data6 += 4;
-        data7 += 4;
-      }
-      for (index_t d = 0; d < depth_remain; ++d) {
-        float32x4_t vi = {*data0, *data1, *data2, *data3};
-        vst1q_f32(packed_ptr, vi);
-        packed_ptr += 4;
-        float32x4_t vin = {*data4, *data5, *data6, *data7};
-        vst1q_f32(packed_ptr, vin);
-        packed_ptr += 4;
-        ++data0;
-        ++data1;
-        ++data2;
-        ++data3;
-        ++data4;
-        ++data5;
-        ++data6;
-        ++data7;
-      }  // d
-    }
-  }
-}
-template<>
-void Gemm::Unpack<4, 8>(const float *packed_output, MatrixMap<float> *output) {
-  const index_t rows = output->rows();
-  const index_t cols = output->cols();
-  index_t row_stride = output->rows_stride();
-  index_t col_stride = output->cols_stride();
-  float *output_ptr = output->data();
-  const float *packed_ptr = packed_output;
-  const index_t block_size = 8;
-  // packed_output always has row-major
-  if (output->matrix_major() == RowMajor) {
-    if (cols < block_size) {
-      for (index_t r = 0; r < rows; ++r) {
-        memcpy(output_ptr, packed_ptr, sizeof(float) * cols);
-        output_ptr += row_stride;
-        packed_ptr += block_size;
-      }
-    } else {
-      for (index_t r = 0; r < rows; ++r) {
-        float32x4_t vi = vld1q_f32(packed_ptr);
-        vst1q_f32(output_ptr, vi);
-        float32x4_t vin = vld1q_f32(packed_ptr + 4);
-        vst1q_f32(output_ptr + 4, vin);
-        output_ptr += row_stride;
-        packed_ptr += block_size;
-      }
-    }
-  } else {
-    // ColMajor
-    if (rows < block_size) {
-      for (index_t c = 0; c < cols; ++c) {
-        for (index_t r = 0; r < rows; ++r) {
-          output_ptr[c * col_stride + r] = packed_ptr[r * block_size + c];
-        }  // r
-      }  // c
-    } else {
-      const float *data0 = packed_ptr;
-      const float *data1 = data0 + block_size;
-      const float *data2 = data1 + block_size;
-      const float *data3 = data2 + block_size;
-      index_t col_block = cols / 4;
-      index_t col_remain = cols - col_block * 4;
-      for (index_t col_block_idx = 0; col_block_idx < col_block;
-           ++col_block_idx) {
-        float32x4_t v0 = vld1q_f32(data0);
-        float32x4_t v1 = vld1q_f32(data1);
-        float32x4_t v2 = vld1q_f32(data2);
-        float32x4_t v3 = vld1q_f32(data3);
-        float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
-        float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
-        float32x4x2_t v0123_intertwined =
-            vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
-        float32x4x2_t v0123n_intertwined =
-            vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
-        vst1q_f32(output_ptr, v0123_intertwined.val[0]);
-        output_ptr += col_stride;
-        vst1q_f32(output_ptr, v0123_intertwined.val[1]);
-        output_ptr += col_stride;
-        vst1q_f32(output_ptr, v0123n_intertwined.val[0]);
-        output_ptr += col_stride;
-        vst1q_f32(output_ptr, v0123n_intertwined.val[1]);
-        output_ptr += col_stride;
-        data0 += 4;
-        data1 += 4;
-        data2 += 4;
-        data3 += 4;
-      }
-      for (index_t c = 0; c < col_remain; ++c) {
-        float32x4_t vi = {*data0, *data1, *data2, *data3};
-        vst1q_f32(output_ptr, vi);
-        output_ptr += col_stride;
-        ++data0;
-        ++data1;
-        ++data2;
-        ++data3;
-      }  // d
-    }
-  }
-}
-template<>
-void Gemm::Unpack<8, 8>(const float *packed_output, MatrixMap<float> *output) {
-  const index_t rows = output->rows();
-  const index_t cols = output->cols();
-  index_t row_stride = output->rows_stride();
-  index_t col_stride = output->cols_stride();
-  float *output_ptr = output->data();
-  const float *packed_ptr = packed_output;
-  const index_t block_size = 8;
-  // packed_output always has row-major
-  if (output->matrix_major() == RowMajor) {
-    if (cols < block_size) {
-      for (index_t r = 0; r < rows; ++r) {
-        memcpy(output_ptr, packed_ptr, sizeof(float) * cols);
-        output_ptr += row_stride;
-        packed_ptr += block_size;
-      }
-    } else {
-      for (index_t r = 0; r < rows; ++r) {
-        float32x4_t vi = vld1q_f32(packed_ptr);
-        vst1q_f32(output_ptr, vi);
-        float32x4_t vin = vld1q_f32(packed_ptr + 4);
-        vst1q_f32(output_ptr + 4, vin);
-        output_ptr += row_stride;
-        packed_ptr += block_size;
-      }
-    }
-  } else {
-    // ColMajor
-    if (rows < block_size) {
-      for (index_t c = 0; c < cols; ++c) {
-        for (index_t r = 0; r < rows; ++r) {
-          output_ptr[c * col_stride + r] = packed_ptr[r * block_size + c];
-        }  // r
-      }  // c
-    } else {
-      const float *data0 = packed_ptr;
-      const float *data1 = data0 + block_size;
-      const float *data2 = data1 + block_size;
-      const float *data3 = data2 + block_size;
-      const float *data4 = data3 + block_size;
-      const float *data5 = data4 + block_size;
-      const float *data6 = data5 + block_size;
-      const float *data7 = data6 + block_size;
-      index_t col_block = cols / 4;
-      index_t col_remain = cols - col_block * 4;
-      for (index_t col_block_idx = 0; col_block_idx < col_block;
-           ++col_block_idx) {
-        float32x4_t v0 = vld1q_f32(data0);
-        float32x4_t v1 = vld1q_f32(data1);
-        float32x4_t v2 = vld1q_f32(data2);
-        float32x4_t v3 = vld1q_f32(data3);
-        float32x4x2_t v02_intertwined = vzipq_f32(v0, v2);
-        float32x4x2_t v13_intertwined = vzipq_f32(v1, v3);
-        float32x4x2_t v0123_intertwined =
-            vzipq_f32(v02_intertwined.val[0], v13_intertwined.val[0]);
-        float32x4x2_t v0123n_intertwined =
-            vzipq_f32(v02_intertwined.val[1], v13_intertwined.val[1]);
-        float32x4_t v4 = vld1q_f32(data4);
+    "bne 0b \n"
-        float32x4_t v5 = vld1q_f32(data5);
-        float32x4_t v6 = vld1q_f32(data6);
-        float32x4_t v7 = vld1q_f32(data7);
-        float32x4x2_t v46_intertwined = vzipq_f32(v4, v6);
-        float32x4x2_t v57_intertwined = vzipq_f32(v5, v7);
-        float32x4x2_t v4567_intertwined =
-            vzipq_f32(v46_intertwined.val[0], v57_intertwined.val[0]);
-        float32x4x2_t v4567n_intertwined =
-            vzipq_f32(v46_intertwined.val[1], v57_intertwined.val[1]);
-        vst1q_f32(output_ptr, v0123_intertwined.val[0]);
+    // prologue
-        vst1q_f32(output_ptr + 4, v4567_intertwined.val[0]);
+    "1:\n"
-        output_ptr += col_stride;
+    "vmla.f32 q8, q4, d0[0] \n"
+    "vmla.f32 q9, q5, d0[0] \n"
+    "vmla.f32 q10, q4, d0[1] \n"
+    "vmla.f32 q11, q5, d0[1] \n"
+    "vmla.f32 q12, q4, d1[0] \n"
+    "vmla.f32 q13, q5, d1[0] \n"
+    "vmla.f32 q14, q4, d1[1] \n"
+    "vmla.f32 q15, q5, d1[1] \n"
-        vst1q_f32(output_ptr, v0123_intertwined.val[1]);
+    "vld1.f32 {d8-d9}, [%[rhs_ptr]]! \n"
-        vst1q_f32(output_ptr + 4, v4567_intertwined.val[1]);
+    "vld1.f32 {d10-d11}, [%[rhs_ptr]]! \n"
-        output_ptr += col_stride;
-        vst1q_f32(output_ptr, v0123n_intertwined.val[0]);
+    "vmla.f32 q8, q6, d2[0] \n"
-        vst1q_f32(output_ptr + 4, v4567n_intertwined.val[0]);
+    "vmla.f32 q9, q7, d2[0] \n"
-        output_ptr += col_stride;
+    "vmla.f32 q10, q6, d2[1] \n"
+    "vmla.f32 q11, q7, d2[1] \n"
+    "vmla.f32 q12, q6, d3[0] \n"
+    "vmla.f32 q13, q7, d3[0] \n"
+    "vmla.f32 q14, q6, d3[1] \n"
+    "vmla.f32 q15, q7, d3[1] \n"
-        vst1q_f32(output_ptr, v0123n_intertwined.val[1]);
+    "vld1.f32 {d12-d13}, [%[rhs_ptr]]! \n"
-        vst1q_f32(output_ptr + 4, v4567n_intertwined.val[1]);
+    "vld1.f32 {d14-d15}, [%[rhs_ptr]]! \n"
-        output_ptr += col_stride;
-        data0 += 4;
+    "vmla.f32 q8, q4, d4[0] \n"
-        data1 += 4;
+    "vmla.f32 q9, q5, d4[0] \n"
-        data2 += 4;
+    "vmla.f32 q10, q4, d4[1] \n"
-        data3 += 4;
+    "vmla.f32 q11, q5, d4[1] \n"
-        data4 += 4;
+    "vmla.f32 q12, q4, d5[0] \n"
-        data5 += 4;
+    "vmla.f32 q13, q5, d5[0] \n"
-        data6 += 4;
+    "vmla.f32 q14, q4, d5[1] \n"
-        data7 += 4;
+    "vmla.f32 q15, q5, d5[1] \n"
-      }
-      for (index_t c = 0; c < col_remain; ++c) {
-        float32x4_t vi = {*data0, *data1, *data2, *data3};
-        vst1q_f32(output_ptr, vi);
-        float32x4_t vin = {*data4, *data5, *data6, *data7};
-        vst1q_f32(output_ptr + 4, vin);
-        output_ptr += col_stride;
-        ++data0;
+    "vmla.f32 q8, q6, d6[0] \n"
-        ++data1;
+    "vmla.f32 q9, q7, d6[0] \n"
-        ++data2;
+    "vmla.f32 q10, q6, d6[1] \n"
-        ++data3;
+    "vmla.f32 q11, q7, d6[1] \n"
-        ++data4;
+    "vmla.f32 q12, q6, d7[0] \n"
-        ++data5;
+    "vmla.f32 q13, q7, d7[0] \n"
-        ++data6;
+    "vmla.f32 q14, q6, d7[1] \n"
-        ++data7;
+    "vmla.f32 q15, q7, d7[1] \n"
-      }  // d
-    }
+    "vst1.f32 {d16-d17}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d18-d19}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d20-d21}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d22-d23}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d24-d25}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d26-d27}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d28-d29}, [%[packed_output_data]]! \n"
+    "vst1.f32 {d30-d31}, [%[packed_output_data]]! \n"
+    :  // outputs
+    [lhs_ptr] "+r"(lhs_ptr),
+    [rhs_ptr] "+r"(rhs_ptr),
+    [packed_output_data] "+r"(packed_output_data),
+    [r_depth_block_count] "+r"(r_depth_block_count)
+    :  // inputs
+    :  // clabbers
+    "cc", "memory", "r0",
+        "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+        "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
  }
+#endif
 }
-MaceStatus Gemm::Compute(const OpContext *context,
+template<>
+MaceStatus Gemm<float>::Compute(const OpContext *context,
                                const Tensor *lhs,
                                const Tensor *rhs,
                                const index_t batch,
-                         const index_t lhs_rows,
+                                const index_t rows,
-                         const index_t lhs_cols,
+                                const index_t cols,
-                         const index_t rhs_rows,
+                                const index_t depth,
-                         const index_t rhs_cols,
+                                const MatrixMajor lhs_major,
-                         const bool transpose_lhs,
+                                const MatrixMajor rhs_major,
-                         const bool transpose_rhs,
+                                const MatrixMajor output_major,
-                         const bool transpose_out,
                                const bool lhs_batched,
                                const bool rhs_batched,
                                Tensor *output) {
-  index_t rows = transpose_lhs ? lhs_cols : lhs_rows;
+  MACE_CHECK(output->size() == batch * rows * cols,
-  index_t depth = transpose_lhs ? lhs_rows : lhs_cols;
+             "Need resize output tensor before call gemm.");
-  index_t cols = transpose_rhs ? rhs_rows : rhs_cols;
+  Tensor::MappingGuard lhs_guard(lhs);
-  index_t depth2 = transpose_rhs ? rhs_cols : rhs_rows;
+  Tensor::MappingGuard rhs_guard(rhs);
-  MACE_CHECK(depth == depth2,
+  Tensor::MappingGuard output_guard(output);
-             "Matrices that multiply have inconsistent depth dim: ",
+  const float *lhs_data = lhs->data<float>();
-             depth,
+  const float *rhs_data = rhs->data<float>();
-             " vs. ",
+  float *output_data = output->mutable_data<float>();
-             depth2);
+#ifdef __aarch64__
+  const index_t row_block_size = 8;
+#else
+  const index_t row_block_size = 4;
+#endif
+  const index_t col_block_size = 8;
+  const index_t depth_block_size = 4;
+  const index_t row_block_count = RoundUpDiv(rows, row_block_size);
+  const index_t col_block_count = RoundUpDiv(cols, col_block_size);
+  const index_t rows_padded = RoundUp(rows, row_block_size);
+  const index_t cols_padded = RoundUp(cols, col_block_size);
+  const index_t depth_padded = RoundUp(depth, depth_block_size);
+  ScratchBuffer *scratch = context->device()->scratch_buffer();
+  index_t packed_lhs_size =
+      PadAlignSize(sizeof(float) * rows_padded * depth_padded);
+  index_t packed_rhs_size =
+      PadAlignSize(sizeof(float) * depth_padded * cols_padded);
+  index_t packed_output_size =
+      PadAlignSize(sizeof(float) * rows_padded * cols_padded);
+  // resize to the total size of lhs & rhs & output anyway,
+  // in case we do not cache const tensor for saving memory
+  MACE_RETURN_IF_ERROR(scratch->GrowSize(
+      packed_lhs_size + packed_rhs_size + packed_output_size));
+  float *packed_lhs_data =
+      scratch->Scratch(packed_lhs_size).mutable_data<float>();
+  float *packed_rhs_data =
+      scratch->Scratch(packed_rhs_size).mutable_data<float>();
+  float *packed_output_data =
+      scratch->Scratch(packed_output_size).mutable_data<float>();
+  int cache_side = kNoCache;
+  if (cached_ == kCacheLhs) {
+    packed_lhs_data = pack_cache_.mutable_data<float>();
+  } else if (cached_ == kCacheRhs) {
+    packed_rhs_data = pack_cache_.mutable_data<float>();
+  } else if (should_cache_pack_) {
+    if (lhs->is_weight() && (!lhs_batched || batch == 1)) {
+      cache_side = kCacheLhs;
+      pack_cache_.Resize(packed_lhs_size);
+      packed_lhs_data = pack_cache_.mutable_data<float>();
+    } else if (rhs->is_weight() && (!rhs_batched || batch == 1)) {
+      cache_side = kCacheRhs;
+      pack_cache_.Resize(packed_rhs_size);
+      packed_rhs_data = pack_cache_.mutable_data<float>();
+    }
+  }
-  return Compute(context,
+  utils::ThreadPool
-                 lhs,
+      &thread_pool = context->device()->cpu_runtime()->thread_pool();
-                 rhs,
-                 batch,
+  for (index_t b = 0; b < batch; ++b) {
+    MatrixMap<const float>
+        lhs_matrix
+        (lhs_data + static_cast<index_t>(lhs_batched) * b * rows * depth,
+         lhs_major,
         rows,
-                 cols,
+         depth);
+    MatrixMap<const float>
+        rhs_matrix
+        (rhs_data + static_cast<index_t>(rhs_batched) * b * depth * cols,
+         rhs_major,
         depth,
-                 transpose_lhs ? ColMajor : RowMajor,
+         cols);
-                 transpose_rhs ? ColMajor : RowMajor,
+    MatrixMap<float> output_matrix
-                 transpose_out ? ColMajor : RowMajor,
+        (output_data + b * rows * cols, output_major, rows, cols);
-                 lhs_batched,
-                 rhs_batched,
+    // pack lhs
-                 output);
+    if (cached_ != kCacheLhs) {
-}
+      thread_pool.Compute1D([=, &lhs_matrix](index_t start,
+                                             index_t end,
+                                             index_t step) {
+        for (index_t row_block_idx = start; row_block_idx < end;
+             row_block_idx += step) {
+          const index_t start_row = row_block_idx * row_block_size;
+          const index_t
+              row_block_len = std::min(row_block_size, rows - start_row);
+          float *packed_lhs_data_block =
+              packed_lhs_data + row_block_idx * row_block_size * depth_padded;
+          PackLhs(lhs_matrix.block(start_row, 0, row_block_len, depth),
+                  packed_lhs_data_block);
+        }
+      }, 0, row_block_count, 1);
+      if (cache_side == kCacheLhs) {
+        cached_ = kCacheLhs;
+        if (lhs->UnderlyingBuffer()->OnHost()) {
+          AdviseFree(reinterpret_cast<void *>(const_cast<float *>(lhs->data<
+                         float>())),
+                     lhs->raw_size());
+        }
+      }
+    }
-void RegisterGemmDelegator(OpDelegatorRegistry *registry) {
+    // pack rhs
-  MACE_REGISTER_DELEGATOR(
+    if (cached_ != kCacheRhs) {
-      registry, Gemm, delegator::GemmParam,
+      thread_pool.Compute1D([=, &rhs_matrix](index_t start,
-      MACE_DELEGATOR_KEY(Gemm, DeviceType::CPU, float, ImplType::NEON));
+                                             index_t end,
+                                             index_t step) {
+        for (index_t col_block_idx = start; col_block_idx < end;
+             col_block_idx += step) {
+          const index_t start_col = col_block_idx * col_block_size;
+          const index_t
+              col_block_len = std::min(col_block_size, cols - start_col);
+          float *packed_rhs_data_block =
+              packed_rhs_data + col_block_idx * col_block_size * depth_padded;
+          PackRhs(rhs_matrix.block(0, start_col, depth, col_block_len),
+                  packed_rhs_data_block);
+        }
+      }, 0, col_block_count, 1);
+      if (cache_side == kCacheRhs) {
+        cached_ = kCacheRhs;
+        if (rhs->UnderlyingBuffer()->OnHost()) {
+          AdviseFree(reinterpret_cast<void *>(const_cast<float *>(rhs->data<
+                         float>())),
+                     rhs->raw_size());
+        }
+      }
+    }
+    // multiply lhs and rhs
+    thread_pool.Compute1D([=, &output_matrix](index_t start,
+                                              index_t end,
+                                              index_t step) {
+      for (index_t row_block_idx = start; row_block_idx < end;
+           row_block_idx += step) {
+        const index_t start_row = row_block_idx * row_block_size;
+        const index_t
+            row_block_len = std::min(row_block_size, rows - start_row);
+        const float *packed_lhs_data_block =
+            packed_lhs_data + row_block_idx * row_block_size * depth_padded;
+        for (index_t col_block_idx = 0; col_block_idx < col_block_count;
+             ++col_block_idx) {
+          const index_t start_col = col_block_idx * col_block_size;
+          const index_t
+              col_block_len = std::min(col_block_size, cols - start_col);
+          const float *packed_rhs_data_block =
+              packed_rhs_data + col_block_idx * col_block_size * depth_padded;
+          float *packed_output_data_block =
+              packed_output_data + row_block_idx * row_block_size * cols_padded
+                  + col_block_idx * col_block_size;
+          ComputeBlock(packed_lhs_data_block,
+                       packed_rhs_data_block,
+                       depth_padded,
+                       packed_output_data_block);
+          MatrixMap<float> output_block = output_matrix.block(start_row,
+                                                              start_col,
+                                                              row_block_len,
+                                                              col_block_len);
+          UnpackOutput(packed_output_data_block, &output_block);
+        }  // col_block_idx
+      }  // row_block_idx
+    }, 0, row_block_count, 1);
+  }  // b
+  return MaceStatus::MACE_SUCCESS;
 }
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/arm/fp32/gemv.cc
+++ b/mace/ops/arm/fp32/gemv.cc
@@ -12,12 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "mace/ops/arm/fp32/gemv.h"
 #include <arm_neon.h>
 #include <algorithm>
+#include "mace/ops/arm/base/gemv.h"
 #include "mace/utils/math.h"
 #if !defined(__aarch64__)
@@ -34,9 +32,9 @@ float vaddvq_f32(float32x4_t v) {
 namespace mace {
 namespace ops {
 namespace arm {
-namespace fp32 {
-MaceStatus Gemv::Compute(const OpContext *context,
+template<>
+MaceStatus Gemv<float>::Compute(const OpContext *context,
                                const Tensor *lhs,
                                const Tensor *rhs,
                                const Tensor *bias,
@@ -378,13 +376,6 @@ MaceStatus Gemv::Compute(const OpContext *context,
 #undef vaddvq_f32
 #endif
-void RegisterGemvDelegator(OpDelegatorRegistry *registry) {
-  MACE_REGISTER_DELEGATOR(
-      registry, Gemv, DelegatorParam,
-      MACE_DELEGATOR_KEY(Gemv, DeviceType::CPU, float, ImplType::NEON));
-}
-}  // namespace fp32
 }  // namespace arm
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/registry/op_delegators_registry.cc
+++ b/mace/ops/registry/op_delegators_registry.cc
@@ -38,13 +38,15 @@ extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
 #ifdef MACE_ENABLE_NEON
 namespace arm {
 namespace fp32 {
+extern void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry);
+}  // namespace fp32
 extern void RegisterActivationDelegator(OpDelegatorRegistry *registry);
 extern void RegisterBiasAddDelegator(OpDelegatorRegistry *registry);
 extern void RegisterConv2dK1x1Delegator(OpDelegatorRegistry *registry);
 extern void RegisterConv2dK1xNDelegator(OpDelegatorRegistry *registry);
 extern void RegisterConv2dK3x3Delegator(OpDelegatorRegistry *registry);
-extern void RegisterConv2dK3x3WinogradDelegator(OpDelegatorRegistry *registry);
 extern void RegisterConv2dK5x5Delegator(OpDelegatorRegistry *registry);
 extern void RegisterConv2dK7x7Delegator(OpDelegatorRegistry *registry);
 extern void RegisterConv2dGeneralDelegator(OpDelegatorRegistry *registry);
@@ -69,7 +71,6 @@ extern void RegisterGroupDeconv2dGeneralDelegator(
 extern void RegisterGemmDelegator(OpDelegatorRegistry *registry);
 extern void RegisterGemvDelegator(OpDelegatorRegistry *registry);
-}  // namespace fp32
 #ifdef MACE_ENABLE_QUANTIZE
 namespace q8 {
@@ -97,32 +98,33 @@ void RegisterAllOpDelegators(OpDelegatorRegistry *registry) {
 #endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_NEON
-  arm::fp32::RegisterActivationDelegator(registry);
-  arm::fp32::RegisterBiasAddDelegator(registry);
-  arm::fp32::RegisterConv2dK1x1Delegator(registry);
-  arm::fp32::RegisterConv2dK1xNDelegator(registry);
-  arm::fp32::RegisterConv2dK3x3Delegator(registry);
  arm::fp32::RegisterConv2dK3x3WinogradDelegator(registry);
-  arm::fp32::RegisterConv2dK5x5Delegator(registry);
-  arm::fp32::RegisterConv2dK7x7Delegator(registry);
+  arm::RegisterActivationDelegator(registry);
-  arm::fp32::RegisterConv2dGeneralDelegator(registry);
+  arm::RegisterBiasAddDelegator(registry);
-  arm::fp32::RegisterDeconv2dK2x2Delegator(registry);
+  arm::RegisterConv2dK1x1Delegator(registry);
-  arm::fp32::RegisterDeconv2dK3x3Delegator(registry);
+  arm::RegisterConv2dK1xNDelegator(registry);
-  arm::fp32::RegisterDeconv2dK4x4Delegator(registry);
+  arm::RegisterConv2dK3x3Delegator(registry);
-  arm::fp32::RegisterDeconv2dGeneralDelegator(registry);
+  arm::RegisterConv2dK5x5Delegator(registry);
+  arm::RegisterConv2dK7x7Delegator(registry);
-  arm::fp32::RegisterDepthwiseConv2dK3x3Delegator(registry);
+  arm::RegisterConv2dGeneralDelegator(registry);
-  arm::fp32::RegisterDepthwiseDeconv2dK3x3Delegator(registry);
-  arm::fp32::RegisterGroupDeconv2dK3x3Delegator(registry);
+  arm::RegisterDeconv2dK2x2Delegator(registry);
-  arm::fp32::RegisterDepthwiseDeconv2dK4x4Delegator(registry);
+  arm::RegisterDeconv2dK3x3Delegator(registry);
-  arm::fp32::RegisterGroupDeconv2dK4x4Delegator(registry);
+  arm::RegisterDeconv2dK4x4Delegator(registry);
-  arm::fp32::RegisterDepthwiseDeconv2dGeneralDelegator(registry);
+  arm::RegisterDeconv2dGeneralDelegator(registry);
-  arm::fp32::RegisterGroupDeconv2dGeneralDelegator(registry);
+  arm::RegisterDepthwiseConv2dK3x3Delegator(registry);
-  arm::fp32::RegisterGemmDelegator(registry);
+  arm::RegisterDepthwiseDeconv2dK3x3Delegator(registry);
-  arm::fp32::RegisterGemvDelegator(registry);
+  arm::RegisterGroupDeconv2dK3x3Delegator(registry);
+  arm::RegisterDepthwiseDeconv2dK4x4Delegator(registry);
+  arm::RegisterGroupDeconv2dK4x4Delegator(registry);
+  arm::RegisterDepthwiseDeconv2dGeneralDelegator(registry);
+  arm::RegisterGroupDeconv2dGeneralDelegator(registry);
+  arm::RegisterGemmDelegator(registry);
+  arm::RegisterGemvDelegator(registry);
 #ifdef MACE_ENABLE_QUANTIZE
  arm::q8::RegisterEltwiseDelegator(registry);