Refactor cpu

7c9ca067 · 李寅 · 7c1711d8 · 7c9ca067 · 7c9ca067 · 7c9ca067
86 changed file
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -91,7 +91,6 @@ extern void Register_Eltwise(OperatorRegistry *op_registry);
 extern void Register_FoldedBatchNorm(OperatorRegistry *op_registry);
 extern void Register_FullyConnected(OperatorRegistry *op_registry);
 extern void Register_FusedConv2D(OperatorRegistry *op_registry);
-extern void Register_GlobalAvgPooling(OperatorRegistry *op_registry);
 extern void Register_ImageToBuffer(OperatorRegistry *op_registry);
 extern void Register_LocalResponseNorm(OperatorRegistry *op_registry);
 extern void Register_MatMul(OperatorRegistry *op_registry);
@@ -132,7 +131,6 @@ OperatorRegistry::OperatorRegistry() {
  ops::Register_FoldedBatchNorm(this);
  ops::Register_FullyConnected(this);
  ops::Register_FusedConv2D(this);
-  ops::Register_GlobalAvgPooling(this);
  ops::Register_ImageToBuffer(this);
  ops::Register_LocalResponseNorm(this);
  ops::Register_MatMul(this);

--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -318,6 +318,7 @@ class Tensor {
   public:
    explicit MappingGuard(const Tensor *tensor) : tensor_(tensor) {
      if (tensor_ != nullptr) {
+        MACE_CHECK_NOTNULL(tensor_->buffer_);
        tensor_->buffer_->Map(&mapped_image_pitch_);
      }
    }

--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -121,23 +121,30 @@ void PReLUActivation(const T *input_ptr,
 }

 template <DeviceType D, typename T>
-class ActivationFunctor {
+class ActivationFunctor;
+
+template <>
+class ActivationFunctor<DeviceType::CPU, float> {
 public:
-  ActivationFunctor(ActivationType type, T relux_max_limit)
+  ActivationFunctor(ActivationType type, float relux_max_limit)
      : activation_(type), relux_max_limit_(relux_max_limit) {}

  void operator()(const Tensor *input,
                  const Tensor *alpha,
                  Tensor *output,
                  StatsFuture *future) {
-    const T *input_ptr = input->data<T>();
-    T *output_ptr = output->mutable_data<T>();
+    const float *input_ptr = input->data<float>();
+    float *output_ptr = output->mutable_data<float>();
    if (activation_ == PRELU) {
      MACE_CHECK_NOTNULL(alpha);
-      const T *alpha_ptr = alpha->data<T>();
-      const index_t outer_size = output->dim(0) * output->dim(1)
-          * output->dim(2);
-      PReLUActivation(input_ptr, outer_size, input->dim(3), 1, alpha_ptr,
+      const float *alpha_ptr = alpha->data<float>();
+      const index_t outer_size = output->dim(0);
+      const index_t inner_size = output->dim(2) * output->dim(3);
+      PReLUActivation(input_ptr,
+                      outer_size,
+                      input->dim(1),
+                      inner_size,
+                      alpha_ptr,
                      output_ptr);
    } else {
      DoActivation(input_ptr, output_ptr, output->size(), activation_,
@@ -145,22 +152,6 @@ class ActivationFunctor {
    }
  }

- private:
-  ActivationType activation_;
-  T relux_max_limit_;
-};
-
-template <>
-class ActivationFunctor<DeviceType::NEON, float> {
- public:
-  ActivationFunctor(ActivationType type, float relux_max_limit)
-    : activation_(type), relux_max_limit_(relux_max_limit) {}
-
-  void operator()(const Tensor *input,
-                  const Tensor *alpha,
-                  Tensor *output,
-                  StatsFuture *future);
-
 private:
  ActivationType activation_;
  float relux_max_limit_;

--- a/mace/kernels/arm/activation.cc
+++ b/mace/kernels/arm/activation.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/activation.h"
-
-namespace mace {
-namespace kernels {
-
-void ActivationFunctor<DeviceType::NEON, float>::operator()(
-  const Tensor *input,
-  const Tensor *alpha,
-  Tensor *output,
-  StatsFuture *future) {
-  const float *input_ptr = input->data<float>();
-  float *output_ptr = output->mutable_data<float>();
-  if (activation_ == PRELU) {
-    MACE_CHECK_NOTNULL(alpha);
-    const float *alpha_ptr = alpha->data<float>();
-    const index_t outer_size = output->dim(0);
-    const index_t inner_size = output->dim(2) * output->dim(3);
-    PReLUActivation(input_ptr, outer_size, input->dim(1), inner_size, alpha_ptr,
-                    output_ptr);
-  } else {
-    DoActivation(input_ptr, output_ptr, output->size(), activation_,
-                 relux_max_limit_);
-  }
-}
-
-}  // namespace kernels
-}  // namespace mace
-
-
-
--- a/mace/kernels/arm/batch_norm.cc
+++ b/mace/kernels/arm/batch_norm.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/batch_norm.h"
-
-namespace mace {
-namespace kernels {
-
-void BatchNormFunctor<DeviceType::NEON, float>::operator()(
-  const Tensor *input,
-  const Tensor *scale,
-  const Tensor *offset,
-  const Tensor *mean,
-  const Tensor *var,
-  const float epsilon,
-  Tensor *output,
-  StatsFuture *future) {
-  // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
-  // The calculation formula for inference is
-  // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
-  //          ( \offset - \frac { \scale * mean } {
-  //          \sqrt{var+\variance_epsilon} }
-  // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
-  // new_offset = \offset - mean * common_val;
-  // Y = new_scale * X + new_offset;
-  const index_t batch = input->dim(0);
-  const index_t channels = input->dim(1);
-  const index_t height = input->dim(2);
-  const index_t width = input->dim(3);
-
-  const float *input_ptr = input->data<float>();
-  const float *scale_ptr = scale->data<float>();
-  const float *offset_ptr = offset->data<float>();
-  float *output_ptr = output->mutable_data<float>();
-
-  std::vector<float> new_scale;
-  std::vector<float> new_offset;
-  if (!folded_constant_) {
-    new_scale.resize(channels);
-    new_offset.resize(channels);
-    const float *mean_ptr = mean->data<float>();
-    const float *var_ptr = var->data<float>();
-#pragma omp parallel for
-    for (index_t c = 0; c < channels; ++c) {
-      new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon);
-      new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c];
-    }
-  }
-
-  const float *scale_data = folded_constant_ ? scale_ptr : new_scale.data();
-  const float *offset_data = folded_constant_ ? offset_ptr : new_offset.data();
-
-  index_t channel_size = height * width;
-  index_t batch_size = channels * channel_size;
-
-  // NEON is slower, so stick to the trivial implementaion
-#pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t c = 0; c < channels; ++c) {
-      index_t offset = b * batch_size + c * channel_size;
-      for (index_t hw = 0; hw < height * width; ++hw) {
-        output_ptr[offset + hw] =
-          scale_data[c] * input_ptr[offset + hw] + offset_data[c];
-      }
-    }
-  }
-  DoActivation(output_ptr, output_ptr, output->size(), activation_,
-               relux_max_limit_);
-}
-
-}  // namespace kernels
-}  // namespace mace
--- a/mace/kernels/arm/conv_2d.cc
+++ b/mace/kernels/arm/conv_2d.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/conv_2d.h"
-#include "mace/kernels/arm/conv_winograd.h"
-
-namespace mace {
-namespace kernels {
-
-namespace {
-
-void Conv2dNCHW(const float *input,
-                const float *filter,
-                const index_t batch,
-                const index_t in_height,
-                const index_t in_width,
-                const index_t in_channels,
-                const index_t out_height,
-                const index_t out_width,
-                const index_t out_channels,
-                const int filter_height,
-                const int filter_width,
-                const int stride_h,
-                const int stride_w,
-                const int dilation_h,
-                const int dilation_w,
-                float *output) {
-#pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; ++m) {
-      for (index_t h = 0; h < out_height; ++h) {
-        for (index_t w = 0; w < out_width; ++w) {
-          index_t out_offset =
-            ((b * out_channels + m) * out_height + h) * out_width + w;
-          for (index_t c = 0; c < in_channels; ++c) {
-            for (index_t kh = 0; kh < filter_height; ++kh) {
-              for (index_t kw = 0; kw < filter_width; ++kw) {
-                index_t ih = h * stride_h + kh * dilation_h;
-                index_t iw = w * stride_w + kw * dilation_w;
-                index_t in_offset =
-                  ((b * in_channels + c) * in_height + ih) * in_width + iw;
-                index_t filter_offset =
-                  (((m * in_channels) + c) * filter_height + kh) * filter_width
-                    + kw;
-                output[out_offset] += input[in_offset] * filter[filter_offset];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace
-
-extern void Conv2dNeonK1x1S1(const float *input,
-                             const float *filter,
-                             const index_t batch,
-                             const index_t height,
-                             const index_t width,
-                             const index_t in_channels,
-                             const index_t out_channels,
-                             float *output);
-
-extern void Conv2dNeonK3x3S1(const float *input,
-                             const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
-                             float *output);
-
-extern void Conv2dNeonK3x3S2(const float *input,
-                             const float *filter,
-                             const index_t batch,
-                             const index_t in_height,
-                             const index_t in_width,
-                             const index_t in_channels,
-                             const index_t out_height,
-                             const index_t out_width,
-                             const index_t out_channels,
-                             float *output);
-
-void Conv2dFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
-                                                        const Tensor *filter,
-                                                        const Tensor *bias,
-                                                        Tensor *output,
-                                                        StatsFuture *future) {
-  MACE_CHECK_NOTNULL(input);
-  MACE_CHECK_NOTNULL(filter);
-  MACE_CHECK_NOTNULL(output);
-
-  std::vector<index_t> filter_shape(4);
-  if (is_filter_transformed_) {
-    // TOC -> OIHW
-    filter_shape[0] = filter->dim(1);
-    filter_shape[1] = filter->dim(2);
-    filter_shape[2] = filter_shape[3] = 3;
-  } else {
-    filter_shape = filter->shape();
-  }
-
-  std::vector<index_t> output_shape(4);
-  std::vector<int> paddings(2);
-  if (paddings_.empty()) {
-    CalcNCHWPaddingAndOutputSize(input->shape().data(),
-                                 filter_shape.data(),
-                                 dilations_,
-                                 strides_,
-                                 padding_type_,
-                                 output_shape.data(),
-                                 paddings.data());
-  } else {
-    paddings = paddings_;
-    CalcNCHWOutputSize(input->shape().data(), filter_shape.data(),
-                       paddings_.data(), dilations_, strides_, RoundType::FLOOR,
-                       output_shape.data());
-  }
-  output->Resize(output_shape);
-  output->Clear();
-
-  index_t batch = output->dim(0);
-  index_t channels = output->dim(1);
-  index_t height = output->dim(2);
-  index_t width = output->dim(3);
-
-  index_t input_batch = input->dim(0);
-  index_t input_channels = input->dim(1);
-  index_t input_height = input->dim(2);
-  index_t input_width = input->dim(3);
-
-  index_t filter_h = filter_shape[2];
-  index_t filter_w = filter_shape[3];
-  MACE_CHECK(filter_shape[0] == channels, filter_shape[0], " != ", channels);
-  MACE_CHECK(filter_shape[1] == input_channels, filter_shape[1], " != ",
-             input_channels);
-
-  index_t stride_h = strides_[0];
-  index_t stride_w = strides_[1];
-
-  index_t dilation_h = dilations_[0];
-  index_t dilation_w = dilations_[1];
-
-  MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
-
-  index_t padded_input_height = input_height + paddings[0];
-  index_t padded_input_width = input_width + paddings[1];
-  index_t extra_input_height = padded_input_height;
-  index_t extra_input_width = padded_input_width;
-  index_t extra_output_height = height;
-  index_t extra_output_width = width;
-
-  int pad_top = paddings[0] >> 1;
-  int pad_bottom = paddings[0] - pad_top;
-  int pad_left = paddings[1] >> 1;
-  int pad_right = paddings[1] - pad_left;
-
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
-  auto output_data = output->mutable_data<float>();
-
-  std::function<void(const float *input, float *output)> conv_func;
-
-  bool use_winograd = is_filter_transformed_ || (filter_h == 3 && filter_w == 3
-    && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1
-    && input_channels >= 8 && channels >= 8);
-  bool use_neon_3x3_s1 = filter_h == 3 && filter_w == 3
-    && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
-  bool use_neon_3x3_s2 = filter_h == 3 && filter_w == 3
-    && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1;
-  bool use_neon_1x1_s1 = filter_h == 1 && filter_w == 1
-    && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
-
-  std::vector<index_t> transformed_input_shape;
-  std::vector<index_t> transformed_output_shape;
-  std::vector<index_t> transformed_filter_shape;
-
-  // When size of input feature map is bigger than 16x16,
-  // set winograd out tile size to 6 to get higher performance.
-  index_t winograd_out_tile_size = 2;
-  if (input_height > 16 && input_width > 16) {
-    winograd_out_tile_size = 6;
-  }
-
-  if (use_winograd) {
-    extra_output_height = RoundUp<index_t>(height, winograd_out_tile_size);
-    extra_input_height = std::max(padded_input_height, extra_output_height + 2);
-    extra_output_width = RoundUp<index_t>(width, winograd_out_tile_size);
-    extra_input_width = std::max(padded_input_width, extra_output_width + 2);
-    if (extra_input_height != padded_input_height) {
-      pad_bottom += (extra_input_height - padded_input_height);
-    }
-    if (extra_input_width != padded_input_width) {
-      pad_right += (extra_input_width - padded_input_width);
-    }
-
-    index_t tile_height_count = extra_output_height / winograd_out_tile_size;
-    index_t tile_width_count = extra_output_width / winograd_out_tile_size;
-    index_t tile_count = tile_height_count * tile_width_count;
-    index_t in_tile_area =
-      (winograd_out_tile_size + 2) * (winograd_out_tile_size + 2);
-
-    transformed_input_shape.insert(transformed_input_shape.end(),
-                                   {in_tile_area, batch, input_channels,
-                                    tile_count});
-    transformed_output_shape.insert(transformed_output_shape.end(),
-                                    {in_tile_area, batch, channels,
-                                     tile_count});
-    transformed_filter_shape.insert(transformed_filter_shape.end(),
-                                    {in_tile_area, channels, input_channels});
-  } else if (use_neon_3x3_s1) {
-    extra_output_height = RoundUp<index_t>(height, 2);
-    extra_input_height = std::max(padded_input_height, extra_output_height + 2);
-    extra_output_width = RoundUp<index_t>(width, 4);
-    extra_input_width = std::max(padded_input_width, extra_output_width + 2);
-    if (extra_input_height != padded_input_height) {
-      pad_bottom += (extra_input_height - padded_input_height);
-    }
-    if (extra_input_width != padded_input_width) {
-      pad_right += (extra_input_width - padded_input_width);
-    }
-  } else if (use_neon_3x3_s2) {
-    extra_output_height = height;
-    extra_input_height =
-      std::max(padded_input_height, (extra_output_height - 1) * 2 + 3);
-    extra_output_width = RoundUp<index_t>(width, 4);
-    extra_input_width =
-      std::max(padded_input_width, (extra_output_width - 1) * 2 + 3);
-    if (extra_input_height != padded_input_height) {
-      pad_bottom += (extra_input_height - padded_input_height);
-    }
-    if (extra_input_width != padded_input_width) {
-      pad_right += (extra_input_width - padded_input_width);
-    }
-  }
-
-  // decide scratch size before allocate it
-  index_t total_scratch_size = 0;
-  index_t transformed_input_size = 0;
-  index_t transformed_output_size = 0;
-  index_t padded_input_size = 0;
-  index_t padded_output_size = 0;
-  if (use_winograd) {
-    transformed_input_size =
-      std::accumulate(transformed_input_shape.begin(),
-                      transformed_input_shape.end(),
-                      1,
-                      std::multiplies<index_t>()) * sizeof(float);
-    transformed_output_size =
-      std::accumulate(transformed_output_shape.begin(),
-                      transformed_output_shape.end(),
-                      1,
-                      std::multiplies<index_t>()) * sizeof(float);
-    total_scratch_size += transformed_input_size + transformed_output_size;
-  }
-  if (extra_input_height != input_height || extra_input_width != input_width) {
-    padded_input_size =
-      batch * input_channels * (input_height + pad_top + pad_bottom)
-        * (input_width + pad_left + pad_right) * sizeof(float);
-    total_scratch_size += padded_input_size;
-  }
-  if (extra_output_height != height || extra_output_width != width) {
-    padded_output_size =
-      batch * channels * extra_output_height * extra_output_width
-        * sizeof(float);
-    total_scratch_size += padded_output_size;
-  }
-  // Init scratch buffer
-  scratch_->Rewind();
-  scratch_->GrowSize(total_scratch_size);
-  Tensor transformed_input(scratch_->Scratch(transformed_input_size), DT_FLOAT);
-  Tensor
-    transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT);
-  Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT);
-  Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT);
-
-  // decide which convolution function to call
-  if (use_winograd) {
-    transformed_input.Resize(transformed_input_shape);
-    transformed_output.Resize(transformed_output_shape);
-    const float *transformed_filter_ptr;
-    if (transformed_filter_.dim_size() == 0) {
-      transformed_filter_.Resize(transformed_filter_shape);
-      if (is_filter_transformed_) {
-        transformed_filter_ptr = filter_data;
-      } else {
-        switch (winograd_out_tile_size) {
-          case 2:
-            TransformFilter4x4(filter_data,
-                               filter_shape[1],
-                               filter_shape[0],
-                               transformed_filter_.mutable_data<float>());
-            break;
-          case 6:
-            TransformFilter8x8(filter_data,
-                               filter_shape[1],
-                               filter_shape[0],
-                               transformed_filter_.mutable_data<float>());
-            break;
-          default:MACE_NOT_IMPLEMENTED;
-        }
-        transformed_filter_ptr = transformed_filter_.data<float>();
-      }
-    } else {
-      transformed_filter_ptr = transformed_filter_.data<float>();
-    }
-
-    conv_func = [&](const float *pad_input, float *pad_output) {
-      WinoGradConv3x3s1(pad_input,
-                        transformed_filter_ptr,
-                        batch,
-                        extra_input_height,
-                        extra_input_width,
-                        input_channels,
-                        channels,
-                        winograd_out_tile_size,
-                        transformed_input.mutable_data<float>(),
-                        transformed_output.mutable_data<float>(),
-                        pad_output);
-    };
-  } else if (use_neon_3x3_s1) {
-    conv_func = [=](const float *pad_input, float *pad_output) {
-      Conv2dNeonK3x3S1(pad_input,
-                       filter_data,
-                       batch,
-                       extra_input_height,
-                       extra_input_width,
-                       input_channels,
-                       extra_output_height,
-                       extra_output_width,
-                       channels,
-                       pad_output);
-    };
-  } else if (use_neon_3x3_s2) {
-    conv_func = [=](const float *pad_input, float *pad_output) {
-      Conv2dNeonK3x3S2(pad_input,
-                       filter_data,
-                       batch,
-                       extra_input_height,
-                       extra_input_width,
-                       input_channels,
-                       extra_output_height,
-                       extra_output_width,
-                       channels,
-                       pad_output);
-    };
-  } else if (use_neon_1x1_s1) {
-    conv_func = [=](const float *pad_input, float *pad_output) {
-      Conv2dNeonK1x1S1(pad_input,
-                       filter_data,
-                       batch,
-                       extra_input_height,
-                       extra_input_width,
-                       input_channels,
-                       channels,
-                       pad_output);
-    };
-  } else {
-    conv_func = [=](const float *pad_input, float *pad_output) {
-      Conv2dNCHW(pad_input,
-                 filter_data,
-                 batch,
-                 extra_input_height,
-                 extra_input_width,
-                 input_channels,
-                 extra_output_height,
-                 extra_output_width,
-                 channels,
-                 filter_h,
-                 filter_w,
-                 stride_h,
-                 stride_w,
-                 dilation_h,
-                 dilation_w,
-                 pad_output);
-    };
-  }
-
-  // pad input and output
-  const Tensor *pad_input_ptr = input;
-  if (extra_input_height != input_height || extra_input_width != input_width) {
-    padded_input.Clear();
-    ConstructNCHWInputWithSpecificPadding(input,
-                                          pad_top,
-                                          pad_bottom,
-                                          pad_left,
-                                          pad_right,
-                                          &padded_input);
-    pad_input_ptr = &padded_input;
-  }
-
-  Tensor *pad_output_ptr = output;
-  if (extra_output_height != height || extra_output_width != width) {
-    padded_output.Resize({batch, channels, extra_output_height,
-                           extra_output_width});
-    padded_output.Clear();
-    pad_output_ptr = &padded_output;
-  }
-  const float *pad_input_data = pad_input_ptr->data<float>();
-  float *pad_output_data = pad_output_ptr->mutable_data<float>();
-
-  conv_func(pad_input_data, pad_output_data);
-
-  // unpack output
-  if (extra_output_height != height || extra_output_width != width) {
-#pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t c = 0; c < channels; ++c) {
-        for (index_t h = 0; h < height; ++h) {
-          memcpy(
-            output_data + b * channels * height * width + c * height * width
-              + h * width,
-            pad_output_data
-              + b * channels * extra_output_height * extra_output_width
-              + c * extra_output_height * extra_output_width
-              + h * extra_output_width,
-            sizeof(float) * width);
-        }
-      }
-    }
-  }
-
-  if (bias_data != nullptr) {
-#pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t c = 0; c < channels; ++c) {
-        for (index_t i = 0; i < height * width; ++i) {
-          output_data[(b * channels + c) * height * width + i] += bias_data[c];
-        }
-      }
-    }
-  }
-
-  DoActivation(output_data, output_data, output->size(), activation_,
-               relux_max_limit_);
-}
-
-}  // namespace kernels
-}  // namespace mace
--- a/mace/kernels/global_avg_pooling.h
+++ b/mace/kernels/global_avg_pooling.h
@@ -12,49 +12,46 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifndef MACE_KERNELS_GLOBAL_AVG_POOLING_H_
-#define MACE_KERNELS_GLOBAL_AVG_POOLING_H_
+#ifndef MACE_KERNELS_ARM_CONV_2D_NEON_H_
+#define MACE_KERNELS_ARM_CONV_2D_NEON_H_

-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
+#include "mace/core/types.h"

 namespace mace {
 namespace kernels {

-template <DeviceType D, typename T>
-struct GlobalAvgPoolingFunctor {
-  void operator()(const T *input,
-                  const index_t *input_shape,
-                  T *output,
-                  StatsFuture *future) {
-    index_t batch = input_shape[0];
-    index_t channels = input_shape[1];
-    index_t height = input_shape[2];
-    index_t width = input_shape[3];
-
-    index_t image_size = height * width;
-    index_t input_offset = 0;
-    index_t total_channels = batch * channels;
-
-    for (int c = 0; c < total_channels; ++c) {
-      T sum = 0;
-      for (int i = 0; i < image_size; ++i) {
-        sum += input[input_offset + i];
-      }
-      output[c] = sum / image_size;
-      input_offset += image_size;
-    }
-  }
-};
-
-template <>
-void GlobalAvgPoolingFunctor<DeviceType::NEON, float>::operator()(
-    const float *input,
-    const index_t *input_shape,
-    float *output,
-    StatsFuture *future);
+extern void Conv2dNeonK1x1S1(const float *input,
+                             const float *filter,
+                             const index_t batch,
+                             const index_t height,
+                             const index_t width,
+                             const index_t in_channels,
+                             const index_t out_channels,
+                             float *output);
+
+extern void Conv2dNeonK3x3S1(const float *input,
+                             const float *filter,
+                             const index_t batch,
+                             const index_t in_height,
+                             const index_t in_width,
+                             const index_t in_channels,
+                             const index_t out_height,
+                             const index_t out_width,
+                             const index_t out_channels,
+                             float *output);
+
+extern void Conv2dNeonK3x3S2(const float *input,
+                             const float *filter,
+                             const index_t batch,
+                             const index_t in_height,
+                             const index_t in_width,
+                             const index_t in_channels,
+                             const index_t out_height,
+                             const index_t out_width,
+                             const index_t out_channels,
+                             float *output);

 }  // namespace kernels
 }  // namespace mace

-#endif  // MACE_KERNELS_GLOBAL_AVG_POOLING_H_
+#endif  // MACE_KERNELS_ARM_CONV_2D_NEON_H_
--- a/mace/kernels/arm/conv_2d_neon_1x1.cc
+++ b/mace/kernels/arm/conv_2d_neon_1x1.cc
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif

-#include "mace/core/types.h"
+#include "mace/kernels/arm/conv_2d_neon.h"
 #include "mace/kernels/gemm.h"

 namespace mace {

--- a/mace/kernels/arm/conv_2d_neon_3x3.cc
+++ b/mace/kernels/arm/conv_2d_neon_3x3.cc
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif

-#include "mace/core/types.h"
+#include "mace/kernels/arm/conv_2d_neon.h"

 namespace mace {
 namespace kernels {

--- a/mace/kernels/arm/depthwise_conv2d.cc
+++ b/mace/kernels/arm/depthwise_conv2d.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/depthwise_conv2d.h"
-#include "mace/kernels/activation.h"
-
-namespace mace {
-namespace kernels {
-
-namespace {
-
-void DepthwiseConv2dNCHW(const float *input,
-                         const float *filter,
-                         const index_t batch,
-                         const index_t in_height,
-                         const index_t in_width,
-                         const index_t in_channels,
-                         const index_t out_height,
-                         const index_t out_width,
-                         const index_t out_channels,
-                         const int filter_height,
-                         const int filter_width,
-                         const int stride_h,
-                         const int stride_w,
-                         const int dilation_h,
-                         const int dilation_w,
-                         const int pad_top,
-                         const int pad_left,
-                         float *output) {
-  const index_t multiplier = out_channels / in_channels;
-#pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t m = 0; m < out_channels; ++m) {
-      for (index_t h = 0; h < out_height; ++h) {
-        for (index_t w = 0; w < out_width; ++w) {
-          index_t out_offset =
-            ((b * out_channels + m) * out_height + h) * out_width + w;
-          index_t c = m / multiplier;
-          index_t o = m % multiplier;
-          float sum = 0;
-          for (index_t kh = 0; kh < filter_height; ++kh) {
-            for (index_t kw = 0; kw < filter_width; ++kw) {
-              index_t ih = h * stride_h + kh * dilation_h - pad_top;
-              index_t iw = w * stride_w + kw * dilation_w - pad_left;
-              if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
-                index_t in_offset =
-                  ((b * in_channels + c) * in_height + ih) * in_width + iw;
-                index_t filter_offset =
-                  (((o * in_channels) + c) * filter_height + kh) * filter_width
-                    + kw;
-
-                sum += input[in_offset] * filter[filter_offset];
-              }
-            }
-          }
-          output[out_offset] = sum;
-        }
-      }
-    }
-  }
-}
-}  // namespace
-
-extern void DepthwiseConv2dNeonK3x3S1(const float *input,
-                                      const float *filter,
-                                      const index_t batch,
-                                      const index_t in_height,
-                                      const index_t in_width,
-                                      const index_t in_channels,
-                                      const index_t out_height,
-                                      const index_t out_width,
-                                      const index_t out_channels,
-                                      const int pad_top,
-                                      const int pad_left,
-                                      const int valid_h_start,
-                                      const int valid_h_stop,
-                                      const int valid_w_start,
-                                      const int valid_w_stop,
-                                      float *output);
-
-void DepthwiseConv2dNeonK3x3S2(const float *input,
-                               const float *filter,
-                               const index_t batch,
-                               const index_t in_height,
-                               const index_t in_width,
-                               const index_t in_channels,
-                               const index_t out_height,
-                               const index_t out_width,
-                               const index_t out_channels,
-                               const int pad_top,
-                               const int pad_left,
-                               const int valid_h_start,
-                               const int valid_h_stop,
-                               const int valid_w_start,
-                               const int valid_w_stop,
-                               float *output);
-
-void DepthwiseConv2dFunctor<DeviceType::NEON,
-                            float>::operator()(const Tensor *input,
-                                               const Tensor *filter,
-                                               const Tensor *bias,
-                                               Tensor *output,
-                                               StatsFuture *future) {
-  MACE_CHECK_NOTNULL(input);
-  MACE_CHECK_NOTNULL(filter);
-  MACE_CHECK_NOTNULL(output);
-
-  std::vector<index_t> output_shape(4);
-  std::vector<int> paddings(2);
-  std::vector<index_t> filter_shape
-    {filter->dim(0) * filter->dim(1), filter->dim(1), filter->dim(2),
-     filter->dim(3)};
-
-  if (paddings_.empty()) {
-    CalcNCHWPaddingAndOutputSize(input->shape().data(),
-                                 filter_shape.data(),
-                                 dilations_,
-                                 strides_,
-                                 padding_type_,
-                                 output_shape.data(),
-                                 paddings.data());
-  } else {
-    paddings = paddings_;
-    CalcNCHWOutputSize(input->shape().data(), filter_shape.data(),
-                       paddings_.data(), dilations_, strides_, RoundType::FLOOR,
-                       output_shape.data());
-  }
-  output->Resize(output_shape);
-  output->Clear();
-
-  index_t batch = output->dim(0);
-  index_t channels = output->dim(1);
-  index_t height = output->dim(2);
-  index_t width = output->dim(3);
-
-  index_t input_batch = input->dim(0);
-  index_t input_channels = input->dim(1);
-  index_t input_height = input->dim(2);
-  index_t input_width = input->dim(3);
-
-  index_t filter_h = filter_shape[2];
-  index_t filter_w = filter_shape[3];
-  MACE_CHECK(filter_shape[0] == channels, filter_shape[0], " != ", channels);
-  MACE_CHECK(filter_shape[1] == input_channels, filter_shape[1], " != ",
-             input_channels);
-
-  index_t stride_h = strides_[0];
-  index_t stride_w = strides_[1];
-
-  index_t dilation_h = dilations_[0];
-  index_t dilation_w = dilations_[1];
-
-  MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");
-
-  int pad_top = paddings[0] >> 1;
-  int pad_bottom = paddings[0] - pad_top;
-  int pad_left = paddings[1] >> 1;
-  int pad_right = paddings[1] - pad_left;
-
-  int valid_h_start = pad_top == 0 ? 0 : (pad_top - 1) / stride_h + 1;
-  int valid_h_stop = pad_bottom == 0
-                     ? height
-                     : height - ((pad_bottom - 1) / stride_h + 1);
-  int valid_w_start = pad_left == 0 ? 0 : (pad_left - 1) / stride_w + 1;
-  int valid_w_stop = pad_right == 0
-                     ? width
-                     : width - ((pad_right - 1) / stride_w + 1);
-
-  std::function<void(const float *input, float *output)> conv_func;
-
-  auto input_data = input->data<float>();
-  auto filter_data = filter->data<float>();
-  auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
-  auto output_data = output->mutable_data<float>();
-
-  if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
-    && dilation_h == 1 && dilation_w == 1) {
-    conv_func = [=](const float *input, float *output) {
-      DepthwiseConv2dNeonK3x3S1(input,
-                                filter_data,
-                                batch,
-                                input_height,
-                                input_width,
-                                input_channels,
-                                height,
-                                width,
-                                channels,
-                                pad_top,
-                                pad_left,
-                                valid_h_start,
-                                valid_h_stop,
-                                valid_w_start,
-                                valid_w_stop,
-                                output);
-    };
-  } else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2
-    && dilation_h == 1 && dilation_w == 1) {
-    conv_func = [=](const float *input, float *output) {
-      DepthwiseConv2dNeonK3x3S2(input,
-                                filter_data,
-                                batch,
-                                input_height,
-                                input_width,
-                                input_channels,
-                                height,
-                                width,
-                                channels,
-                                pad_top,
-                                pad_left,
-                                valid_h_start,
-                                valid_h_stop,
-                                valid_w_start,
-                                valid_w_stop,
-                                output);
-    };
-  } else {
-    conv_func = [=](const float *input, float *output) {
-      DepthwiseConv2dNCHW(input,
-                          filter_data,
-                          batch,
-                          input_height,
-                          input_width,
-                          input_channels,
-                          height,
-                          width,
-                          channels,
-                          filter_h,
-                          filter_w,
-                          stride_h,
-                          stride_w,
-                          dilation_h,
-                          dilation_w,
-                          pad_top,
-                          pad_left,
-                          output);
-    };
-  }
-
-  conv_func(input_data, output_data);
-
-  if (bias_data != nullptr) {
-#pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t c = 0; c < channels; ++c) {
-        for (index_t i = 0; i < height * width; ++i) {
-          output_data[(b * channels + c) * height * width + i] += bias_data[c];
-        }
-      }
-    }
-  }
-
-  DoActivation(output_data, output_data, output->size(), activation_,
-               relux_max_limit_);
-}
-
-}  // namespace kernels
-}  // namespace mace
--- a/mace/kernels/arm/depthwise_conv2d_neon.h
+++ b/mace/kernels/arm/depthwise_conv2d_neon.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_ARM_DEPTHWISE_CONV2D_NEON_H_
+#define MACE_KERNELS_ARM_DEPTHWISE_CONV2D_NEON_H_
+
+#include "mace/core/types.h"
+
+namespace mace {
+namespace kernels {
+
+void DepthwiseConv2dNeonK3x3S1(const float *input,
+                               const float *filter,
+                               const index_t batch,
+                               const index_t in_height,
+                               const index_t in_width,
+                               const index_t in_channels,
+                               const index_t out_height,
+                               const index_t out_width,
+                               const index_t out_channels,
+                               const int pad_top,
+                               const int pad_left,
+                               const index_t valid_h_start,
+                               const index_t valid_h_stop,
+                               const index_t valid_w_start,
+                               const index_t valid_w_stop,
+                               float *output);
+
+void DepthwiseConv2dNeonK3x3S2(const float *input,
+                          const float *filter,
+                          const index_t batch,
+                          const index_t in_height,
+                          const index_t in_width,
+                          const index_t in_channels,
+                          const index_t out_height,
+                          const index_t out_width,
+                          const index_t out_channels,
+                          const int pad_top,
+                          const int pad_left,
+                          const index_t valid_h_start,
+                          const index_t valid_h_stop,
+                          const index_t valid_w_start,
+                          const index_t valid_w_stop,
+                          float *output);
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_ARM_DEPTHWISE_CONV2D_NEON_H_
--- a/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
+++ b/mace/kernels/arm/depthwise_conv2d_neon_3x3.cc
@@ -16,7 +16,7 @@
 #include <arm_neon.h>
 #endif

-#include "mace/core/types.h"
+#include "mace/kernels/arm/depthwise_conv2d_neon.h"

 namespace mace {
 namespace kernels {
@@ -60,10 +60,10 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
                               const index_t out_channels,
                               const int pad_top,
                               const int pad_left,
-                               const int valid_h_start,
-                               const int valid_h_stop,
-                               const int valid_w_start,
-                               const int valid_w_stop,
+                               const index_t valid_h_start,
+                               const index_t valid_h_stop,
+                               const index_t valid_w_start,
+                               const index_t valid_w_stop,
                               float *output) {
  const index_t multiplier = out_channels / in_channels;
  const index_t in_image_size = in_height * in_width;
@@ -277,10 +277,10 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
                               const index_t out_channels,
                               const int pad_top,
                               const int pad_left,
-                               const int valid_h_start,
-                               const int valid_h_stop,
-                               const int valid_w_start,
-                               const int valid_w_stop,
+                               const index_t valid_h_start,
+                               const index_t valid_h_stop,
+                               const index_t valid_w_start,
+                               const index_t valid_w_stop,
                               float *output) {
  const index_t multiplier = out_channels / in_channels;
  const index_t in_image_size = in_height * in_width;

--- a/mace/kernels/arm/fully_connected.cc
+++ b/mace/kernels/arm/fully_connected.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/fully_connected.h"
-#include "mace/kernels/gemm.h"
-
-namespace mace {
-namespace kernels {
-
-void FullyConnectedFunctor<DeviceType::NEON,
-                           float>::operator()(const Tensor *input,
-                                              const Tensor *weight,
-                                              const Tensor *bias,
-                                              Tensor *output,
-                                              StatsFuture *future) {
-  std::vector<index_t> output_shape = {input->dim(0), weight->dim(0), 1, 1};
-  output->Resize(output_shape);
-  const index_t N = output->dim(0);
-  const index_t input_size = weight->dim(1);
-  const index_t output_size = weight->dim(0);
-  const float *input_ptr = input->data<float>();
-  const float *weight_ptr = weight->data<float>();
-  const float *bias_ptr = bias == nullptr ? nullptr : bias->data<float>();
-  float *output_ptr = output->mutable_data<float>();
-
-  Gemv(weight_ptr, input_ptr, N, input_size, output_size, output_ptr);
-  for (int i = 0; i < N; ++i) {
-    for (int j = 0; j < output_size; ++j) {
-      output_ptr[j + i * output_size] += bias_ptr[j];
-    }
-  }
-
-  DoActivation(output_ptr, output_ptr, output->size(), activation_,
-               relux_max_limit_);
-}
-
-}  // namespace kernels
-}  // namespace mace
--- a/mace/kernels/arm/local_response_norm.cc
+++ b/mace/kernels/arm/local_response_norm.cc
-//
-// Copyright (c) 2018 XiaoMi All rights reserved.
-//
-
-#include "mace/kernels/local_response_norm.h"
-
-namespace mace {
-namespace kernels {
-
-void LocalResponseNormFunctor<DeviceType::NEON, float>::operator()(
-  const Tensor *input,
-  int depth_radius,
-  float bias,
-  float alpha,
-  float beta,
-  Tensor *output,
-  StatsFuture *future) {
-  const index_t batch = input->dim(0);
-  const index_t channels = input->dim(1);
-  const index_t height = input->dim(2);
-  const index_t width = input->dim(3);
-
-  const float *input_ptr = input->data<float>();
-  float *output_ptr = output->mutable_data<float>();
-
-  index_t image_size = height * width;
-  index_t batch_size = channels * image_size;
-
-#pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t c = 0; c < channels; ++c) {
-      const int begin_input_c = std::max(static_cast<index_t>(0),
-                                         c - depth_radius);
-      const int end_input_c = std::min(channels, c + depth_radius + 1);
-
-      index_t pos = b * batch_size;
-      for (index_t hw = 0; hw < height * width; ++hw, ++pos) {
-        float accum = 0.f;
-        for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
-          const float input_val = input_ptr[pos + input_c * image_size];
-          accum += input_val * input_val;
-        }
-        const float multiplier = std::pow(bias + alpha * accum, -beta);
-        output_ptr[pos + c * image_size] =
-            input_ptr[pos + c * image_size] * multiplier;
-      }
-    }
-  }
-}
-
-}  // namespace kernels
-}  // namespace mace
--- a/mace/kernels/arm/pooling.cc
+++ b/mace/kernels/arm/pooling.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/pooling.h"
-
-namespace mace {
-namespace kernels {
-
-namespace {
-
-void MaxPooling(const float *input,
-                const index_t batch,
-                const index_t in_height,
-                const index_t in_width,
-                const index_t channels,
-                const index_t out_height,
-                const index_t out_width,
-                const int filter_height,
-                const int filter_width,
-                const int stride_h,
-                const int stride_w,
-                const int dilation_h,
-                const int dilation_w,
-                const int pad_top,
-                const int pad_left,
-                float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = channels * in_image_size;
-  const index_t out_batch_size = channels * out_image_size;
-
-#pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t c = 0; c < channels; ++c) {
-      const index_t out_base = b * out_batch_size + c * out_image_size;
-      const index_t in_base = b * in_batch_size + c * in_image_size;
-      for (index_t h = 0; h < out_height; ++h) {
-        for (index_t w = 0; w < out_width; ++w) {
-          const index_t out_offset = out_base + h * out_width + w;
-          float res = std::numeric_limits<float>::lowest();
-          for (int fh = 0; fh < filter_height; ++fh) {
-            for (int fw = 0; fw < filter_width; ++fw) {
-              int inh = h * stride_h + dilation_h * fh - pad_top;
-              int inw = w * stride_w + dilation_w * fw - pad_left;
-              if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
-                index_t input_offset = in_base + inh * in_width + inw;
-                res = std::max(res, input[input_offset]);
-              }
-            }
-          }
-          output[out_offset] = res;
-        }
-      }
-    }
-  }
-}
-
-void AvgPooling(const float *input,
-                const index_t batch,
-                const index_t in_height,
-                const index_t in_width,
-                const index_t channels,
-                const index_t out_height,
-                const index_t out_width,
-                const int filter_height,
-                const int filter_width,
-                const int stride_h,
-                const int stride_w,
-                const int dilation_h,
-                const int dilation_w,
-                const int pad_top,
-                const int pad_left,
-                float *output) {
-  const index_t in_image_size = in_height * in_width;
-  const index_t out_image_size = out_height * out_width;
-  const index_t in_batch_size = channels * in_image_size;
-  const index_t out_batch_size = channels * out_image_size;
-
-#pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch; ++b) {
-    for (index_t c = 0; c < channels; ++c) {
-      const index_t out_base = b * out_batch_size + c * out_image_size;
-      const index_t in_base = b * in_batch_size + c * in_image_size;
-      for (index_t h = 0; h < out_height; ++h) {
-        for (index_t w = 0; w < out_width; ++w) {
-          const index_t out_offset = out_base + h * out_width + w;
-          float res = 0;
-          int block_size = 0;
-          for (int fh = 0; fh < filter_height; ++fh) {
-            for (int fw = 0; fw < filter_width; ++fw) {
-              int inh = h * stride_h + dilation_h * fh - pad_top;
-              int inw = w * stride_w + dilation_w * fw - pad_left;
-              if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
-                index_t input_offset = in_base + inh * in_width + inw;
-                res += input[input_offset];
-                ++block_size;
-              }
-            }
-          }
-          output[out_offset] = res / block_size;
-        }
-      }
-    }
-  }
-}
-}  // namespace
-
-void PoolingFunctor<DeviceType::NEON,
-                    float>::operator()(const Tensor *input_tensor,
-                                       Tensor *output_tensor,
-                                       StatsFuture *future) {
-  std::vector<index_t> output_shape(4);
-  std::vector<index_t> filter_shape = {
-    input_tensor->dim(1), input_tensor->dim(1), kernels_[0], kernels_[1]};
-
-  std::vector<int> paddings(2);
-  if (paddings_.empty()) {
-    kernels::CalcNCHWPaddingAndOutputSize(
-      input_tensor->shape().data(), filter_shape.data(), dilations_,
-      strides_, padding_type_, output_shape.data(), paddings.data());
-  } else {
-    paddings = paddings_;
-    CalcNCHWOutputSize(input_tensor->shape().data(), filter_shape.data(),
-                       paddings_.data(), dilations_, strides_, RoundType::CEIL,
-                       output_shape.data());
-  }
-  output_tensor->Resize(output_shape);
-
-  const float *input = input_tensor->data<float>();
-  float *output = output_tensor->mutable_data<float>();
-  const index_t *input_shape = input_tensor->shape().data();
-  index_t batch = output_shape[0];
-  index_t channels = output_shape[1];
-  index_t height = output_shape[2];
-  index_t width = output_shape[3];
-
-  index_t input_channels = input_shape[1];
-  index_t input_height = input_shape[2];
-  index_t input_width = input_shape[3];
-
-  index_t in_image_size = input_height * input_width;
-
-  int filter_h = kernels_[0];
-  int filter_w = kernels_[1];
-
-  int stride_h = strides_[0];
-  int stride_w = strides_[1];
-
-  int dilation_h = dilations_[0];
-  int dilation_w = dilations_[1];
-
-  int pad_top = paddings[0] / 2;
-  int pad_left = paddings[1] / 2;
-
-  if (pooling_type_ == PoolingType::MAX) {
-    MaxPooling(input,
-               batch,
-               input_height,
-               input_width,
-               channels,
-               height,
-               width,
-               filter_h,
-               filter_w,
-               stride_h,
-               stride_w,
-               dilation_h,
-               dilation_w,
-               pad_top,
-               pad_left,
-               output);
-  } else if (pooling_type_ == PoolingType::AVG) {
-    AvgPooling(input,
-               batch,
-               input_height,
-               input_width,
-               channels,
-               height,
-               width,
-               filter_h,
-               filter_w,
-               stride_h,
-               stride_w,
-               dilation_h,
-               dilation_w,
-               pad_top,
-               pad_left,
-               output);
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-}  // namespace kernels
-}  // namespace mace
--- a/mace/kernels/arm/softmax.cc
+++ b/mace/kernels/arm/softmax.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/softmax.h"
-
-namespace mace {
-namespace kernels {
-
-void SoftmaxFunctor<DeviceType::NEON, float>::operator()(const Tensor *input,
-                                                         Tensor *output,
-                                                         StatsFuture *future) {
-  const index_t batch = input->dim(0);
-  const index_t class_count = input->dim(1);
-  const index_t class_size = input->dim(2) * input->dim(3);
-
-  const float *input_data = input->data<float>();
-  float *output_data = output->mutable_data<float>();
-
-  for (index_t b = 0; b < batch; ++b) {
-    std::vector<float>
-      max_val(class_size, std::numeric_limits<float>::lowest());
-    std::vector<float> sum_val(class_size, 0.f);
-
-    // calculate max for each class
-    for (index_t c = 0; c < class_count; ++c) {
-      const float *input_ptr = input_data + (b * class_count + c) * class_size;
-      for (index_t k = 0; k < class_size; ++k) {
-        max_val[k] = std::max(max_val[k], input_ptr[k]);
-      }
-    }
-
-    // calculate data - max for each class
-#pragma omp parallel for
-    for (index_t c = 0; c < class_count; ++c) {
-      const float *input_ptr = input_data + (b * class_count + c) * class_size;
-      float *output_ptr = output_data + (b * class_count + c) * class_size;
-      for (index_t k = 0; k < class_size; ++k) {
-        output_ptr[k] = ::exp(input_ptr[k] - max_val[k]);
-      }
-    }
-
-    // calculate sum for each class
-    for (index_t c = 0; c < class_count; ++c) {
-      float *output_ptr = output_data + (b * class_count + c) * class_size;
-      for (index_t k = 0; k < class_size; ++k) {
-        sum_val[k] += output_ptr[k];
-      }
-    }
-
-    // calculate (data - max) / sum for each class
-    for (index_t c = 0; c < class_count; ++c) {
-      float *output_ptr = output_data + (b * class_count + c) * class_size;
-      for (index_t k = 0; k < class_size; ++k) {
-        output_ptr[k] /= sum_val[k];
-      }
-    }
-  }
-}
-
-}  // namespace kernels
-}  // namespace mace
--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -34,21 +34,24 @@ struct BatchNormFunctorBase {
  BatchNormFunctorBase(bool folded_constant,
                       const ActivationType activation,
                       const float relux_max_limit)
-      : folded_constant_(folded_constant),
-        activation_(activation),
-        relux_max_limit_(relux_max_limit) {}
+    : folded_constant_(folded_constant),
+      activation_(activation),
+      relux_max_limit_(relux_max_limit) {}

  const bool folded_constant_;
  const ActivationType activation_;
  const float relux_max_limit_;
 };

-template <DeviceType D, typename T>
-struct BatchNormFunctor : BatchNormFunctorBase {
+template<DeviceType D, typename T>
+struct BatchNormFunctor;
+
+template<>
+struct BatchNormFunctor<DeviceType::CPU, float> : BatchNormFunctorBase {
  BatchNormFunctor(const bool folded_constant,
                   const ActivationType activation,
                   const float relux_max_limit)
-      : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
+    : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}

  void operator()(const Tensor *input,
                  const Tensor *scale,
@@ -67,29 +70,29 @@ struct BatchNormFunctor : BatchNormFunctorBase {
    // new_offset = \offset - mean * common_val;
    // Y = new_scale * X + new_offset;
    const index_t batch = input->dim(0);
-    const index_t height = input->dim(1);
-    const index_t width = input->dim(2);
-    const index_t channels = input->dim(3);
+    const index_t channels = input->dim(1);
+    const index_t height = input->dim(2);
+    const index_t width = input->dim(3);

    Tensor::MappingGuard input_mapper(input);
    Tensor::MappingGuard scale_mapper(scale);
    Tensor::MappingGuard offset_mapper(offset);
    Tensor::MappingGuard output_mapper(output);

-    const T *input_ptr = input->data<T>();
-    const T *scale_ptr = scale->data<T>();
-    const T *offset_ptr = offset->data<T>();
-    T *output_ptr = output->mutable_data<T>();
+    const float *input_ptr = input->data<float>();
+    const float *scale_ptr = scale->data<float>();
+    const float *offset_ptr = offset->data<float>();
+    float *output_ptr = output->mutable_data<float>();

-    std::vector<T> new_scale;
-    std::vector<T> new_offset;
+    std::vector<float> new_scale;
+    std::vector<float> new_offset;
    if (!folded_constant_) {
      new_scale.resize(channels);
      new_offset.resize(channels);
      Tensor::MappingGuard mean_mapper(mean);
      Tensor::MappingGuard var_mapper(var);
-      const T *mean_ptr = mean->data<T>();
-      const T *var_ptr = var->data<T>();
+      const float *mean_ptr = mean->data<float>();
+      const float *var_ptr = var->data<float>();
 #pragma omp parallel for
      for (index_t c = 0; c < channels; ++c) {
        new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon);
@@ -97,44 +100,21 @@ struct BatchNormFunctor : BatchNormFunctorBase {
      }
    }

-    const T *scale_data = folded_constant_ ? scale_ptr : new_scale.data();
-    const T *offset_data = folded_constant_ ? offset_ptr : new_offset.data();
+    const float *scale_data = folded_constant_ ? scale_ptr : new_scale.data();
+    const float
+      *offset_data = folded_constant_ ? offset_ptr : new_offset.data();

-    const int elements = batch * height * width;
-    constexpr int c_tile_size = 4;
-    const int c_tiles = channels / c_tile_size;
-    const index_t remains_start = c_tiles * c_tile_size;
+    index_t channel_size = height * width;
+    index_t batch_size = channels * channel_size;

-    if (c_tiles > 0) {
+    // NEON is slower, so stick to the trivial implementaion
 #pragma omp parallel for collapse(2)
-      for (index_t i = 0; i < elements; ++i) {
-        for (int cb = 0; cb < c_tiles; ++cb) {
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-          static_assert(c_tile_size == 4, "channels tile size must be 4");
-          int c = cb * c_tile_size;
-          int pos = i * channels + c;
-
-          float32x4_t scales = vld1q_f32(scale_data + c);
-          float32x4_t offsets = vld1q_f32(offset_data + c);
-          float32x4_t in = vld1q_f32(input_ptr + pos);
-          float32x4_t out = vfmaq_f32(offsets, scales, in);
-          vst1q_f32(output_ptr + pos, out);
-#else
-          for (int ci = 0; ci < c_tile_size; ++ci) {
-            int c = cb * c_tile_size + ci;
-            index_t pos = i * channels + c;
-            output_ptr[pos] = scale_data[c] * input_ptr[pos] + offset_data[c];
-          }
-#endif
-        }
-      }
-    }
-    if (remains_start < channels) {
-#pragma omp parallel for collapse(2)
-      for (index_t i = 0; i < elements; ++i) {
-        for (index_t c = remains_start; c < channels; ++c) {
-          index_t pos = i * channels + c;
-          output_ptr[pos] = scale_data[c] * input_ptr[pos] + offset_data[c];
+    for (index_t b = 0; b < batch; ++b) {
+      for (index_t c = 0; c < channels; ++c) {
+        index_t offset = b * batch_size + c * channel_size;
+        for (index_t hw = 0; hw < height * width; ++hw) {
+          output_ptr[offset + hw] =
+            scale_data[c] * input_ptr[offset + hw] + offset_data[c];
        }
      }
    }
@@ -143,29 +123,12 @@ struct BatchNormFunctor : BatchNormFunctorBase {
  }
 };

-template <>
-struct BatchNormFunctor<DeviceType::NEON, float> : BatchNormFunctorBase {
-  BatchNormFunctor(const bool folded_constant,
-                   const ActivationType activation,
-                   const float relux_max_limit)
-    : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
-  void operator()(const Tensor *input,
-                  const Tensor *scale,
-                  const Tensor *offset,
-                  const Tensor *mean,
-                  const Tensor *var,
-                  const float epsilon,
-                  Tensor *output,
-                  StatsFuture *future);
-};
-
-
-template <typename T>
+template<typename T>
 struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
  BatchNormFunctor(const bool folded_constant,
                   const ActivationType activation,
                   const float relux_max_limit)
-      : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
+    : BatchNormFunctorBase(folded_constant, activation, relux_max_limit) {}
  void operator()(const Tensor *input,
                  const Tensor *scale,
                  const Tensor *offset,

--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.h
@@ -26,49 +26,41 @@
 namespace mace {
 namespace kernels {

-template <DeviceType D, typename T>
-struct BiasAddFunctor {
+template<DeviceType D, typename T>
+struct BiasAddFunctor;
+
+template<>
+struct BiasAddFunctor<DeviceType::CPU, float> {
  void operator()(const Tensor *input,
                  const Tensor *bias,
                  Tensor *output,
                  StatsFuture *future) {
    const index_t batch = input->dim(0);
-    const index_t height = input->dim(1);
-    const index_t width = input->dim(2);
-    const index_t channels = input->dim(3);
+    const index_t channels = input->dim(1);
+    const index_t height = input->dim(2);
+    const index_t width = input->dim(3);

    Tensor::MappingGuard input_mapper(input);
    Tensor::MappingGuard bias_mapper(bias);
    Tensor::MappingGuard output_mapper(output);

-    const T *input_ptr = input->data<T>();
-    const T *bias_ptr = bias->data<T>();
-    T *output_ptr = output->mutable_data<T>();
+    const float *input_ptr = input->data<float>();
+    const float *bias_ptr = bias->data<float>();
+    float *output_ptr = output->mutable_data<float>();

-#pragma omp parallel for collapse(4)
+#pragma omp parallel for collapse(2)
    for (index_t n = 0; n < batch; ++n) {
-      for (index_t h = 0; h < height; ++h) {
-        for (index_t w = 0; w < width; ++w) {
-          for (index_t c = 0; c < channels; ++c) {
-            index_t pos = (((n * height) + h) * width + w) * channels + c;
-            output_ptr[pos] = input_ptr[pos] + bias_ptr[c];
-          }
+      for (index_t c = 0; c < channels; ++c) {
+        for (index_t hw = 0; hw < height * width; ++hw) {
+          index_t pos = (n * channels + c) * height * width + hw;
+          output_ptr[pos] = input_ptr[pos] + bias_ptr[c];
        }
      }
    }
  }
 };

-/*
-template <>
-void BiasAddFunctor<DeviceType::NEON, float>::operator()(
-    const Tensor *input,
-    const Tensor *bias,
-    Tensor *output,
-    StatsFuture *future);
-*/
-
-template <typename T>
+template<typename T>
 struct BiasAddFunctor<DeviceType::OPENCL, T> {
  void operator()(const Tensor *input,
                  const Tensor *bias,

--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.h
@@ -24,7 +24,7 @@
 namespace mace {
 namespace kernels {

-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 struct ChannelShuffleFunctor {
  explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}

@@ -39,20 +39,25 @@ struct ChannelShuffleFunctor {
    T *output_ptr = output->mutable_data<T>();

    index_t batch = input->dim(0);
-    index_t height = input->dim(1);
-    index_t width = input->dim(2);
-    index_t channels = input->dim(3);
-
-    index_t bhw_fuse = batch * height * width;
-    int channels_per_group = channels / groups_;
-
-#pragma omp parallel for
-    for (int bhw = 0; bhw < bhw_fuse; ++bhw) {
-      for (int c = 0; c < channels; ++c) {
-        index_t channel_base = bhw * channels;
-        output_ptr[channel_base + c] =
-          input_ptr[channel_base + c % groups_ * channels_per_group
-            + c / groups_];
+    index_t channels = input->dim(1);
+    index_t height = input->dim(2);
+    index_t width = input->dim(3);
+
+    index_t image_size = height * width;
+    index_t batch_size = channels * image_size;
+    index_t channels_per_group = channels / groups_;
+
+#pragma omp parallel for collapse(2)
+    for (index_t b = 0; b < batch; ++b) {
+      for (index_t c = 0; c < channels; ++c) {
+        const T *input_base = input_ptr + b * batch_size;
+        T *output_base = output_ptr + b * batch_size;
+        index_t g = c % groups_;
+        index_t idx = c / groups_;
+        for (index_t hw = 0; hw < height * width; ++hw) {
+          output_base[c * image_size + hw] = input_base[
+            (c % groups_ * channels_per_group + c / groups_) * image_size + hw];
+        }
      }
    }
  }
@@ -60,7 +65,7 @@ struct ChannelShuffleFunctor {
  const int groups_;
 };

-template <typename T>
+template<typename T>
 struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
  explicit ChannelShuffleFunctor(const int groups) : groups_(groups) {}


--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -19,6 +19,7 @@
 #include <arm_neon.h>
 #endif
 #include <algorithm>
+#include <functional>
 #include <memory>
 #include <vector>

@@ -27,165 +28,13 @@
 #include "mace/core/tensor.h"
 #include "mace/kernels/activation.h"
 #include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/kernels/arm/conv_2d_neon.h"
+#include "mace/kernels/arm/conv_winograd.h"
 #include "mace/utils/utils.h"

 namespace mace {
 namespace kernels {

-template <typename T,
-          int inc_tile_size,
-          int c_count,
-          int h_count,
-          int w_count>
-void Conv2dKernelFunc(const T *input_ptr,  // batch start
-                      const T *filter_ptr,
-                      const T *bias_ptr,
-                      T *output_ptr,  // batch start
-                      const int h_offset,
-                      const int w_offset,
-                      const int c_offset,
-                      const int kernel_h,
-                      const int kernel_w,
-                      const int stride_h,
-                      const int stride_w,
-                      const int dilation_h,
-                      const int dilation_w,
-                      const int channels,
-                      const int input_channels,
-                      const int width,
-                      const int padded_width) {
-  T sum[h_count * w_count * c_count] = {0.0f};
-  if (bias_ptr != nullptr) {
-    for (int hi = 0; hi < h_count; ++hi) {
-      for (int wi = 0; wi < w_count; ++wi) {
-        for (int ci = 0; ci < c_count; ++ci) {
-          const int sum_idx = (hi * w_count + wi) * c_count + ci;
-          sum[sum_idx] = bias_ptr[c_offset + ci];
-        }
-      }
-    }
-  }
-
-  for (int kh = 0; kh < kernel_h; ++kh) {
-    for (int kw = 0; kw < kernel_w; ++kw) {
-      int inc = 0;
-      for (; inc + inc_tile_size <= input_channels; inc += inc_tile_size) {
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-        // AArch64 NEON has 32 128-bit general purpose registers
-        static_assert(inc_tile_size == 4, "input channels tile size must be 4");
-        float32x4_t in[h_count * w_count];  // NOLINT(runtime/arrays)
-#else
-        T in[h_count * w_count * inc_tile_size];  // NOLINT(runtime/arrays)
-#endif
-        for (int hi = 0; hi < h_count; ++hi) {
-          for (int wi = 0; wi < w_count; ++wi) {
-            const int in_idx = hi * w_count + wi;
-            const int inh = (h_offset + hi) * stride_h + kh * dilation_h;
-            const int inw = (w_offset + wi) * stride_w + kw * dilation_w;
-            const int in_offset =
-                (inh * padded_width + inw) * input_channels + inc;
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-            static_assert(inc_tile_size == 4,
-                          "input channels tile size must be 4");
-            in[in_idx] = vld1q_f32(input_ptr + in_offset);
-#else
-            for (int inci = 0; inci < inc_tile_size; ++inci) {
-              in[in_idx * inc_tile_size + inci] = input_ptr[in_offset + inci];
-            }
-#endif
-          }
-        }
-
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-        static_assert(inc_tile_size == 4, "input channels tile size must be 4");
-        float32x4_t weights[c_count];  // NOLINT(runtime/arrays)
-#else
-        T weights[c_count * inc_tile_size];  // NOLINT(runtime/arrays)
-#endif
-        for (int ci = 0; ci < c_count; ++ci) {
-          const int weights_idx = ci;
-          const int filter_offset =
-              ((kh * kernel_w + kw) * channels + c_offset + ci) *
-                  input_channels +
-              inc;
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-          weights[weights_idx] = vld1q_f32(filter_ptr + filter_offset);
-#else
-          for (int inci = 0; inci < inc_tile_size; ++inci) {
-            weights[weights_idx * inc_tile_size + inci] =
-                filter_ptr[filter_offset + inci];
-          }
-#endif
-        }
-        for (int hi = 0; hi < h_count; ++hi) {
-          for (int wi = 0; wi < w_count; ++wi) {
-            for (int ci = 0; ci < c_count; ++ci) {
-              const int weights_idx = ci;
-              const int in_idx = hi * w_count + wi;
-              const int sum_idx = (hi * w_count + wi) * c_count + ci;
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-              float32x4_t tmp = vmulq_f32(in[in_idx], weights[weights_idx]);
-              sum[sum_idx] += vaddvq_f32(tmp);
-#else
-              for (int inci = 0; inci < inc_tile_size; ++inci) {
-                sum[sum_idx] += in[in_idx * inc_tile_size + inci] *
-                                weights[weights_idx * inc_tile_size + inci];
-              }
-#endif
-            }
-          }
-        }
-      }
-      // handling the remaining input channels
-      for (; inc < input_channels; ++inc) {
-        T in[h_count * w_count];  // NOLINT(runtime/arrays)
-        for (int hi = 0; hi < h_count; ++hi) {
-          for (int wi = 0; wi < w_count; ++wi) {
-            const int in_idx = hi * w_count + wi;
-            const int inh = (h_offset + hi) * stride_h + kh * dilation_h;
-            const int inw = (w_offset + wi) * stride_w + kw * dilation_w;
-            const int in_offset =
-                (inh * padded_width + inw) * input_channels + inc;
-            in[in_idx] = input_ptr[in_offset];
-          }
-        }
-
-        T weights[c_count];  // NOLINT(runtime/arrays)
-        for (int ci = 0; ci < c_count; ++ci) {
-          const int weights_idx = ci;
-          const int filter_offset =
-              ((kh * kernel_w + kw) * channels + c_offset + ci) *
-                  input_channels +
-              inc;
-          weights[weights_idx] = filter_ptr[filter_offset];
-        }
-        for (int hi = 0; hi < h_count; ++hi) {
-          for (int wi = 0; wi < w_count; ++wi) {
-            for (int ci = 0; ci < c_count; ++ci) {
-              const int weights_idx = ci;
-              const int in_idx = hi * w_count + wi;
-              const int sum_idx = (hi * w_count + wi) * c_count + ci;
-              sum[sum_idx] += in[in_idx] * weights[weights_idx];
-            }
-          }
-        }
-      }
-    }
-  }
-  // save output
-  for (int hi = 0; hi < h_count; ++hi) {
-    for (int wi = 0; wi < w_count; ++wi) {
-      for (int ci = 0; ci < c_count; ++ci) {
-        const int out_offset =
-            ((h_offset + hi) * width + w_offset + wi) * channels + c_offset +
-            ci;
-        const int sum_idx = (hi * w_count + wi) * c_count + ci;
-        output_ptr[out_offset] = sum[sum_idx];
-      }
-    }
-  }
-}
-
 struct Conv2dFunctorBase {
  Conv2dFunctorBase(const int *strides,
                    const Padding &padding_type,
@@ -193,12 +42,12 @@ struct Conv2dFunctorBase {
                    const int *dilations,
                    const ActivationType activation,
                    const float relux_max_limit)
-      : strides_(strides),
-        padding_type_(padding_type),
-        paddings_(paddings),
-        dilations_(dilations),
-        activation_(activation),
-        relux_max_limit_(relux_max_limit) {}
+    : strides_(strides),
+      padding_type_(padding_type),
+      paddings_(paddings),
+      dilations_(dilations),
+      activation_(activation),
+      relux_max_limit_(relux_max_limit) {}

  const int *strides_;  // [stride_h, stride_w]
  const Padding padding_type_;
@@ -208,100 +57,11 @@ struct Conv2dFunctorBase {
  const float relux_max_limit_;
 };

-#define MACE_DO_CONV2D(CC, CH, CW) \
-Conv2dKernelFunc<T, inc_tile_size, CC, CH, CW>( \
-  input_ptr, filter_data, bias_data, output_ptr, \
-  h_offset, w_offset, c_offset, kernel_h, kernel_w, \
-  stride_h, stride_w, dilation_h, dilation_w, \
-  channels, input_channels, width, padded_width);
-
-#define MACE_CASE_W_CONV2D(CC, CH) \
-switch (w_count) { \
-  case 1: \
-    MACE_DO_CONV2D(CC, CH, 1); \
-    break; \
-  case 2: \
-    MACE_DO_CONV2D(CC, CH, 2); \
-    break; \
-  case 3: \
-    MACE_DO_CONV2D(CC, CH, 3); \
-    break; \
-  case 4: \
-    MACE_DO_CONV2D(CC, CH, 4); \
-    break; \
-  default: \
-    LOG(FATAL) << "Unsupported w tile: " << w_count; \
-}
-
-#define MACE_CASE_H_CONV2D(CC) \
-switch (h_count) { \
-  case 1: \
-    MACE_CASE_W_CONV2D(CC, 1); \
-    break; \
-  case 2: \
-    MACE_CASE_W_CONV2D(CC, 2); \
-    break; \
-  default: \
-    LOG(FATAL) << "Unsupported h tile: " << h_count; \
-}
-
-#define MACE_CASE_C_CONV2D \
-switch (c_count) { \
-  case 1: \
-    MACE_CASE_H_CONV2D(1); \
-    break; \
-  case 2: \
-    MACE_CASE_H_CONV2D(2); \
-    break; \
-  case 3: \
-    MACE_CASE_H_CONV2D(3); \
-    break; \
-  case 4: \
-    MACE_CASE_H_CONV2D(4); \
-    break; \
-  case 5: \
-    MACE_CASE_H_CONV2D(5); \
-    break; \
-  case 6: \
-    MACE_CASE_H_CONV2D(6); \
-    break; \
-  case 7: \
-    MACE_CASE_H_CONV2D(7); \
-    break; \
-  case 8: \
-    MACE_CASE_H_CONV2D(8); \
-    break; \
-  case 9: \
-    MACE_CASE_H_CONV2D(9); \
-    break; \
-  case 10: \
-    MACE_CASE_H_CONV2D(10); \
-    break; \
-  case 11: \
-    MACE_CASE_H_CONV2D(11); \
-    break; \
-  case 12: \
-    MACE_CASE_H_CONV2D(12); \
-    break; \
-  case 13: \
-    MACE_CASE_H_CONV2D(13); \
-    break; \
-  case 14: \
-    MACE_CASE_H_CONV2D(14); \
-    break; \
-  case 15: \
-    MACE_CASE_H_CONV2D(15); \
-    break; \
-  case 16: \
-    MACE_CASE_H_CONV2D(16); \
-    break; \
-  default: \
-    LOG(FATAL) << "Unsupported c tile: " << c_count; \
-}
-
-
-template <DeviceType D, typename T>
-struct Conv2dFunctor : Conv2dFunctorBase {
+template<DeviceType D, typename T>
+struct Conv2dFunctor;
+
+template<>
+struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
  Conv2dFunctor(const int *strides,
                const Padding &padding_type,
                const std::vector<int> &paddings,
@@ -310,15 +70,62 @@ struct Conv2dFunctor : Conv2dFunctorBase {
                const float relux_max_limit,
                const bool is_filter_transformed,
                ScratchBuffer *scratch)
-      : Conv2dFunctorBase(strides,
-                          padding_type,
-                          paddings,
-                          dilations,
-                          activation,
-                          relux_max_limit) {}
-
-  void operator()(const Tensor *input,   // NHWC
-                  const Tensor *filter,  // HWOI or TOI
+    : Conv2dFunctorBase(strides,
+                        padding_type,
+                        paddings,
+                        dilations,
+                        activation,
+                        relux_max_limit),
+      is_filter_transformed_(is_filter_transformed),
+      scratch_(scratch) {}
+
+  void Conv2dGeneral(const float *input,
+                     const float *filter,
+                     const index_t batch,
+                     const index_t in_height,
+                     const index_t in_width,
+                     const index_t in_channels,
+                     const index_t out_height,
+                     const index_t out_width,
+                     const index_t out_channels,
+                     const int filter_height,
+                     const int filter_width,
+                     const int stride_h,
+                     const int stride_w,
+                     const int dilation_h,
+                     const int dilation_w,
+                     float *output) {
+#pragma omp parallel for collapse(2)
+    for (index_t b = 0; b < batch; ++b) {
+      for (index_t m = 0; m < out_channels; ++m) {
+        for (index_t h = 0; h < out_height; ++h) {
+          for (index_t w = 0; w < out_width; ++w) {
+            index_t out_offset =
+              ((b * out_channels + m) * out_height + h) * out_width + w;
+            for (index_t c = 0; c < in_channels; ++c) {
+              for (index_t kh = 0; kh < filter_height; ++kh) {
+                for (index_t kw = 0; kw < filter_width; ++kw) {
+                  index_t ih = h * stride_h + kh * dilation_h;
+                  index_t iw = w * stride_w + kw * dilation_w;
+                  index_t in_offset =
+                    ((b * in_channels + c) * in_height + ih) * in_width + iw;
+                  index_t filter_offset =
+                    (((m * in_channels) + c) * filter_height + kh)
+                      * filter_width
+                      + kw;
+                  output[out_offset] +=
+                    input[in_offset] * filter[filter_offset];
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void operator()(const Tensor *input,
+                  const Tensor *filter,
                  const Tensor *bias,
                  Tensor *output,
                  StatsFuture *future) {
@@ -326,138 +133,374 @@ struct Conv2dFunctor : Conv2dFunctorBase {
    MACE_CHECK_NOTNULL(filter);
    MACE_CHECK_NOTNULL(output);

+    std::vector<index_t> filter_shape(4);
+    if (is_filter_transformed_) {
+      // TOC -> OIHW
+      filter_shape[0] = filter->dim(1);
+      filter_shape[1] = filter->dim(2);
+      filter_shape[2] = filter_shape[3] = 3;
+    } else {
+      filter_shape = filter->shape();
+    }
+
    std::vector<index_t> output_shape(4);
    std::vector<int> paddings(2);
    if (paddings_.empty()) {
-      kernels::CalcNHWCPaddingAndOutputSize(
-          input->shape().data(), filter->shape().data(), dilations_, strides_,
-          padding_type_, output_shape.data(), paddings.data());
+      CalcNCHWPaddingAndOutputSize(input->shape().data(),
+                                   filter_shape.data(),
+                                   dilations_,
+                                   strides_,
+                                   padding_type_,
+                                   output_shape.data(),
+                                   paddings.data());
    } else {
      paddings = paddings_;
-      CalcOutputSize(input->shape().data(), filter->shape().data(),
-                     paddings_.data(), dilations_, strides_, RoundType::FLOOR,
-                     output_shape.data());
+      CalcNCHWOutputSize(input->shape().data(),
+                         filter_shape.data(),
+                         paddings_.data(),
+                         dilations_,
+                         strides_,
+                         RoundType::FLOOR,
+                         output_shape.data());
    }
    output->Resize(output_shape);
-
-    int batch = output->dim(0);
-    int height = output->dim(1);
-    int width = output->dim(2);
-    int channels = output->dim(3);
-
-    int input_batch = input->dim(0);
-    int input_height = input->dim(1);
-    int input_width = input->dim(2);
-    int input_channels = input->dim(3);
-
-    int kernel_h = filter->dim(0);
-    int kernel_w = filter->dim(1);
-    MACE_CHECK(filter->dim(2) == channels, filter->dim(2), " != ", channels);
-    MACE_CHECK(filter->dim(3) == input_channels, filter->dim(3), " != ",
+    output->Clear();
+
+    index_t batch = output->dim(0);
+    index_t channels = output->dim(1);
+    index_t height = output->dim(2);
+    index_t width = output->dim(3);
+
+    index_t input_batch = input->dim(0);
+    index_t input_channels = input->dim(1);
+    index_t input_height = input->dim(2);
+    index_t input_width = input->dim(3);
+
+    index_t filter_h = filter_shape[2];
+    index_t filter_w = filter_shape[3];
+    MACE_CHECK(filter_shape[0] == channels, filter_shape[0], " != ", channels);
+    MACE_CHECK(filter_shape[1] == input_channels, filter_shape[1], " != ",
               input_channels);

-    int stride_h = strides_[0];
-    int stride_w = strides_[1];
+    index_t stride_h = strides_[0];
+    index_t stride_w = strides_[1];

-    int dilation_h = dilations_[0];
-    int dilation_w = dilations_[1];
+    index_t dilation_h = dilations_[0];
+    index_t dilation_w = dilations_[1];

    MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");

-    int padded_height = input_height + paddings[0];
-    int padded_width = input_width + paddings[1];
+    index_t padded_input_height = input_height + paddings[0];
+    index_t padded_input_width = input_width + paddings[1];
+    index_t extra_input_height = padded_input_height;
+    index_t extra_input_width = padded_input_width;
+    index_t extra_output_height = height;
+    index_t extra_output_width = width;
+
+    int pad_top = paddings[0] >> 1;
+    int pad_bottom = paddings[0] - pad_top;
+    int pad_left = paddings[1] >> 1;
+    int pad_right = paddings[1] - pad_left;
+
+    Tensor::MappingGuard input_guard(input);
+    Tensor::MappingGuard filter_guard(filter);
+    Tensor::MappingGuard bias_guard(bias);
+    Tensor::MappingGuard output_guard(output);
+
+    auto input_data = input->data<float>();
+    auto filter_data = filter->data<float>();
+    auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
+    auto output_data = output->mutable_data<float>();
+
+    std::function<void(const float *input, float *output)> conv_func;
+
+    bool
+      use_winograd = is_filter_transformed_ || (filter_h == 3 && filter_w == 3
+      && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1
+      && input_channels >= 8 && channels >= 8);
+    bool use_neon_3x3_s1 = filter_h == 3 && filter_w == 3
+      && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
+    bool use_neon_3x3_s2 = filter_h == 3 && filter_w == 3
+      && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1;
+    bool use_neon_1x1_s1 = filter_h == 1 && filter_w == 1
+      && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
+
+    std::vector<index_t> transformed_input_shape;
+    std::vector<index_t> transformed_output_shape;
+    std::vector<index_t> transformed_filter_shape;
+
+    // When size of input feature map is bigger than 16x16,
+    // set winograd out tile size to 6 to get higher performance.
+    index_t winograd_out_tile_size = 2;
+    if (input_height > 16 && input_width > 16) {
+      winograd_out_tile_size = 6;
+    }
+
+    if (use_winograd) {
+      extra_output_height = RoundUp<index_t>(height, winograd_out_tile_size);
+      extra_input_height =
+        std::max(padded_input_height, extra_output_height + 2);
+      extra_output_width = RoundUp<index_t>(width, winograd_out_tile_size);
+      extra_input_width = std::max(padded_input_width, extra_output_width + 2);
+      if (extra_input_height != padded_input_height) {
+        pad_bottom += (extra_input_height - padded_input_height);
+      }
+      if (extra_input_width != padded_input_width) {
+        pad_right += (extra_input_width - padded_input_width);
+      }
+
+      index_t tile_height_count = extra_output_height / winograd_out_tile_size;
+      index_t tile_width_count = extra_output_width / winograd_out_tile_size;
+      index_t tile_count = tile_height_count * tile_width_count;
+      index_t in_tile_area =
+        (winograd_out_tile_size + 2) * (winograd_out_tile_size + 2);
+
+      transformed_input_shape.insert(transformed_input_shape.end(),
+                                     {in_tile_area, batch, input_channels,
+                                      tile_count});
+      transformed_output_shape.insert(transformed_output_shape.end(),
+                                      {in_tile_area, batch, channels,
+                                       tile_count});
+      transformed_filter_shape.insert(transformed_filter_shape.end(),
+                                      {in_tile_area, channels, input_channels});
+    } else if (use_neon_3x3_s1) {
+      extra_output_height = RoundUp<index_t>(height, 2);
+      extra_input_height =
+        std::max(padded_input_height, extra_output_height + 2);
+      extra_output_width = RoundUp<index_t>(width, 4);
+      extra_input_width = std::max(padded_input_width, extra_output_width + 2);
+      if (extra_input_height != padded_input_height) {
+        pad_bottom += (extra_input_height - padded_input_height);
+      }
+      if (extra_input_width != padded_input_width) {
+        pad_right += (extra_input_width - padded_input_width);
+      }
+    } else if (use_neon_3x3_s2) {
+      extra_output_height = height;
+      extra_input_height =
+        std::max(padded_input_height, (extra_output_height - 1) * 2 + 3);
+      extra_output_width = RoundUp<index_t>(width, 4);
+      extra_input_width =
+        std::max(padded_input_width, (extra_output_width - 1) * 2 + 3);
+      if (extra_input_height != padded_input_height) {
+        pad_bottom += (extra_input_height - padded_input_height);
+      }
+      if (extra_input_width != padded_input_width) {
+        pad_right += (extra_input_width - padded_input_width);
+      }
+    }

-    Tensor padded_input;
-    // Keep this alive during kernel execution
-    if (paddings[0] > 0 || paddings[1] > 0) {
-      ConstructNHWCInputWithPadding(input, paddings.data(), &padded_input);
-      input = &padded_input;
+    // decide scratch size before allocate it
+    index_t total_scratch_size = 0;
+    index_t transformed_input_size = 0;
+    index_t transformed_output_size = 0;
+    index_t padded_input_size = 0;
+    index_t padded_output_size = 0;
+    if (use_winograd) {
+      transformed_input_size =
+        std::accumulate(transformed_input_shape.begin(),
+                        transformed_input_shape.end(),
+                        1,
+                        std::multiplies<index_t>()) * sizeof(float);
+      transformed_output_size =
+        std::accumulate(transformed_output_shape.begin(),
+                        transformed_output_shape.end(),
+                        1,
+                        std::multiplies<index_t>()) * sizeof(float);
+      total_scratch_size += transformed_input_size + transformed_output_size;
    }
+    if (extra_input_height != input_height
+      || extra_input_width != input_width) {
+      padded_input_size =
+        batch * input_channels * (input_height + pad_top + pad_bottom)
+          * (input_width + pad_left + pad_right) * sizeof(float);
+      total_scratch_size += padded_input_size;
+    }
+    if (extra_output_height != height || extra_output_width != width) {
+      padded_output_size =
+        batch * channels * extra_output_height * extra_output_width
+          * sizeof(float);
+      total_scratch_size += padded_output_size;
+    }
+    // Init scratch buffer
+    scratch_->Rewind();
+    scratch_->GrowSize(total_scratch_size);
+    Tensor
+      transformed_input(scratch_->Scratch(transformed_input_size), DT_FLOAT);
+    Tensor
+      transformed_output(scratch_->Scratch(transformed_output_size), DT_FLOAT);
+    Tensor padded_input(scratch_->Scratch(padded_input_size), DT_FLOAT);
+    Tensor padded_output(scratch_->Scratch(padded_output_size), DT_FLOAT);
+
+    // decide which convolution function to call
+    if (use_winograd) {
+      transformed_input.Resize(transformed_input_shape);
+      transformed_output.Resize(transformed_output_shape);
+      const float *transformed_filter_ptr;
+      if (transformed_filter_.dim_size() == 0) {
+        transformed_filter_.Resize(transformed_filter_shape);
+        if (is_filter_transformed_) {
+          transformed_filter_ptr = filter_data;
+        } else {
+          switch (winograd_out_tile_size) {
+            case 2:
+              TransformFilter4x4(filter_data,
+                                 filter_shape[1],
+                                 filter_shape[0],
+                                 transformed_filter_.mutable_data<float>());
+              break;
+            case 6:
+              TransformFilter8x8(filter_data,
+                                 filter_shape[1],
+                                 filter_shape[0],
+                                 transformed_filter_.mutable_data<float>());
+              break;
+            default:MACE_NOT_IMPLEMENTED;
+          }
+          transformed_filter_ptr = transformed_filter_.data<float>();
+        }
+      } else {
+        transformed_filter_ptr = transformed_filter_.data<float>();
+      }

-    // padded_input.DebugPrint();
+      conv_func = [&](const float *pad_input, float *pad_output) {
+        WinoGradConv3x3s1(pad_input,
+                          transformed_filter_ptr,
+                          batch,
+                          extra_input_height,
+                          extra_input_width,
+                          input_channels,
+                          channels,
+                          winograd_out_tile_size,
+                          transformed_input.mutable_data<float>(),
+                          transformed_output.mutable_data<float>(),
+                          pad_output);
+      };
+    } else if (use_neon_3x3_s1) {
+      conv_func = [=](const float *pad_input, float *pad_output) {
+        Conv2dNeonK3x3S1(pad_input,
+                         filter_data,
+                         batch,
+                         extra_input_height,
+                         extra_input_width,
+                         input_channels,
+                         extra_output_height,
+                         extra_output_width,
+                         channels,
+                         pad_output);
+      };
+    } else if (use_neon_3x3_s2) {
+      conv_func = [=](const float *pad_input, float *pad_output) {
+        Conv2dNeonK3x3S2(pad_input,
+                         filter_data,
+                         batch,
+                         extra_input_height,
+                         extra_input_width,
+                         input_channels,
+                         extra_output_height,
+                         extra_output_width,
+                         channels,
+                         pad_output);
+      };
+    } else if (use_neon_1x1_s1) {
+      conv_func = [=](const float *pad_input, float *pad_output) {
+        Conv2dNeonK1x1S1(pad_input,
+                         filter_data,
+                         batch,
+                         extra_input_height,
+                         extra_input_width,
+                         input_channels,
+                         channels,
+                         pad_output);
+      };
+    } else {
+      conv_func = [=](const float *pad_input, float *pad_output) {
+        Conv2dGeneral(pad_input,
+                      filter_data,
+                      batch,
+                      extra_input_height,
+                      extra_input_width,
+                      input_channels,
+                      extra_output_height,
+                      extra_output_width,
+                      channels,
+                      filter_h,
+                      filter_w,
+                      stride_h,
+                      stride_w,
+                      dilation_h,
+                      dilation_w,
+                      pad_output);
+      };
+    }

-    Tensor::MappingGuard input_mapper(input);
-    Tensor::MappingGuard filter_mapper(filter);
-    Tensor::MappingGuard bias_mapper(bias);
-    Tensor::MappingGuard output_mapper(output);
-    auto input_data = input->data<T>();
-    auto filter_data = filter->data<T>();
-    auto bias_data = bias == nullptr ? nullptr : bias->data<T>();
-    auto output_data = output->mutable_data<T>();
+    // pad input and output
+    const Tensor *pad_input_ptr = input;
+    if (extra_input_height != input_height
+      || extra_input_width != input_width) {
+      padded_input.Clear();
+      ConstructNCHWInputWithSpecificPadding(input,
+                                            pad_top,
+                                            pad_bottom,
+                                            pad_left,
+                                            pad_right,
+                                            &padded_input);
+      pad_input_ptr = &padded_input;
+    }

-    constexpr int inc_tile_size = 4;
-// TODO(heliangliang) Auto tuning these parameters
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-    const int c_tile_size = 4;
-    const int h_tile_size = 2;
-    const int w_tile_size = 2;
-#else
-    const int c_tile_size = 4;
-    const int h_tile_size = 1;
-    const int w_tile_size = 2;
-#endif
+    Tensor *pad_output_ptr = output;
+    if (extra_output_height != height || extra_output_width != width) {
+      padded_output.Resize({batch, channels, extra_output_height,
+                            extra_output_width});
+      padded_output.Clear();
+      pad_output_ptr = &padded_output;
+    }
+    const float *pad_input_data = pad_input_ptr->data<float>();
+    float *pad_output_data = pad_output_ptr->mutable_data<float>();
+
+    conv_func(pad_input_data, pad_output_data);
+
+    // unpack output
+    if (extra_output_height != height || extra_output_width != width) {
+#pragma omp parallel for collapse(2)
+      for (index_t b = 0; b < batch; ++b) {
+        for (index_t c = 0; c < channels; ++c) {
+          for (index_t h = 0; h < height; ++h) {
+            memcpy(
+              output_data + b * channels * height * width + c * height * width
+                + h * width,
+              pad_output_data
+                + b * channels * extra_output_height * extra_output_width
+                + c * extra_output_height * extra_output_width
+                + h * extra_output_width,
+              sizeof(float) * width);
+          }
+        }
+      }
+    }

-    const int c_tiles = RoundUpDiv(channels, c_tile_size);
-    const int h_tiles = RoundUpDiv(height, h_tile_size);
-    const int w_tiles = RoundUpDiv(width, w_tile_size);
-
-#pragma omp parallel for collapse(4)
-    for (int n = 0; n < batch; ++n) {
-      for (int cb = 0; cb < c_tiles; ++cb) {
-        for (int hb = 0; hb < h_tiles; ++hb) {
-          for (int wb = 0; wb < w_tiles; ++wb) {
-            const T *input_ptr =
-                input_data + n * padded_height * padded_width * input_channels;
-            T *output_ptr = output_data + n * height * width * channels;
-            const int h_offset = hb * h_tile_size;
-            const int w_offset = wb * w_tile_size;
-            const int c_offset = cb * c_tile_size;
-
-            const int h_count = std::min(h_tile_size, height - h_offset);
-            const int w_count = std::min(w_tile_size, width - w_offset);
-            const int c_count = std::min(c_tile_size, channels - c_offset);
-
-            MACE_CASE_C_CONV2D;
+    if (bias_data != nullptr) {
+#pragma omp parallel for collapse(2)
+      for (index_t b = 0; b < batch; ++b) {
+        for (index_t c = 0; c < channels; ++c) {
+          for (index_t i = 0; i < height * width; ++i) {
+            output_data[(b * channels + c) * height * width + i] +=
+              bias_data[c];
          }
        }
      }
    }
+
    DoActivation(output_data, output_data, output->size(), activation_,
                 relux_max_limit_);
  }
-};
-
-template <>
-struct Conv2dFunctor<DeviceType::NEON, float> : Conv2dFunctorBase {
-  Conv2dFunctor(const int *strides,
-                const Padding &padding_type,
-                const std::vector<int> &paddings,
-                const int *dilations,
-                const ActivationType activation,
-                const float relux_max_limit,
-                const bool is_filter_transformed,
-                ScratchBuffer *scratch)
-    : Conv2dFunctorBase(strides,
-                        padding_type,
-                        paddings,
-                        dilations,
-                        activation,
-                        relux_max_limit),
-      is_filter_transformed_(is_filter_transformed),
-      scratch_(scratch) {}
-
-  void operator()(const Tensor *input,
-                  const Tensor *filter,
-                  const Tensor *bias,
-                  Tensor *output,
-                  StatsFuture *future);

  Tensor transformed_filter_;
  bool is_filter_transformed_;
  ScratchBuffer *scratch_;
 };

-template <typename T>
+template<typename T>
 struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
  Conv2dFunctor(const int *strides,
                const Padding &padding_type,
@@ -467,12 +510,12 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
                const float relux_max_limit,
                const bool is_filter_transformed,
                ScratchBuffer *scratch)
-      : Conv2dFunctorBase(strides,
-                          padding_type,
-                          paddings,
-                          dilations,
-                          activation,
-                          relux_max_limit) {}
+    : Conv2dFunctorBase(strides,
+                        padding_type,
+                        paddings,
+                        dilations,
+                        activation,
+                        relux_max_limit) {}

  void operator()(const Tensor *input,
                  const Tensor *filter,

--- a/mace/kernels/conv_pool_2d_util.cc
+++ b/mace/kernels/conv_pool_2d_util.cc
@@ -368,7 +368,6 @@ void ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
                                           const int pad_left,
                                           const int pad_right,
                                           Tensor *output_tensor) {
-  Tensor::MappingGuard input_mapper(input_tensor);
  const float *input = input_tensor->data<float>();
  const index_t *input_shape = input_tensor->shape().data();


--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.h
@@ -25,15 +25,15 @@
 namespace mace {
 namespace kernels {

-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 struct DepthToSpaceOpFunctor {
  explicit DepthToSpaceOpFunctor(const int block_size, bool d2s)
-      : block_size_(block_size), d2s_(d2s) {}
+    : block_size_(block_size), d2s_(d2s) {}
  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
    const int batch_size = input->dim(0);
-    const int input_height = input->dim(1);
-    const int input_width = input->dim(2);
-    const int input_depth = input->dim(3);
+    const int input_depth = input->dim(1);
+    const int input_height = input->dim(2);
+    const int input_width = input->dim(3);

    index_t output_depth, output_width, output_height;

@@ -46,8 +46,8 @@ struct DepthToSpaceOpFunctor {
      output_width = input_width / block_size_;
      output_height = input_height / block_size_;
    }
-    std::vector<index_t> output_shape = {batch_size, output_height,
-                                         output_width, output_depth};
+    std::vector<index_t> output_shape = {batch_size, output_depth,
+                                         output_height, output_width};

    output->Resize(output_shape);

@@ -59,23 +59,22 @@ struct DepthToSpaceOpFunctor {
    if (d2s_) {
 #pragma omp parallel for
      for (int b = 0; b < batch_size; ++b) {
-        for (int h = 0; h < output_height; ++h) {
-          const int in_h = h / block_size_;
-          const int offset_h = (h % block_size_);
-          for (int w = 0; w < output_width; ++w) {
-            const int in_w = w / block_size_;
-            const int offset_w = w % block_size_;
-            const int offset_d =
+        for (int d = 0; d < output_depth; ++d) {
+          for (int h = 0; h < output_height; ++h) {
+            const int in_h = h / block_size_;
+            const int offset_h = (h % block_size_);
+            for (int w = 0; w < output_width; ++w) {
+              const index_t in_w = w / block_size_;
+              const index_t offset_w = w % block_size_;
+              const index_t offset_d =
                (offset_h * block_size_ + offset_w) * output_depth;
-            for (int d = 0; d < output_depth; ++d) {
-              const int in_d = d + offset_d;
-              const int o_index =
-                  ((b * output_height + h) * output_width + w) * output_depth +
-                  d;
-              const int i_index =
-                  ((b * input_height + in_h) * input_width + in_w) *
-                      input_depth +
-                  in_d;
+
+              const index_t in_d = d + offset_d;
+              const index_t o_index =
+                ((b * output_depth + d) * output_height + h) * output_width + w;
+              const index_t i_index =
+                ((b * input_depth + in_d) * input_height + in_h) * input_width
+                  + in_w;
              output_ptr[o_index] = input_ptr[i_index];
            }
          }
@@ -84,22 +83,23 @@ struct DepthToSpaceOpFunctor {
    } else {
 #pragma omp parallel for
      for (int b = 0; b < batch_size; ++b) {
-        for (int h = 0; h < input_height; ++h) {
-          const int out_h = h / block_size_;
-          const int offset_h = (h % block_size_);
-          for (int w = 0; w < input_width; ++w) {
-            const int out_w = w / block_size_;
-            const int offset_w = (w % block_size_);
-            const int offset_d =
+        for (int d = 0; d < input_depth; ++d) {
+          for (int h = 0; h < input_height; ++h) {
+            const int out_h = h / block_size_;
+            const int offset_h = (h % block_size_);
+            for (int w = 0; w < input_width; ++w) {
+              const int out_w = w / block_size_;
+              const int offset_w = (w % block_size_);
+              const int offset_d =
                (offset_h * block_size_ + offset_w) * input_depth;
-            for (int d = 0; d < input_depth; ++d) {
+
              const int out_d = d + offset_d;
-              const int o_index =
-                  ((b * output_height + out_h) * output_width + out_w) *
-                      output_depth +
-                  out_d;
-              const int i_index =
-                  ((b * input_height + h) * input_width + w) * input_depth + d;
+              const index_t o_index =
+                ((b * output_depth + out_d) * output_height + out_h)
+                  * output_width + out_w;
+              const index_t i_index =
+                ((b * input_depth + d) * input_height + h) * input_width
+                  + w;
              output_ptr[o_index] = input_ptr[i_index];
            }
          }
@@ -112,10 +112,10 @@ struct DepthToSpaceOpFunctor {
  bool d2s_;
 };

-template <typename T>
+template<typename T>
 struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
  DepthToSpaceOpFunctor(const int block_size, bool d2s)
-      : block_size_(block_size), d2s_(d2s) {}
+    : block_size_(block_size), d2s_(d2s) {}
  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);

  const int block_size_;

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -26,225 +26,12 @@
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/kernels/activation.h"
+#include "mace/kernels/arm/depthwise_conv2d_neon.h"
 #include "mace/public/mace.h"

 namespace mace {
 namespace kernels {

-template <typename T>
-void DepthwiseConv2dKernel(const T *input_ptr,
-                           const T *filter_ptr,
-                           const T *bias_ptr,
-                           T *output_ptr,
-                           int batch,
-                           int height,
-                           int width,
-                           int channels,
-                           int input_height,
-                           int input_width,
-                           int input_channels,
-                           int multiplier,
-                           int padded_h_start,
-                           int padded_h_stop,
-                           int padded_w_start,
-                           int padded_w_stop,
-                           int kernel_h,
-                           int kernel_w,
-                           int stride_h,
-                           int stride_w,
-                           int dilation_h,
-                           int dilation_w,
-                           int h_start,
-                           int h_stop,
-                           int w_start,
-                           int w_stop) {
-#pragma omp parallel for collapse(4)
-  for (int n = 0; n < batch; ++n) {
-    for (int h = h_start; h < h_stop; ++h) {
-      for (int w = w_start; w < w_stop; ++w) {
-        for (int c = 0; c < channels; ++c) {
-          const index_t inc = c / multiplier;
-          const index_t m = c % multiplier;
-          T bias_channel = bias_ptr ? bias_ptr[c] : 0;
-          index_t offset = n * height * width * channels +
-                           h * width * channels + w * channels + c;
-          output_ptr[offset] = bias_channel;
-          T sum = 0;
-          const T *filter_base = filter_ptr + inc * multiplier + m;
-          for (int kh = 0; kh < kernel_h; ++kh) {
-            for (int kw = 0; kw < kernel_w; ++kw) {
-              int inh = padded_h_start + h * stride_h + dilation_h * kh;
-              int inw = padded_w_start + w * stride_w + dilation_w * kw;
-              if (inh < 0 || inh >= input_height || inw < 0 ||
-                  inw >= input_width) {
-                MACE_CHECK(inh >= padded_h_start && inh < padded_h_stop &&
-                               inw >= padded_w_start && inw < padded_w_stop,
-                           "Out of range read from input: ", padded_h_start,
-                           " <= ", inh, " < ", padded_h_stop, ", ",
-                           padded_w_start, " <= ", inw, " < ", padded_w_stop);
-              } else {
-                index_t input_offset =
-                    n * input_height * input_width * input_channels +
-                    inh * input_width * input_channels + inw * input_channels +
-                    inc;
-                sum += input_ptr[input_offset] * filter_base[0];  // HWIM
-              }
-              filter_base += input_channels * multiplier;
-            }
-          }
-          output_ptr[offset] += sum;
-        }
-      }
-    }
-  }
-}
-template <typename T>
-void DepthwiseConv2dNoOOBCheckKernel(const T *input_ptr,
-                                     const T *filter_ptr,
-                                     const T *bias_ptr,
-                                     T *output_ptr,
-                                     int batch,
-                                     int height,
-                                     int width,
-                                     int channels,
-                                     int input_height,
-                                     int input_width,
-                                     int input_channels,
-                                     int multiplier,
-                                     int padded_h_start,
-                                     int padded_h_stop,
-                                     int padded_w_start,
-                                     int padded_w_stop,
-                                     int kernel_h,
-                                     int kernel_w,
-                                     int stride_h,
-                                     int stride_w,
-                                     int dilation_h,
-                                     int dilation_w,
-                                     int h_start,
-                                     int h_stop,
-                                     int w_start,
-                                     int w_stop) {
-  if (multiplier == 1) {
-    constexpr int c_tile_size = 4;
-
-#pragma omp parallel for collapse(3)
-    for (int n = 0; n < batch; ++n) {
-      for (int h = h_start; h < h_stop; ++h) {
-        for (int w = w_start; w < w_stop; ++w) {
-          int c;
-          for (c = 0; c + c_tile_size <= channels; c += c_tile_size) {
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-            static_assert(c_tile_size == 4, "channels tile size must be 4");
-            float32x4_t sum = vdupq_n_f32(0);
-            if (bias_ptr != nullptr) {
-              sum = vld1q_f32(bias_ptr + c);
-            }
-#else
-            T sum[c_tile_size] = {0};
-            if (bias_ptr != nullptr) {
-              for (int ci = 0; ci < c_tile_size; ++ci) {
-                sum[ci] = bias_ptr[c + ci];
-              }
-            }
-#endif
-            const T *filter_base = filter_ptr + c;
-            for (int kh = 0; kh < kernel_h; ++kh) {
-              for (int kw = 0; kw < kernel_w; ++kw) {
-                int inh = padded_h_start + h * stride_h + dilation_h * kh;
-                int inw = padded_w_start + w * stride_w + dilation_w * kw;
-                MACE_ASSERT(inh >= 0 && inh < input_height && inw >= 0 &&
-                            inw < input_width);
-                index_t input_offset =
-                    n * input_height * input_width * input_channels +
-                    inh * input_width * input_channels + inw * input_channels +
-                    c;
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-                float32x4_t in = vld1q_f32(input_ptr + input_offset);
-                float32x4_t weights = vld1q_f32(filter_base);
-                sum = vfmaq_f32(sum, in, weights);
-#else
-                for (int ci = 0; ci < c_tile_size; ++ci) {
-                  sum[ci] +=
-                      input_ptr[input_offset + ci] * filter_base[ci];  // HWIM
-                }
-#endif
-                filter_base += input_channels;
-              }
-            }
-
-            index_t offset = n * height * width * channels +
-                             h * width * channels + w * channels + c;
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-            vst1q_f32(output_ptr + offset, sum);
-#else
-            for (int ci = 0; ci < c_tile_size; ++ci) {
-              output_ptr[offset + ci] = sum[ci];
-            }
-#endif
-          }
-          for (; c < channels; ++c) {
-            T bias_channel = bias_ptr ? bias_ptr[c] : 0;
-            index_t offset = n * height * width * channels +
-                             h * width * channels + w * channels + c;
-            output_ptr[offset] = bias_channel;
-            T sum = 0;
-            const T *filter_base = filter_ptr + c;
-            for (int kh = 0; kh < kernel_h; ++kh) {
-              for (int kw = 0; kw < kernel_w; ++kw) {
-                int inh = padded_h_start + h * stride_h + dilation_h * kh;
-                int inw = padded_w_start + w * stride_w + dilation_w * kw;
-                MACE_ASSERT(inh >= 0 && inh < input_height && inw >= 0 &&
-                            inw < input_width);
-                index_t input_offset =
-                    n * input_height * input_width * input_channels +
-                    inh * input_width * input_channels + inw * input_channels +
-                    c;
-                sum += input_ptr[input_offset] * filter_base[0];  // HWIM
-                filter_base += input_channels * multiplier;
-              }
-            }
-            output_ptr[offset] += sum;
-          }
-        }
-      }
-    }
-  } else {
-#pragma omp parallel for collapse(4)
-    for (int n = 0; n < batch; ++n) {
-      for (int h = h_start; h < h_stop; ++h) {
-        for (int w = w_start; w < w_stop; ++w) {
-          for (int c = 0; c < channels; ++c) {
-            const index_t inc = c / multiplier;
-            const index_t m = c % multiplier;
-            T bias_channel = bias_ptr ? bias_ptr[c] : 0;
-            index_t offset = n * height * width * channels +
-                             h * width * channels + w * channels + c;
-            output_ptr[offset] = bias_channel;
-            T sum = 0;
-            const T *filter_base = filter_ptr + inc * multiplier + m;
-            for (int kh = 0; kh < kernel_h; ++kh) {
-              for (int kw = 0; kw < kernel_w; ++kw) {
-                int inh = padded_h_start + h * stride_h + dilation_h * kh;
-                int inw = padded_w_start + w * stride_w + dilation_w * kw;
-                MACE_ASSERT(inh >= 0 && inh < input_height && inw >= 0 &&
-                            inw < input_width);
-                index_t input_offset =
-                    n * input_height * input_width * input_channels +
-                    inh * input_width * input_channels + inw * input_channels +
-                    inc;
-                sum += input_ptr[input_offset] * filter_base[0];  // HWIM
-                filter_base += input_channels * multiplier;
-              }
-            }
-            output_ptr[offset] += sum;
-          }
-        }
-      }
-    }
-  }
-}
-
 struct DepthwiseConv2dFunctorBase {
  DepthwiseConv2dFunctorBase(const int *strides,
                             const Padding padding_type,
@@ -252,12 +39,12 @@ struct DepthwiseConv2dFunctorBase {
                             const int *dilations,
                             const ActivationType activation,
                             const float relux_max_limit)
-      : strides_(strides),
-        padding_type_(padding_type),
-        paddings_(paddings),
-        dilations_(dilations),
-        activation_(activation),
-        relux_max_limit_(relux_max_limit) {}
+    : strides_(strides),
+      padding_type_(padding_type),
+      paddings_(paddings),
+      dilations_(dilations),
+      activation_(activation),
+      relux_max_limit_(relux_max_limit) {}

  const int *strides_;  // [stride_h, stride_w]
  const Padding padding_type_;
@@ -267,159 +54,246 @@ struct DepthwiseConv2dFunctorBase {
  const float relux_max_limit_;
 };

-template <DeviceType D, typename T>
-struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
+template<DeviceType D, typename T>
+struct DepthwiseConv2dFunctor;
+
+template<>
+struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
+  : public DepthwiseConv2dFunctorBase {
  DepthwiseConv2dFunctor(const int *strides,
                         const Padding padding_type,
                         const std::vector<int> &paddings,
                         const int *dilations,
                         const ActivationType activation,
                         const float relux_max_limit)
-      : DepthwiseConv2dFunctorBase(strides,
-                                   padding_type,
-                                   paddings,
-                                   dilations,
-                                   activation,
-                                   relux_max_limit) {}
-
-  void operator()(const Tensor *input,   // NHWC
-                  const Tensor *filter,  // HWIM
-                  const Tensor *bias,    // O
+    : DepthwiseConv2dFunctorBase(strides,
+                                 padding_type,
+                                 paddings,
+                                 dilations,
+                                 activation,
+                                 relux_max_limit) {}
+
+  void DepthwiseConv2dGeneral(const float *input,
+                              const float *filter,
+                              const index_t batch,
+                              const index_t in_height,
+                              const index_t in_width,
+                              const index_t in_channels,
+                              const index_t out_height,
+                              const index_t out_width,
+                              const index_t out_channels,
+                              const int filter_height,
+                              const int filter_width,
+                              const int stride_h,
+                              const int stride_w,
+                              const int dilation_h,
+                              const int dilation_w,
+                              const int pad_top,
+                              const int pad_left,
+                              float *output) {
+    const index_t multiplier = out_channels / in_channels;
+#pragma omp parallel for collapse(2)
+    for (index_t b = 0; b < batch; ++b) {
+      for (index_t m = 0; m < out_channels; ++m) {
+        for (index_t h = 0; h < out_height; ++h) {
+          for (index_t w = 0; w < out_width; ++w) {
+            index_t out_offset =
+              ((b * out_channels + m) * out_height + h) * out_width + w;
+            index_t c = m / multiplier;
+            index_t o = m % multiplier;
+            float sum = 0;
+            for (index_t kh = 0; kh < filter_height; ++kh) {
+              for (index_t kw = 0; kw < filter_width; ++kw) {
+                index_t ih = h * stride_h + kh * dilation_h - pad_top;
+                index_t iw = w * stride_w + kw * dilation_w - pad_left;
+                if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
+                  index_t in_offset =
+                    ((b * in_channels + c) * in_height + ih) * in_width + iw;
+                  index_t filter_offset =
+                    (((o * in_channels) + c) * filter_height + kh)
+                      * filter_width
+                      + kw;
+
+                  sum += input[in_offset] * filter[filter_offset];
+                }
+              }
+            }
+            output[out_offset] = sum;
+          }
+        }
+      }
+    }
+  }
+
+  void operator()(const Tensor *input,
+                  const Tensor *filter,
+                  const Tensor *bias,
                  Tensor *output,
                  StatsFuture *future) {
    MACE_CHECK_NOTNULL(input);
    MACE_CHECK_NOTNULL(filter);
    MACE_CHECK_NOTNULL(output);

-    // Create a fake conv_2d filter to calculate the paddings and output size
-    std::vector<index_t> fake_filter_shape(4);
-    fake_filter_shape[0] = filter->shape()[0];
-    fake_filter_shape[1] = filter->shape()[1];
-    fake_filter_shape[2] = filter->shape()[2] * filter->shape()[3];
-    fake_filter_shape[3] = 1;
-
    std::vector<index_t> output_shape(4);
    std::vector<int> paddings(2);
+    std::vector<index_t> filter_shape
+      {filter->dim(0) * filter->dim(1), filter->dim(1), filter->dim(2),
+       filter->dim(3)};
+
    if (paddings_.empty()) {
-      kernels::CalcNHWCPaddingAndOutputSize(
-          input->shape().data(), fake_filter_shape.data(), dilations_, strides_,
-          padding_type_, output_shape.data(), paddings.data());
+      CalcNCHWPaddingAndOutputSize(input->shape().data(),
+                                   filter_shape.data(),
+                                   dilations_,
+                                   strides_,
+                                   padding_type_,
+                                   output_shape.data(),
+                                   paddings.data());
    } else {
      paddings = paddings_;
-      CalcOutputSize(input->shape().data(), fake_filter_shape.data(),
-                     paddings_.data(), dilations_, strides_, RoundType::FLOOR,
-                     output_shape.data());
+      CalcNCHWOutputSize(input->shape().data(),
+                         filter_shape.data(),
+                         paddings_.data(),
+                         dilations_,
+                         strides_,
+                         RoundType::FLOOR,
+                         output_shape.data());
    }
-    auto input_shape = fake_filter_shape;
    output->Resize(output_shape);
+    output->Clear();

    index_t batch = output->dim(0);
-    index_t height = output->dim(1);
-    index_t width = output->dim(2);
-    index_t channels = output->dim(3);
+    index_t channels = output->dim(1);
+    index_t height = output->dim(2);
+    index_t width = output->dim(3);

    index_t input_batch = input->dim(0);
-    index_t input_height = input->dim(1);
-    index_t input_width = input->dim(2);
-    index_t input_channels = input->dim(3);
-
-    index_t kernel_h = filter->dim(0);
-    index_t kernel_w = filter->dim(1);
-    index_t multiplier = filter->dim(3);
-    MACE_CHECK(filter->dim(2) == input_channels, filter->dim(2), "!=",
+    index_t input_channels = input->dim(1);
+    index_t input_height = input->dim(2);
+    index_t input_width = input->dim(3);
+
+    index_t filter_h = filter_shape[2];
+    index_t filter_w = filter_shape[3];
+    MACE_CHECK(filter_shape[0] == channels, filter_shape[0], " != ", channels);
+    MACE_CHECK(filter_shape[1] == input_channels, filter_shape[1], " != ",
               input_channels);
-    MACE_CHECK(channels == input_channels * multiplier);

-    int stride_h = strides_[0];
-    int stride_w = strides_[1];
+    index_t stride_h = strides_[0];
+    index_t stride_w = strides_[1];

-    int dilation_h = dilations_[0];
-    int dilation_w = dilations_[1];
+    index_t dilation_h = dilations_[0];
+    index_t dilation_w = dilations_[1];

    MACE_CHECK(batch == input_batch, "Input/Output batch size mismatch");

-    // The left-upper most offset of the padded input
-    int paddings_top = paddings[0] / 2;
-    int paddings_bottom = paddings[0] - paddings_top;
-    int paddings_left = paddings[1] / 2;
-    int paddings_right = paddings[1] - paddings_left;
-
-    int padded_h_start = 0 - paddings_top;
-    int padded_w_start = 0 - paddings_left;
-    index_t padded_h_stop = input_height + paddings_bottom;
-    index_t padded_w_stop = input_width + paddings_right;
-
-    Tensor::MappingGuard input_mapper(input);
-    Tensor::MappingGuard filter_mapper(filter);
-    Tensor::MappingGuard bias_mapper(bias);
-    Tensor::MappingGuard output_mapper(output);
-    const T *input_ptr = input->data<T>();
-    const T *filter_ptr = filter->data<T>();
-    const T *bias_ptr = bias == nullptr ? nullptr : bias->data<T>();
-    T *output_ptr = output->mutable_data<T>();
-
-    int valid_h_start =
-        paddings_top == 0 ? 0 : (paddings_top - 1) / stride_h + 1;
-    int valid_h_stop = paddings_bottom == 0
+    int pad_top = paddings[0] >> 1;
+    int pad_bottom = paddings[0] - pad_top;
+    int pad_left = paddings[1] >> 1;
+    int pad_right = paddings[1] - pad_left;
+
+    index_t valid_h_start = pad_top == 0 ? 0 : (pad_top - 1) / stride_h + 1;
+    index_t valid_h_stop = pad_bottom == 0
                           ? height
-                           : height - ((paddings_bottom - 1) / stride_h + 1);
-    int valid_w_start =
-        paddings_left == 0 ? 0 : (paddings_left - 1) / stride_w + 1;
-    int valid_w_stop = paddings_right == 0
+                           : height - ((pad_bottom - 1) / stride_h + 1);
+    index_t valid_w_start = pad_left == 0 ? 0 : (pad_left - 1) / stride_w + 1;
+    index_t valid_w_stop = pad_right == 0
                           ? width
-                           : width - ((paddings_right - 1) / stride_w + 1);
-
-    // Calculate border elements with out-of-boundary checking
-    if (valid_h_start > 0) {
-      DepthwiseConv2dKernel<T>(
-          input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width,
-          channels, input_height, input_width, input_channels, multiplier,
-          padded_h_start, padded_h_stop, padded_w_start, padded_w_stop,
-          kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w, 0,
-          valid_h_start, 0, width);
-    }
-    if (valid_h_stop < height) {
-      DepthwiseConv2dKernel<T>(
-          input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width,
-          channels, input_height, input_width, input_channels, multiplier,
-          padded_h_start, padded_h_stop, padded_w_start, padded_w_stop,
-          kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w,
-          std::max(valid_h_start, valid_h_stop), height, 0, width);
-    }
-    if (valid_w_start > 0) {
-      DepthwiseConv2dKernel<T>(
-          input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width,
-          channels, input_height, input_width, input_channels, multiplier,
-          padded_h_start, padded_h_stop, padded_w_start, padded_w_stop,
-          kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w,
-          valid_h_start, valid_h_stop, 0, valid_w_start);
-    }
-    if (valid_w_stop < width) {
-      DepthwiseConv2dKernel<T>(
-          input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width,
-          channels, input_height, input_width, input_channels, multiplier,
-          padded_h_start, padded_h_stop, padded_w_start, padded_w_stop,
-          kernel_h, kernel_w, stride_h, stride_w, dilation_h, dilation_w,
-          valid_h_start, valid_h_stop, std::max(valid_w_start, valid_w_stop),
-          width);
+                           : width - ((pad_right - 1) / stride_w + 1);
+
+    std::function<void(const float *input, float *output)> conv_func;
+
+    Tensor::MappingGuard input_guard(input);
+    Tensor::MappingGuard filter_guard(filter);
+    Tensor::MappingGuard bias_guard(bias);
+    Tensor::MappingGuard output_guard(output);
+    auto input_data = input->data<float>();
+    auto filter_data = filter->data<float>();
+    auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
+    auto output_data = output->mutable_data<float>();
+
+    if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
+      && dilation_h == 1 && dilation_w == 1) {
+      conv_func = [=](const float *input, float *output) {
+        DepthwiseConv2dNeonK3x3S1(input,
+                                  filter_data,
+                                  batch,
+                                  input_height,
+                                  input_width,
+                                  input_channels,
+                                  height,
+                                  width,
+                                  channels,
+                                  pad_top,
+                                  pad_left,
+                                  valid_h_start,
+                                  valid_h_stop,
+                                  valid_w_start,
+                                  valid_w_stop,
+                                  output);
+      };
+    } else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2
+      && dilation_h == 1 && dilation_w == 1) {
+      conv_func = [=](const float *input, float *output) {
+        DepthwiseConv2dNeonK3x3S2(input,
+                                  filter_data,
+                                  batch,
+                                  input_height,
+                                  input_width,
+                                  input_channels,
+                                  height,
+                                  width,
+                                  channels,
+                                  pad_top,
+                                  pad_left,
+                                  valid_h_start,
+                                  valid_h_stop,
+                                  valid_w_start,
+                                  valid_w_stop,
+                                  output);
+      };
+    } else {
+      conv_func = [=](const float *input, float *output) {
+        DepthwiseConv2dGeneral(input,
+                               filter_data,
+                               batch,
+                               input_height,
+                               input_width,
+                               input_channels,
+                               height,
+                               width,
+                               channels,
+                               filter_h,
+                               filter_w,
+                               stride_h,
+                               stride_w,
+                               dilation_h,
+                               dilation_w,
+                               pad_top,
+                               pad_left,
+                               output);
+      };
    }

-    // Calculate border elements without out-of-boundary checking
-    DepthwiseConv2dNoOOBCheckKernel<T>(
-        input_ptr, filter_ptr, bias_ptr, output_ptr, batch, height, width,
-        channels, input_height, input_width, input_channels, multiplier,
-        padded_h_start, padded_h_stop, padded_w_start, padded_w_stop, kernel_h,
-        kernel_w, stride_h, stride_w, dilation_h, dilation_w, valid_h_start,
-        valid_h_stop, valid_w_start, valid_w_stop);
+    conv_func(input_data, output_data);

-    output_ptr = output->mutable_data<T>();
-    DoActivation(output_ptr, output_ptr, output->size(), activation_,
+    if (bias_data != nullptr) {
+#pragma omp parallel for collapse(2)
+      for (index_t b = 0; b < batch; ++b) {
+        for (index_t c = 0; c < channels; ++c) {
+          for (index_t i = 0; i < height * width; ++i) {
+            output_data[(b * channels + c) * height * width + i] +=
+              bias_data[c];
+          }
+        }
+      }
+    }
+
+    DoActivation(output_data, output_data, output->size(), activation_,
                 relux_max_limit_);
  }
 };

-template <>
-struct DepthwiseConv2dFunctor<DeviceType::NEON, float>
+template<typename T>
+struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
  : DepthwiseConv2dFunctorBase {
  DepthwiseConv2dFunctor(const int *strides,
                         const Padding padding_type,
@@ -439,29 +313,6 @@ struct DepthwiseConv2dFunctor<DeviceType::NEON, float>
                  const Tensor *bias,
                  Tensor *output,
                  StatsFuture *future);
-};
-
-template <typename T>
-struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
-    : DepthwiseConv2dFunctorBase {
-  DepthwiseConv2dFunctor(const int *strides,
-                         const Padding padding_type,
-                         const std::vector<int> &paddings,
-                         const int *dilations,
-                         const ActivationType activation,
-                         const float relux_max_limit)
-      : DepthwiseConv2dFunctorBase(strides,
-                                   padding_type,
-                                   paddings,
-                                   dilations,
-                                   activation,
-                                   relux_max_limit) {}
-
-  void operator()(const Tensor *input,
-                  const Tensor *filter,
-                  const Tensor *bias,
-                  Tensor *output,
-                  StatsFuture *future);

  cl::Kernel kernel_;
  uint32_t kwg_size_;

--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -23,6 +23,7 @@
 #include "mace/core/tensor.h"
 #include "mace/kernels/activation.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/kernels/gemm.h"

 namespace mace {
 namespace kernels {
@@ -41,7 +42,10 @@ struct FullyConnectedBase {
 };

 template <DeviceType D, typename T>
-struct FullyConnectedFunctor : FullyConnectedBase {
+struct FullyConnectedFunctor;
+
+template <>
+struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
  FullyConnectedFunctor(const BufferType weight_type,
                        const ActivationType activation,
                        const float relux_max_limit)
@@ -52,33 +56,25 @@ struct FullyConnectedFunctor : FullyConnectedBase {
                  const Tensor *bias,
                  Tensor *output,
                  StatsFuture *future) {
-    std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
+    std::vector<index_t> output_shape = {input->dim(0), weight->dim(0), 1, 1};
    output->Resize(output_shape);
    const index_t N = output->dim(0);
    const index_t input_size = weight->dim(1);
    const index_t output_size = weight->dim(0);
+
    Tensor::MappingGuard guard_input(input);
    Tensor::MappingGuard guard_weight(weight);
    Tensor::MappingGuard guard_bias(bias);
    Tensor::MappingGuard guard_output(output);
-    const T *input_ptr = input->data<T>();
-    const T *weight_ptr = weight->data<T>();
-    const T *bias_ptr = bias == nullptr ? nullptr : bias->data<T>();
-    T *output_ptr = output->mutable_data<T>();
+    const float *input_ptr = input->data<float>();
+    const float *weight_ptr = weight->data<float>();
+    const float *bias_ptr = bias == nullptr ? nullptr : bias->data<float>();
+    float *output_ptr = output->mutable_data<float>();

-#pragma omp parallel for collapse(2)
+    Gemv(weight_ptr, input_ptr, N, input_size, output_size, output_ptr);
    for (int i = 0; i < N; ++i) {
-      for (int out_idx = 0; out_idx < output_size; ++out_idx) {
-        T sum = 0;
-        if (bias_ptr != nullptr) sum = bias_ptr[out_idx];
-        index_t input_offset = i * input_size;
-        index_t weight_offset = out_idx * input_size;
-        for (int in_idx = 0; in_idx < input_size; ++in_idx) {
-          sum += input_ptr[input_offset] * weight_ptr[weight_offset];
-          input_offset++;
-          weight_offset++;
-        }
-        output_ptr[i * output_size + out_idx] = sum;
+      for (int j = 0; j < output_size; ++j) {
+        output_ptr[j + i * output_size] += bias_ptr[j];
      }
    }

@@ -87,20 +83,6 @@ struct FullyConnectedFunctor : FullyConnectedBase {
  }
 };

-template <>
-struct FullyConnectedFunctor<DeviceType::NEON, float> : FullyConnectedBase {
-  FullyConnectedFunctor(const BufferType weight_type,
-                        const ActivationType activation,
-                        const float relux_max_limit)
-    : FullyConnectedBase(weight_type, activation, relux_max_limit) {}
-
-  void operator()(const Tensor *input,
-                  const Tensor *weight,
-                  const Tensor *bias,
-                  Tensor *output,
-                  StatsFuture *future);
-};
-
 template <typename T>
 struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
  FullyConnectedFunctor(const BufferType weight_type,

--- a/mace/kernels/gemm.cc
+++ b/mace/kernels/gemm.cc
@@ -747,7 +747,7 @@ void Gemv(const float *m_ptr,
    for (index_t h = 0; h < remain_h; ++h) {
      float32x4_t vsum0 = vdupq_n_f32(0.f);
      const float *m_ptr0 = m_ptr + (h + remain_start_height) * width;
-      const float *v_ptr0 = v_ptr;
+      const float *v_ptr0 = v_ptr + b * width;
      for (index_t w = 0; w < width_d4; ++w) {
        float32x4_t vm = vld1q_f32(m_ptr0);
        float32x4_t vv = vld1q_f32(v_ptr0);
@@ -761,7 +761,7 @@ void Gemv(const float *m_ptr,
        m_ptr0++;
        v_ptr0++;
      }
-      out_ptr[remain_start_height + h] = sum;
+      out_ptr[remain_start_height + h + b * height] = sum;
    }
  }
 #else

--- a/mace/kernels/local_response_norm.h
+++ b/mace/kernels/local_response_norm.h
@@ -17,8 +17,11 @@
 namespace mace {
 namespace kernels {

-template <DeviceType D, typename T>
-struct LocalResponseNormFunctor {
+template<DeviceType D, typename T>
+struct LocalResponseNormFunctor;
+
+template<>
+struct LocalResponseNormFunctor<DeviceType::CPU, float> {
  void operator()(const Tensor *input,
                  int depth_radius,
                  float bias,
@@ -27,48 +30,39 @@ struct LocalResponseNormFunctor {
                  Tensor *output,
                  StatsFuture *future) {
    const index_t batch = input->dim(0);
-    const index_t height = input->dim(1);
-    const index_t width = input->dim(2);
-    const index_t channels = input->dim(3);
-
-    Tensor::MappingGuard input_mapper(input);
-    Tensor::MappingGuard output_mapper(output);
+    const index_t channels = input->dim(1);
+    const index_t height = input->dim(2);
+    const index_t width = input->dim(3);

-    const T *input_ptr = input->data<T>();
-    T *output_ptr = output->mutable_data<T>();
+    const float *input_ptr = input->data<float>();
+    float *output_ptr = output->mutable_data<float>();

-    const int elements = batch * height * width;
+    index_t image_size = height * width;
+    index_t batch_size = channels * image_size;

 #pragma omp parallel for collapse(2)
-    for (index_t i = 0; i < elements; ++i) {
+    for (index_t b = 0; b < batch; ++b) {
      for (index_t c = 0; c < channels; ++c) {
        const int begin_input_c = std::max(static_cast<index_t>(0),
                                           c - depth_radius);
        const int end_input_c = std::min(channels, c + depth_radius + 1);
-        index_t pos = i * channels;
-        float accum = 0.f;
-        for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
-          const float input_val = input_ptr[pos + input_c];
-          accum += input_val * input_val;
+
+        index_t pos = b * batch_size;
+        for (index_t hw = 0; hw < height * width; ++hw, ++pos) {
+          float accum = 0.f;
+          for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
+            const float input_val = input_ptr[pos + input_c * image_size];
+            accum += input_val * input_val;
+          }
+          const float multiplier = std::pow(bias + alpha * accum, -beta);
+          output_ptr[pos + c * image_size] =
+            input_ptr[pos + c * image_size] * multiplier;
        }
-        const float multiplier = std::pow(bias + alpha * accum, -beta);
-        output_ptr[pos + c] = input_ptr[pos + c] * multiplier;
      }
    }
  }
 };

-template <>
-struct LocalResponseNormFunctor<DeviceType::NEON, float> {
-  void operator()(const Tensor *input,
-                  int depth_radius,
-                  float bias,
-                  float alpha,
-                  float beta,
-                  Tensor *output,
-                  StatsFuture *future);
-};
-
 }  // namespace kernels
 }  // namespace mace


--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -57,7 +57,10 @@ struct PoolingFunctorBase {
 };

 template <DeviceType D, typename T>
-struct PoolingFunctor : PoolingFunctorBase {
+struct PoolingFunctor;
+
+template <>
+struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
  PoolingFunctor(const PoolingType pooling_type,
                 const int *kernels,
                 const int *strides,
@@ -68,43 +71,141 @@ struct PoolingFunctor : PoolingFunctorBase {
            pooling_type, kernels, strides, padding_type, paddings, dilations) {
  }

+  void MaxPooling(const float *input,
+                  const index_t batch,
+                  const index_t in_height,
+                  const index_t in_width,
+                  const index_t channels,
+                  const index_t out_height,
+                  const index_t out_width,
+                  const int filter_height,
+                  const int filter_width,
+                  const int stride_h,
+                  const int stride_w,
+                  const int dilation_h,
+                  const int dilation_w,
+                  const int pad_top,
+                  const int pad_left,
+                  float *output) {
+    const index_t in_image_size = in_height * in_width;
+    const index_t out_image_size = out_height * out_width;
+    const index_t in_batch_size = channels * in_image_size;
+    const index_t out_batch_size = channels * out_image_size;
+
+#pragma omp parallel for collapse(2)
+    for (index_t b = 0; b < batch; ++b) {
+      for (index_t c = 0; c < channels; ++c) {
+        const index_t out_base = b * out_batch_size + c * out_image_size;
+        const index_t in_base = b * in_batch_size + c * in_image_size;
+        for (index_t h = 0; h < out_height; ++h) {
+          for (index_t w = 0; w < out_width; ++w) {
+            const index_t out_offset = out_base + h * out_width + w;
+            float res = std::numeric_limits<float>::lowest();
+            for (int fh = 0; fh < filter_height; ++fh) {
+              for (int fw = 0; fw < filter_width; ++fw) {
+                int inh = h * stride_h + dilation_h * fh - pad_top;
+                int inw = w * stride_w + dilation_w * fw - pad_left;
+                if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
+                  index_t input_offset = in_base + inh * in_width + inw;
+                  res = std::max(res, input[input_offset]);
+                }
+              }
+            }
+            output[out_offset] = res;
+          }
+        }
+      }
+    }
+  }
+
+  void AvgPooling(const float *input,
+                  const index_t batch,
+                  const index_t in_height,
+                  const index_t in_width,
+                  const index_t channels,
+                  const index_t out_height,
+                  const index_t out_width,
+                  const int filter_height,
+                  const int filter_width,
+                  const int stride_h,
+                  const int stride_w,
+                  const int dilation_h,
+                  const int dilation_w,
+                  const int pad_top,
+                  const int pad_left,
+                  float *output) {
+    const index_t in_image_size = in_height * in_width;
+    const index_t out_image_size = out_height * out_width;
+    const index_t in_batch_size = channels * in_image_size;
+    const index_t out_batch_size = channels * out_image_size;
+
+#pragma omp parallel for collapse(2)
+    for (index_t b = 0; b < batch; ++b) {
+      for (index_t c = 0; c < channels; ++c) {
+        const index_t out_base = b * out_batch_size + c * out_image_size;
+        const index_t in_base = b * in_batch_size + c * in_image_size;
+        for (index_t h = 0; h < out_height; ++h) {
+          for (index_t w = 0; w < out_width; ++w) {
+            const index_t out_offset = out_base + h * out_width + w;
+            float res = 0;
+            int block_size = 0;
+            for (int fh = 0; fh < filter_height; ++fh) {
+              for (int fw = 0; fw < filter_width; ++fw) {
+                int inh = h * stride_h + dilation_h * fh - pad_top;
+                int inw = w * stride_w + dilation_w * fw - pad_left;
+                if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
+                  index_t input_offset = in_base + inh * in_width + inw;
+                  res += input[input_offset];
+                  ++block_size;
+                }
+              }
+            }
+            output[out_offset] = res / block_size;
+          }
+        }
+      }
+    }
+  }
+
  void operator()(const Tensor *input_tensor,
                  Tensor *output_tensor,
                  StatsFuture *future) {
    std::vector<index_t> output_shape(4);
    std::vector<index_t> filter_shape = {
-        kernels_[0], kernels_[1], input_tensor->dim(3), input_tensor->dim(3)};
+      input_tensor->dim(1), input_tensor->dim(1), kernels_[0], kernels_[1]};

    std::vector<int> paddings(2);
    if (paddings_.empty()) {
-      kernels::CalcNHWCPaddingAndOutputSize(
-          input_tensor->shape().data(), filter_shape.data(), dilations_,
-          strides_, padding_type_, output_shape.data(), paddings.data());
+      kernels::CalcNCHWPaddingAndOutputSize(
+        input_tensor->shape().data(), filter_shape.data(), dilations_,
+        strides_, padding_type_, output_shape.data(), paddings.data());
    } else {
      paddings = paddings_;
-      CalcOutputSize(input_tensor->shape().data(), filter_shape.data(),
-                     paddings_.data(), dilations_, strides_, RoundType::CEIL,
-                     output_shape.data());
+      CalcNCHWOutputSize(input_tensor->shape().data(),
+                         filter_shape.data(),
+                         paddings_.data(),
+                         dilations_,
+                         strides_,
+                         RoundType::CEIL,
+                         output_shape.data());
    }
    output_tensor->Resize(output_shape);

-    Tensor::MappingGuard in_guard(input_tensor);
-    Tensor::MappingGuard out_guard(output_tensor);
-    const T *input = input_tensor->data<T>();
-    T *output = output_tensor->mutable_data<T>();
+    Tensor::MappingGuard input_guard(input_tensor);
+    Tensor::MappingGuard output_guard(output_tensor);
+    const float *input = input_tensor->data<float>();
+    float *output = output_tensor->mutable_data<float>();
    const index_t *input_shape = input_tensor->shape().data();
    index_t batch = output_shape[0];
-    index_t height = output_shape[1];
-    index_t width = output_shape[2];
-    index_t channels = output_shape[3];
+    index_t channels = output_shape[1];
+    index_t height = output_shape[2];
+    index_t width = output_shape[3];

-    index_t input_height = input_shape[1];
-    index_t input_width = input_shape[2];
-    index_t input_channels = input_shape[3];
-    index_t in_image_size = input_height * input_width;
+    index_t input_height = input_shape[2];
+    index_t input_width = input_shape[3];

-    int kernel_h = kernels_[0];
-    int kernel_w = kernels_[1];
+    int filter_h = kernels_[0];
+    int filter_w = kernels_[1];

    int stride_h = strides_[0];
    int stride_w = strides_[1];
@@ -112,84 +213,47 @@ struct PoolingFunctor : PoolingFunctorBase {
    int dilation_h = dilations_[0];
    int dilation_w = dilations_[1];

-    // The left-upper most offset of the padded input
-    int padded_h_start = 0 - paddings[0] / 2;
-    int padded_w_start = 0 - paddings[1] / 2;
-
-    if (pooling_type_ == MAX) {
-#pragma omp parallel for collapse(4)
-      for (int b = 0; b < batch; ++b) {
-        for (int h = 0; h < height; ++h) {
-          for (int w = 0; w < width; ++w) {
-            for (int c = 0; c < channels; ++c) {
-              index_t out_offset =
-                  (((b * height) + h) * width + w) * channels + c;
-              index_t in_offset = b * in_image_size * input_channels + c;
-              T res = std::numeric_limits<T>::lowest();
-              for (int kh = 0; kh < kernel_h; ++kh) {
-                for (int kw = 0; kw < kernel_w; ++kw) {
-                  int inh = padded_h_start + h * stride_h + dilation_h * kh;
-                  int inw = padded_w_start + w * stride_w + dilation_w * kw;
-                  if (inh >= 0 && inh < input_height && inw >= 0 &&
-                      inw < input_width) {
-                    index_t input_offset =
-                        in_offset + (inh * input_width + inw) * input_channels;
-                    res = std::max(res, input[input_offset]);
-                  }
-                }
-              }
-              output[out_offset] = res;
-            }
-          }
-        }
-      }
-    } else if (pooling_type_ == AVG) {
-#pragma omp parallel for collapse(4)
-      for (int b = 0; b < batch; ++b) {
-        for (int h = 0; h < height; ++h) {
-          for (int w = 0; w < width; ++w) {
-            for (int c = 0; c < channels; ++c) {
-              index_t out_offset =
-                  (((b * height) + h) * width + w) * channels + c;
-              index_t in_offset = b * in_image_size * input_channels + c;
-              T sum = static_cast<T>(0);
-              int block_size = 0;
-              for (int kh = 0; kh < kernel_h; ++kh) {
-                for (int kw = 0; kw < kernel_w; ++kw) {
-                  int inh = padded_h_start + h * stride_h + dilation_h * kh;
-                  int inw = padded_w_start + w * stride_w + dilation_w * kw;
-                  if (inh >= 0 && inh < input_height && inw >= 0 &&
-                      inw < input_width) {
-                    index_t input_offset =
-                        in_offset + (inh * input_width + inw) * input_channels;
-                    sum += input[input_offset];
-                    block_size += 1;
-                  }
-                }
-              }
-              output[out_offset] = sum / block_size;
-            }
-          }
-        }
-      }
-    }
-  }
-};
+    int pad_top = paddings[0] / 2;
+    int pad_left = paddings[1] / 2;

-template <>
-struct PoolingFunctor<DeviceType::NEON, float> : PoolingFunctorBase {
-  PoolingFunctor(const PoolingType pooling_type,
-                 const int *kernels,
-                 const int *strides,
-                 const Padding padding_type,
-                 const std::vector<int> &paddings,
-                 const int *dilations)
-    : PoolingFunctorBase(
-    pooling_type, kernels, strides, padding_type, paddings, dilations) {
+    if (pooling_type_ == PoolingType::MAX) {
+      MaxPooling(input,
+                 batch,
+                 input_height,
+                 input_width,
+                 channels,
+                 height,
+                 width,
+                 filter_h,
+                 filter_w,
+                 stride_h,
+                 stride_w,
+                 dilation_h,
+                 dilation_w,
+                 pad_top,
+                 pad_left,
+                 output);
+    } else if (pooling_type_ == PoolingType::AVG) {
+      AvgPooling(input,
+                 batch,
+                 input_height,
+                 input_width,
+                 channels,
+                 height,
+                 width,
+                 filter_h,
+                 filter_w,
+                 stride_h,
+                 stride_w,
+                 dilation_h,
+                 dilation_w,
+                 pad_top,
+                 pad_left,
+                 output);
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
  }
-  void operator()(const Tensor *input_tensor,
-                  Tensor *output_tensor,
-                  StatsFuture *future);
 };

 template <typename T>

--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -38,15 +38,15 @@ inline float CalculateResizeScale(index_t in_size,
                                  index_t out_size,
                                  bool align_corners) {
  return (align_corners && out_size > 1)
-             ? (in_size - 1) / static_cast<float>(out_size - 1)
-             : in_size / static_cast<float>(out_size);
+         ? (in_size - 1) / static_cast<float>(out_size - 1)
+         : in_size / static_cast<float>(out_size);
 }

 inline void ComputeInterpolationWeights(
-    const index_t out_size,
-    const index_t in_size,
-    const float scale,
-    CachedInterpolation *interpolation) {
+  const index_t out_size,
+  const index_t in_size,
+  const float scale,
+  CachedInterpolation *interpolation) {
  interpolation[out_size].lower = 0;
  interpolation[out_size].upper = 0;
  for (index_t i = out_size - 1; i >= 0; --i) {
@@ -68,8 +68,7 @@ inline float ComputeLerp(const float top_left,
  return top + (bottom - top) * y_lerp;
 }

-template <typename T>
-void ResizeImage(const T *images,
+inline void ResizeImage(const float *images,
                 const index_t batch_size,
                 const index_t in_height,
                 const index_t in_width,
@@ -78,39 +77,32 @@ void ResizeImage(const T *images,
                 const index_t channels,
                 const std::vector<CachedInterpolation> &xs_vec,
                 const std::vector<CachedInterpolation> &ys,
-                 T *output) {
-  const index_t in_batch_num_values = channels * in_height * in_width;
-  const index_t out_batch_num_values = channels * out_height * out_width;
+                 float *output) {
  const CachedInterpolation *xs = xs_vec.data();

 #pragma omp parallel for collapse(2)
  for (index_t b = 0; b < batch_size; ++b) {
-    for (index_t y = 0; y < out_height; ++y) {
-      const T *batch_input_ptr = images + in_batch_num_values * b;
-      T *batch_output_ptr = output + out_batch_num_values * b;
-      const T *y_lower_input_ptr =
-          batch_input_ptr + ys[y].lower * in_width * channels;
-      const T *y_upper_input_ptr =
-          batch_input_ptr + ys[y].upper * in_width * channels;
-      T *y_output_ptr = batch_output_ptr + y * out_width * channels;
-      const float ys_lerp = ys[y].lerp;
-
-      for (index_t x = 0; x < out_width; ++x) {
-        const float xs_lerp = xs[x].lerp;
-        const T *top_left_ptr = y_lower_input_ptr + xs[x].lower * channels;
-        const T *top_right_ptr = y_lower_input_ptr + xs[x].upper * channels;
-        const T *bottom_left_ptr = y_upper_input_ptr + xs[x].lower * channels;
-        const T *bottom_right_ptr = y_upper_input_ptr + xs[x].upper * channels;
-        T *output_ptr = y_output_ptr + x * channels;
-
-        for (index_t c = 0; c < channels; ++c) {
-          const T top_left = top_left_ptr[c];
-          const T top_right = top_right_ptr[c];
-          const T bottom_left = bottom_left_ptr[c];
-          const T bottom_right = bottom_right_ptr[c];
-
-          output_ptr[c] = ComputeLerp(top_left, top_right, bottom_left,
-                                      bottom_right, xs_lerp, ys_lerp);
+    for (index_t c = 0; c < channels; ++c) {
+      const float
+        *channel_input_ptr = images + (b * channels + c) * in_height * in_width;
+      float *channel_output_ptr =
+        output + (b * channels + c) * out_height * out_width;
+      for (index_t y = 0; y < out_height; ++y) {
+        const float *y_lower_input_ptr =
+          channel_input_ptr + ys[y].lower * in_width;
+        const float *y_upper_input_ptr =
+          channel_input_ptr + ys[y].upper * in_width;
+        const float ys_lerp = ys[y].lerp;
+
+        for (index_t x = 0; x < out_width; ++x) {
+          const float xs_lerp = xs[x].lerp;
+          const float top_left = y_lower_input_ptr[xs[x].lower];
+          const float top_right = y_lower_input_ptr[xs[x].upper];
+          const float bottom_left = y_upper_input_ptr[xs[x].lower];
+          const float bottom_right = y_upper_input_ptr[xs[x].upper];
+          channel_output_ptr[y * out_width + x] =
+            ComputeLerp(top_left, top_right, bottom_left,
+                        bottom_right, xs_lerp, ys_lerp);
        }
      }
    }
@@ -120,7 +112,7 @@ void ResizeImage(const T *images,
 struct ResizeBilinearFunctorBase {
  ResizeBilinearFunctorBase(const std::vector<index_t> &size,
                            bool align_corners)
-      : align_corners_(align_corners) {
+    : align_corners_(align_corners) {
    MACE_CHECK(size.size() == 2);
    out_height_ = size[0];
    out_width_ = size[1];
@@ -132,38 +124,43 @@ struct ResizeBilinearFunctorBase {
  index_t out_width_;
 };

-template <DeviceType D, typename T>
-struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
+template<DeviceType D, typename T>
+struct ResizeBilinearFunctor;
+
+template<>
+struct ResizeBilinearFunctor<DeviceType::CPU, float>
+  : ResizeBilinearFunctorBase {
  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
-      : ResizeBilinearFunctorBase(size, align_corners) {}
+    : ResizeBilinearFunctorBase(size, align_corners) {}

  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
    const index_t batch = input->dim(0);
-    const index_t in_height = input->dim(1);
-    const index_t in_width = input->dim(2);
-    const index_t channels = input->dim(3);
+    const index_t channels = input->dim(1);
+    const index_t in_height = input->dim(2);
+    const index_t in_width = input->dim(3);

    index_t out_height = out_height_;
    index_t out_width = out_width_;
    MACE_CHECK(out_height > 0 && out_width > 0);
-    std::vector<index_t> out_shape{batch, out_height, out_width, channels};
+    std::vector<index_t> out_shape{batch, channels, out_height, out_width};
    output->Resize(out_shape);

    Tensor::MappingGuard input_mapper(input);
    Tensor::MappingGuard output_mapper(output);
-    const T *input_data = input->data<T>();
-    T *output_data = output->mutable_data<T>();
+    const float *input_data = input->data<float>();
+    float *output_data = output->mutable_data<float>();

    if (out_height == in_height && out_width == in_width) {
-      std::copy(input_data, input_data + channels * in_height * in_width,
+      std::copy(input_data,
+                input_data + batch * channels * in_height * in_width,
                output_data);
      return;
    }

    float height_scale =
-        CalculateResizeScale(in_height, out_height, align_corners_);
+      CalculateResizeScale(in_height, out_height, align_corners_);
    float width_scale =
-        CalculateResizeScale(in_width, out_width, align_corners_);
+      CalculateResizeScale(in_width, out_width, align_corners_);

    std::vector<CachedInterpolation> ys(out_height + 1);
    std::vector<CachedInterpolation> xs(out_width + 1);
@@ -177,11 +174,11 @@ struct ResizeBilinearFunctor : ResizeBilinearFunctorBase {
  }
 };

-template <typename T>
+template<typename T>
 struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
-    : ResizeBilinearFunctorBase {
+  : ResizeBilinearFunctorBase {
  ResizeBilinearFunctor(const std::vector<index_t> &size, bool align_corners)
-      : ResizeBilinearFunctorBase(size, align_corners) {}
+    : ResizeBilinearFunctorBase(size, align_corners) {}

  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);


--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -19,6 +19,7 @@
 #include <functional>
 #include <memory>
 #include <vector>
+#include <limits>

 #include "mace/core/future.h"
 #include "mace/core/runtime/opencl/cl2_header.h"
@@ -29,50 +30,66 @@
 namespace mace {
 namespace kernels {

-template <DeviceType D, typename T>
-struct SoftmaxFunctor {
-  void operator()(const Tensor *logits, Tensor *output, StatsFuture *future) {
-    Tensor::MappingGuard logits_guard(logits);
+template<DeviceType D, typename T>
+struct SoftmaxFunctor;
+
+template<>
+struct SoftmaxFunctor<DeviceType::CPU, float> {
+  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
+    const index_t batch = input->dim(0);
+    const index_t class_count = input->dim(1);
+    const index_t class_size = input->dim(2) * input->dim(3);
+
+    Tensor::MappingGuard input_guard(input);
    Tensor::MappingGuard output_guard(output);
-    const T *logits_ptr = logits->data<T>();
-    T *output_ptr = output->mutable_data<T>();
-    auto &logits_shape = logits->shape();
-    const index_t batch_size =
-        std::accumulate(logits_shape.begin(), logits_shape.end() - 1, 1,
-                        std::multiplies<index_t>());
-    const index_t num_classes = logits_shape.back();
-
-#pragma omp parallel
-    {
-      // Allocate per thread buffer
-      std::vector<T> exp_data(num_classes);
-#pragma omp for
-      for (index_t i = 0; i < batch_size; ++i) {
-        const index_t pos = i * num_classes;
-        T max_value = logits_ptr[pos];
-        for (index_t c = 1; c < num_classes; ++c) {
-          max_value = std::max(max_value, logits_ptr[pos + c]);
+    const float *input_data = input->data<float>();
+    float *output_data = output->mutable_data<float>();
+
+    for (index_t b = 0; b < batch; ++b) {
+      std::vector<float>
+        max_val(class_size, std::numeric_limits<float>::lowest());
+      std::vector<float> sum_val(class_size, 0.f);
+
+      // calculate max for each class
+      for (index_t c = 0; c < class_count; ++c) {
+        const float
+          *input_ptr = input_data + (b * class_count + c) * class_size;
+        for (index_t k = 0; k < class_size; ++k) {
+          max_val[k] = std::max(max_val[k], input_ptr[k]);
+        }
+      }
+
+      // calculate data - max for each class
+#pragma omp parallel for
+      for (index_t c = 0; c < class_count; ++c) {
+        const float
+          *input_ptr = input_data + (b * class_count + c) * class_size;
+        float *output_ptr = output_data + (b * class_count + c) * class_size;
+        for (index_t k = 0; k < class_size; ++k) {
+          output_ptr[k] = ::exp(input_ptr[k] - max_val[k]);
        }
-        // TODO(liuqi): check overflow?
-        T sum = 0;
-        for (index_t c = 0; c < num_classes; ++c) {
-          exp_data[c] = ::exp((logits_ptr[pos + c] - max_value));
-          sum += exp_data[c];
+      }
+
+      // calculate sum for each class
+      for (index_t c = 0; c < class_count; ++c) {
+        float *output_ptr = output_data + (b * class_count + c) * class_size;
+        for (index_t k = 0; k < class_size; ++k) {
+          sum_val[k] += output_ptr[k];
        }
-        for (index_t c = 0; c < num_classes; ++c) {
-          output_ptr[pos + c] = exp_data[c] / sum;
+      }
+
+      // calculate (data - max) / sum for each class
+      for (index_t c = 0; c < class_count; ++c) {
+        float *output_ptr = output_data + (b * class_count + c) * class_size;
+        for (index_t k = 0; k < class_size; ++k) {
+          output_ptr[k] /= sum_val[k];
        }
      }
    }
  }
 };

-template <>
-struct SoftmaxFunctor<DeviceType::NEON, float> {
-  void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
-};
-
-template <typename T>
+template<typename T>
 struct SoftmaxFunctor<DeviceType::OPENCL, T> {
  void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);


--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -35,11 +35,6 @@ void Register_Activation(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    ActivationOp<DeviceType::OPENCL, half>);
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
-                                     .Device(DeviceType::NEON)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    ActivationOp<DeviceType::NEON, float>);
 }

 }  // namespace ops

--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -31,9 +31,21 @@ void ReluBenchmark(
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+  } else if (D == DeviceType::OPENCL) {
+    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("Activation", "ReluBM")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "RELU")
+      .Finalize(net.NewOperatorDef());
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -43,11 +55,7 @@ void ReluBenchmark(
        .AddStringArg("activation", "RELU")
        .Finalize(net.NewOperatorDef());
  } else {
-    OpDefBuilder("Activation", "ReluBM")
-        .Input("Input")
-        .Output("Output")
-        .AddStringArg("activation", "RELU")
-        .Finalize(net.NewOperatorDef());
+    MACE_NOT_IMPLEMENTED;
  }

  // Warm-up
@@ -93,7 +101,11 @@ void ReluxBenchmark(
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+  } else {
+    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  }

  if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
@@ -157,10 +169,23 @@ void PreluBenchmark(
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+  } else if (D == DeviceType::OPENCL) {
+    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
  net.AddRandomInput<D, float>("Alpha", {channels});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("Activation", "PreluBM")
+      .Input("Input")
+      .Input("Alpha")
+      .Output("Output")
+      .AddStringArg("activation", "PRELU")
+      .Finalize(net.NewOperatorDef());
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Alpha", "AlphaImage",
@@ -173,12 +198,7 @@ void PreluBenchmark(
        .AddStringArg("activation", "PRELU")
        .Finalize(net.NewOperatorDef());
  } else {
-    OpDefBuilder("Activation", "PreluBM")
-        .Input("Input")
-        .Input("Alpha")
-        .Output("Output")
-        .AddStringArg("activation", "PRELU")
-        .Finalize(net.NewOperatorDef());
+    MACE_NOT_IMPLEMENTED;
  }

  // Warm-up
@@ -224,7 +244,11 @@ void TanhBenchmark(
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+  } else {
+    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  }

  if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
@@ -286,7 +310,11 @@ void SigmoidBenchmark(
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+  } else {
+    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  }

  if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(&net, "Input", "InputImage",

--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -269,16 +269,11 @@ void TestSimplePrelu() {
    net.RunOp(D);
  }

-  if (D == DeviceType::NEON) {
+  if (D == DeviceType::CPU) {
    auto expected = CreateTensor<float>(
        {2, 2, 2, 2},
        {-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0});
    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
-  } else {
-    auto expected = CreateTensor<float>(
-        {2, 2, 2, 2},
-        {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
-    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
  }
 }
 }  // namespace
@@ -287,10 +282,6 @@ TEST_F(ActivationOpTest, CPUSimplePrelu) {
  TestSimplePrelu<DeviceType::CPU>();
 }

-TEST_F(ActivationOpTest, NEONSimplePrelu) {
-  TestSimplePrelu<DeviceType::NEON>();
-}
-
 TEST_F(ActivationOpTest, OPENCLSimplePrelu) {
  TestSimplePrelu<DeviceType::OPENCL>();
 }

--- a/mace/ops/addn.cc
+++ b/mace/ops/addn.cc
@@ -35,12 +35,6 @@ void Register_AddN(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    AddNOp<DeviceType::OPENCL, half>);
-
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
-                                     .Device(DeviceType::NEON)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    AddNOp<DeviceType::NEON, float>);
 }

 }  // namespace ops

--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -35,11 +35,6 @@ void Register_BatchNorm(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    BatchNormOp<DeviceType::OPENCL, half>);
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
-                                     .Device(DeviceType::NEON)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    BatchNormOp<DeviceType::NEON, float>);
 }

 }  // namespace ops

--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -30,13 +30,29 @@ void BatchNorm(
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
+  } else if (D == DeviceType::OPENCL) {
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
  net.AddRandomInput<D, T>("Scale", {channels});
  net.AddRandomInput<D, T>("Offset", {channels});
  net.AddRandomInput<D, T>("Mean", {channels});
  net.AddRandomInput<D, T>("Var", {channels}, true);

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("BatchNorm", "BatchNormBM")
+      .Input("Input")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
+      .AddFloatArg("epsilon", 1e-3)
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
@@ -57,15 +73,7 @@ void BatchNorm(
        .Output("Output")
        .Finalize(net.NewOperatorDef());
  } else {
-    OpDefBuilder("BatchNorm", "BatchNormBM")
-        .Input("Input")
-        .Input("Scale")
-        .Input("Offset")
-        .Input("Mean")
-        .Input("Var")
-        .AddFloatArg("epsilon", 1e-3)
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
+    MACE_NOT_IMPLEMENTED;
  }

  // tuning

--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -34,7 +34,22 @@ void Simple() {
  net.AddInputFromArray<D, float>("Mean", {1}, {10});
  net.AddInputFromArray<D, float>("Var", {1}, {11.67f});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+    OpDefBuilder("BatchNorm", "BatchNormTest")
+      .Input("InputNCHW")
+      .Input("Scale")
+      .Input("Offset")
+      .Input("Mean")
+      .Input("Var")
+      .AddFloatArg("epsilon", 1e-3)
+      .Output("OutputNCHW")
+      .Finalize(net.NewOperatorDef());
+    // Run
+
+    net.RunOp(D);
+    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
@@ -61,18 +76,6 @@ void Simple() {
    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
                            kernels::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("BatchNorm", "BatchNormTest")
-      .Input("Input")
-      .Input("Scale")
-      .Input("Offset")
-      .Input("Mean")
-      .Input("Var")
-      .AddFloatArg("epsilon", 1e-3)
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
  }

  // Check
@@ -97,17 +100,7 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  index_t height = 64;
  index_t width = 64;

-  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("BatchNorm", "BatchNormTest")
-    .Input("Input")
-    .Input("Scale")
-    .Input("Offset")
-    .Input("Mean")
-    .Input("Var")
-    .AddFloatArg("epsilon", 1e-3)
-    .Output("Output")
-    .Finalize(net.NewOperatorDef());

  // Add input data
  net.AddRandomInput<DeviceType::OPENCL, float>(
@@ -117,9 +110,30 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels});

+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
+  // Construct graph
+  OpDefBuilder("BatchNorm", "BatchNormTest")
+    .Input("InputNCHW")
+    .Input("Scale")
+    .Input("Offset")
+    .Input("Mean")
+    .Input("Var")
+    .AddFloatArg("epsilon", 1e-3)
+    .Output("OutputNCHW")
+    .Finalize(net.NewOperatorDef());
+
  // run cpu
  net.RunOp();

+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
  // Check
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));
@@ -170,15 +184,6 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {

  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("BatchNorm", "BatchNormTest")
-    .Input("Input")
-    .Input("Scale")
-    .Input("Offset")
-    .Input("Mean")
-    .Input("Var")
-    .AddFloatArg("epsilon", 1e-1)
-    .Output("Output")
-    .Finalize(net.NewOperatorDef());

  // Add input data
  net.AddRandomInput<DeviceType::OPENCL, float>(
@@ -188,9 +193,29 @@ TEST_F(BatchNormOpTest, SimpleRandomHalfOPENCL) {
  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels});

+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
+  OpDefBuilder("BatchNorm", "BatchNormTest")
+    .Input("InputNCHW")
+    .Input("Scale")
+    .Input("Offset")
+    .Input("Mean")
+    .Input("Var")
+    .AddFloatArg("epsilon", 1e-1)
+    .Output("OutputNCHW")
+    .Finalize(net.NewOperatorDef());
+
  // run cpu
  net.RunOp();

+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
  // Check
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));
@@ -242,15 +267,6 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {

  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("BatchNorm", "BatchNormTest")
-    .Input("Input")
-    .Input("Scale")
-    .Input("Offset")
-    .Input("Mean")
-    .Input("Var")
-    .AddFloatArg("epsilon", 1e-3)
-    .Output("Output")
-    .Finalize(net.NewOperatorDef());

  // Add input data
  net.AddRandomInput<DeviceType::OPENCL, float>(
@@ -260,9 +276,29 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels});

+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
+  OpDefBuilder("BatchNorm", "BatchNormTest")
+    .Input("InputNCHW")
+    .Input("Scale")
+    .Input("Offset")
+    .Input("Mean")
+    .Input("Var")
+    .AddFloatArg("epsilon", 1e-3)
+    .Output("OutputNCHW")
+    .Finalize(net.NewOperatorDef());
+
  // run cpu
  net.RunOp();

+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
  // Check
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));
@@ -313,15 +349,6 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {

  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("BatchNorm", "BatchNormTest")
-    .Input("Input")
-    .Input("Scale")
-    .Input("Offset")
-    .Input("Mean")
-    .Input("Var")
-    .AddFloatArg("epsilon", 1e-1)
-    .Output("Output")
-    .Finalize(net.NewOperatorDef());

  // Add input data
  net.AddRandomInput<DeviceType::OPENCL, float>(
@@ -331,9 +358,29 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  net.AddRandomInput<DeviceType::OPENCL, float>("Mean", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels});

+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
+  OpDefBuilder("BatchNorm", "BatchNormTest")
+    .Input("InputNCHW")
+    .Input("Scale")
+    .Input("Offset")
+    .Input("Mean")
+    .Input("Var")
+    .AddFloatArg("epsilon", 1e-1)
+    .Output("OutputNCHW")
+    .Finalize(net.NewOperatorDef());
+
  // run cpu
  net.RunOp();

+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
  // Check
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));
@@ -375,63 +422,6 @@ TEST_F(BatchNormOpTest, ComplexRandomHalfOPENCL) {
  ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"), 1e-1, 1e-2);
 }

-TEST_F(BatchNormOpTest, NEONTest) {
-  srand(time(NULL));
-  unsigned int seed;
-
-  // generate random input
-  index_t batch = 1 + rand_r(&seed) % 10;
-  index_t channels = 3 + rand_r(&seed) % 50;
-  index_t height = 64;
-  index_t width = 64;
-
-  // Construct graph
-  OpsTestNet net;
-  OpDefBuilder("BatchNorm", "BatchNormTest")
-    .Input("Input")
-    .Input("Scale")
-    .Input("Offset")
-    .Input("Mean")
-    .Input("Var")
-    .AddFloatArg("epsilon", 1e-3)
-    .Output("Output")
-    .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  net.AddRandomInput<DeviceType::CPU, float>(
-    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::CPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::CPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::CPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::CPU, float>("Var", {channels});
-
-  // run cpu
-  net.RunOp();
-
-  OpDefBuilder("BatchNorm", "BatchNormTest")
-    .Input("InputNeon")
-    .Input("Scale")
-    .Input("Offset")
-    .Input("Mean")
-    .Input("Var")
-    .AddFloatArg("epsilon", 1e-3)
-    .Output("OutputNeon")
-    .Finalize(net.NewOperatorDef());
-
-  net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("InputNeon", "Input");
-
-  // Run on neon
-  net.RunOp(DeviceType::NEON);
-  net.Sync();
-
-  net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("OutputExptected",
-                                                       "Output");
-
-  ExpectTensorNear<float>(*net.GetOutput("OutputExptected"),
-                          *net.GetOutput("OutputNeon"),
-                          1e-5, 1e-4);
-}
-
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -29,10 +29,22 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
+  } else if (D == DeviceType::OPENCL) {
+    net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
  net.AddRandomInput<D, T>("Bias", {channels}, true);

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("BiasAdd", "BiasAddBM")
+      .Input("Input")
+      .Input("Bias")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Bias", "BiasImage",
@@ -43,11 +55,7 @@ void BiasAdd(int iters, int batch, int channels, int height, int width) {
        .Output("Output")
        .Finalize(net.NewOperatorDef());
  } else {
-    OpDefBuilder("BiasAdd", "BiasAddBM")
-        .Input("Input")
-        .Input("Bias")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
+    MACE_NOT_IMPLEMENTED;
  }

  // Warm-up

--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -31,7 +31,23 @@ void BiasAddSimple() {
                                  {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
  net.AddInputFromArray<D, float>("Bias", {1}, {0.5f});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    OpDefBuilder("BiasAdd", "BiasAddTest")
+      .Input("InputNCHW")
+      .Input("Bias")
+      .Output("OutputNCHW")
+      .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Bias", "BiasImage",
@@ -49,13 +65,7 @@ void BiasAddSimple() {
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
                            kernels::BufferType::IN_OUT_CHANNEL);
  } else {
-    OpDefBuilder("BiasAdd", "BiasAddTest")
-        .Input("Input")
-        .Input("Bias")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
+    MACE_NOT_IMPLEMENTED;
  }

  // Check
@@ -81,22 +91,33 @@ TEST_F(BiasAddOpTest, SimpleRandomOPENCL) {
  index_t height = 64 + rand_r(&seed) % 50;
  index_t width = 64 + rand_r(&seed) % 50;

-  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("BiasAdd", "BiasAddTest")
-      .Input("Input")
-      .Input("Bias")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());

  // Add input data
  net.AddRandomInput<DeviceType::OPENCL, float>(
-      "Input", {batch, height, width, channels});
+    "Input", {batch, height, width, channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {channels}, true);

+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
+  // Construct graph
+  OpDefBuilder("BiasAdd", "BiasAddTest")
+      .Input("InputNCHW")
+      .Input("Bias")
+      .Output("OutputNCHW")
+      .Finalize(net.NewOperatorDef());
+
  // run cpu
  net.RunOp();

+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
  // Check
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));
@@ -130,22 +151,32 @@ TEST_F(BiasAddOpTest, ComplexRandomOPENCL) {
  index_t height = 103 + rand_r(&seed) % 100;
  index_t width = 113 + rand_r(&seed) % 100;

-  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("BiasAdd", "BiasAddTest")
-      .Input("Input")
-      .Input("Bias")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());

  // Add input data
  net.AddRandomInput<DeviceType::OPENCL, float>(
-      "Input", {batch, height, width, channels});
+    "Input", {batch, height, width, channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {channels}, true);

+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
+  // Construct graph
+  OpDefBuilder("BiasAdd", "BiasAddTest")
+      .Input("InputNCHW")
+      .Input("Bias")
+      .Output("OutputNCHW")
+      .Finalize(net.NewOperatorDef());
+
  // run cpu
  net.RunOp();

+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
  // Check
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));

--- a/mace/ops/channel_shuffle.h
+++ b/mace/ops/channel_shuffle.h
@@ -34,7 +34,14 @@ class ChannelShuffleOp : public Operator<D, T> {
  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);
    Tensor *output = this->Output(OUTPUT);
-    int channels = input->dim(3);
+    int channels;
+    if (D == OPENCL) {
+      channels = input->dim(3);
+    } else if (D == CPU) {
+      channels = input->dim(1);
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
    MACE_CHECK(channels % group_ == 0,
               "input channels must be an integral multiple of group. ",
               input->dim(3));

--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
@@ -29,9 +29,20 @@ void ChannelShuffle(
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, float>("Input", {batch, height, channels, width});
+  } else if (D == DeviceType::OPENCL) {
+    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("Softmax", "SoftmaxBM")
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -41,10 +52,7 @@ void ChannelShuffle(
        .AddIntArg("group", group)
        .Finalize(net.NewOperatorDef());
  } else {
-    OpDefBuilder("Softmax", "SoftmaxBM")
-        .Input("Input")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
+    MACE_NOT_IMPLEMENTED;
  }

  // Warm-up

--- a/mace/ops/channel_shuffle_test.cc
+++ b/mace/ops/channel_shuffle_test.cc
@@ -22,21 +22,31 @@ namespace test {
 class ChannelShuffleOpTest : public OpsTestBase {};

 TEST_F(ChannelShuffleOpTest, C8G4_CPU) {
-  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
-      .Input("Input")
-      .Output("Output")
-      .AddIntArg("group", 4)
-      .Finalize(net.NewOperatorDef());

  // Add input data
  net.AddInputFromArray<DeviceType::CPU, float>(
      "Input", {1, 1, 2, 8},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});

+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
+  // Construct graph
+  OpDefBuilder("ChannelShuffle", "ChannelShuffleTest")
+    .Input("InputNCHW")
+    .Output("OutputNCHW")
+    .AddIntArg("group", 4)
+    .Finalize(net.NewOperatorDef());
+
  // Run
  net.RunOp();
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);

  // Check
  auto expected = CreateTensor<float>(

--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -33,11 +33,6 @@ void Register_Concat(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    ConcatOp<DeviceType::OPENCL, half>);
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
-                                     .Device(DeviceType::NEON)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    ConcatOp<DeviceType::NEON, float>);
 }

 }  // namespace ops

--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -35,12 +35,6 @@ void Register_Conv2D(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    Conv2dOp<DeviceType::OPENCL, half>);
-
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
-                                     .Device(DeviceType::NEON)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                                      Conv2dOp<DeviceType::NEON, float>);
 }

 }  // namespace ops

--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -41,21 +41,34 @@ void Conv2d(int iters,
  OpsTestNet net;

  // Add input data
-  if (D == DeviceType::NEON) {
+  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
    net.AddRandomInput<D, float>("Filter",
                                 {output_channels, channels, kernel_h,
                                  kernel_w});
    net.AddRandomInput<D, float>("Bias", {output_channels});
-  } else {
+  } else if (D == DeviceType::OPENCL) {
    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
    net.AddRandomInput<D, float>("Filter",
                                 {kernel_h, kernel_w, output_channels,
                                  channels});
    net.AddRandomInput<D, float>("Bias", {output_channels});
+  } else {
+    MACE_NOT_IMPLEMENTED;
  }

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("Conv2D", "Conv2dTest")
+      .Input("Input")
+      .Input("Filter")
+      .Input("Bias")
+      .Output("Output")
+      .AddIntsArg("strides", {stride, stride})
+      .AddIntArg("padding", padding)
+      .AddIntsArg("dilations", {dilation, dilation})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -73,16 +86,7 @@ void Conv2d(int iters,
        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
  } else {
-    OpDefBuilder("Conv2D", "Conv2dTest")
-        .Input("Input")
-        .Input("Filter")
-        .Input("Bias")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntArg("padding", padding)
-        .AddIntsArg("dilations", {dilation, dilation})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
+    MACE_NOT_IMPLEMENTED;
  }

  net.Setup(D);
@@ -134,7 +138,6 @@ void Conv2d(int iters,

 #define BM_CONV_2D(N, C, H, W, KH, KW, S, D, P, OC)                 \
  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, CPU);    \
-  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, NEON);   \
  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, float, OPENCL); \
  BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, D, P, OC, half, OPENCL);


--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -38,7 +38,32 @@ void TestNHWCSimple3x3VALID() {
     1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
  net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+    OpDefBuilder("Conv2D", "Conv2dTest")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
+      .Input("Bias")
+      .Output("OutputNCHW")
+      .AddIntsArg("strides", {1, 1})
+      .AddIntArg("padding", Padding::VALID)
+      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -63,18 +88,7 @@ void TestNHWCSimple3x3VALID() {
                        kernels::BufferType::IN_OUT_CHANNEL);

  } else {
-    OpDefBuilder("Conv2D", "Conv2dTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
+    MACE_NOT_IMPLEMENTED;
  }

  auto expected = CreateTensor<float>({1, 1, 1, 1}, {18.1f});
@@ -95,7 +109,32 @@ void TestNHWCSimple3x3SAME() {
     1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
  net.AddInputFromArray<D, T>("Bias", {1}, {0.1f});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+    OpDefBuilder("Conv2D", "Conv2dTest")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
+      .Input("Bias")
+      .Output("OutputNCHW")
+      .AddIntsArg("strides", {1, 1})
+      .AddIntArg("padding", Padding::SAME)
+      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -120,18 +159,7 @@ void TestNHWCSimple3x3SAME() {
                        kernels::BufferType::IN_OUT_CHANNEL);

  } else {
-    OpDefBuilder("Conv2D", "Conv2dTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::SAME)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
+    MACE_NOT_IMPLEMENTED;
  }

  auto expected = CreateTensor<float>(
@@ -166,7 +194,32 @@ void TestNHWCSimple3x3WithoutBias() {
    {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
     1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+    OpDefBuilder("Conv2D", "Conv2dTest")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
+      .Output("OutputNCHW")
+      .AddIntsArg("strides", {1, 1})
+      .AddIntArg("padding", Padding::VALID)
+      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -187,18 +240,7 @@ void TestNHWCSimple3x3WithoutBias() {
    ImageToBuffer<D, T>(&net, "OutputImage", "Output",
                        kernels::BufferType::IN_OUT_CHANNEL);
  } else {
-    OpDefBuilder("Conv2D", "Conv2dTest")
-      .Input("Input")
-      .Input("Filter")
-      .Output("Output")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
+    MACE_NOT_IMPLEMENTED;
  }

  // Check
@@ -234,7 +276,32 @@ void TestNHWCCombined3x3() {
     1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f, 1.0f, 1.0f, 0.5f, 0.5f});
  net.AddInputFromArray<D, T>("Bias", {2}, {0.1f, 0.2f});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+    OpDefBuilder("Conv2D", "Conv2DTest")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
+      .Input("Bias")
+      .Output("OutputNCHW")
+      .AddIntsArg("strides", {2, 2})
+      .AddIntArg("padding", Padding::SAME)
+      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -258,18 +325,7 @@ void TestNHWCCombined3x3() {
    ImageToBuffer<D, T>(&net, "OutputImage", "Output",
                        kernels::BufferType::IN_OUT_CHANNEL);
  } else {
-    OpDefBuilder("Conv2D", "Conv2DTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {2, 2})
-      .AddIntArg("padding", Padding::SAME)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
+    MACE_NOT_IMPLEMENTED;
  }

  // Check
@@ -309,7 +365,31 @@ void TestConv1x1() {
    {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f});
  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+    OpDefBuilder("Conv2D", "Conv2DTest")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
+      .Input("Bias")
+      .Output("OutputNCHW")
+      .AddIntsArg("strides", {1, 1})
+      .AddIntArg("padding", Padding::VALID)
+      .AddIntsArg("dilations", {1, 1})
+      .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Filter", "FilterImage",
@@ -332,17 +412,7 @@ void TestConv1x1() {
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
                            kernels::BufferType::IN_OUT_CHANNEL);
  } else {
-    OpDefBuilder("Conv2D", "Conv2DTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
+    MACE_NOT_IMPLEMENTED;
  }

  // Check
@@ -377,27 +447,44 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
    index_t width = shape[1];
    index_t input_channels = shape[2] + (rand_r(&seed) % 10);
    index_t output_channels = shape[3] + (rand_r(&seed) % 10);
-    // Construct graph
+
    OpsTestNet net;
+
+    // Add input data
+    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
+    net.AddRandomInput<D, T>(
+      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
+    net.AddRandomInput<D, T>("Bias", {output_channels});
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+
+    // Construct graph
    OpDefBuilder("Conv2D", "Conv2dTest")
-      .Input("Input")
-      .Input("Filter")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
      .Input("Bias")
-      .Output("Output")
+      .Output("OutputNCHW")
      .AddIntsArg("strides", {stride_h, stride_w})
      .AddIntArg("padding", type)
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

-    // Add input data
-    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
-    net.AddRandomInput<D, T>(
-      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
-    net.AddRandomInput<D, T>("Bias", {output_channels});

    // run on cpu
    net.RunOp();
+
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+
    // Check
    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));
@@ -470,15 +557,6 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
    index_t output_channels = filter_shape[3];
    // Construct graph
    OpsTestNet net;
-    OpDefBuilder("Conv2D", "Conv2dTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {stride_h, stride_w})
-      .AddIntArg("padding", padding)
-      .AddIntsArg("dilations", {dilations[0], dilations[1]})
-      .Finalize(net.NewOperatorDef());

    std::vector<float> float_input_data;
    GenerateRandomRealTypeData({batch, height, width, input_channels},
@@ -497,8 +575,33 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
      float_filter_data);
    net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);

+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+
+    OpDefBuilder("Conv2D", "Conv2dTest")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
+      .Input("Bias")
+      .Output("OutputNCHW")
+      .AddIntsArg("strides", {stride_h, stride_w})
+      .AddIntArg("padding", padding)
+      .AddIntsArg("dilations", {dilations[0], dilations[1]})
+      .Finalize(net.NewOperatorDef());
+
    // run on cpu
    net.RunOp();
+
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+
    // Check
    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));
@@ -605,27 +708,44 @@ void TestDilationConvNxN(const std::vector<index_t> &shape,
    index_t width = shape[1];
    index_t input_channels = shape[2];
    index_t output_channels = shape[3];
-    // Construct graph
+
    OpsTestNet net;
+
+    // Add input data
+    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
+    net.AddRandomInput<D, T>(
+      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
+    net.AddRandomInput<D, T>("Bias", {output_channels});
+
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+
+    // Construct graph
    OpDefBuilder("Conv2D", "Conv2dTest")
-      .Input("Input")
-      .Input("Filter")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
      .Input("Bias")
-      .Output("Output")
+      .Output("OutputNCHW")
      .AddIntsArg("strides", {stride_h, stride_w})
      .AddIntArg("padding", type)
      .AddIntsArg("dilations", {dilation_rate, dilation_rate})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

-    // Add input data
-    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
-    net.AddRandomInput<D, T>(
-      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
-    net.AddRandomInput<D, T>("Bias", {output_channels});
-
    // run on cpu
    net.RunOp();
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+
+
    // Check
    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));
@@ -692,17 +812,8 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
    index_t width = shape[1];
    index_t input_channels = shape[2];
    index_t output_channels = shape[3];
-    // Construct graph
+
    OpsTestNet net;
-    OpDefBuilder("Conv2D", "Conv2dTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {stride_h, stride_w})
-      .AddIntsArg("padding_values", paddings)
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());

    // Add input data
    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
@@ -710,8 +821,33 @@ void TestArbitraryPadConvNxN(const std::vector<index_t> &shape,
      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
    net.AddRandomInput<D, T>("Bias", {output_channels});

+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+    // Construct graph
+    OpDefBuilder("Conv2D", "Conv2dTest")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
+      .Input("Bias")
+      .Output("OutputNCHW")
+      .AddIntsArg("strides", {stride_h, stride_w})
+      .AddIntsArg("padding_values", paddings)
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+
    // run on cpu
    net.RunOp();
+
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+
    // Check
    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));
@@ -763,83 +899,6 @@ TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) {
  TestArbitraryPadConvNxN<DeviceType::OPENCL, float>({107, 113, 5, 7}, {4, 4});
 }

-static void TestNeonArbitraryPadConvNxN(const std::vector<index_t> &shape,
-                                        const std::vector<int> &paddings) {
-  testing::internal::LogToStderr();
-  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w) {
-    srand(time(NULL));
-
-    // generate random input
-    index_t batch = 1;
-    index_t height = shape[0];
-    index_t width = shape[1];
-    index_t input_channels = shape[2];
-    index_t output_channels = shape[3];
-    // Construct graph
-    OpsTestNet net;
-    OpDefBuilder("Conv2D", "Conv2dTestCPU")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {stride_h, stride_w})
-      .AddIntsArg("padding_values", paddings)
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
-      .Finalize(net.NewOperatorDef());
-
-    // Add input data
-    net.AddRandomInput<DeviceType::CPU, float>("Input",
-                                               {batch, height, width,
-                                                input_channels});
-    net.AddRandomInput<DeviceType::CPU, float>(
-      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
-    net.AddRandomInput<DeviceType::CPU, float>("Bias", {output_channels});
-
-    // run cpu
-    net.RunOp();
-
-    // run neon
-    OpDefBuilder("Conv2D", "Conv2dTestNEON")
-      .Input("InputNeon")
-      .Input("FilterNeon")
-      .Input("Bias")
-      .Output("OutputNeon")
-      .AddIntsArg("strides", {stride_h, stride_w})
-      .AddIntsArg("padding_values", paddings)
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
-      .Finalize(net.NewOperatorDef());
-
-    net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("InputNeon", "Input");
-    net.FillHWOIInputToOIHWInput<DeviceType::CPU, float>("FilterNeon",
-                                                         "Filter");
-
-    // Run on device
-    net.RunOp(DeviceType::NEON);
-
-    net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("OutputExptected",
-                                                         "Output");
-
-    ExpectTensorNear<float>(*net.GetOutput("OutputExptected"),
-                            *net.GetOutput("OutputNeon"),
-                            1e-5, 1e-3);
-  };
-
-  for (int kernel_size : {1, 3, 5}) {
-    for (int stride : {1, 2}) {
-      if (stride <= kernel_size) {
-        func(kernel_size, kernel_size, stride, stride);
-      }
-    }
-  }
-}
-
-TEST_F(Conv2dOpTest, NEONTest) {
-  TestNeonArbitraryPadConvNxN({32, 34, 32, 64}, {0, 0});
-  TestNeonArbitraryPadConvNxN({32, 32, 32, 64}, {1, 1});
-  TestNeonArbitraryPadConvNxN({128, 128, 16, 16}, {2, 2});
-  TestNeonArbitraryPadConvNxN({107, 113, 5, 7}, {4, 4});
-}
-
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/depth_to_space.h
+++ b/mace/ops/depth_to_space.h
@@ -37,10 +37,17 @@ class DepthToSpaceOp : public Operator<D, T> {
    Tensor *output = this->Output(OUTPUT);
    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");

-    int input_depth = input->dim(3);
+    int input_depth;
+    if (D == CPU) {
+      input_depth = input->dim(1);
+    } else if (D == OPENCL) {
+      input_depth = input->dim(3);
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
    MACE_CHECK(input_depth % (block_size_ * block_size_) == 0,
               "input depth should be dividable by block_size * block_size",
-               input->dim(3));
+               input_depth);
    MACE_CHECK((input_depth % 4) == 0,
               "input channel should be dividable by 4");
    functor_(input, output, future);

--- a/mace/ops/depth_to_space_benchmark.cc
+++ b/mace/ops/depth_to_space_benchmark.cc
@@ -29,9 +29,20 @@ void DepthToSpace(
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+  } else if (D == DeviceType::OPENCL) {
+    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -41,10 +52,7 @@ void DepthToSpace(
        .AddIntArg("block_size", block_size)
        .Finalize(net.NewOperatorDef());
  } else {
-    OpDefBuilder("DepthToSpace", "DepthToSpaceBM")
-        .Input("Input")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
+    MACE_NOT_IMPLEMENTED;
  }

  // Warm-up

--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
@@ -36,11 +36,21 @@ void RunDepthToSpace(const bool d2s,
  const char *ops_test_name = (d2s) ? "DepthToSpaceTest" : "SpaceToDepthTest";
  // Construct graph
  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
    OpDefBuilder(ops_name, ops_test_name)
-        .Input("Input")
-        .Output("Output")
+        .Input("InputNCHW")
+        .Output("OutputNCHW")
        .AddIntArg("block_size", block_size)
        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);

  } else {
    BufferToImage<D, float>(&net, "Input", "InputImage",
@@ -50,9 +60,10 @@ void RunDepthToSpace(const bool d2s,
        .Output("OutputImage")
        .AddIntArg("block_size", block_size)
        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
  }
-  // Run
-  net.RunOp(D);
+

  if (D == DeviceType::OPENCL) {
    ImageToBuffer<DeviceType::OPENCL, float>(&net, "OutputImage", "Output",
@@ -176,22 +187,31 @@ void RandomTest(const bool d2s, const int block_size,
  const char *ops_test_name = (d2s) ? "DepthToSpaceTest" : "SpaceToDepthTest";

  // Add input data
-  net.AddRandomInput<D, float>("Input1", shape);
-
+  net.AddRandomInput<D, float>("Input", shape);
+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
  OpDefBuilder(ops_name, ops_test_name)
-      .Input("Input1")
+      .Input("InputNCHW")
      .AddIntArg("block_size", block_size)
-      .Output("Output")
+      .Output("OutputNCHW")
      .Finalize(net.NewOperatorDef());

  // Run
  net.RunOp();

-  BufferToImage<D, T>(&net, "Input1", "InputImg1",
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
+
+  BufferToImage<D, T>(&net, "Input", "InputImg",
                      kernels::BufferType::IN_OUT_CHANNEL);

  OpDefBuilder(ops_name, ops_test_name)
-      .Input("InputImg1")
+      .Input("InputImg")
      .AddIntArg("block_size", block_size)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Output("OutputImg")

--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -35,12 +35,6 @@ void Register_DepthwiseConv2d(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    DepthwiseConv2dOp<DeviceType::OPENCL, half>);
-
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
-                                     .Device(DeviceType::NEON)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    DepthwiseConv2dOp<DeviceType::NEON, float>);
 }

 }  // namespace ops

--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -40,21 +40,34 @@ void DepthwiseConv2d(int iters,
  OpsTestNet net;

  // Add input data
-  if (D == DeviceType::NEON) {
+  if (D == DeviceType::CPU) {
    net.AddRandomInput<D, float>("Input",
                                 {batch, input_channels, height, width});
    net.AddRandomInput<D, float>(
      "Filter", {multiplier, input_channels, kernel_h, kernel_w});
    net.AddRandomInput<D, float>("Bias", {input_channels * multiplier});
-  } else {
+  } else if (D == DeviceType::OPENCL) {
    net.AddRandomInput<D, float>("Input",
                                 {batch, height, width, input_channels});
    net.AddRandomInput<D, float>(
      "Filter", {kernel_h, kernel_w, input_channels, multiplier});
    net.AddRandomInput<D, float>("Bias", {input_channels * multiplier});
+  } else {
+    MACE_NOT_IMPLEMENTED;
  }

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
+      .Input("Input")
+      .Input("Filter")
+      .Input("Bias")
+      .Output("Output")
+      .AddIntsArg("strides", {stride, stride})
+      .AddIntArg("padding", padding)
+      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -72,16 +85,7 @@ void DepthwiseConv2d(int iters,
        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
  } else {
-    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
-        .Input("Input")
-        .Input("Filter")
-        .Input("Bias")
-        .Output("Output")
-        .AddIntsArg("strides", {stride, stride})
-        .AddIntArg("padding", padding)
-        .AddIntsArg("dilations", {1, 1})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
+    MACE_NOT_IMPLEMENTED;
  }

  net.Setup(D);
@@ -131,8 +135,7 @@ void DepthwiseConv2d(int iters,
 #define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, M)                 \
  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, CPU);    \
  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, OPENCL); \
-  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, OPENCL);  \
-  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, float, NEON);
+  BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, M, half, OPENCL);

 BM_DEPTHWISE_CONV_2D(1, 32, 112, 112, 3, 3, 1, SAME, 1);
 BM_DEPTHWISE_CONV_2D(1, 32, 56, 56, 3, 3, 2, VALID, 1);

--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -35,7 +35,31 @@ void SimpleValidTest() {
  net.AddInputFromArray<D, float>(
    "Filter", {2, 2, 2, 1}, {1.0f, 2.0f, 2.0f, 4.0f, 3.0f, 6.0f, 4.0f, 8.0f});
  net.AddInputFromArray<D, float>("Bias", {2}, {.1f, .2f});
-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWIO,
+                                                    "FilterOIHW",
+                                                    OIHW);
+    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
+      .Input("Bias")
+      .Output("OutputNCHW")
+      .AddIntsArg("strides", {1, 1})
+      .AddIntArg("padding", Padding::VALID)
+      .AddIntsArg("dilations", {1, 1})
+      .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -60,17 +84,7 @@ void SimpleValidTest() {
                            kernels::BufferType::IN_OUT_CHANNEL);

  } else {
-    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
+    MACE_NOT_IMPLEMENTED;
  }

  // Check
@@ -144,7 +158,33 @@ void ComplexValidTest() {
     0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72, 0.73, 0.74});
  net.AddInputFromArray<D, float>("Bias", {6},
                                  {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
-  if (D == DeviceType::OPENCL) {
+
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWIO,
+                                                    "FilterOIHW",
+                                                    OIHW);
+    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
+      .Input("Bias")
+      .Output("OutputNCHW")
+      .AddIntsArg("strides", {2, 2})
+      .AddIntArg("padding", Padding::SAME)
+      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -169,18 +209,7 @@ void ComplexValidTest() {
                        kernels::BufferType::IN_OUT_CHANNEL);

  } else {
-    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {2, 2})
-      .AddIntArg("padding", Padding::SAME)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
+    MACE_NOT_IMPLEMENTED;
  }

  // Check
@@ -224,7 +253,7 @@ TEST_F(DepthwiseConv2dOpTest, ComplexOpenCLHalf) {
 }

 namespace {
-template<DeviceType D, typename T>
+template<typename T>
 void TestNxNS12(const index_t height, const index_t width) {
  testing::internal::LogToStderr();
  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
@@ -238,16 +267,28 @@ void TestNxNS12(const index_t height, const index_t width) {
    OpsTestNet net;

    // Add input data
-    net.AddRandomInput<D, float>("Input",
-                                 {batch, height, width, input_channels});
-    net.AddRandomInput<D, float>(
+    net.AddRandomInput<DeviceType::OPENCL, float>("Input",
+                                                  {batch, height, width,
+                                                   input_channels});
+    net.AddRandomInput<DeviceType::OPENCL, float>(
      "Filter", {kernel_h, kernel_w, input_channels, multiplier});
-    net.AddRandomInput<D, float>("Bias", {multiplier * input_channels});
+    net.AddRandomInput<DeviceType::OPENCL, float>("Bias",
+                                                  {multiplier
+                                                     * input_channels});
+
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWIO,
+                                                    "FilterOIHW",
+                                                    OIHW);
    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
-      .Input("Input")
-      .Input("Filter")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
      .Input("Bias")
-      .Output("Output")
+      .Output("OutputNCHW")
      .AddIntsArg("strides", {stride_h, stride_w})
      .AddIntArg("padding", type)
      .AddIntsArg("dilations", {1, 1})
@@ -256,48 +297,41 @@ void TestNxNS12(const index_t height, const index_t width) {

    // Run on cpu
    net.RunOp();
+
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+
    // Check
    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));

-    if (D == DeviceType::OPENCL) {
-      BufferToImage<D, T>(&net, "Input", "InputImage",
-                          kernels::BufferType::IN_OUT_CHANNEL);
-      BufferToImage<D, T>(&net, "Filter", "FilterImage",
-                          kernels::BufferType::DW_CONV2D_FILTER);
-      BufferToImage<D, T>(&net, "Bias", "BiasImage",
-                          kernels::BufferType::ARGUMENT);
-      OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
-        .Input("InputImage")
-        .Input("FilterImage")
-        .Input("BiasImage")
-        .Output("OutputImage")
-        .AddIntsArg("strides", {stride_h, stride_w})
-        .AddIntArg("padding", type)
-        .AddIntsArg("dilations", {1, 1})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-
-      net.RunOp(D);
-
-      // Transfer output
-      ImageToBuffer<D, float>(&net, "OutputImage", "DeviceOutput",
-                              kernels::BufferType::IN_OUT_CHANNEL);
-    } else {
-      OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
-        .Input("Input")
-        .Input("Filter")
-        .Input("Bias")
-        .Output("DeviceOutput")
-        .AddIntsArg("strides", {stride_h, stride_w})
-        .AddIntArg("padding", type)
-        .AddIntsArg("dilations", {1, 1})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
-
-      // Run
-      net.RunOp(D);
-    }
+    BufferToImage<DeviceType::OPENCL, T>(&net, "Input", "InputImage",
+                                         kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<DeviceType::OPENCL, T>(&net, "Filter", "FilterImage",
+                                         kernels::BufferType::DW_CONV2D_FILTER);
+    BufferToImage<DeviceType::OPENCL, T>(&net, "Bias", "BiasImage",
+                                         kernels::BufferType::ARGUMENT);
+    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
+      .Input("InputImage")
+      .Input("FilterImage")
+      .Input("BiasImage")
+      .Output("OutputImage")
+      .AddIntsArg("strides", {stride_h, stride_w})
+      .AddIntArg("padding", type)
+      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+
+    net.RunOp(DeviceType::OPENCL);
+
+    // Transfer output
+    ImageToBuffer<DeviceType::OPENCL, float>(&net,
+                                         "OutputImage",
+                                         "DeviceOutput",
+                                         kernels::BufferType::IN_OUT_CHANNEL);
+

    // Check
    if (DataTypeToEnum<T>::value == DT_FLOAT) {
@@ -319,109 +353,27 @@ void TestNxNS12(const index_t height, const index_t width) {
 }  // namespace

 TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12) {
-  TestNxNS12<DeviceType::OPENCL, float>(4, 4);
+  TestNxNS12<float>(4, 4);
 }

 TEST_F(DepthwiseConv2dOpTest, OpenCLSimpleNxNS12Half) {
-  TestNxNS12<DeviceType::OPENCL, half>(4, 4);
+  TestNxNS12<half>(4, 4);
 }

 TEST_F(DepthwiseConv2dOpTest, OpenCLAlignedNxNS12) {
-  TestNxNS12<DeviceType::OPENCL, float>(128, 128);
+  TestNxNS12<float>(128, 128);
 }

 TEST_F(DepthwiseConv2dOpTest, OpenCLAlignedNxNS12Half) {
-  TestNxNS12<DeviceType::OPENCL, half>(128, 128);
+  TestNxNS12<half>(128, 128);
 }

 TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12) {
-  TestNxNS12<DeviceType::OPENCL, float>(107, 113);
+  TestNxNS12<float>(107, 113);
 }

 TEST_F(DepthwiseConv2dOpTest, OpenCLUnalignedNxNS12Half) {
-  TestNxNS12<DeviceType::OPENCL, half>(107, 113);
-}
-
-namespace {
-void TestNEONNxNS12(const index_t height,
-                    const index_t width,
-                    const index_t input_channels,
-                    const index_t multiplier) {
-  testing::internal::LogToStderr();
-  auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
-                  Padding type) {
-    // generate random input
-    index_t batch = 1;
-    // Construct graph
-    OpsTestNet net;
-
-    // Add input data
-    net.AddRandomInput<CPU, float>("Input",
-                                   {batch, height, width, input_channels});
-    net.AddRandomInput<CPU, float>(
-      "Filter", {kernel_h, kernel_w, input_channels, multiplier});
-    net.AddRandomInput<CPU, float>("Bias", {multiplier * input_channels});
-    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {stride_h, stride_w})
-      .AddIntArg("padding", type)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
-      .Finalize(net.NewOperatorDef());
-
-    // Run on cpu
-    net.RunOp();
-    // Check
-    Tensor expected;
-    expected.Copy(*net.GetOutput("Output"));
-
-    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
-      .Input("InputNeon")
-      .Input("FilterNeon")
-      .Input("Bias")
-      .Output("OutputNeon")
-      .AddIntsArg("strides", {stride_h, stride_w})
-      .AddIntArg("padding", type)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
-      .Finalize(net.NewOperatorDef());
-
-    net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("InputNeon", "Input");
-    net.FillHWIOInputToOIHWInput<DeviceType::CPU, float>("FilterNeon",
-                                                         "Filter");
-
-    // Run
-    net.RunOp(NEON);
-
-    net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("OutputExptected",
-                                                         "Output");
-
-    // Check
-    ExpectTensorNear<float>(*net.GetOutput("OutputExptected"),
-                            *net.GetOutput("OutputNeon"),
-                            1e-5, 1e-3);
-  };
-
-  for (int kernel_size : {1, 3, 5}) {
-    for (int stride : {1, 2}) {
-      if (kernel_size > stride) {
-        func(kernel_size, kernel_size, stride, stride, VALID);
-        func(kernel_size, kernel_size, stride, stride, SAME);
-      }
-    }
-  }
-}
-}  // namespace
-
-TEST_F(DepthwiseConv2dOpTest, NEONTest) {
-  TestNEONNxNS12(4, 4, 32, 1);
-  TestNEONNxNS12(64, 64, 32, 1);
-  TestNEONNxNS12(112, 112, 32, 1);
-  TestNEONNxNS12(128, 128, 15, 1);
-  TestNEONNxNS12(107, 113, 15, 1);
+  TestNxNS12<half>(107, 113);
 }

 }  // namespace test

--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -35,11 +35,6 @@ void Register_Eltwise(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    EltwiseOp<DeviceType::OPENCL, half>);
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
-                                     .Device(DeviceType::NEON)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    EltwiseOp<DeviceType::NEON, float>);
 }

 }  // namespace ops

--- a/mace/ops/folded_batch_norm.cc
+++ b/mace/ops/folded_batch_norm.cc
@@ -35,11 +35,6 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    FoldedBatchNormOp<DeviceType::OPENCL, half>);
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
-                                     .Device(DeviceType::NEON)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    FoldedBatchNormOp<DeviceType::NEON, float>);
 }

 }  // namespace ops

--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
@@ -36,7 +36,7 @@ void CalculateScaleOffset(const std::vector<float> &gamma,
  }
 }

-template <DeviceType D>
+template<DeviceType D>
 void Simple() {
  OpsTestNet net;

@@ -49,7 +49,18 @@ void Simple() {
  net.AddInputFromArray<D, float>("Scale", {1}, scale);
  net.AddInputFromArray<D, float>("Offset", {1}, offset);

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+    OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+      .Input("InputNCHW")
+      .Input("Scale")
+      .Input("Offset")
+      .Output("OutputNCHW")
+      .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Scale", "ScaleImage",
@@ -58,33 +69,24 @@ void Simple() {
                            kernels::BufferType::ARGUMENT);

    OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
-        .Input("InputImage")
-        .Input("ScaleImage")
-        .Input("OffsetImage")
-        .Output("OutputImage")
-        .Finalize(net.NewOperatorDef());
+      .Input("InputImage")
+      .Input("ScaleImage")
+      .Input("OffsetImage")
+      .Output("OutputImage")
+      .Finalize(net.NewOperatorDef());
    // Run
    net.RunOp(D);

    // Transfer output
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
                            kernels::BufferType::IN_OUT_CHANNEL);
-  } else {
-    OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
-        .Input("Input")
-        .Input("Scale")
-        .Input("Offset")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
  }

  // Check
  auto expected =
-      CreateTensor<float>({1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125,
-                                         0.8291, 0.8291, 3.1708, 3.1708,
-                                         5.5125, 5.5125, 7.8543, 7.8543});
+    CreateTensor<float>({1, 6, 2, 1}, {-3.8543, -3.8543, -1.5125, -1.5125,
+                                       0.8291, 0.8291, 3.1708, 3.1708,
+                                       5.5125, 5.5125, 7.8543, 7.8543});

  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-4);
 }
@@ -92,100 +94,8 @@ void Simple() {

 TEST_F(FoldedBatchNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }

-/*
-TEST_F(FoldedBatchNormOpTest, SimpleNEON) {
-  Simple<DeviceType::NEON>();
-}
-*/
-
 TEST_F(FoldedBatchNormOpTest, SimpleOPENCL) { Simple<DeviceType::OPENCL>(); }

-/*
-TEST_F(FoldedBatchNormOpTest, SimpleRandomNeon) {
-  srand(time(NULL));
-
-  // generate random input
-  index_t batch = 1 + rand() % 10;
-  index_t channels = 3 + rand() % 50;
-  index_t height = 64;
-  index_t width = 64;
-  // Construct graph
-  OpsTestNet net;
-  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
-      .Input("Input")
-      .Input("Scale")
-      .Input("Offset")
-      .Input("Mean")
-      .Input("Var")
-      .Input("Epsilon")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  net.AddRandomInput<DeviceType::CPU, float>("Input", {batch, channels, height,
-width});
-  net.AddRandomInput<DeviceType::CPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::CPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::CPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::CPU, float>("Var", {channels}, true);
-  net.AddInputFromArray<DeviceType::CPU, float>("Epsilon", {}, {1e-3});
-
-  // run cpu
-  net.RunOp();
-
-  // Check
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
-
-  // Run NEON
-  net.RunOp(DeviceType::NEON);
-
-  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
-}
-
-TEST_F(FoldedBatchNormOpTest, ComplexRandomNeon) {
-  srand(time(NULL));
-
-  // generate random input
-  index_t batch = 1 + rand() % 10;
-  index_t channels = 3 + rand() % 50;
-  index_t height = 103;
-  index_t width = 113;
-  // Construct graph
-  OpsTestNet net;
-  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
-      .Input("Input")
-      .Input("Scale")
-      .Input("Offset")
-      .Input("Mean")
-      .Input("Var")
-      .Input("Epsilon")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  net.AddRandomInput<DeviceType::CPU, float>("Input", {batch, channels, height,
-width});
-  net.AddRandomInput<DeviceType::CPU, float>("Scale", {channels});
-  net.AddRandomInput<DeviceType::CPU, float>("Offset", {channels});
-  net.AddRandomInput<DeviceType::CPU, float>("Mean", {channels});
-  net.AddRandomInput<DeviceType::CPU, float>("Var", {channels}, true);
-  net.AddInputFromArray<DeviceType::CPU, float>("Epsilon", {}, {1e-3});
-
-  // run cpu
-  net.RunOp();
-
-  // Check
-  Tensor expected;
-  expected.Copy(*net.GetOutput("Output"));
-
-  // Run NEON
-  net.RunOp(DeviceType::NEON);
-
-  ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 1e-2);
-}
-*/
-
 TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
  // generate random input
  static unsigned int seed = time(NULL);
@@ -196,22 +106,33 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {

  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
-      .Input("Input")
-      .Input("Scale")
-      .Input("Offset")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());

  // Add input data
  net.AddRandomInput<DeviceType::OPENCL, float>(
-      "Input", {batch, height, width, channels});
+    "Input", {batch, height, width, channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});

+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
+  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+    .Input("InputNCHW")
+    .Input("Scale")
+    .Input("Offset")
+    .Output("OutputNCHW")
+    .Finalize(net.NewOperatorDef());
+
  // run cpu
  net.RunOp();

+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
  // Check
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));
@@ -225,11 +146,11 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
                                           kernels::BufferType::ARGUMENT);

  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Output("OutputImage")
-      .Finalize(net.NewOperatorDef());
+    .Input("InputImage")
+    .Input("ScaleImage")
+    .Input("OffsetImage")
+    .Output("OutputImage")
+    .Finalize(net.NewOperatorDef());

  // Run on opencl
  net.RunOp(DeviceType::OPENCL);
@@ -250,22 +171,33 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {

  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
-      .Input("Input")
-      .Input("Scale")
-      .Input("Offset")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());

  // Add input data
  net.AddRandomInput<DeviceType::OPENCL, float>(
-      "Input", {batch, height, width, channels});
+    "Input", {batch, height, width, channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});

+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
+  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+    .Input("InputNCHW")
+    .Input("Scale")
+    .Input("Offset")
+    .Output("OutputNCHW")
+    .Finalize(net.NewOperatorDef());
+
  // run cpu
  net.RunOp();

+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
  // Check
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));
@@ -279,12 +211,12 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
                                          kernels::BufferType::ARGUMENT);

  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Output("OutputImage")
-      .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
-      .Finalize(net.NewOperatorDef());
+    .Input("InputImage")
+    .Input("ScaleImage")
+    .Input("OffsetImage")
+    .Output("OutputImage")
+    .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
+    .Finalize(net.NewOperatorDef());

  // Run on opencl
  net.RunOp(DeviceType::OPENCL);
@@ -305,22 +237,33 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {

  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
-      .Input("Input")
-      .Input("Scale")
-      .Input("Offset")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());

  // Add input data
  net.AddRandomInput<DeviceType::OPENCL, float>(
-      "Input", {batch, height, width, channels});
+    "Input", {batch, height, width, channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});

+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
+  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+    .Input("InputNCHW")
+    .Input("Scale")
+    .Input("Offset")
+    .Output("OutputNCHW")
+    .Finalize(net.NewOperatorDef());
+
  // run cpu
  net.RunOp();

+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
  // Check
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));
@@ -334,11 +277,11 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
                                           kernels::BufferType::ARGUMENT);

  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Output("OutputImage")
-      .Finalize(net.NewOperatorDef());
+    .Input("InputImage")
+    .Input("ScaleImage")
+    .Input("OffsetImage")
+    .Output("OutputImage")
+    .Finalize(net.NewOperatorDef());

  // Run on opencl
  net.RunOp(DeviceType::OPENCL);
@@ -358,22 +301,33 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {

  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
-      .Input("Input")
-      .Input("Scale")
-      .Input("Offset")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());

  // Add input data
  net.AddRandomInput<DeviceType::OPENCL, float>(
-      "Input", {batch, height, width, channels});
+    "Input", {batch, height, width, channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Scale", {channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Offset", {channels});

+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
+  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+    .Input("InputNCHW")
+    .Input("Scale")
+    .Input("Offset")
+    .Output("OutputNCHW")
+    .Finalize(net.NewOperatorDef());
+
  // run cpu
  net.RunOp();

+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
  // Check
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));
@@ -387,12 +341,12 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
                                          kernels::BufferType::ARGUMENT);

  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
-      .Input("InputImage")
-      .Input("ScaleImage")
-      .Input("OffsetImage")
-      .Output("OutputImage")
-      .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
-      .Finalize(net.NewOperatorDef());
+    .Input("InputImage")
+    .Input("ScaleImage")
+    .Input("OffsetImage")
+    .Output("OutputImage")
+    .AddIntArg("T", static_cast<int>(DataType::DT_HALF))
+    .Finalize(net.NewOperatorDef());

  // Run on opencl
  net.RunOp(DeviceType::OPENCL);

--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -35,12 +35,6 @@ void Register_FullyConnected(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    FullyConnectedOp<DeviceType::OPENCL, half>);
-
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC")
-                                     .Device(DeviceType::NEON)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    FullyConnectedOp<DeviceType::NEON, float>);
 }

 }  // namespace ops

--- a/mace/ops/fully_connected.h
+++ b/mace/ops/fully_connected.h
@@ -23,19 +23,19 @@
 namespace mace {
 namespace ops {

-template <DeviceType D, class T>
+template<DeviceType D, class T>
 class FullyConnectedOp : public Operator<D, T> {
 public:
  FullyConnectedOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws),
-        functor_(static_cast<kernels::BufferType>(
-                     OperatorBase::GetSingleArgument<int>(
-                         "weight_type", static_cast<int>(
-                             kernels::WEIGHT_WIDTH))),
-                 kernels::StringToActivationType(
-                     OperatorBase::GetSingleArgument<std::string>("activation",
-                                                                  "NOOP")),
-                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
+    : Operator<D, T>(operator_def, ws),
+      functor_(static_cast<kernels::BufferType>(
+                 OperatorBase::GetSingleArgument<int>(
+                   "weight_type", static_cast<int>(
+                     kernels::WEIGHT_WIDTH))),
+               kernels::StringToActivationType(
+                 OperatorBase::GetSingleArgument<std::string>("activation",
+                                                              "NOOP")),
+               OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}

  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);
@@ -44,8 +44,17 @@ class FullyConnectedOp : public Operator<D, T> {
    Tensor *output = this->Output(OUTPUT);

    const index_t input_size = input->dim(1) * input->dim(2) * input->dim(3);
-    MACE_CHECK(input_size == weight->dim(1) && weight->dim(0) == bias->dim(0))
-        << "The size of Input, Weight and Bias don't match.";
+    MACE_CHECK(input_size == weight->dim(1) && weight->dim(0) == bias->dim(0),
+               "The size of Input: ",
+               input_size,
+               " Weight: ",
+               weight->dim(1),
+               ",",
+               weight->dim(
+                 0),
+               " and Bias ",
+               bias->dim(0),
+               " don't match.");

    functor_(input, weight, bias, output, future);
    return true;

--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
@@ -36,8 +36,14 @@ void FCBenchmark(
                               {out_channel, height * width * channel});
  net.AddRandomInput<D, float>("Bias", {out_channel});

-  if (D == DeviceType::OPENCL) {
-    const int width_size = height * width * channel;
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("FC", "FullyConnectedTest")
+      .Input("Input")
+      .Input("Weight")
+      .Input("Bias")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+  } else if (D == DeviceType::OPENCL) {
    kernels::BufferType weight_type = kernels::BufferType::WEIGHT_WIDTH;
    BufferToImage<D, T>(&net, "Weight", "WeightImage",
                        weight_type);
@@ -55,12 +61,7 @@ void FCBenchmark(
        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
  } else {
-    OpDefBuilder("FC", "FullyConnectedTest")
-        .Input("Input")
-        .Input("Weight")
-        .Input("Bias")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
+    MACE_NOT_IMPLEMENTED;
  }

  // Warm-up

--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -40,7 +40,18 @@ void Simple(const std::vector<index_t> &input_shape,
  net.AddInputFromArray<D, float>("Weight", weight_shape, weight_value);
  net.AddInputFromArray<D, float>("Bias", bias_shape, bias_value);

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    net.Transpose2D<D, float>("Weight", "WeightTranspose");
+    OpDefBuilder("FC", "FullyConnectedTest")
+      .Input("Input")
+      .Input("Weight")
+      .Input("Bias")
+      .Output("OutputNCHW")
+      .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Weight", "WeightImage",
@@ -62,14 +73,7 @@ void Simple(const std::vector<index_t> &input_shape,
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
                            kernels::BufferType::IN_OUT_CHANNEL);
  } else {
-    OpDefBuilder("FC", "FullyConnectedTest")
-      .Input("Input")
-      .Input("Weight")
-      .Input("Bias")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
+    MACE_NOT_IMPLEMENTED;
  }

  // Check
@@ -130,12 +134,6 @@ void Complex(const index_t batch,

  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("FC", "FullyConnectedTest")
-    .Input("Input")
-    .Input("Weight")
-    .Input("Bias")
-    .Output("Output")
-    .Finalize(net.NewOperatorDef());

  // Add input data
  net.AddRandomInput<DeviceType::OPENCL, float>(
@@ -144,9 +142,18 @@ void Complex(const index_t batch,
    "Weight", {out_channel, height * width * channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {out_channel});

+  OpDefBuilder("FC", "FullyConnectedTest")
+    .Input("Input")
+    .Input("Weight")
+    .Input("Bias")
+    .Output("OutputNCHW")
+    .Finalize(net.NewOperatorDef());
+
  // run cpu
  net.RunOp();

+  net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+
  // Check
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));
@@ -216,12 +223,6 @@ void TestWXFormat(const index_t batch,

  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("FC", "FullyConnectedTest")
-    .Input("Input")
-    .Input("Weight")
-    .Input("Bias")
-    .Output("Output")
-    .Finalize(net.NewOperatorDef());

  // Add input data
  net.AddRandomInput<DeviceType::OPENCL, float>(
@@ -230,9 +231,18 @@ void TestWXFormat(const index_t batch,
    "Weight", {out_channel, height * width * channels});
  net.AddRandomInput<DeviceType::OPENCL, float>("Bias", {out_channel});

+  OpDefBuilder("FC", "FullyConnectedTest")
+    .Input("Input")
+    .Input("Weight")
+    .Input("Bias")
+    .Output("OutputNCHW")
+    .Finalize(net.NewOperatorDef());
+
  // run cpu
  net.RunOp();

+  net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+
  // Check
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));
@@ -286,61 +296,6 @@ TEST_F(FullyConnectedOpTest, OPENCLHalfWidthFormatAligned) {
  TestWXFormat<half>(1, 16, 32, 32, 32);
 }

-namespace {
-void FullyConnectedTestNEON(const index_t batch,
-                            const index_t height,
-                            const index_t width,
-                            const index_t channels,
-                            const index_t out_channel) {
-  srand(time(NULL));
-
-  // Construct graph
-  OpsTestNet net;
-  OpDefBuilder("FC", "FullyConnectedTest")
-    .Input("Input")
-    .Input("Weight")
-    .Input("Bias")
-    .Output("Output")
-    .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  net.AddRandomInput<DeviceType::CPU, float>(
-    "Input", {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::CPU, float>(
-    "Weight", {out_channel, height * width * channels});
-  net.AddRandomInput<DeviceType::CPU, float>("Bias", {out_channel});
-
-  // run cpu
-  net.RunOp();
-
-  // Run on neon
-  OpDefBuilder("FC", "FullyConnectedTest")
-    .Input("Input")
-    .Input("Weight")
-    .Input("Bias")
-    .Output("OutputNeon")
-    .Finalize(net.NewOperatorDef());
-
-  // Run on device
-  net.RunOp(DeviceType::NEON);
-
-  net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("OutputExptected",
-                                                       "Output");
-
-  ExpectTensorNear<float>(*net.GetOutput("OutputExptected"),
-                          *net.GetOutput("OutputNeon"),
-                          1e-3, 1e-3);
-}
-}  // namespace
-
-TEST_F(FullyConnectedOpTest, TestNEON) {
-  FullyConnectedTestNEON(1, 7, 7, 32, 16);
-  FullyConnectedTestNEON(1, 7, 7, 512, 128);
-  FullyConnectedTestNEON(1, 1, 1, 2048, 1024);
-  FullyConnectedTestNEON(3, 1, 1, 16, 8);
-  FullyConnectedTestNEON(3, 7, 7, 32, 16);
-}
-
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/fused_conv_2d.cc
+++ b/mace/ops/fused_conv_2d.cc
@@ -35,11 +35,6 @@ void Register_FusedConv2D(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    FusedConv2dOp<DeviceType::OPENCL, half>);
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FusedConv2D")
-                                     .Device(DeviceType::NEON)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    FusedConv2dOp<DeviceType::NEON, float>);
 }

 }  // namespace ops

--- a/mace/ops/fused_conv_2d_test.cc
+++ b/mace/ops/fused_conv_2d_test.cc
@@ -37,7 +37,33 @@ void TestNHWCSimple3x3VALID() {
     1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
  net.AddInputFromArray<D, float>("Bias", {1}, {-0.1f});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
+      .Input("Bias")
+      .Output("OutputNCHW")
+      .AddIntsArg("strides", {1, 1})
+      .AddIntArg("padding", Padding::VALID)
+      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddStringArg("activation", "RELU")
+      .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -63,19 +89,7 @@ void TestNHWCSimple3x3VALID() {
                            kernels::BufferType::IN_OUT_CHANNEL);

  } else {
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddStringArg("activation", "RELU")
-      .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
+    MACE_NOT_IMPLEMENTED;
  }

  auto expected = CreateTensor<float>({1, 1, 1, 1}, {0.0f});
@@ -96,7 +110,33 @@ void TestNHWCSimple3x3SAME() {
     1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
  net.AddInputFromArray<D, float>("Bias", {1}, {-0.1f});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
+      .Input("Bias")
+      .Output("OutputNCHW")
+      .AddIntsArg("strides", {1, 1})
+      .AddIntArg("padding", Padding::SAME)
+      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddStringArg("activation", "RELU")
+      .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -122,19 +162,7 @@ void TestNHWCSimple3x3SAME() {
                            kernels::BufferType::IN_OUT_CHANNEL);

  } else {
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::SAME)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddStringArg("activation", "RELU")
-      .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
+    MACE_NOT_IMPLEMENTED;
  }

  auto expected = CreateTensor<float>(
@@ -168,7 +196,33 @@ void TestNHWCSimple3x3WithoutBias() {
    {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
     1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
+      .Output("OutputNCHW")
+      .AddIntsArg("strides", {1, 1})
+      .AddIntArg("padding", Padding::VALID)
+      .AddIntsArg("dilations", {1, 1})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .AddStringArg("activation", "RELU")
+      .Finalize(net.NewOperatorDef());
+
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, T>(&net, "Filter", "FilterImage",
@@ -190,19 +244,7 @@ void TestNHWCSimple3x3WithoutBias() {
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
                            kernels::BufferType::IN_OUT_CHANNEL);
  } else {
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("Input")
-      .Input("Filter")
-      .Output("Output")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-      .AddStringArg("activation", "RELU")
-      .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
+    MACE_NOT_IMPLEMENTED;
  }

  // Check
@@ -241,7 +283,31 @@ void TestConv1x1() {
    {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f});
  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
+      .Input("Bias")
+      .Output("OutputNCHW")
+      .AddIntsArg("strides", {1, 1})
+      .AddIntArg("padding", Padding::VALID)
+      .AddIntsArg("dilations", {1, 1})
+      .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
    BufferToImage<D, float>(&net, "Filter", "FilterImage",
@@ -264,17 +330,7 @@ void TestConv1x1() {
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
                            kernels::BufferType::IN_OUT_CHANNEL);
  } else {
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {1, 1})
-      .AddIntArg("padding", Padding::VALID)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());
-    // Run
-    net.RunOp(D);
+    MACE_NOT_IMPLEMENTED;
  }

  // Check
@@ -308,27 +364,43 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape) {
    index_t width = shape[1];
    index_t input_channels = shape[2] + (rand_r(&seed) % 10);
    index_t output_channels = shape[3] + (rand_r(&seed) % 10);
-    // Construct graph
+
    OpsTestNet net;
+
+    // Add input data
+    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
+    net.AddRandomInput<D, T>(
+      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
+    net.AddRandomInput<D, T>("Bias", {output_channels});
+
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+
+    // Construct graph
    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("Input")
-      .Input("Filter")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
      .Input("Bias")
-      .Output("Output")
+      .Output("OutputNCHW")
      .AddIntsArg("strides", {stride_h, stride_w})
      .AddIntArg("padding", type)
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

-    // Add input data
-    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
-    net.AddRandomInput<D, T>(
-      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
-    net.AddRandomInput<D, T>("Bias", {output_channels});
-
    // run on cpu
    net.RunOp();
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+
    // Check
    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));
@@ -386,17 +458,8 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape,
  index_t width = shape[1];
  index_t input_channels = shape[2];
  index_t output_channels = shape[3];
-  // Construct graph
+
  OpsTestNet net;
-  OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {stride, stride})
-      .AddIntArg("padding", type)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());

  std::vector<float> float_input_data;
  GenerateRandomRealTypeData({batch, height, width, input_channels},
@@ -415,8 +478,33 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &shape,
      float_filter_data);
  net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data);

+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+  net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                  HWOI,
+                                                  "FilterOIHW",
+                                                  OIHW);
+
+  // Construct graph
+  OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+    .Input("InputNCHW")
+    .Input("FilterOIHW")
+    .Input("Bias")
+    .Output("OutputNCHW")
+    .AddIntsArg("strides", {stride, stride})
+    .AddIntArg("padding", type)
+    .AddIntsArg("dilations", {1, 1})
+    .Finalize(net.NewOperatorDef());
+
  // run on cpu
  net.RunOp();
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
  // Check
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));
@@ -479,27 +567,42 @@ void TestGeneralConvNxNS12(const std::vector<index_t> &image_shape,
    index_t kernel_w = filter_shape[1];
    index_t output_channels = filter_shape[2];
    index_t input_channels = filter_shape[3];
-    // Construct graph
+
    OpsTestNet net;
+
+    // Add input data
+    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
+    net.AddRandomInput<D, T>(
+      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
+    net.AddRandomInput<D, T>("Bias", {output_channels});
+
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+
+    // Construct graph
    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("Input")
-      .Input("Filter")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
      .Input("Bias")
-      .Output("Output")
+      .Output("OutputNCHW")
      .AddIntsArg("strides", {stride_h, stride_w})
      .AddIntArg("padding", type)
      .AddIntsArg("dilations", {1, 1})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

-    // Add input data
-    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
-    net.AddRandomInput<D, T>(
-      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
-    net.AddRandomInput<D, T>("Bias", {output_channels});
-
    // run on cpu
    net.RunOp();
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
    // Check
    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));
@@ -561,27 +664,44 @@ void TestAtrousConvNxN(const std::vector<index_t> &shape,
    index_t width = shape[1];
    index_t output_channels = shape[2];
    index_t input_channels = shape[3];
-    // Construct graph
+
    OpsTestNet net;
+
+    // Add input data
+    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
+    net.AddRandomInput<D, T>(
+      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
+    net.AddRandomInput<D, T>("Bias", {output_channels});
+
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+
+    // Construct graph
    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("Input")
-      .Input("Filter")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
      .Input("Bias")
-      .Output("Output")
+      .Output("OutputNCHW")
      .AddIntsArg("strides", {stride_h, stride_w})
      .AddIntArg("padding", type)
      .AddIntsArg("dilations", {dilation, dilation})
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

-    // Add input data
-    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
-    net.AddRandomInput<D, T>(
-      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
-    net.AddRandomInput<D, T>("Bias", {output_channels});
-
    // run on cpu
    net.RunOp();
+
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+
    // Check
    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));
@@ -651,17 +771,8 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
    index_t kernel_w = filter_shape[1];
    index_t output_channels = filter_shape[2];
    index_t input_channels = filter_shape[3];
-    // Construct graph
+
    OpsTestNet net;
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {stride_h, stride_w})
-      .AddIntArg("padding", type)
-      .AddIntsArg("dilations", {1, 1})
-      .Finalize(net.NewOperatorDef());

    // Add input data
    net.AddRandomInput<D, float>("Input",
@@ -670,8 +781,33 @@ void TestGeneralHalfAtrousConv(const std::vector<index_t> &image_shape,
      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
    net.AddRandomInput<D, float>("Bias", {output_channels});

+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    net.TransformDataFormat<DeviceType::CPU, float>("Filter",
+                                                    HWOI,
+                                                    "FilterOIHW",
+                                                    OIHW);
+
+    // Construct graph
+    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
+      .Input("InputNCHW")
+      .Input("FilterOIHW")
+      .Input("Bias")
+      .Output("OutputNCHW")
+      .AddIntsArg("strides", {stride_h, stride_w})
+      .AddIntArg("padding", type)
+      .AddIntsArg("dilations", {1, 1})
+      .Finalize(net.NewOperatorDef());
+
    // run on cpu
    net.RunOp();
+
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
    // Check
    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));
@@ -718,81 +854,6 @@ TEST_F(FusedConv2dOpTest, OPENCL15X15AtrousConvD4) {
                                                {2, 2});
 }

-namespace {
-void TestNEONGeneralConvNxNS12(
-  const std::vector<index_t> &image_shape,
-  const std::vector<index_t> &filter_shape) {
-  testing::internal::LogToStderr();
-  auto func = [&](int stride_h, int stride_w, Padding type) {
-    srand(time(NULL));
-
-    // generate random input
-    index_t batch = 1;
-    index_t height = image_shape[0];
-    index_t width = image_shape[1];
-    index_t kernel_h = filter_shape[0];
-    index_t kernel_w = filter_shape[1];
-    index_t output_channels = filter_shape[2];
-    index_t input_channels = filter_shape[3];
-    // Construct graph
-    OpsTestNet net;
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("Input")
-      .Input("Filter")
-      .Input("Bias")
-      .Output("Output")
-      .AddIntsArg("strides", {stride_h, stride_w})
-      .AddIntArg("padding", type)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
-      .Finalize(net.NewOperatorDef());
-
-    // Add input data
-    net.AddRandomInput<CPU, float>("Input",
-                                   {batch, height, width, input_channels});
-    net.AddRandomInput<CPU, float>(
-      "Filter", {kernel_h, kernel_w, output_channels, input_channels});
-    net.AddRandomInput<CPU, float>("Bias", {output_channels});
-
-    // run on cpu
-    net.RunOp();
-
-    OpDefBuilder("FusedConv2D", "FusedConv2dTest")
-      .Input("InputNeon")
-      .Input("FilterNeon")
-      .Input("Bias")
-      .Output("OutputNeon")
-      .AddIntsArg("strides", {stride_h, stride_w})
-      .AddIntArg("padding", type)
-      .AddIntsArg("dilations", {1, 1})
-      .AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
-      .Finalize(net.NewOperatorDef());
-
-    net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("InputNeon", "Input");
-    net.FillHWOIInputToOIHWInput<DeviceType::CPU, float>("FilterNeon",
-                                                         "Filter");
-
-    // Run on device
-    net.RunOp(DeviceType::NEON);
-
-    net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("OutputExptected",
-                                                         "Output");
-
-    ExpectTensorNear<float>(*net.GetOutput("OutputExptected"),
-                            *net.GetOutput("OutputNeon"),
-                            1e-5, 1e-4);
-  };
-
-  for (int stride : {1, 2}) {
-    func(stride, stride, VALID);
-    func(stride, stride, SAME);
-  }
-}
-}  // namespace
-
-TEST_F(FusedConv2dOpTest, NEONTest) {
-  TestNEONGeneralConvNxNS12({32, 32}, {7, 7, 64, 3});
-}
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/global_avg_pooling.cc
+++ b/mace/ops/global_avg_pooling.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/global_avg_pooling.h"
-
-namespace mace {
-namespace ops {
-
-void Register_GlobalAvgPooling(OperatorRegistry *op_registry) {
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("GlobalAvgPooling")
-                                     .Device(DeviceType::CPU)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    GlobalAvgPoolingOp<DeviceType::CPU, float>);
-}
-
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/global_avg_pooling.h
+++ b/mace/ops/global_avg_pooling.h
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_GLOBAL_AVG_POOLING_H_
-#define MACE_OPS_GLOBAL_AVG_POOLING_H_
-
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/global_avg_pooling.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class GlobalAvgPoolingOp : public Operator<D, T> {
- public:
-  GlobalAvgPoolingOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<D, T>(operator_def, ws) {}
-
-  bool Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    Tensor *output = this->Output(OUTPUT);
-
-    std::vector<index_t> output_shape(4);
-    output_shape[0] = input->shape()[0];
-    output_shape[1] = input->shape()[1];
-    output_shape[2] = output_shape[3] = 1;
-
-    output->Resize(output_shape);
-
-    auto pooling_func = kernels::GlobalAvgPoolingFunctor<D, T>();
-    pooling_func(input->data<float>(), input->shape().data(),
-                 output->mutable_data<float>(), future);
-    return true;
-  }
-
- protected:
-  OP_INPUT_TAGS(INPUT);
-  OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_GLOBAL_AVG_POOLING_H_
--- a/mace/ops/global_avg_pooling_benchmark.cc
+++ b/mace/ops/global_avg_pooling_benchmark.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/global_avg_pooling.h"
-#include "mace/core/operator.h"
-#include "mace/core/testing/test_benchmark.h"
-#include "mace/ops/ops_test_util.h"
-
-namespace mace {
-namespace ops {
-namespace test {
-
-namespace {
-template <DeviceType D>
-void GlobalAvgPooling(
-    int iters, int batch, int channels, int height, int width) {
-  mace::testing::StopTiming();
-
-  OpsTestNet net;
-  OpDefBuilder("GlobalAvgPooling", "GlobalAvgPoolingTest")
-      .Input("Input")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  net.AddRandomInput<DeviceType::CPU, float>("Input",
-                                             {batch, channels, height, width});
-
-  // Warm-up
-  for (int i = 0; i < 5; ++i) {
-    net.RunOp(D);
-  }
-
-  mace::testing::StartTiming();
-  while (iters--) {
-    net.RunOp(D);
-  }
-}
-}  // namespace
-
-#define BM_GLOBAL_AVG_POOLING_MACRO(N, C, H, W, DEVICE)               \
-  static void BM_GLOBAL_AVG_POOLING_##N##_##C##_##H##_##W##_##DEVICE( \
-      int iters) {                                                    \
-    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W;  \
-    mace::testing::MaccProcessed(tot);                                \
-    mace::testing::BytesProcessed(tot *(sizeof(float)));              \
-    GlobalAvgPooling<DEVICE>(iters, N, C, H, W);                      \
-  }                                                                   \
-  BENCHMARK(BM_GLOBAL_AVG_POOLING_##N##_##C##_##H##_##W##_##DEVICE)
-
-#define BM_GLOBAL_AVG_POOLING(N, C, H, W) \
-  BM_GLOBAL_AVG_POOLING_MACRO(N, C, H, W, CPU);
-//  BM_GLOBAL_AVG_POOLING_MACRO(N, C, H, W, NEON);
-
-BM_GLOBAL_AVG_POOLING(1, 3, 7, 7);
-BM_GLOBAL_AVG_POOLING(1, 3, 64, 64);
-BM_GLOBAL_AVG_POOLING(1, 3, 256, 256);
-
-}  // namespace test
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/global_avg_pooling_test.cc
+++ b/mace/ops/global_avg_pooling_test.cc
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/core/operator.h"
-#include "mace/ops/ops_test_util.h"
-
-namespace mace {
-namespace ops {
-namespace test {
-
-class GlobalAvgPoolingOpTest : public OpsTestBase {};
-
-TEST_F(GlobalAvgPoolingOpTest, 3x7x7_CPU) {
-  // Construct graph
-  OpsTestNet net;
-  OpDefBuilder("GlobalAvgPooling", "GlobalAvgPoolingTest")
-      .Input("Input")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  std::vector<float> input(147);
-  for (int i = 0; i < 147; ++i) {
-    input[i] = i / 49 + 1;
-  }
-  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 7, 7}, input);
-
-  // Run
-  net.RunOp();
-
-  // Check
-  auto expected = CreateTensor<float>({1, 3, 1, 1}, {1, 2, 3});
-
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
-}
-
-}  // namespace test
-}  // namespace ops
-}  // namespace mace
--- a/mace/ops/local_response_norm.cc
+++ b/mace/ops/local_response_norm.cc
@@ -13,12 +13,6 @@ void Register_LocalResponseNorm(OperatorRegistry *op_registry) {
                                     .TypeConstraint<float>("T")
                                     .Build(),
                    LocalResponseNormOp<DeviceType::CPU, float>);
-
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("LocalResponseNorm")
-                                     .Device(DeviceType::NEON)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    LocalResponseNormOp<DeviceType::NEON, float>);
 }

 }  // namespace ops

--- a/mace/ops/local_response_norm_benchmark.cc
+++ b/mace/ops/local_response_norm_benchmark.cc
@@ -18,7 +18,9 @@ static void LocalResponseNorm(
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<D, T>("Input", {batch, height, width, channels});
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, T>("Input", {batch, channels, height, width});
+  }

  OpDefBuilder("LocalResponseNorm", "LocalResponseNormBM")
      .Input("Input")
@@ -54,8 +56,7 @@ static void LocalResponseNorm(
  BENCHMARK(BM_LOCAL_RESPONSE_NORM_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE)

 #define BM_LOCAL_RESPONSE_NORM(N, C, H, W)                 \
-  BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, float, CPU);    \
-  BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, float, NEON);
+  BM_LOCAL_RESPONSE_NORM_MACRO(N, C, H, W, float, CPU);

 BM_LOCAL_RESPONSE_NORM(1, 1, 512, 512);
 BM_LOCAL_RESPONSE_NORM(1, 3, 128, 128);

--- a/mace/ops/local_response_norm_test.cc
+++ b/mace/ops/local_response_norm_test.cc
@@ -19,16 +19,21 @@ void Simple() {
  net.AddInputFromArray<D, float>("Input", {1, 1, 2, 6},
                                  {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});

-  OpDefBuilder("LocalResponseNorm", "LocalResponseNormTest")
-      .Input("Input")
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
+
+    OpDefBuilder("LocalResponseNorm", "LocalResponseNormTest")
+      .Input("InputNCHW")
      .AddIntArg("depth_radius", 5)
      .AddFloatArg("bias", 1.0f)
      .AddFloatArg("alpha", 1.0f)
      .AddFloatArg("beta", 0.5f)
-      .Output("Output")
+      .Output("OutputNCHW")
      .Finalize(net.NewOperatorDef());
-  // Run
-  net.RunOp(D);
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<D, float>("OutputNCHW", NCHW, "Output", NHWC);
+  }

  // Check
  auto expected =
@@ -40,57 +45,6 @@ void Simple() {

 TEST_F(LocalResponseNormOpTest, SimpleCPU) { Simple<DeviceType::CPU>(); }

-TEST_F(LocalResponseNormOpTest, NEONTest) {
-  srand(time(NULL));
-  unsigned int seed;
-
-  // generate random input
-  index_t batch = 1 + rand_r(&seed) % 10;
-  index_t channels = 3 + rand_r(&seed) % 50;
-  index_t height = 64;
-  index_t width = 64;
-
-  // Construct graph
-  OpsTestNet net;
-  OpDefBuilder("LocalResponseNorm", "LocalResponseNormTest")
-      .Input("Input")
-      .AddIntArg("depth_radius", 5)
-      .AddFloatArg("bias", 1.0f)
-      .AddFloatArg("alpha", 1.0f)
-      .AddFloatArg("beta", 0.5f)
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  net.AddRandomInput<DeviceType::CPU, float>(
-    "Input", {batch, height, width, channels});
-
-  // run cpu
-  net.RunOp();
-
-  OpDefBuilder("LocalResponseNorm", "LocalResponseNormTest")
-      .Input("InputNeon")
-      .AddIntArg("depth_radius", 5)
-      .AddFloatArg("bias", 1.0f)
-      .AddFloatArg("alpha", 1.0f)
-      .AddFloatArg("beta", 0.5f)
-      .Output("OutputNeon")
-      .Finalize(net.NewOperatorDef());
-
-  net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("InputNeon", "Input");
-
-  // Run on neon
-  net.RunOp(DeviceType::NEON);
-  net.Sync();
-
-  net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("OutputExpected",
-                                                       "Output");
-
-  ExpectTensorNear<float>(*net.GetOutput("OutputExpected"),
-                          *net.GetOutput("OutputNeon"),
-                          0, 0.001);
-}
-
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -35,6 +35,8 @@ namespace mace {
 namespace ops {
 namespace test {

+enum DataFormat { NHWC = 0, NCHW = 1, HWOI = 2, OIHW = 3, HWIO = 4 };
+
 class OpDefBuilder {
 public:
  OpDefBuilder(const char *type, const std::string &name) {
@@ -173,78 +175,161 @@ class OpsTestNet {
  }

  template<DeviceType D, typename T>
-  void FillNHWCInputToNCHWInput(const std::string &name_nchw,
-                                const std::string &name_nhwc) {
-    Tensor *input = ws_.GetTensor(name_nhwc);
-    Tensor *output = ws_.CreateTensor(name_nchw,
+  void Transpose2D(const std::string &src_name,
+                   const std::string &dst_name) {
+    Tensor *input = ws_.GetTensor(src_name);
+    Tensor *output = ws_.CreateTensor(dst_name,
                                      GetDeviceAllocator(D),
                                      DataTypeToEnum<T>::v());
    const std::vector<index_t> input_shape = input->shape();
-    index_t batch = input_shape[0];
-    index_t height = input_shape[1];
-    index_t width = input_shape[2];
-    index_t channels = input_shape[3];
-    output->Resize({batch, channels, height, width});
+    MACE_CHECK(input_shape.size() == 2, "input shape != 2");
+    output->Resize({input_shape[1], input_shape[0]});
+    Tensor::MappingGuard input_guard(input);
+    Tensor::MappingGuard output_guard(output);
    const T *input_data = input->data<T>();
    T *output_data = output->mutable_data<T>();
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t c = 0; c < channels; ++c) {
-        for (index_t h = 0; h < height; ++h) {
-          for (index_t w = 0; w < width; ++w) {
-            output_data[((b * channels + c) * height + h) * width + w] =
-              input_data[((b * height + h) * width + w) * channels + c];
-          }
-        }
+    for (index_t i = 0; i < input_shape[0]; ++i) {
+      for (index_t j = 0; j < input_shape[1]; ++j) {
+        output_data[j * input_shape[0] + i] =
+          input_data[i * input_shape[1] + j];
      }
    }
  }

  template<DeviceType D, typename T>
-  void FillHWOIInputToOIHWInput(const std::string &name_oihw,
-                                const std::string &name_hwoi) {
-    Tensor *input = ws_.GetTensor(name_hwoi);
-    Tensor *output = ws_.CreateTensor(name_oihw,
+  void TransformDataFormat(const std::string &src_name,
+                           const DataFormat src_format,
+                           const std::string &dst_name,
+                           const DataFormat dst_format) {
+    Tensor *input = ws_.GetTensor(src_name);
+    Tensor *output = ws_.CreateTensor(dst_name,
                                      GetDeviceAllocator(D),
                                      DataTypeToEnum<T>::v());
    const std::vector<index_t> input_shape = input->shape();
-    index_t height = input_shape[0];
-    index_t width = input_shape[1];
-    index_t out_channels = input_shape[2];
-    index_t in_channels = input_shape[3];
-    index_t hw = height * width;
-    index_t oi = out_channels * in_channels;
-    output->Resize({out_channels, in_channels, height, width});
-    const T *input_data = input->data<T>();
-    T *output_data = output->mutable_data<T>();
-    for (index_t i = 0; i < oi; ++i) {
-      for (index_t j = 0; j < hw; ++j) {
-        output_data[i * height * width + j] =
-          input_data[j * out_channels * in_channels + i];
+    MACE_CHECK(input_shape.size() == 4, "input shape != 4");
+
+    if (src_format == NHWC && dst_format == NCHW) {
+      index_t batch = input_shape[0];
+      index_t height = input_shape[1];
+      index_t width = input_shape[2];
+      index_t channels = input_shape[3];
+      output->Resize({batch, channels, height, width});
+      Tensor::MappingGuard input_guard(input);
+      Tensor::MappingGuard output_guard(output);
+      const T *input_data = input->data<T>();
+      T *output_data = output->mutable_data<T>();
+      for (index_t b = 0; b < batch; ++b) {
+        for (index_t c = 0; c < channels; ++c) {
+          for (index_t h = 0; h < height; ++h) {
+            for (index_t w = 0; w < width; ++w) {
+              output_data[((b * channels + c) * height + h) * width + w] =
+                input_data[((b * height + h) * width + w) * channels + c];
+            }
+          }
+        }
+      }
+    } else if (src_format == NCHW && dst_format == NHWC) {
+      index_t batch = input_shape[0];
+      index_t channels = input_shape[1];
+      index_t height = input_shape[2];
+      index_t width = input_shape[3];
+      output->Resize({batch, height, width, channels});
+      Tensor::MappingGuard input_guard(input);
+      Tensor::MappingGuard output_guard(output);
+      const T *input_data = input->data<T>();
+      T *output_data = output->mutable_data<T>();
+      for (index_t b = 0; b < batch; ++b) {
+        for (index_t h = 0; h < height; ++h) {
+          for (index_t w = 0; w < width; ++w) {
+            for (index_t c = 0; c < channels; ++c) {
+              output_data[((b * height + h) * width + w) * channels + c] =
+                input_data[((b * channels + c) * height + h) * width + w];
+            }
+          }
+        }
+      }
+    } else if (src_format == HWOI && dst_format == OIHW) {
+      index_t height = input_shape[0];
+      index_t width = input_shape[1];
+      index_t out_channels = input_shape[2];
+      index_t in_channels = input_shape[3];
+      index_t hw = height * width;
+      index_t oi = out_channels * in_channels;
+      output->Resize({out_channels, in_channels, height, width});
+      Tensor::MappingGuard input_guard(input);
+      Tensor::MappingGuard output_guard(output);
+      const T *input_data = input->data<T>();
+      T *output_data = output->mutable_data<T>();
+      for (index_t i = 0; i < oi; ++i) {
+        for (index_t j = 0; j < hw; ++j) {
+          output_data[i * height * width + j] =
+            input_data[j * out_channels * in_channels + i];
+        }
+      }
+    } else if (src_format == OIHW && dst_format == HWOI) {
+      index_t out_channels = input_shape[0];
+      index_t in_channels = input_shape[1];
+      index_t height = input_shape[2];
+      index_t width = input_shape[3];
+      index_t hw = height * width;
+      index_t oi = out_channels * in_channels;
+      output->Resize({height, width, out_channels, in_channels});
+      Tensor::MappingGuard input_guard(input);
+      Tensor::MappingGuard output_guard(output);
+      const T *input_data = input->data<T>();
+      T *output_data = output->mutable_data<T>();
+      for (index_t i = 0; i < hw; ++i) {
+        for (index_t j = 0; j < oi; ++j) {
+          output_data[i * out_channels * in_channels + j] =
+            input_data[j * height * width + i];
+        }
+      }
+    } else if (src_format == HWIO && dst_format == OIHW) {
+      index_t height = input_shape[0];
+      index_t width = input_shape[1];
+      index_t in_channels = input_shape[2];
+      index_t out_channels = input_shape[3];
+      index_t hw = height * width;
+      output->Resize({out_channels, in_channels, height, width});
+      Tensor::MappingGuard input_guard(input);
+      Tensor::MappingGuard output_guard(output);
+      const T *input_data = input->data<T>();
+      T *output_data = output->mutable_data<T>();
+      for (index_t m = 0; m < out_channels; ++m) {
+        for (index_t c = 0; c < in_channels; ++c) {
+          for (index_t k = 0; k < hw; ++k) {
+            output_data[((m * in_channels) + c) * height * width + k] =
+              input_data[k * out_channels * in_channels + c * out_channels + m];
+          }
+        }
      }
+    } else {
+      MACE_NOT_IMPLEMENTED;
    }
  }

  template<DeviceType D, typename T>
-  void FillHWIOInputToOIHWInput(const std::string &name_oihw,
-                                const std::string &name_hwio) {
-    Tensor *input = ws_.GetTensor(name_hwio);
-    Tensor *output = ws_.CreateTensor(name_oihw,
+  void FillNHWCInputToNCHWInput(const std::string &name_nchw,
+                                const std::string &name_nhwc) {
+    Tensor *input = ws_.GetTensor(name_nhwc);
+    Tensor *output = ws_.CreateTensor(name_nchw,
                                      GetDeviceAllocator(D),
                                      DataTypeToEnum<T>::v());
    const std::vector<index_t> input_shape = input->shape();
-    index_t height = input_shape[0];
-    index_t width = input_shape[1];
-    index_t in_channels = input_shape[2];
-    index_t out_channels = input_shape[3];
-    index_t hw = height * width;
-    output->Resize({out_channels, in_channels, height, width});
+    index_t batch = input_shape[0];
+    index_t height = input_shape[1];
+    index_t width = input_shape[2];
+    index_t channels = input_shape[3];
+    output->Resize({batch, channels, height, width});
    const T *input_data = input->data<T>();
    T *output_data = output->mutable_data<T>();
-    for (index_t m = 0; m < out_channels; ++m) {
-      for (index_t c = 0; c < in_channels; ++c) {
-        for (index_t k = 0; k < hw; ++k) {
-          output_data[((m * in_channels) + c) * height * width + k] =
-            input_data[k * out_channels * in_channels + c * out_channels + m];
+    for (index_t b = 0; b < batch; ++b) {
+      for (index_t c = 0; c < channels; ++c) {
+        for (index_t h = 0; h < height; ++h) {
+          for (index_t w = 0; w < width; ++w) {
+            output_data[((b * channels + c) * height + h) * width + w] =
+              input_data[((b * height + h) * width + w) * channels + c];
+          }
        }
      }
    }
@@ -349,7 +434,7 @@ void GenerateRandomRealTypeData(const std::vector<index_t> &shape,
    std::generate(res->begin(), res->end(),
                  [&gen, &nd, positive] {
                    return half_float::half_cast<half>(
-                        positive ? std::abs(nd(gen)) : nd(gen));
+                      positive ? std::abs(nd(gen)) : nd(gen));
                  });
  } else {
    std::generate(res->begin(), res->end(), [&gen, &nd, positive] {
@@ -528,7 +613,6 @@ struct Expector<EXP_TYPE, RES_TYPE, false> {
  }
 };

-
 template<typename T>
 void ExpectTensorNear(const Tensor &x, const Tensor &y,
                      const double rel_err = 1e-5,

--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -23,12 +23,6 @@ void Register_Pooling(OperatorRegistry *op_registry) {
                                     .TypeConstraint<float>("T")
                                     .Build(),
                    PoolingOp<DeviceType::CPU, float>);
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
-                                     .Device(DeviceType::CPU)
-                                     .TypeConstraint<half>("T")
-                                     .Build(),
-                    PoolingOp<DeviceType::CPU, half>);
-
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
                                     .Device(DeviceType::OPENCL)
                                     .TypeConstraint<float>("T")
@@ -39,11 +33,6 @@ void Register_Pooling(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    PoolingOp<DeviceType::OPENCL, half>);
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
-                                     .Device(DeviceType::NEON)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    PoolingOp<DeviceType::NEON, float>);
 }

 }  // namespace ops

--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
@@ -23,7 +23,7 @@ namespace ops {
 namespace test {

 namespace {
-template <DeviceType D>
+template<DeviceType D>
 void Pooling(int iters,
             int batch,
             int channels,
@@ -36,7 +36,20 @@ void Pooling(int iters,
  mace::testing::StopTiming();

  OpsTestNet net;
-  OpDefBuilder("Pooling", "PoolingTest")
+
+  // Add input data
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, float>("Input",
+                                 {batch, channels, height, width});
+  } else if (D == DeviceType::OPENCL) {
+    net.AddRandomInput<D, float>("Input",
+                                 {batch, height, width, channels});
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("Pooling", "PoolingTest")
      .Input("Input")
      .Output("Output")
      .AddIntArg("pooling_type", pooling_type)
@@ -45,10 +58,22 @@ void Pooling(int iters,
      .AddIntArg("padding", padding)
      .AddIntsArg("dilations", {1, 1})
      .Finalize(net.NewOperatorDef());
+  } else if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(&net, "Input", "InputImage",
+                            kernels::BufferType::IN_OUT_CHANNEL);

-  // Add input data
-  net.AddRandomInput<DeviceType::CPU, float>("Input",
-                                             {batch, channels, height, width});
+    OpDefBuilder("Pooling", "PoolingTest")
+      .Input("InputImage")
+      .Output("OutputImage")
+      .AddIntArg("pooling_type", pooling_type)
+      .AddIntsArg("kernels", {kernel, kernel})
+      .AddIntsArg("strides", {stride, stride})
+      .AddIntArg("padding", padding)
+      .AddIntsArg("dilations", {1, 1})
+      .Finalize(net.NewOperatorDef());
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }

  // Warm-up
  for (int i = 0; i < 5; ++i) {
@@ -78,8 +103,8 @@ void Pooling(int iters,
        ##DEVICE)

 #define BM_POOLING(N, C, H, W, K, S, PA, PO) \
-  BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU);
-//  BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, NEON);
+  BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, CPU); \
+  BM_POOLING_MACRO(N, C, H, W, K, S, PA, PO, OPENCL);

 BM_POOLING(1, 3, 129, 129, 2, 2, SAME, MAX);
 BM_POOLING(1, 3, 257, 257, 2, 2, SAME, MAX);

--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -28,9 +28,21 @@ class PoolingOpTest : public OpsTestBase {};
 TEST_F(PoolingOpTest, MAX_VALID) {
  // Construct graph
  OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<DeviceType::CPU, float>(
+    "Input", {1, 4, 4, 2},
+    {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
+     8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
+
+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
  OpDefBuilder("Pooling", "PoolingTest")
-    .Input("Input")
-    .Output("Output")
+    .Input("InputNCHW")
+    .Output("OutputNCHW")
    .AddIntsArg("kernels", {2, 2})
    .AddIntsArg("strides", {2, 2})
    .AddIntArg("padding", Padding::VALID)
@@ -38,15 +50,14 @@ TEST_F(PoolingOpTest, MAX_VALID) {
    .AddIntArg("pooling_type", PoolingType::MAX)
    .Finalize(net.NewOperatorDef());

-  // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>(
-    "Input", {1, 4, 4, 2},
-    {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
-     8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
-
  // Run
  net.RunOp();

+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
  // Check
  auto expected =
    CreateTensor<float>({1, 2, 2, 2}, {5, 21, 7, 23, 13, 29, 15, 31});
@@ -57,9 +68,19 @@ TEST_F(PoolingOpTest, MAX_VALID) {
 TEST_F(PoolingOpTest, MAX_SAME) {
  // Construct graph
  OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 3, 1},
+                                                {0, 1, 2, 3, 4, 5, 6, 7, 8});
+
+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
  OpDefBuilder("Pooling", "PoolingTest")
-    .Input("Input")
-    .Output("Output")
+    .Input("InputNCHW")
+    .Output("OutputNCHW")
    .AddIntsArg("kernels", {2, 2})
    .AddIntsArg("strides", {2, 2})
    .AddIntArg("padding", Padding::SAME)
@@ -67,13 +88,14 @@ TEST_F(PoolingOpTest, MAX_SAME) {
    .AddIntArg("pooling_type", PoolingType::MAX)
    .Finalize(net.NewOperatorDef());

-  // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 3, 3, 1},
-                                                {0, 1, 2, 3, 4, 5, 6, 7, 8});
-
  // Run
  net.RunOp();

+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
  // Check
  auto expected = CreateTensor<float>({1, 2, 2, 1}, {4, 5, 7, 8});

@@ -83,9 +105,20 @@ TEST_F(PoolingOpTest, MAX_SAME) {
 TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
  // Construct graph
  OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<DeviceType::CPU, float>(
+    "Input", {1, 4, 4, 1},
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+
+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
  OpDefBuilder("Pooling", "PoolingTest")
-    .Input("Input")
-    .Output("Output")
+    .Input("InputNCHW")
+    .Output("OutputNCHW")
    .AddIntsArg("kernels", {2, 2})
    .AddIntsArg("strides", {1, 1})
    .AddIntArg("padding", Padding::VALID)
@@ -93,14 +126,14 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
    .AddIntArg("pooling_type", PoolingType::MAX)
    .Finalize(net.NewOperatorDef());

-  // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>(
-    "Input", {1, 4, 4, 1},
-    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
-
  // Run
  net.RunOp();

+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
  // Check
  auto expected = CreateTensor<float>({1, 2, 2, 1}, {10, 11, 14, 15});

@@ -110,9 +143,20 @@ TEST_F(PoolingOpTest, MAX_VALID_DILATION) {
 TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
  // Construct graph
  OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<DeviceType::CPU, float>(
+    "Input", {1, 2, 9, 1},
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
+
+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
  OpDefBuilder("Pooling", "PoolingTest")
-    .Input("Input")
-    .Output("Output")
+    .Input("InputNCHW")
+    .Output("OutputNCHW")
    .AddIntArg("pooling_type", PoolingType::MAX)
    .AddIntsArg("kernels", {2, 2})
    .AddIntsArg("strides", {2, 2})
@@ -120,13 +164,15 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
    .AddIntsArg("dilations", {1, 1})
    .Finalize(net.NewOperatorDef());

-  // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>(
-    "Input", {1, 2, 9, 1},
-    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17});
+
  // Run
  net.RunOp();

+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
  // Check
  auto expected = CreateTensor<float>({1, 1, 5, 1}, {10, 12, 14, 16, 17});

@@ -145,12 +191,15 @@ void SimpleMaxPooling3S2() {
    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
     14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26});

-  if (D == DeviceType::OPENCL) {
-    BufferToImage<D, float>(&net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT_CHANNEL);
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);
+    // Run
    OpDefBuilder("Pooling", "PoolingTest")
-      .Input("InputImage")
-      .Output("OutputImage")
+      .Input("InputNCHW")
+      .Output("OutputNCHW")
      .AddIntArg("pooling_type", PoolingType::MAX)
      .AddIntsArg("kernels", {3, 3})
      .AddIntsArg("strides", {2, 2})
@@ -158,13 +207,16 @@ void SimpleMaxPooling3S2() {
      .AddIntsArg("dilations", {1, 1})
      .Finalize(net.NewOperatorDef());
    net.RunOp(D);
-    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+  } else if (D == DeviceType::OPENCL) {
+    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
-  } else {
-    // Run
    OpDefBuilder("Pooling", "PoolingTest")
-      .Input("Input")
-      .Output("Output")
+      .Input("InputImage")
+      .Output("OutputImage")
      .AddIntArg("pooling_type", PoolingType::MAX)
      .AddIntsArg("kernels", {3, 3})
      .AddIntsArg("strides", {2, 2})
@@ -172,6 +224,8 @@ void SimpleMaxPooling3S2() {
      .AddIntsArg("dilations", {1, 1})
      .Finalize(net.NewOperatorDef());
    net.RunOp(D);
+    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
+                            kernels::BufferType::IN_OUT_CHANNEL);
  }

  // Check
@@ -194,9 +248,18 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
                   Padding padding) {
  // Construct graph
  OpsTestNet net;
+
+  // Add input data
+  net.AddRandomInput<D, float>("Input", input_shape);
+
+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
  OpDefBuilder("Pooling", "PoolingTest")
-    .Input("Input")
-    .Output("Output")
+    .Input("InputNCHW")
+    .Output("OutputNCHW")
    .AddIntArg("pooling_type", PoolingType::MAX)
    .AddIntsArg("kernels", {3, 3})
    .AddIntsArg("strides", strides)
@@ -204,11 +267,14 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
    .AddIntsArg("dilations", {1, 1})
    .Finalize(net.NewOperatorDef());

-  // Add input data
-  net.AddRandomInput<D, float>("Input", input_shape);
-
  // run on cpu
  net.RunOp();
+
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));

@@ -226,7 +292,7 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
    .Finalize(net.NewOperatorDef());
  net.RunOp(D);
  ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                      kernels::BufferType::IN_OUT_CHANNEL);
+                          kernels::BufferType::IN_OUT_CHANNEL);

  if (DataTypeToEnum<T>::value == DT_HALF) {
    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
@@ -237,12 +303,6 @@ void MaxPooling3S2(const std::vector<index_t> &input_shape,
 }
 }  // namespace

-// TODO(chenghui) : there is a bug.
-// TEST_F(PoolingOpTest, NEONAlignedMaxPooling3S2) {
-//  AlignedMaxPooling3S2<NEON>(Padding::VALID);
-//  AlignedMaxPooling3S2<NEON>(Padding::SAME);
-//}
-
 TEST_F(PoolingOpTest, OPENCLAlignedMaxPooling3S2) {
  MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {1, 1}, Padding::VALID);
  MaxPooling3S2<OPENCL, float>({3, 64, 32, 32}, {2, 2}, Padding::VALID);
@@ -267,9 +327,21 @@ TEST_F(PoolingOpTest, OPENCLUnalignedMaxPooling3S2) {
 TEST_F(PoolingOpTest, AVG_VALID) {
  // Construct graph
  OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<DeviceType::CPU, float>(
+    "Input", {1, 4, 4, 2},
+    {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
+     8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
+
+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
  OpDefBuilder("Pooling", "PoolingTest")
-    .Input("Input")
-    .Output("Output")
+    .Input("InputNCHW")
+    .Output("OutputNCHW")
    .AddIntsArg("kernels", {2, 2})
    .AddIntsArg("strides", {2, 2})
    .AddIntArg("padding", Padding::VALID)
@@ -277,15 +349,15 @@ TEST_F(PoolingOpTest, AVG_VALID) {
    .AddIntArg("pooling_type", PoolingType::AVG)
    .Finalize(net.NewOperatorDef());

-  // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>(
-    "Input", {1, 4, 4, 2},
-    {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
-     8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});

  // Run
  net.RunOp();

+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
  // Check
  auto expected = CreateTensor<float>(
    {1, 2, 2, 2}, {2.5, 18.5, 4.5, 20.5, 10.5, 26.5, 12.5, 28.5});
@@ -339,9 +411,18 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
                    Padding padding) {
  // Construct graph
  OpsTestNet net;
+
+  // Add input data
+  net.AddRandomInput<D, float>("Input", shape);
+
+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
  OpDefBuilder("Pooling", "PoolingTest")
-    .Input("Input")
-    .Output("Output")
+    .Input("InputNCHW")
+    .Output("OutputNCHW")
    .AddIntArg("pooling_type", PoolingType::AVG)
    .AddIntsArg("kernels", kernels)
    .AddIntsArg("strides", strides)
@@ -349,11 +430,14 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
    .AddIntsArg("dilations", {1, 1})
    .Finalize(net.NewOperatorDef());

-  // Add input data
-  net.AddRandomInput<D, float>("Input", shape);
-
  // run on cpu
  net.RunOp();
+
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));

@@ -371,8 +455,7 @@ void AvgPoolingTest(const std::vector<index_t> &shape,
    .Finalize(net.NewOperatorDef());
  net.RunOp(D);
  ImageToBuffer<D, float>(&net, "OutputImage", "OPENCLOutput",
-                      kernels::BufferType::IN_OUT_CHANNEL);
-
+                          kernels::BufferType::IN_OUT_CHANNEL);

  if (DataTypeToEnum<T>::value == DT_HALF) {
    ExpectTensorNear<float>(expected, *net.GetOutput("OPENCLOutput"),
@@ -425,64 +508,6 @@ TEST_F(PoolingOpTest, OPENCLUnAlignedLargeKernelAvgPooling) {
                                Padding::SAME);
 }

-namespace {
-void AvgPoolingNEONTest(const std::vector<index_t> &shape,
-                        const std::vector<int> &kernels,
-                        const std::vector<int> &strides,
-                        Padding padding,
-                        PoolingType pooling_type) {
-  // Construct graph
-  OpsTestNet net;
-  OpDefBuilder("Pooling", "PoolingTest")
-    .Input("Input")
-    .Output("Output")
-    .AddIntArg("pooling_type", pooling_type)
-    .AddIntsArg("kernels", kernels)
-    .AddIntsArg("strides", strides)
-    .AddIntArg("padding", padding)
-    .AddIntsArg("dilations", {1, 1})
-    .Finalize(net.NewOperatorDef());
-
-  // Add input data
-  net.AddRandomInput<CPU, float>("Input", shape);
-
-  // run on cpu
-  net.RunOp();
-
-  OpDefBuilder("Pooling", "PoolingTest")
-    .Input("InputNeon")
-    .Output("OutputNeon")
-    .AddIntArg("pooling_type", pooling_type)
-    .AddIntsArg("kernels", kernels)
-    .AddIntsArg("strides", strides)
-    .AddIntArg("padding", padding)
-    .AddIntsArg("dilations", {1, 1})
-    .Finalize(net.NewOperatorDef());
-
-  net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("InputNeon", "Input");
-
-  // run on neon
-  net.RunOp(DeviceType::NEON);
-
-  net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("OutputExptected",
-                                                       "Output");
-
-  ExpectTensorNear<float>(*net.GetOutput("OutputExptected"),
-                          *net.GetOutput("OutputNeon"),
-                          1e-5, 1e-4);
-}
-}  // namespace
-
-TEST_F(PoolingOpTest, NEONTest) {
-  AvgPoolingNEONTest({3, 31, 37, 128}, {8, 8}, {8, 8},
-                     Padding::VALID, PoolingType::MAX);
-  AvgPoolingNEONTest({3, 31, 37, 128}, {8, 8}, {8, 8},
-                     Padding::SAME, PoolingType::MAX);
-  AvgPoolingNEONTest({3, 31, 37, 128}, {8, 8}, {8, 8},
-                     Padding::VALID, PoolingType::AVG);
-  AvgPoolingNEONTest({3, 31, 37, 128}, {8, 8}, {8, 8},
-                     Padding::SAME, PoolingType::AVG);
-}
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/quantize.cc
+++ b/mace/ops/quantize.cc
@@ -23,11 +23,6 @@ void Register_Quantize(OperatorRegistry *op_registry) {
                                     .TypeConstraint<uint8_t>("T")
                                     .Build(),
                    QuantizeOp<DeviceType::CPU, uint8_t>);
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Quantize")
-                                     .Device(DeviceType::NEON)
-                                     .TypeConstraint<uint8_t>("T")
-                                     .Build(),
-                    QuantizeOp<DeviceType::CPU, uint8_t>);
 }

 void Register_Dequantize(OperatorRegistry *op_registry) {
@@ -36,11 +31,6 @@ void Register_Dequantize(OperatorRegistry *op_registry) {
                                     .TypeConstraint<uint8_t>("T")
                                     .Build(),
                    DequantizeOp<DeviceType::CPU, uint8_t>);
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Dequantize")
-                                     .Device(DeviceType::NEON)
-                                     .TypeConstraint<uint8_t>("T")
-                                     .Build(),
-                    DequantizeOp<DeviceType::CPU, uint8_t>);
 }

 void Register_Requantize(OperatorRegistry *op_registry) {
@@ -49,11 +39,6 @@ void Register_Requantize(OperatorRegistry *op_registry) {
                                    .TypeConstraint<uint8_t>("T")
                                    .Build(),
                    RequantizeOp<DeviceType::CPU, uint8_t>);
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Requantize")
-                                    .Device(DeviceType::NEON)
-                                    .TypeConstraint<uint8_t>("T")
-                                    .Build(),
-                    RequantizeOp<DeviceType::CPU, uint8_t>);
 }

 }  // namespace ops

--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
@@ -35,11 +35,27 @@ void ResizeBilinearBenchmark(int iters,
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<D, float>("Input",
-                               {batch, input_height, input_width, channels});
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, float>("Input",
+                                 {batch, channels, input_height, input_width});
+  } else if (D == DeviceType::OPENCL) {
+    net.AddRandomInput<D, float>("Input",
+                                 {batch, input_height, input_width, channels});
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
  net.AddInputFromArray<D, index_t>("OutSize", {2},
                                    {output_height, output_width});
-  if (D == DeviceType::OPENCL) {
+
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
+      .Input("Input")
+      .Input("OutSize")
+      .Output("Output")
+      .AddIntsArg("size", {output_height, output_width})
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, T>(&net, "Input", "InputImage",
                        kernels::BufferType::IN_OUT_CHANNEL);
    OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
@@ -50,13 +66,7 @@ void ResizeBilinearBenchmark(int iters,
        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());
  } else {
-    OpDefBuilder("ResizeBilinear", "ResizeBilinearBenchmark")
-        .Input("Input")
-        .Input("OutSize")
-        .Output("Output")
-        .AddIntsArg("size", {output_height, output_width})
-        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .Finalize(net.NewOperatorDef());
+    MACE_NOT_IMPLEMENTED;
  }

  // Warm-up

--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -28,19 +28,28 @@ TEST_F(ResizeBilinearTest, CPUResizeBilinearWOAlignCorners) {
  testing::internal::LogToStderr();
  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
-      .Input("Input")
-      .Output("Output")
-      .AddIntsArg("size", {1, 2})
-      .Finalize(net.NewOperatorDef());

  // Add input data
  std::vector<float> input(24);
  std::iota(begin(input), end(input), 0);
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
+  OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
+    .Input("InputNCHW")
+    .Output("OutputNCHW")
+    .AddIntsArg("size", {1, 2})
+    .Finalize(net.NewOperatorDef());

  // Run
  net.RunOp();
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);

  // Check
  auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 6, 7, 8});
@@ -52,20 +61,30 @@ TEST_F(ResizeBilinearTest, ResizeBilinearWAlignCorners) {
  testing::internal::LogToStderr();
  // Construct graph
  OpsTestNet net;
-  OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
-      .Input("Input")
-      .Output("Output")
-      .AddIntArg("align_corners", 1)
-      .AddIntsArg("size", {1, 2})
-      .Finalize(net.NewOperatorDef());

  // Add input data
  std::vector<float> input(24);
  std::iota(begin(input), end(input), 0);
  net.AddInputFromArray<DeviceType::CPU, float>("Input", {1, 2, 4, 3}, input);
+  net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                  NHWC,
+                                                  "InputNCHW",
+                                                  NCHW);
+
+  OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
+    .Input("InputNCHW")
+    .Output("OutputNCHW")
+    .AddIntArg("align_corners", 1)
+    .AddIntsArg("size", {1, 2})
+    .Finalize(net.NewOperatorDef());

  // Run
  net.RunOp();
+  net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                  NCHW,
+                                                  "Output",
+                                                  NHWC);
+

  // Check
  auto expected = CreateTensor<float>({1, 1, 2, 3}, {0, 1, 2, 9, 10, 11});
@@ -92,15 +111,24 @@ void TestRandomResizeBilinear() {
    // Add input data
    net.AddRandomInput<D, float>("Input",
                                 {batch, in_height, in_width, channels});
+    net.TransformDataFormat<DeviceType::CPU, float>("Input",
+                                                    NHWC,
+                                                    "InputNCHW",
+                                                    NCHW);

    OpDefBuilder("ResizeBilinear", "ResizeBilinearTest")
-        .Input("Input")
-        .Output("Output")
+        .Input("InputNCHW")
+        .Output("OutputNCHW")
        .AddIntArg("align_corners", align_corners)
        .AddIntsArg("size", {height, width})
        .Finalize(net.NewOperatorDef());
    // Run on CPU
    net.RunOp(DeviceType::CPU);
+    net.TransformDataFormat<DeviceType::CPU, float>("OutputNCHW",
+                                                    NCHW,
+                                                    "Output",
+                                                    NHWC);
+
    Tensor expected;
    expected.Copy(*net.GetOutput("Output"));

@@ -129,12 +157,6 @@ void TestRandomResizeBilinear() {
 }
 }  // namespace

-/*
-TEST_F(ResizeBilinearTest, NEONRandomResizeBilinear) {
-  TestRandomResizeBilinear<DeviceType::NEON>();
-}
-*/
-
 TEST_F(ResizeBilinearTest, OPENCLRandomResizeBilinear) {
  TestRandomResizeBilinear<DeviceType::OPENCL>();
 }

--- a/mace/ops/slice.cc
+++ b/mace/ops/slice.cc
@@ -34,11 +34,6 @@ void Register_Slice(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    SliceOp<DeviceType::OPENCL, half>);
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
-                                     .Device(DeviceType::NEON)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    SliceOp<DeviceType::NEON, float>);
 }

 }  // namespace ops

--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -35,11 +35,6 @@ void Register_Softmax(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    SoftmaxOp<DeviceType::OPENCL, half>);
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
-                                     .Device(DeviceType::NEON)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    SoftmaxOp<DeviceType::NEON, float>);
 }

 }  // namespace ops

--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
@@ -31,9 +31,20 @@ void SoftmaxBenchmark(
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, float>("Input", {batch, channels, height, width});
+  } else if (D == DeviceType::OPENCL) {
+    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("Softmax", "SoftmaxBM")
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -42,10 +53,7 @@ void SoftmaxBenchmark(
        .Output("Output")
        .Finalize(net.NewOperatorDef());
  } else {
-    OpDefBuilder("Softmax", "SoftmaxBM")
-        .Input("Input")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
+    MACE_NOT_IMPLEMENTED;
  }

  // Warm-up

--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
@@ -30,7 +30,17 @@ void Simple() {
  net.AddInputFromArray<D, float>("Input", {1, 1, 2, 4},
                                  {1, 1, 1, 1, 1, 2, 3, 4});

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
+    OpDefBuilder("Softmax", "SoftmaxTest")
+      .Input("InputNCHW")
+      .Output("OutputNCHW")
+      .Finalize(net.NewOperatorDef());
+
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -46,13 +56,7 @@ void Simple() {
    ImageToBuffer<D, float>(&net, "OutputImage", "Output",
                            kernels::BufferType::IN_OUT_CHANNEL);
  } else {
-    OpDefBuilder("Softmax", "SoftmaxTest")
-      .Input("Input")
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
-
-    // Run
-    net.RunOp(D);
+    MACE_NOT_IMPLEMENTED;
  }

  auto expected = CreateTensor<float>(
@@ -74,13 +78,18 @@ void Complex(const std::vector<index_t> &logits_shape) {
  // Add input data
  net.AddRandomInput<D, float>("Input", logits_shape);

+  net.TransformDataFormat<CPU, float>("Input", NHWC, "InputNCHW", NCHW);
+
  OpDefBuilder("Softmax", "SoftmaxTest")
-    .Input("Input")
-    .Output("Output")
+    .Input("InputNCHW")
+    .Output("OutputNCHW")
    .Finalize(net.NewOperatorDef());

  // Run on cpu
  net.RunOp();
+
+  net.TransformDataFormat<CPU, float>("OutputNCHW", NCHW, "Output", NHWC);
+
  Tensor expected;
  expected.Copy(*net.GetOutput("Output"));

@@ -119,47 +128,6 @@ TEST_F(SoftmaxOpTest, OPENCLUnAligned) {
  Complex<DeviceType::OPENCL>({5, 211, 107, 1});
 }

-namespace {
-void SoftMaxNEONTest(const std::vector<index_t> &logits_shape) {
-  // Construct graph
-  OpsTestNet net;
-  // Add input data
-  net.AddRandomInput<CPU, float>("Input", logits_shape);
-
-  OpDefBuilder("Softmax", "SoftmaxTest")
-    .Input("Input")
-    .Output("Output")
-    .Finalize(net.NewOperatorDef());
-
-  // Run on cpu
-  net.RunOp();
-
-  OpDefBuilder("Softmax", "SoftmaxTest")
-    .Input("InputNeon")
-    .Output("OutputNeon")
-    .Finalize(net.NewOperatorDef());
-
-  net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("InputNeon", "Input");
-
-  // run on neon
-  net.RunOp(DeviceType::NEON);
-
-  net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("OutputExptected",
-                                                       "Output");
-
-  ExpectTensorNear<float>(*net.GetOutput("OutputExptected"),
-                          *net.GetOutput("OutputNeon"),
-                          1e-5, 1e-5);
-}
-}  // namespace
-
-TEST_F(SoftmaxOpTest, NEONTest) {
-  SoftMaxNEONTest({5, 64, 64, 3});
-  SoftMaxNEONTest({8, 128, 128, 8});
-  SoftMaxNEONTest({1, 113, 107, 13});
-  SoftMaxNEONTest({5, 211, 107, 1});
-}
-
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/space_to_batch_test.cc
+++ b/mace/ops/space_to_batch_test.cc
@@ -169,69 +169,6 @@ TEST(SpaceToBatchTest, MultiBatchAndChannelData) {
       17, 18, 21, 22, 19, 20, 23, 24, 25, 26, 29, 30, 27, 28, 31, 32});
 }

-// TEST(SpaceTobatchTest, CompareTF) {
-//
-//  const std::string space_file = "/data/local/tmp/test/input";
-//  const std::string batch_file = "/data/local/tmp/test/output";
-//  const std::vector<index_t> space_shape = {1, 256, 256, 32};
-//  const int space_size = std::accumulate(space_shape.begin(),
-//  space_shape.end(), 1, std::multiplies<int>());
-//  const std::vector<index_t> batch_shape = {4, 130, 130, 32};
-//  const int batch_size = std::accumulate(batch_shape.begin(),
-//  batch_shape.end(), 1, std::multiplies<int>());
-//
-//  auto space_tensor = std::unique_ptr<Tensor>(new
-//  Tensor(GetDeviceAllocator(DeviceType::OPENCL),
-//                                                    DataTypeToEnum<float>::v()));
-//  space_tensor->Resize(space_shape);
-//  std::vector<float> space_data(space_size, 0.0);
-//  std::ifstream in_file(space_file, std::ios::in | std::ios::binary);
-//  if (in_file.is_open()) {
-//    in_file.read(reinterpret_cast<char *>(space_data.data()),
-//                 space_size * sizeof(float));
-//    in_file.close();
-//    Tensor::MappingGuard space_mapper(space_tensor.get());
-//    float *space_ptr = space_tensor->mutable_data<float>();
-//    MACE_CHECK(static_cast<size_t>(space_tensor->size()) == space_data.size())
-//      << "Space tensor size:" << space_tensor->size()
-//      << ", space data size:" << space_data.size();
-//    memcpy(space_ptr, space_data.data(), space_data.size() * sizeof(float));
-//  } else {
-//    VLOG(0) << "open space file failed";
-//  }
-//
-//  auto batch_tensor = std::unique_ptr<Tensor>(new
-//  Tensor(GetDeviceAllocator(DeviceType::OPENCL),
-//                                                    DataTypeToEnum<float>::v()));
-//  std::vector<float> batch_data(batch_size, 0.0);
-//  batch_tensor->Resize(batch_shape);
-//  {
-//    std::ifstream in_file(batch_file, std::ios::in | std::ios::binary);
-//    if (in_file.is_open()) {
-//      in_file.read(reinterpret_cast<char *>(batch_data.data()),
-//                    batch_size * sizeof(float));
-//      in_file.close();
-//    } else {
-//      VLOG(0) << "open batch file failed";
-//    }
-//    Tensor::MappingGuard batch_mapper(batch_tensor.get());
-//    float *batch_ptr = batch_tensor->mutable_data<float>();
-//    MACE_CHECK(static_cast<size_t>(batch_tensor->size()) ==
-//    batch_data.size());
-//    memcpy(batch_ptr, batch_data.data(), batch_data.size() * sizeof(float));
-//  }
-//
-//  RunSpaceToBatch<DeviceType::OPENCL>(space_shape, space_data,
-//                                      {2, 2},
-//                                      {2, 2, 2, 2},
-//                                      batch_tensor.get());
-//
-//  RunBatchToSpace<DeviceType::OPENCL>(batch_shape, batch_data,
-//                                      {2, 2},
-//                                      {2, 2, 2, 2},
-//                                      space_tensor.get());
-//}
-
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
--- a/mace/ops/space_to_depth.h
+++ b/mace/ops/space_to_depth.h
@@ -24,12 +24,12 @@
 namespace mace {
 namespace ops {

-template <DeviceType D, typename T>
+template<DeviceType D, typename T>
 class SpaceToDepthOp : public Operator<D, T> {
 public:
  SpaceToDepthOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
-        functor_(OperatorBase::GetSingleArgument<int>("block_size", 1), false) {
+    : Operator<D, T>(op_def, ws),
+      functor_(OperatorBase::GetSingleArgument<int>("block_size", 1), false) {
  }

  bool Run(StatsFuture *future) override {
@@ -37,16 +37,27 @@ class SpaceToDepthOp : public Operator<D, T> {
    Tensor *output = this->Output(OUTPUT);
    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
    const int block_size =
-        OperatorBase::GetSingleArgument<int>("block_size", 1);
-    const int input_height = input->dim(1);
-    const int input_width = input->dim(2);
-    const int input_depth = input->dim(3);
+      OperatorBase::GetSingleArgument<int>("block_size", 1);
+    index_t input_height;
+    index_t input_width;
+    index_t input_depth;
+    if (D == CPU) {
+      input_height = input->dim(2);
+      input_width = input->dim(3);
+      input_depth = input->dim(1);
+    } else if (D == OPENCL) {
+      input_height = input->dim(1);
+      input_width = input->dim(2);
+      input_depth = input->dim(3);
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
    MACE_CHECK((input_depth % 4) == 0,
               "input channel should be dividable by 4");
    MACE_CHECK(
-        (input_width%block_size == 0)&&(input_height%block_size == 0),
-        "input width and height should be dividable by block_size",
-        input->dim(3));
+      (input_width % block_size == 0) && (input_height % block_size == 0),
+      "input width and height should be dividable by block_size",
+      input->dim(3));
    functor_(input, output, future);
    return true;
  }

--- a/mace/ops/space_to_depth_benchmark.cc
+++ b/mace/ops/space_to_depth_benchmark.cc
@@ -29,9 +29,20 @@ void SpaceToDepth(
  OpsTestNet net;

  // Add input data
-  net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  if (D == DeviceType::CPU) {
+    net.AddRandomInput<D, float>("Input", {batch, height, channels, width});
+  } else if (D == DeviceType::OPENCL) {
+    net.AddRandomInput<D, float>("Input", {batch, height, width, channels});
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }

-  if (D == DeviceType::OPENCL) {
+  if (D == DeviceType::CPU) {
+    OpDefBuilder("SpaceToDepth", "SpaceToDepthBM")
+      .Input("Input")
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+  } else if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(&net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);

@@ -41,10 +52,7 @@ void SpaceToDepth(
        .AddIntArg("block_size", block_size)
        .Finalize(net.NewOperatorDef());
  } else {
-    OpDefBuilder("SpaceToDepth", "SpaceToDepthBM")
-        .Input("Input")
-        .Output("Output")
-        .Finalize(net.NewOperatorDef());
+    MACE_NOT_IMPLEMENTED;
  }

  // Warm-up

--- a/mace/ops/transpose.cc
+++ b/mace/ops/transpose.cc
@@ -19,16 +19,10 @@ namespace ops {

 void Register_Transpose(OperatorRegistry *op_registry) {
  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose")
-    .Device(DeviceType::CPU)
-    .TypeConstraint<float>("T")
-    .Build(),
+                                   .Device(DeviceType::CPU)
+                                   .TypeConstraint<float>("T")
+                                   .Build(),
                    TransposeOp<DeviceType::CPU, float>);
-
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose")
-    .Device(DeviceType::NEON)
-    .TypeConstraint<float>("T")
-    .Build(),
-                    TransposeOp<DeviceType::NEON, float>);
 }

 }  // namespace ops

--- a/mace/python/tools/caffe_converter_lib.py
+++ b/mace/python/tools/caffe_converter_lib.py
@@ -221,7 +221,7 @@ class CaffeConverter(object):
        arg.i = self.dt
        data_format_arg = op_def.arg.add()
        data_format_arg.name = 'data_format'
-        if self.device == 'neon':
+        if self.device == 'cpu':
            data_format_arg.s = 'NCHW'
        else:
            data_format_arg.s = 'NHWC'
@@ -396,7 +396,7 @@ class CaffeConverter(object):

    def convert_conv2d(self, op):
        use_winograd = False
-        if self.device == 'neon':
+        if self.device == 'cpu':
            use_winograd = self.check_winograd_conv(op)

        param = op.layer.convolution_param
@@ -414,14 +414,14 @@ class CaffeConverter(object):

        # Add filter
        weight_tensor_name = op.name + '_weight:0'
-        if self.device == 'neon':
+        if self.device == 'cpu':
            weight_data = op.data[0]
        else:
            # OIHW -> HWOI
            weight_data = op.data[0].transpose((2, 3, 0, 1))

-        if self.device == 'neon' and use_winograd:
-            self.convert_winograd_conv_filter_neon(op, op_def)
+        if self.device == 'cpu' and use_winograd:
+            self.convert_winograd_conv_filter_cpu(op, op_def)
        else:
            self.add_tensor(weight_tensor_name, weight_data)

@@ -459,7 +459,7 @@ class CaffeConverter(object):
        final_op = op
        self.resolved_ops.add(op.name)

-        input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
+        input_format = 'NCHW' if self.device == 'cpu' else 'NHWC'
        output_shape = Shapes.conv_pool_shape(
            op.get_single_parent().output_shape_map[op.layer.bottom[0]],
            weight_data.shape, paddings, strides, dilations, math.floor,
@@ -486,7 +486,7 @@ class CaffeConverter(object):
    def check_winograd_conv(self, op):
        param = op.layer.convolution_param
        filter_shape = np.asarray(op.data[0].shape)
-        if self.device != 'neon':
+        if self.device != 'cpu':
            filter_shape = filter_shape[[2, 3, 0, 1]]  # OIHW -> HWOI
        paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)

@@ -503,7 +503,7 @@ class CaffeConverter(object):
            elif len(param.dilation) == 2:
                dilations = [param.dilation[0], param.dilation[1]]

-        input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
+        input_format = 'NCHW' if self.device == 'cpu' else 'NHWC'
        output_shape = Shapes.conv_pool_shape(
            op.get_single_parent().output_shape_map[op.layer.bottom[0]],
            filter_shape, paddings, strides, dilations, math.floor,
@@ -519,13 +519,13 @@ class CaffeConverter(object):
                    (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
                    (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
                    (width < OPENCL_IMAGE_MAX_SIZE)
-            elif self.device == 'neon':
+            elif self.device == 'cpu':
                return filter_shape[2] == 3 and \
                    filter_shape[2] == filter_shape[3] and \
                    filter_shape[0] >= 8 and filter_shape[1] >= 8
        return False

-    def convert_winograd_conv_filter_neon(self, op, op_def):
+    def convert_winograd_conv_filter_cpu(self, op, op_def):
        # Add filter
        weight_tensor_name = op.name + '_weight:0'
        weight_data = op.data[0]  # OIHW
@@ -739,7 +739,7 @@ class CaffeConverter(object):
        weight_data = op.data[0].reshape(-1, op.data[0].shape[-1])
        assert weight_data.shape[1] == (
            input_shape[1] * input_shape[2] * input_shape[3])
-        if self.device != 'neon':
+        if self.device != 'cpu':
            weight_data = weight_data.reshape(-1, input_shape[3],
                                              input_shape[1], input_shape[2])
            weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(
@@ -783,7 +783,7 @@ class CaffeConverter(object):
                op_def.input.extend([bias_tensor_name])

        self.resolved_ops.add(op.name)
-        input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
+        input_format = 'NCHW' if self.device == 'cpu' else 'NHWC'
        output_shape = Shapes.fully_connected_shape(input_shape,
                                                    weight_data.shape,
                                                    input_format)
@@ -823,14 +823,14 @@ class CaffeConverter(object):
            0]]
        if param.HasField('global_pooling') and param.global_pooling:
            kernels = [input_shape[2], input_shape[3]] \
-                if self.device == 'neon' else \
+                if self.device == 'cpu' else \
                [input_shape[1], input_shape[2]]

        kernel_arg = op_def.arg.add()
        kernel_arg.name = 'kernels'
        kernel_arg.ints.extend(kernels)

-        if self.device != 'neon':
+        if self.device != 'cpu':
            filter_shape = [
                kernels[0], kernels[1], input_shape[3], input_shape[3]
            ]
@@ -838,7 +838,7 @@ class CaffeConverter(object):
            filter_shape = [
                input_shape[1], input_shape[1], kernels[0], kernels[1]
            ]
-        input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
+        input_format = 'NCHW' if self.device == 'cpu' else 'NHWC'
        output_shape = Shapes.conv_pool_shape(input_shape, filter_shape,
                                              paddings, strides, [1, 1],
                                              math.ceil, input_format)
@@ -897,7 +897,7 @@ class CaffeConverter(object):
        op_def = self.CommonConvert(op, 'Concat')
        axis_arg = op_def.arg.add()
        axis_arg.name = 'axis'
-        axis_arg.i = 3 if self.device != 'neon' else 1
+        axis_arg.i = 3 if self.device != 'cpu' else 1
        try:
            if op.layer.concat_param.HasFeild('axis'):
                axis_arg.i = op.concat_param.axis
@@ -947,7 +947,7 @@ class CaffeConverter(object):

        axis_arg = op_def.arg.add()
        axis_arg.name = 'axis'
-        axis_arg.i = 3 if self.device != 'neon' else 1
+        axis_arg.i = 3 if self.device != 'cpu' else 1

        input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
        num_outputs = len(op.layer.top)
@@ -958,7 +958,7 @@ class CaffeConverter(object):
            raise Exception(
                'Mace do not support slice with input shape ' +
                str(input_shape) + ' and number of output ' + str(num_outputs))
-        input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
+        input_format = 'NCHW' if self.device == 'cpu' else 'NHWC'
        output_shape = Shapes.slice_shape(input_shape, num_outputs,
                                          input_format)
        for i in range(len(op.layer.top)):
@@ -978,14 +978,14 @@ class CaffeConverter(object):
        self.resolved_ops.add(op.name)

    def convert_reshape(self, op):
-        if self.device == 'neon':
+        if self.device == 'cpu':
            op_def = self.CommonConvert(op, 'Reshape')
        else:
            op_def = self.CommonConvert(op, 'ReOrganize')
        input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
        output_shape = input_shape
        shape_param = np.asarray(op.layer.reshape_param.shape.dim)
-        if self.device != 'neon':
+        if self.device != 'cpu':
            shape_param = shape_param[[0, 3, 1, 2]]
        for i in range(len(shape_param)):
            if shape_param[i] != 0:
@@ -1060,7 +1060,7 @@ class CaffeConverter(object):
        assert len(input_nodes) == len(input_shapes)
        for i in range(len(input_nodes)):
            input_op = self.ops_map[input_nodes[i]]
-            input_shape = input_shapes[i] if self.device != 'neon' else \
+            input_shape = input_shapes[i] if self.device != 'cpu' else \
                [input_shapes[i][0], input_shapes[i][3],
                 input_shapes[i][1], input_shapes[i][2]]
            if input_op.layer is not None:
@@ -1068,7 +1068,7 @@ class CaffeConverter(object):
            else:
                input_op.output_shape_map[input_op.name] = input_shape

-    def add_neon_input_transform(self, names):
+    def add_cpu_input_transform(self, names):
        for name in names:
            new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
            op_def = self.net_def.op.add()
@@ -1085,7 +1085,7 @@ class CaffeConverter(object):
            arg.name = 'T'
            arg.i = self.dt

-    def add_neon_output_transform(self, names):
+    def add_cpu_output_transform(self, names):
        for name in names:
            output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
            op_def = self.net_def.op.add()
@@ -1105,8 +1105,8 @@ class CaffeConverter(object):
        if self.device == 'gpu':
            self.add_input_transform(input_nodes)

-        if self.device == 'neon':
-            self.add_neon_input_transform(input_nodes)
+        if self.device == 'cpu':
+            self.add_cpu_input_transform(input_nodes)

        for op in self.ops:
            if op.name in self.resolved_ops:
@@ -1152,10 +1152,7 @@ class CaffeConverter(object):
            self.add_output_transform(output_nodes)

        if self.device == 'cpu':
-            self.replace_in_out_name(input_nodes, output_nodes)
-
-        if self.device == 'neon':
-            self.add_neon_output_transform(output_nodes)
+            self.add_cpu_output_transform(output_nodes)

        for op in self.ops:
            if op.name not in self.resolved_ops:

--- a/mace/python/tools/tf_converter_lib.py
+++ b/mace/python/tools/tf_converter_lib.py
@@ -157,7 +157,7 @@ class TFConverter(object):

            self.add_output_shape(self.ops[name].outputs, op_def)

-    def add_neon_input_transform(self, names):
+    def add_cpu_input_transform(self, names):
        for name in names:
            new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
            op_def = self.net_def.op.add()
@@ -187,7 +187,7 @@ class TFConverter(object):
            epsilon_arg.name = 'buffer_type'
            epsilon_arg.i = buffer_type_map['IN_OUT_CHANNEL']

-    def add_neon_output_transform(self, names):
+    def add_cpu_output_transform(self, names):
        for name in names:
            output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
            op_def = self.net_def.op.add()
@@ -281,7 +281,7 @@ class TFConverter(object):
                return (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
                       (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
                       (width < OPENCL_IMAGE_MAX_SIZE)
-            elif self.device == 'neon':
+            elif self.device == 'cpu':
                return filter_shape[2] >= 8 and filter_shape[3] >= 8
        return False

@@ -375,7 +375,7 @@ class TFConverter(object):
        self.add_output_shape(final_op.outputs, iwt_op)
        self.net_def.op.extend([wt_op, matmul_op, iwt_op])

-    def convert_conv_winograd_filter_neon(self, op, op_def):
+    def convert_conv_winograd_filter_cpu(self, op, op_def):
        weight_tensor = get_input_tensor(op, 1)
        weight_tensor_value = weight_tensor.eval().astype(np.float32)
        input_shape = get_input_tensor(op, 0).shape.as_list()
@@ -421,7 +421,7 @@ class TFConverter(object):

    def convert_conv2d(self, op):
        use_winograd = False
-        if self.device == 'neon':
+        if self.device == 'cpu':
            use_winograd = self.check_winograd_conv(op)

        op_def = mace_pb2.OperatorDef()
@@ -434,7 +434,7 @@ class TFConverter(object):
        else:
            op_def.type = op.type

-        if self.device == 'neon' and not use_winograd:
+        if self.device == 'cpu' and not use_winograd:
            self.transpose_filter_tensor[get_input_tensor(
                op, 1).name] = (3, 2, 0, 1)
        elif op.type == 'Conv2D':
@@ -449,8 +449,8 @@ class TFConverter(object):
            output_name = self.add_buffer_to_image(
                get_input_tensor(op, 1).name, buffer_type)
            op_def.input.extend([output_name])
-        elif self.device == 'neon' and use_winograd:
-            self.convert_conv_winograd_filter_neon(op, op_def)
+        elif self.device == 'cpu' and use_winograd:
+            self.convert_conv_winograd_filter_cpu(op, op_def)
        else:
            op_def.input.extend(
                [get_input_tensor(op, i).name for i in range(len(op.inputs))])
@@ -463,7 +463,7 @@ class TFConverter(object):
        strides_arg.ints.extend(op.get_attr('strides')[1:3])
        data_format_arg = op_def.arg.add()
        data_format_arg.name = 'data_format'
-        if self.device == 'neon':
+        if self.device == 'cpu':
            data_format_arg.s = 'NCHW'
        else:
            data_format_arg.s = 'NHWC'
@@ -502,7 +502,7 @@ class TFConverter(object):
        self.net_def.op.extend([op_def])

    def check_conv_to_fc(self, op):
-        if self.device != 'neon' or op.type != "Conv2D":
+        if self.device != 'cpu' or op.type != "Conv2D":
            return False
        filter_shape = get_input_tensor(op, 1).shape.as_list()
        input_shape = get_input_tensor(op, 0).shape.as_list()
@@ -569,7 +569,7 @@ class TFConverter(object):
        arg.i = self.dt
        data_format_arg = op_def.arg.add()
        data_format_arg.name = 'data_format'
-        if self.device == 'neon':
+        if self.device == 'cpu':
            data_format_arg.s = 'NCHW'
        else:
            data_format_arg.s = 'NHWC'
@@ -675,7 +675,7 @@ class TFConverter(object):
        epsilon_arg.f = get_input_tensor(op, 1).eval().astype(np.float)
        data_format_arg = op_def.arg.add()
        data_format_arg.name = 'data_format'
-        if self.device == 'neon':
+        if self.device == 'cpu':
            data_format_arg.s = 'NCHW'
        else:
            data_format_arg.s = 'NHWC'
@@ -709,7 +709,7 @@ class TFConverter(object):
        kernels_arg.ints.extend(op.get_attr('ksize')[1:3])
        data_format_arg = op_def.arg.add()
        data_format_arg.name = 'data_format'
-        if self.device == 'neon':
+        if self.device == 'cpu':
            data_format_arg.s = 'NCHW'
        else:
            data_format_arg.s = 'NHWC'
@@ -739,7 +739,7 @@ class TFConverter(object):
        kernels_arg.ints.extend(op.inputs[0].shape.as_list()[1:3])
        data_format_arg = op_def.arg.add()
        data_format_arg.name = 'data_format'
-        if self.device == 'neon':
+        if self.device == 'cpu':
            data_format_arg.s = 'NCHW'
        else:
            data_format_arg.s = 'NHWC'
@@ -802,7 +802,7 @@ class TFConverter(object):
        axis_arg = op_def.arg.add()
        axis_arg.name = 'axis'
        axis = get_input_tensor(op, len(op.inputs) - 1).eval().astype(np.int32)
-        if self.device == 'neon' and axis == 3:
+        if self.device == 'cpu' and axis == 3:
            axis = 1
        axis_arg.i = axis
        self.add_output_shape(op.outputs, op_def)
@@ -969,7 +969,7 @@ class TFConverter(object):
        strides_arg.ints.extend([1, 1])
        data_format_arg = op_def.arg.add()
        data_format_arg.name = 'data_format'
-        if self.device == 'neon':
+        if self.device == 'cpu':
            data_format_arg.s = 'NCHW'
        else:
            data_format_arg.s = 'NHWC'
@@ -1109,8 +1109,8 @@ class TFConverter(object):
    def convert(self, input_nodes, output_nodes):
        if self.device == 'gpu':
            self.add_gpu_input_transform(input_nodes)
-        if self.device == 'neon':
-            self.add_neon_input_transform(input_nodes)
+        if self.device == 'cpu':
+            self.add_cpu_input_transform(input_nodes)

        for op in self.tf_ops:
            if self.resolved_ops[op.name] == 1:
@@ -1197,11 +1197,8 @@ class TFConverter(object):
        if self.device == 'gpu':
            self.add_gpu_output_transform(output_nodes)

-        if self.device == 'neon':
-            self.add_neon_output_transform(output_nodes)
-
        if self.device == 'cpu':
-            self.replace_in_out_name(input_nodes, output_nodes)
+            self.add_cpu_output_transform(output_nodes)

        for key in self.resolved_ops:
            if self.resolved_ops[key] != 1:
@@ -1252,7 +1249,7 @@ class Optimizer:
                scale_tensor = self.tensor_map[scale_buffer_name]
                weight_shape = weight_tensor.dims
                idx = 0
-                if self.device == 'neon':  # OIHW
+                if self.device == 'cpu':  # OIHW
                    for oc in range(weight_shape[0]):
                        for ic in range(weight_shape[1]):
                            for i in range(weight_shape[2]):

--- a/tools/mace_tools.py
+++ b/tools/mace_tools.py
@@ -69,9 +69,6 @@ def get_data_and_device_type(runtime):
    elif runtime == "cpu":
        data_type = "DT_FLOAT"
        device_type = "CPU"
-    elif runtime == "neon":
-        data_type = "DT_FLOAT"
-        device_type = "NEON"

    return data_type, device_type


--- a/tools/validate.py
+++ b/tools/validate.py
@@ -56,7 +56,6 @@ def compare_output(platform, mace_runtime, output_name, mace_out_value,
        print output_name, 'MACE VS', platform.upper(
        ), 'similarity: ', similarity
        if (mace_runtime == "cpu" and similarity > 0.999) or \
-            (mace_runtime == "neon" and similarity > 0.999) or \
            (mace_runtime == "gpu" and similarity > 0.995) or \
                (mace_runtime == "dsp" and similarity > 0.930):
            print '===================Similarity Test Passed=================='