From 345f7980513940a67514458607b96e198ffe4890 Mon Sep 17 00:00:00 2001
From: wuchenghui <wuchenghui@xiaomi.com>
Date: Wed, 4 Apr 2018 15:38:01 +0800
Subject: [PATCH] fix caffe converter for neon

---
 mace/kernels/activation.h                |  26 ++--
 mace/kernels/arm/activation.cc           |   4 +-
 mace/kernels/slice.h                     |  41 ++++--
 mace/ops/activation_test.cc              |  22 ++-
 mace/ops/slice.cc                        |   5 +
 mace/ops/slice.h                         |   6 +-
 mace/ops/slice_test.cc                   |  51 ++++---
 mace/python/tools/caffe_converter_lib.py | 173 ++++++++++++++++++-----
 8 files changed, 243 insertions(+), 85 deletions(-)
diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h
index 5130ccdc..c60f4a3b 100644
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -89,18 +89,22 @@ void DoActivation(const T *input_ptr,
 
 template <typename T>
 void PReLUActivation(const T *input_ptr,
-                     const index_t size,
+                     const index_t outer_size,
                      const index_t input_chan,
+                     const index_t inner_size,
                      const T *alpha_ptr,
                      T *output_ptr) {
-#pragma omp parallel for
-  for (index_t i = 0; i < size; ++i) {
-    const index_t chan_idx = i % input_chan;
-    T in = input_ptr[i];
-    if (in < 0) {
-      output_ptr[i] = in * alpha_ptr[chan_idx];
-    } else {
-      output_ptr[i] = in;
+#pragma omp parallel for collapse(3)
+  for (index_t i = 0; i < outer_size; ++i) {
+    for (index_t chan_idx = 0; chan_idx < input_chan; ++chan_idx) {
+      for (index_t j = 0; j < inner_size; ++j) {
+        index_t idx = i * input_chan * inner_size + chan_idx * inner_size + j;
+        if (input_ptr[idx] < 0) {
+          output_ptr[idx] = input_ptr[idx] * alpha_ptr[chan_idx];
+        } else {
+          output_ptr[idx] = input_ptr[idx];
+        }
+      }
     }
   }
 }
@@ -120,7 +124,9 @@ class ActivationFunctor {
     if (activation_ == PRELU) {
       MACE_CHECK_NOTNULL(alpha);
       const T *alpha_ptr = alpha->data<T>();
-      PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr,
+      const index_t outer_size = output->dim(0) * output->dim(1)
+          * output->dim(2);
+      PReLUActivation(input_ptr, outer_size, input->dim(3), 1, alpha_ptr,
                       output_ptr);
     } else {
       DoActivation(input_ptr, output_ptr, output->size(), activation_,
diff --git a/mace/kernels/arm/activation.cc b/mace/kernels/arm/activation.cc
index 30f21e03..59b94ffa 100644
--- a/mace/kernels/arm/activation.cc
+++ b/mace/kernels/arm/activation.cc
@@ -17,7 +17,9 @@ void ActivationFunctor<DeviceType::NEON, float>::operator()(
   if (activation_ == PRELU) {
     MACE_CHECK_NOTNULL(alpha);
     const float *alpha_ptr = alpha->data<float>();
-    PReLUActivation(input_ptr, output->size(), input->dim(1), alpha_ptr,
+    const index_t outer_size = output->dim(0);
+    const index_t inner_size = output->dim(2) * output->dim(3);
+    PReLUActivation(input_ptr, outer_size, input->dim(1), inner_size, alpha_ptr,
                     output_ptr);
   } else {
     DoActivation(input_ptr, output_ptr, output->size(), activation_,
diff --git a/mace/kernels/slice.h b/mace/kernels/slice.h
index 1bde41e5..58c21e7a 100644
--- a/mace/kernels/slice.h
+++ b/mace/kernels/slice.h
@@ -5,6 +5,7 @@
 #ifndef MACE_KERNELS_SLICE_H_
 #define MACE_KERNELS_SLICE_H_
 
+#include <functional>
 #include <vector>
 
 #include "mace/core/future.h"
@@ -16,20 +17,34 @@
 namespace mace {
 namespace kernels {
 
+struct SliceFunctorBase {
+  explicit SliceFunctorBase(const int32_t axis) : axis_(axis) {}
+
+  int32_t axis_;
+};
+
 template<DeviceType D, typename T>
-struct SliceFunctor {
+struct SliceFunctor : SliceFunctorBase {
+  explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {}
+
   void operator()(const Tensor *input,
                   const std::vector<Tensor *> &output_list,
                   StatsFuture *future) {
-    const index_t outer_size = input->dim(0) * input->dim(1) * input->dim(2);
-    const index_t input_channels = input->dim(3);
+    const index_t input_channels = input->dim(axis_);
     const size_t outputs_count = output_list.size();
     const index_t output_channels = input_channels / outputs_count;
     std::vector<T *> output_ptrs(output_list.size(), nullptr);
+    std::vector<index_t> output_shape(input->shape());
+    output_shape[axis_] = output_channels;
 
-    std::vector<index_t> output_shape({input->dim(0), input->dim(1),
-                                       input->dim(2), output_channels});
-
+    const index_t outer_size = std::accumulate(output_shape.begin(),
+                                               output_shape.begin() + axis_,
+                                               1,
+                                               std::multiplies<index_t>());
+    const index_t inner_size = std::accumulate(output_shape.begin() + axis_ + 1,
+                                               output_shape.end(),
+                                               1,
+                                               std::multiplies<index_t>());
     for (size_t i= 0; i < outputs_count; ++i) {
       output_list[i]->Resize(output_shape);
       output_ptrs[i] = output_list[i]->mutable_data<T>();
@@ -38,25 +53,27 @@ struct SliceFunctor {
 
 #pragma omp parallel for
     for (int outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
-      int input_idx = outer_idx * input_channels;
-      int output_idx = outer_idx * output_channels;
+      int input_idx = outer_idx * input_channels * inner_size;
+      int output_idx = outer_idx * output_channels * inner_size;
       for (size_t i = 0; i < outputs_count; ++i) {
         if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
           memcpy(output_ptrs[i]+output_idx, input_ptr+input_idx,
-                 output_channels * sizeof(T));
+                 output_channels * inner_size * sizeof(T));
         } else {
-          for (index_t k = 0; k < output_channels; ++k) {
+          for (index_t k = 0; k < output_channels * inner_size; ++k) {
             *(output_ptrs[i] + output_idx + k) = *(input_ptr + input_idx + k);
           }
         }
-        input_idx += output_channels;
+        input_idx += output_channels * inner_size;
       }
     }
   }
 };
 
 template<typename T>
-struct SliceFunctor<DeviceType::OPENCL, T> {
+struct SliceFunctor<DeviceType::OPENCL, T> : SliceFunctorBase {
+  explicit SliceFunctor(const int32_t axis) : SliceFunctorBase(axis) {}
+
   void operator()(const Tensor *input,
                   const std::vector<Tensor *> &output_list,
                   StatsFuture *future);
diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc
index 18bddcc7..034a591f 100644
--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -249,14 +249,26 @@ void TestSimplePrelu() {
     net.RunOp(D);
   }
 
-  auto expected = CreateTensor<float>(
-      {2, 2, 2, 2},
-      {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
+  if (D == DeviceType::NEON) {
+    auto expected = CreateTensor<float>(
+        {2, 2, 2, 2},
+        {-14, 7, -12, 6, -15, -15, -12, -12, -6, 3, -4, 2, -3, -3, 0, 0});
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+  } else {
+    auto expected = CreateTensor<float>(
+        {2, 2, 2, 2},
+        {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+  }
+}
 
-  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+TEST_F(ActivationOpTest, CPUSimplePrelu) {
+  TestSimplePrelu<DeviceType::CPU>();
 }
 
-TEST_F(ActivationOpTest, CPUSimplePrelu) { TestSimplePrelu<DeviceType::CPU>(); }
+TEST_F(ActivationOpTest, NEONSimplePrelu) {
+  TestSimplePrelu<DeviceType::NEON>();
+}
 
 TEST_F(ActivationOpTest, OPENCLSimplePrelu) {
   TestSimplePrelu<DeviceType::OPENCL>();
diff --git a/mace/ops/slice.cc b/mace/ops/slice.cc
index d482b028..e8fe0acc 100644
--- a/mace/ops/slice.cc
+++ b/mace/ops/slice.cc
@@ -24,6 +24,11 @@ void Register_Slice(OperatorRegistry *op_registry) {
                                      .TypeConstraint<half>("T")
                                      .Build(),
                     SliceOp<DeviceType::OPENCL, half>);
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
+                                     .Device(DeviceType::NEON)
+                                     .TypeConstraint<float>("T")
+                                     .Build(),
+                    SliceOp<DeviceType::NEON, float>);
 }
 
 }  // namespace ops
diff --git a/mace/ops/slice.h b/mace/ops/slice.h
index a1a6ad11..22869345 100644
--- a/mace/ops/slice.h
+++ b/mace/ops/slice.h
@@ -17,14 +17,16 @@ template <DeviceType D, typename T>
 class SliceOp : public Operator<D, T> {
  public:
   SliceOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws) {}
+      : Operator<D, T>(op_def, ws),
+        functor_(OperatorBase::GetSingleArgument<int>("axis", 3)) {}
 
   bool Run(StatsFuture *future) override {
     MACE_CHECK(this->OutputSize() >= 2)
       << "There must be at least two outputs for slicing";
     const Tensor *input = this->Input(INPUT);
     const std::vector<Tensor *> output_list = this->Outputs();
-    MACE_CHECK((input->dim(3) % this->OutputSize()) == 0)
+    const int32_t slice_axis = OperatorBase::GetSingleArgument<int>("axis", 3);
+    MACE_CHECK((input->dim(slice_axis) % this->OutputSize()) == 0)
       << "Outputs do not split input equally.";
 
     functor_(input, output_list, future);
diff --git a/mace/ops/slice_test.cc b/mace/ops/slice_test.cc
index bd0244d3..ad507af3 100644
--- a/mace/ops/slice_test.cc
+++ b/mace/ops/slice_test.cc
@@ -16,7 +16,7 @@ namespace test {
 class SliceOpTest : public OpsTestBase {};
 
 template<DeviceType D, typename T>
-void RandomTest(const int num_outputs) {
+void RandomTest(const int num_outputs, const int axis) {
   static unsigned int seed = time(NULL);
   const index_t output_channels = 4 * (1 + rand_r(&seed) % 10);
   const index_t input_channels = num_outputs * output_channels;
@@ -27,7 +27,11 @@ void RandomTest(const int num_outputs) {
   // Construct graph
   OpsTestNet net;
 
-  std::vector<index_t> input_shape({batch, height, width, input_channels});
+  std::vector<index_t> input_shape;
+  if (axis == 1)
+    input_shape = {batch, input_channels, height, width};
+  else if (axis == 3)
+    input_shape = {batch, height, width, input_channels};
   const index_t input_size = std::accumulate(input_shape.begin(),
                                              input_shape.end(),
                                              1,
@@ -49,7 +53,7 @@ void RandomTest(const int num_outputs) {
         .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
         .Finalize(net.NewOperatorDef());
   } else {
-    auto builder = OpDefBuilder("Slice", "SliceTest");
+    auto builder = OpDefBuilder("Slice", "SliceTest").AddIntArg("axis", axis);
     builder.Input("Input");
     for (int i = 0; i < num_outputs; ++i) {
       builder = builder.Output(MakeString("Output", i));
@@ -70,9 +74,17 @@ void RandomTest(const int num_outputs) {
   }
 
   // Check
-  std::vector<index_t> expected_shape({batch, height, width, output_channels});
+  std::vector<index_t> expected_shape;
+  if (axis == 1)
+    expected_shape = {batch, output_channels, height, width};
+  else if (axis == 3)
+    expected_shape = {batch, height, width, output_channels};
   const index_t outer_size = std::accumulate(expected_shape.begin(),
-                                             expected_shape.end() - 1,
+                                             expected_shape.begin() + axis,
+                                             1,
+                                             std::multiplies<index_t>());
+  const index_t inner_size = std::accumulate(expected_shape.begin() + axis + 1,
+                                             expected_shape.end(),
                                              1,
                                              std::multiplies<index_t>());
   const float *input_ptr = input_data.data();
@@ -83,8 +95,9 @@ void RandomTest(const int num_outputs) {
     Tensor::MappingGuard output_mapper(output);
     output_ptr = output->data<float>();
     for (int outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
-      const int idx = outer_idx * input_channels + i * output_channels;
-      for (int j = 0; j < output_channels; ++j) {
+      const int idx = (outer_idx * input_channels + i * output_channels)
+          *  inner_size;
+      for (int j = 0; j < output_channels * inner_size; ++j) {
         ASSERT_NEAR(*output_ptr++, input_ptr[idx + j], 1e-2) << "with output "
           << i << " index " << idx + j;
       }
@@ -93,21 +106,27 @@ void RandomTest(const int num_outputs) {
 }
 
 TEST_F(SliceOpTest, CPU) {
-  RandomTest<DeviceType::CPU, float>(2);
-  RandomTest<DeviceType::CPU, float>(4);
-  RandomTest<DeviceType::CPU, float>(11);
+  RandomTest<DeviceType::CPU, float>(2, 3);
+  RandomTest<DeviceType::CPU, float>(4, 3);
+  RandomTest<DeviceType::CPU, float>(11, 3);
+}
+
+TEST_F(SliceOpTest, CPUAxis1) {
+  RandomTest<DeviceType::CPU, float>(2, 1);
+  RandomTest<DeviceType::CPU, float>(4, 1);
+  RandomTest<DeviceType::CPU, float>(11, 1);
 }
 
 TEST_F(SliceOpTest, OPENCLFloat) {
-  RandomTest<DeviceType::OPENCL, float>(2);
-  RandomTest<DeviceType::OPENCL, float>(4);
-  RandomTest<DeviceType::OPENCL, float>(11);
+  RandomTest<DeviceType::OPENCL, float>(2, 3);
+  RandomTest<DeviceType::OPENCL, float>(4, 3);
+  RandomTest<DeviceType::OPENCL, float>(11, 3);
 }
 
 TEST_F(SliceOpTest, OPENCLHalf) {
-  RandomTest<DeviceType::OPENCL, half>(2);
-  RandomTest<DeviceType::OPENCL, half>(4);
-  RandomTest<DeviceType::OPENCL, half>(11);
+  RandomTest<DeviceType::OPENCL, half>(2, 3);
+  RandomTest<DeviceType::OPENCL, half>(4, 3);
+  RandomTest<DeviceType::OPENCL, half>(11, 3);
 }
 
 }  // namespace test
diff --git a/mace/python/tools/caffe_converter_lib.py b/mace/python/tools/caffe_converter_lib.py
index 166bb6ec..7a94d280 100644
--- a/mace/python/tools/caffe_converter_lib.py
+++ b/mace/python/tools/caffe_converter_lib.py
@@ -68,14 +68,26 @@ def BlobToNPArray(blob):
 
 class Shapes(object):
   @staticmethod
-  def conv_pool_shape(input_shape, filter_shape, paddings, strides, dilations, round_func):
+  def conv_pool_shape(input_shape, filter_shape, paddings, strides, dilations, round_func, input_format='NHWC'):
     output_shape = np.zeros_like(input_shape)
     output_shape[0] = input_shape[0]
-    output_shape[1] = int(round_func((input_shape[1] + paddings[0] - filter_shape[0]
-                                      - (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1
-    output_shape[2] = int(round_func((input_shape[2] + paddings[1] - filter_shape[1]
-                                      - (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1
-    output_shape[3] = filter_shape[2]
+    if input_format == 'NHWC':
+      # input format: NHWC, filter format: HWOI
+      output_shape[1] = int(round_func((input_shape[1] + paddings[0] - filter_shape[0]
+                                        - (filter_shape[0] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1
+      output_shape[2] = int(round_func((input_shape[2] + paddings[1] - filter_shape[1]
+                                        - (filter_shape[1] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1
+      output_shape[3] = filter_shape[2]
+    elif input_format == 'NCHW':
+      # input format: NCHW, filter format: OIHW
+      output_shape[1] = filter_shape[0]
+      output_shape[2] = int(round_func((input_shape[2] + paddings[0] - filter_shape[2]
+                                        - (filter_shape[2] - 1) * (dilations[0] - 1)) / float(strides[0]))) + 1
+      output_shape[3] = int(round_func((input_shape[3] + paddings[1] - filter_shape[3]
+                                        - (filter_shape[3] - 1) * (dilations[1] - 1)) / float(strides[1]))) + 1
+    else:
+      raise Exception("format %s is not supported" % input_format)
+
     return output_shape
 
   @staticmethod
@@ -93,8 +105,13 @@ class Shapes(object):
     return output_shape
 
   @staticmethod
-  def slice_shape(input_shape, num_output):
-    return [input_shape[0], input_shape[1], input_shape[2], input_shape[3]/num_output]
+  def slice_shape(input_shape, num_output, input_format='NHWC'):
+    if input_format == 'NHWC':
+      return [input_shape[0], input_shape[1], input_shape[2], input_shape[3]/num_output]
+    elif input_format == 'NCHW':
+      return [input_shape[0], input_shape[1]/num_output, input_shape[2], input_shape[3]]
+    else:
+      raise Exception("format %s is not supported" % input_format)
 
 # outputs' name is [op.name + '_' + #]
 class CaffeConverter(object):
@@ -168,7 +185,10 @@ class CaffeConverter(object):
     arg.i = self.dt
     data_format_arg = op_def.arg.add()
     data_format_arg.name = 'data_format'
-    data_format_arg.s = 'NHWC'
+    if self.device == 'neon':
+      data_format_arg.s = 'NCHW'
+    else:
+      data_format_arg.s = 'NHWC'
     op_def.name = op.name
     op_def.type = mace_type
     op_def.input.extend([name+':0' for name in self.inputs_map[op.name]])
@@ -342,7 +362,11 @@ class CaffeConverter(object):
 
     # Add filter
     weight_tensor_name = op.name + '_weight:0'
-    weight_data = op.data[0].transpose((2, 3, 0, 1))
+    if self.device == 'neon':
+      weight_data = op.data[0]
+    else:
+      # OIHW -> HWOI
+      weight_data = op.data[0].transpose((2, 3, 0, 1))
     self.add_tensor(weight_tensor_name, weight_data)
 
     if self.device == 'gpu':
@@ -376,10 +400,11 @@ class CaffeConverter(object):
     final_op = op
     self.resolved_ops.add(op.name)
 
+    input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
     output_shape = Shapes.conv_pool_shape(op.get_single_parent().output_shape_map[op.layer.bottom[0]],
       weight_data.shape,
       paddings, strides, dilations,
-      math.floor)
+      math.floor, input_format)
     op.output_shape_map[op.layer.top[0]] = output_shape
 
     if len(self.ops_map[final_op.name].children) == 1 \
@@ -399,9 +424,13 @@ class CaffeConverter(object):
     self.net_def.op.extend([op_def])
 
   def check_winograd_conv(self, op):
+    # TODO: support winograd conv on neon
+    if self.device == 'neon':
+      return False
     param = op.layer.convolution_param
     filter_shape = np.asarray(op.data[0].shape)
-    filter_shape = filter_shape[[2, 3, 0, 1]]
+    if self.device != 'neon':
+      filter_shape = filter_shape[[2, 3, 0, 1]]  # OIHW -> HWOI
     paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
 
     dilations = [1, 1]
@@ -411,17 +440,21 @@ class CaffeConverter(object):
       elif len(param.dilation) == 2:
         dilations = [param.dilation[0], param.dilation[1]]
 
+    input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
     output_shape = Shapes.conv_pool_shape(
       op.get_single_parent().output_shape_map[op.layer.bottom[0]],
-      filter_shape, paddings, strides, dilations, math.floor)
+      filter_shape, paddings, strides, dilations, math.floor, input_format)
     width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
-    return self.winograd and self.device == 'gpu' and \
-           filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \
-           dilations[0] == 1 and (dilations[0] == dilations[1]) and \
-           (strides[0] == 1) and (strides[0] == strides[1]) and \
-           (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
-           (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
-           (width < OPENCL_IMAGE_MAX_SIZE)
+    if self.winograd and dilations[0] == 1 and (dilations[0] == dilations[1]) and \
+            (strides[0] == 1) and (strides[0] == strides[1]):
+      if self.device == 'gpu':
+        return filter_shape[0] == 3 and (filter_shape[0] == filter_shape[1]) and \
+               (16 * filter_shape[2] < OPENCL_IMAGE_MAX_SIZE) and \
+               (16 * filter_shape[3] < OPENCL_IMAGE_MAX_SIZE) and \
+               (width < OPENCL_IMAGE_MAX_SIZE)
+      elif self.device == 'neon':
+        return filter_shape[2] == 3 and (filter_shape[2] == filter_shape[3])
+    return False
 
   def convert_winograd_conv(self, op):
     # Add filter
@@ -435,11 +468,13 @@ class CaffeConverter(object):
     paddings, strides, _ = self.add_stride_pad_kernel_arg(param, None)
 
     filter_shape = np.asarray(op.data[0].shape)
-    filter_shape = filter_shape[[2, 3, 0, 1]]
+    if self.device != 'neon':
+      filter_shape = filter_shape[[2, 3, 0, 1]]  # OIHW -> HWOI
 
+    input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
     output_shape = Shapes.conv_pool_shape(
       op.get_single_parent().output_shape_map[op.layer.bottom[0]],
-      filter_shape, paddings, strides, [1, 1], math.floor)
+      filter_shape, paddings, strides, [1, 1], math.floor, input_format)
 
     # Input transform
     wt_op = mace_pb2.OperatorDef()
@@ -455,8 +490,12 @@ class CaffeConverter(object):
     wt_output_name = wt_op.name + ":0"
     wt_op.output.extend([wt_output_name])
     wt_output_shape = mace_pb2.OutputShape()
-    wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
-    wt_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1])
+    if self.device != 'neon':
+      wt_output_width = output_shape[0] * ((output_shape[1] + 1)/2) * ((output_shape[2]+1)/2)
+      wt_output_shape.dims.extend([16, filter_shape[3], wt_output_width, 1])
+    else:
+      wt_output_width = output_shape[0] * ((output_shape[2] + 1)/2) * ((output_shape[3]+1)/2)
+      wt_output_shape.dims.extend([16, filter_shape[1], wt_output_width, 1])
     wt_op.output_shape.extend([wt_output_shape])
 
     # MatMul
@@ -470,7 +509,10 @@ class CaffeConverter(object):
     matmul_output_name = matmul_op.name + ":0"
     matmul_op.output.extend([matmul_output_name])
     matmul_output_shape = mace_pb2.OutputShape()
-    matmul_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1])
+    if self.device != 'neon':
+      matmul_output_shape.dims.extend([16, filter_shape[2], wt_output_width, 1])
+    else:
+      matmul_output_shape.dims.extend([16, filter_shape[0], wt_output_width, 1])
     matmul_op.output_shape.extend([matmul_output_shape])
 
     # Inverse transform
@@ -483,10 +525,10 @@ class CaffeConverter(object):
     batch_arg.i = output_shape[0]
     height_arg = iwt_op.arg.add()
     height_arg.name = 'height'
-    height_arg.i = output_shape[1]
+    height_arg.i = output_shape[1] if self.device != 'neon' else output_shape[2]
     width_arg = iwt_op.arg.add()
     width_arg.name = 'width'
-    width_arg.i = output_shape[2]
+    width_arg.i = output_shape[2] if self.device != 'neon' else output_shape[3]
     iwt_op.name = op.name + '_inverse_transform'
     iwt_op.type = 'WinogradInverseTransform'
     iwt_op.input.extend([matmul_output_name])
@@ -589,8 +631,9 @@ class CaffeConverter(object):
 
     weight_data = op.data[0].reshape(-1, op.data[0].shape[-1])
     assert weight_data.shape[1] == (input_shape[1] * input_shape[2] * input_shape[3])
-    weight_data = weight_data.reshape(-1, input_shape[3], input_shape[1], input_shape[2])
-    weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(weight_data.shape[0], -1)
+    if self.device != 'neon':
+      weight_data = weight_data.reshape(-1, input_shape[3], input_shape[1], input_shape[2])
+      weight_data = weight_data.transpose((0, 2, 3, 1)).reshape(weight_data.shape[0], -1)
     self.add_tensor(weight_tensor_name, weight_data)
     if self.device == 'gpu':
       if (weight_data.shape[0] + 3) / 4 > OPENCL_IMAGE_MAX_SIZE \
@@ -665,9 +708,12 @@ class CaffeConverter(object):
     kernel_arg.name = 'kernels'
     kernel_arg.ints.extend(kernels)
 
-    filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]]
+    filter_shape = [kernels[0], kernels[1], input_shape[3], input_shape[3]] \
+      if self.device != 'neon' else \
+      [input_shape[1], input_shape[1], kernels[0], kernels[1]]
+    input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
     output_shape = Shapes.conv_pool_shape(input_shape, filter_shape,
-      paddings, strides, [1, 1], math.ceil)
+      paddings, strides, [1, 1], math.ceil, input_format)
     op.output_shape_map[op.layer.top[0]] = output_shape
 
     op_def.output.extend([op.name + ':0'])
@@ -720,7 +766,7 @@ class CaffeConverter(object):
     op_def = self.CommonConvert(op, 'Concat')
     axis_arg = op_def.arg.add()
     axis_arg.name = 'axis'
-    axis_arg.i = 3
+    axis_arg.i = 3 if self.device != 'neon' else 1
     try:
       if op.layer.concat_param.HasFeild('axis'):
         axis_arg.i = op.concat_param.axis
@@ -766,13 +812,19 @@ class CaffeConverter(object):
       if len(param.slice_point) > 0:
         raise Exception('Mace do not support slice with slice_point')
 
+    axis_arg = op_def.arg.add()
+    axis_arg.name = 'axis'
+    axis_arg.i = 3 if self.device != 'neon' else 1
+
     input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
     num_outputs = len(op.layer.top)
-    if (input_shape[3] % num_outputs) != 0 or \
-        (self.device == 'gpu' and ((input_shape[3] / num_outputs) % 4 != 0)) :
+    input_channels = input_shape[axis_arg.i]
+    if (input_channels % num_outputs) != 0 or \
+        (self.device == 'gpu' and ((input_channels / num_outputs) % 4 != 0)):
       raise Exception('Mace do not support slice with input shape '
                       + str(input_shape) + ' and number of output ' + str(num_outputs))
-    output_shape = Shapes.slice_shape(input_shape, num_outputs)
+    input_format = 'NCHW' if self.device == 'neon' else 'NHWC'
+    output_shape = Shapes.slice_shape(input_shape, num_outputs, input_format)
     for i in range(len(op.layer.top)):
       op.output_shape_map[op.layer.top[i]] = output_shape
       self.add_output_shape(op_def, output_shape)
@@ -790,10 +842,15 @@ class CaffeConverter(object):
     self.resolved_ops.add(op.name)
 
   def convert_reshape(self, op):
-    op_def = self.CommonConvert(op, 'ReOrganize')
+    if self.device == 'neon':
+      op_def = self.CommonConvert(op, 'Reshape')
+    else:
+      op_def = self.CommonConvert(op, 'ReOrganize')
     input_shape = op.parents[0].output_shape_map[op.layer.bottom[0]]
     output_shape = input_shape
-    shape_param = np.asarray(op.layer.reshape_param.shape.dim)[[0, 3, 2, 1]]
+    shape_param = np.asarray(op.layer.reshape_param.shape.dim)
+    if self.device != 'neon':
+      shape_param = shape_param[[0, 3, 1, 2]]
     for i in range(len(shape_param)):
       if shape_param[i] != 0:
         output_shape[i] = shape_param[i]
@@ -867,15 +924,50 @@ class CaffeConverter(object):
     assert len(input_nodes) == len(input_shapes)
     for i in range(len(input_nodes)):
       input_op = self.ops_map[input_nodes[i]]
+      input_shape = input_shapes[i] if self.device != 'neon' else \
+        [input_shapes[i][0], input_shapes[i][3], input_shapes[i][1], input_shapes[i][2]]
       if input_op.layer is not None:
-        input_op.output_shape_map[input_op.layer.top[0]] = input_shapes[i]
+        input_op.output_shape_map[input_op.layer.top[0]] = input_shape
       else:
-        input_op.output_shape_map[input_op.name] = input_shapes[i]
+        input_op.output_shape_map[input_op.name] = input_shape
+
+  def add_neon_input_transform(self, names):
+    for name in names:
+      new_input_name = MACE_INPUT_NODE_NAME + '_' + name + ":0"
+      op_def = self.net_def.op.add()
+      op_def.name = name
+      op_def.type = 'Transpose'
+      op_def.input.extend([new_input_name])
+      op_def.output.extend([name+':0'])
+
+      dims_arg = op_def.arg.add()
+      dims_arg.name = 'dims'
+      dims_arg.ints.extend([0, 3, 1, 2])  # NHWC -> NCHW
+
+      arg = op_def.arg.add()
+      arg.name = 'T'
+      arg.i = self.dt
+
+  def add_neon_output_transform(self, names):
+    for name in names:
+      output_name = MACE_OUTPUT_NODE_NAME + '_' + name + ":0"
+      op_def = self.net_def.op.add()
+      op_def.name = output_name[:-2]
+      op_def.type = 'Transpose'
+      op_def.input.extend([name+':0'])
+      op_def.output.extend([output_name])
+
+      dims_arg = op_def.arg.add()
+      dims_arg.name = 'dims'
+      dims_arg.ints.extend([0, 2, 3, 1])  # NCHW -> NHWC
 
   def convert(self, input_nodes, input_shapes, output_nodes):
     if self.device == 'gpu':
       self.add_input_transform(input_nodes)
 
+    if self.device == 'neon':
+      self.add_neon_input_transform(input_nodes)
+
     assert self.ops[0].type == 'Input'
     self.add_input_op_shape(input_nodes, input_shapes)
 
@@ -924,6 +1016,9 @@ class CaffeConverter(object):
     if self.device == 'cpu':
       self.replace_in_out_name(input_nodes, output_nodes)
 
+    if self.device == 'neon':
+      self.add_neon_output_transform(output_nodes)
+
     for op in self.ops:
       if op.name not in self.resolved_ops:
         print 'Unresolve Op: %s with type %s' % (op.name, op.type)
-- 
GitLab