Merge branch 'master' into 'master'

Implement fc and eltwise for neon See merge request !347

Merge branch 'master' into 'master'
Implement fc and eltwise for neon See merge request !347
dd4b992a · Liangliang He · fdf938ce · 73057ea3 · dd4b992a · dd4b992a
5 changed file
--- a/mace/kernels/arm/fully_connected.cc
+++ b/mace/kernels/arm/fully_connected.cc
+//
+// Copyright (c) 2018 XiaoMi All rights reserved.
+//
+#include "mace/kernels/fully_connected.h"
+#include "mace/kernels/gemm.h"
+namespace mace {
+namespace kernels {
+void FullyConnectedFunctor<DeviceType::NEON,
+                           float>::operator()(const Tensor *input,
+                                              const Tensor *weight,
+                                              const Tensor *bias,
+                                              Tensor *output,
+                                              StatsFuture *future) {
+  std::vector<index_t> output_shape = {input->dim(0), weight->dim(0), 1, 1};
+  output->Resize(output_shape);
+  const index_t N = output->dim(0);
+  const index_t input_size = weight->dim(1);
+  const index_t output_size = weight->dim(0);
+  const float *input_ptr = input->data<float>();
+  const float *weight_ptr = weight->data<float>();
+  const float *bias_ptr = bias == nullptr ? nullptr : bias->data<float>();
+  float *output_ptr = output->mutable_data<float>();
+  for (int i = 0; i < N; ++i) {
+    Gemm(weight_ptr, input_ptr, 1, output_size, input_size, 1, output_ptr);
+    for (int j = 0; j < output_size; ++j) {
+      output_ptr[j] += bias_ptr[j];
+    }
+  }
+  DoActivation(output_ptr, output_ptr, output->size(), activation_,
+               relux_max_limit_);
+}
+}  // namespace kernels
+}  // namespace mace
--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -76,6 +76,20 @@ struct FullyConnectedFunctor : FullyConnectedBase {
  }
 };
+template <>
+struct FullyConnectedFunctor<DeviceType::NEON, float> : FullyConnectedBase {
+  FullyConnectedFunctor(const BufferType weight_type,
+                        const ActivationType activation,
+                        const float relux_max_limit)
+    : FullyConnectedBase(weight_type, activation, relux_max_limit) {}
+  void operator()(const Tensor *input,
+                  const Tensor *weight,
+                  const Tensor *bias,
+                  Tensor *output,
+                  StatsFuture *future);
+};
 template <typename T>
 struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
  FullyConnectedFunctor(const BufferType weight_type,

--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -25,6 +25,11 @@ void Register_Eltwise(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    EltwiseOp<DeviceType::OPENCL, half>);
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
+                                     .Device(DeviceType::NEON)
+                                     .TypeConstraint<float>("T")
+                                     .Build(),
+                    EltwiseOp<DeviceType::NEON, float>);
 }
 }  // namespace ops

--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -25,6 +25,12 @@ void Register_FullyConnected(OperatorRegistry *op_registry) {
                                     .TypeConstraint<half>("T")
                                     .Build(),
                    FullyConnectedOp<DeviceType::OPENCL, half>);
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FC")
+                                     .Device(DeviceType::NEON)
+                                     .TypeConstraint<float>("T")
+                                     .Build(),
+                    FullyConnectedOp<DeviceType::NEON, float>);
 }
 }  // namespace ops

--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -13,7 +13,7 @@ namespace test {
 class FullyConnectedOpTest : public OpsTestBase {};
-template <DeviceType D>
+template<DeviceType D>
 void Simple(const std::vector<index_t> &input_shape,
            const std::vector<float> &input_value,
            const std::vector<index_t> &weight_shape,
@@ -107,7 +107,7 @@ TEST_F(FullyConnectedOpTest, SimpleGPUWithBatch) {
                             {1, 2, 3, 4}, {1}, {2}, {2, 1, 1, 1}, {32, 72});
 }
-template <typename T>
+template<typename T>
 void Complex(const index_t batch,
             const index_t height,
             const index_t width,
@@ -189,7 +189,7 @@ TEST_F(FullyConnectedOpTest, OPENCLHalfUnAlignedWithBatch) {
  Complex<half>(31, 21, 11, 23, 103);
 }
-template <typename T>
+template<typename T>
 void TestWXFormat(const index_t batch,
                  const index_t height,
                  const index_t width,
@@ -266,6 +266,57 @@ TEST_F(FullyConnectedOpTest, OPENCLHalfWidthFormatAligned) {
  TestWXFormat<half>(1, 16, 32, 32, 32);
 }
+void FullyConnectedTestNEON(const index_t batch,
+              const index_t height,
+              const index_t width,
+              const index_t channels,
+              const index_t out_channel) {
+  srand(time(NULL));
+  // Construct graph
+  OpsTestNet net;
+  OpDefBuilder("FC", "FullyConnectedTest")
+    .Input("Input")
+    .Input("Weight")
+    .Input("Bias")
+    .Output("Output")
+    .Finalize(net.NewOperatorDef());
+  // Add input data
+  net.AddRandomInput<DeviceType::CPU, float>(
+    "Input", {batch, height, width, channels});
+  net.AddRandomInput<DeviceType::CPU, float>(
+    "Weight", {out_channel, height * width * channels});
+  net.AddRandomInput<DeviceType::CPU, float>("Bias", {out_channel});
+  // run cpu
+  net.RunOp();
+  // Run on neon
+  OpDefBuilder("FC", "FullyConnectedTest")
+    .Input("Input")
+    .Input("Weight")
+    .Input("Bias")
+    .Output("OutputNeon")
+    .Finalize(net.NewOperatorDef());
+  // Run on device
+  net.RunOp(DeviceType::NEON);
+  net.FillNHWCInputToNCHWInput<DeviceType::CPU, float>("OutputExptected",
+                                                       "Output");
+  ExpectTensorNear<float>(*net.GetOutput("OutputExptected"),
+                          *net.GetOutput("OutputNeon"),
+                          0.001);
+}
+TEST_F(FullyConnectedOpTest, TestNEON) {
+  FullyConnectedTestNEON(1, 7, 7, 32, 16);
+  FullyConnectedTestNEON(1, 7, 7, 512, 128);
+  FullyConnectedTestNEON(1, 1, 1, 2048, 1024);
+}
 }  // namespace test
 }  // namespace ops
 }  // namespace mace