From 94dbdeaf929350c65bc66dc382aed74fb4f3b18f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AF=85?= Date: Wed, 6 Sep 2017 20:48:37 +0800 Subject: [PATCH] Refactor op --- mace/kernels/addn.h | 27 +++++++----------- mace/kernels/benchmark/addn_benchmark.cc | 27 ++++++------------ mace/kernels/benchmark/relu_benchmark.cc | 16 +++++------ mace/kernels/neon/addn_neon.cc | 23 +++++---------- mace/kernels/neon/addn_neon.h | 19 ------------- mace/kernels/neon/relu_neon.cc | 16 +++++------ mace/kernels/neon/relu_neon.h | 19 ------------- mace/kernels/relu.h | 17 +++++------ mace/kernels/test/addn_neon_test.cc | 36 ++++++++---------------- mace/kernels/test/relu_neon_test.cc | 23 ++++++--------- mace/ops/addn.cc | 10 ------- mace/ops/addn.h | 18 ++++++++++-- mace/ops/relu.cc | 11 -------- mace/ops/relu.h | 14 +++++++-- 14 files changed, 93 insertions(+), 183 deletions(-) delete mode 100644 mace/kernels/neon/addn_neon.h delete mode 100644 mace/kernels/neon/relu_neon.h diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h index 30648eb8..af2e3542 100644 --- a/mace/kernels/addn.h +++ b/mace/kernels/addn.h @@ -10,25 +10,18 @@ namespace mace { namespace kernels { -template -void AddNFuntion(const vector& input_tensor, Tensor *output_tensor) { - int n = input_tensor.size(); - MACE_CHECK(n > 1); - MACE_CHECK_NOTNULL(input_tensor[0]); - int64_t size = input_tensor[0]->size(); - vector inputs(n); - for (int i = 0; i < n; ++i) { - inputs[i] = input_tensor[i]->data(); - } - output_tensor->ResizeLike(input_tensor[0]); - T* output = output_tensor->mutable_data(); - - for (int i = 0; i < n; ++i) { - for (int64_t j = 0; j < size; ++j) { - output[j] += inputs[i][j]; +template +struct AddNFunctor { + void operator()(const vector& inputs, + T *output, index_t size) { + int n = inputs.size(); + for (int i = 0; i < n; ++i) { + for (index_t j = 0; j < size; ++j) { + output[j] += inputs[i][j]; + } } } -} +}; } // namespace kernels } // namespace mace diff --git a/mace/kernels/benchmark/addn_benchmark.cc b/mace/kernels/benchmark/addn_benchmark.cc index 4cec0270..e735965a 100644 --- a/mace/kernels/benchmark/addn_benchmark.cc +++ b/mace/kernels/benchmark/addn_benchmark.cc @@ -5,7 +5,6 @@ #include "mace/core/testing/test_benchmark.h" #include "mace/core/tensor.h" #include "mace/kernels/addn.h" -#include "mace/kernels/neon/addn_neon.h" using namespace mace; using namespace mace::kernels; @@ -19,32 +18,24 @@ static void AddNBenchmark(int iters, int n, int type) { std::mt19937 gen(rd()); std::normal_distribution nd(0, 1); - Tensor input_tensor1(cpu_allocator(), DataType::DT_FLOAT); - input_tensor1.Resize({n}); - Tensor input_tensor2(cpu_allocator(), DataType::DT_FLOAT); - input_tensor2.Resize({n}); - Tensor input_tensor3(cpu_allocator(), DataType::DT_FLOAT); - input_tensor3.Resize({n}); - vector input_tensors {&input_tensor1, - &input_tensor2, - &input_tensor3}; - Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT); - output_tensor.ResizeLike(input_tensor1); - float *input1 = input_tensor1.mutable_data(); - float *input2 = input_tensor2.mutable_data(); - float *input3 = input_tensor3.mutable_data(); - float *output = output_tensor.mutable_data(); + vector input1(n); + vector input2(n); + vector input3(n); + vector output(n); for (int64_t i = 0; i < n; ++i) { input1[i] = nd(gen); input2[i] = nd(gen); input3[i] = nd(gen); } + vector inputs { input1.data(), input2.data(), input3.data() }; if (type == DeviceType::CPU) { - AddNFuntion(input_tensors, &output_tensor); + AddNFunctor addn_functor; + addn_functor(inputs, &output[0], n); } else if (type == DeviceType::NEON) { - NeonAddNFuntion_float(input_tensors, &output_tensor); + AddNFunctor neon_addn_functor; + neon_addn_functor(inputs, &output[0], n); } } diff --git a/mace/kernels/benchmark/relu_benchmark.cc b/mace/kernels/benchmark/relu_benchmark.cc index 86858681..c384e882 100644 --- a/mace/kernels/benchmark/relu_benchmark.cc +++ b/mace/kernels/benchmark/relu_benchmark.cc @@ -5,7 +5,6 @@ #include "mace/core/testing/test_benchmark.h" #include "mace/core/tensor.h" #include "mace/kernels/relu.h" -#include "mace/kernels/neon/relu_neon.h" using namespace mace; using namespace mace::kernels; @@ -19,20 +18,19 @@ static void ReluBenchmark(int iters, int n, int type) { std::mt19937 gen(rd()); std::normal_distribution nd(0, 1); - Tensor input_tensor(cpu_allocator(), DataType::DT_FLOAT); - input_tensor.Resize({n}); - Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT); - output_tensor.ResizeLike(input_tensor); - float *input = input_tensor.mutable_data(); - float *output = output_tensor.mutable_data(); + vector input(n); + vector output(n); + for (int64_t i = 0; i < n; ++i) { input[i] = nd(gen); } if (type == DeviceType::CPU) { - ReluFuntion(&input_tensor, &output_tensor); + ReluFunctor relu_functor; + relu_functor(&input[0], &output[0], n); } else if (type == DeviceType::NEON) { - NeonReluFuntion_float(&input_tensor, &output_tensor); + ReluFunctor neon_relu_functor; + neon_relu_functor(&input[0], &output[0], n); } } diff --git a/mace/kernels/neon/addn_neon.cc b/mace/kernels/neon/addn_neon.cc index ad6f06e8..77a7c5c0 100644 --- a/mace/kernels/neon/addn_neon.cc +++ b/mace/kernels/neon/addn_neon.cc @@ -3,25 +3,16 @@ // #include -#include "mace/kernels/neon/addn_neon.h" -#include "mace/core/common.h" +#include "mace/kernels/addn.h" namespace mace { namespace kernels { -void NeonAddNFuntion_float(const vector &input_tensor, - Tensor *output_tensor) { - int n = input_tensor.size(); - MACE_CHECK(n > 1); - MACE_CHECK_NOTNULL(input_tensor[0]); - int64_t size = input_tensor[0]->size(); - output_tensor->ResizeLike(input_tensor[0]); - float *output = output_tensor->mutable_data(); - vector inputs(n); - for (int i = 0; i < n; ++i) { - inputs[i] = input_tensor[i]->data(); - } - +template <> +void AddNFunctor::operator()(const vector& inputs, + float *output, + index_t size) { + int n = inputs.size(); int64_t cost = size * n; int64_t groups = 1; if (cost > kCostPerGroup) { @@ -53,7 +44,7 @@ void NeonAddNFuntion_float(const vector &input_tensor, } } } -} +}; } // namespace kernels } // namespace mace \ No newline at end of file diff --git a/mace/kernels/neon/addn_neon.h b/mace/kernels/neon/addn_neon.h deleted file mode 100644 index aa987d63..00000000 --- a/mace/kernels/neon/addn_neon.h +++ /dev/null @@ -1,19 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#ifndef MACE_KERNELS_ADDN_NEON_H_ -#define MACE_KERNELS_ADDN_NEON_H_ - -#include "mace/core/tensor.h" - -namespace mace { -namespace kernels { - -void NeonAddNFuntion_float(const vector &input_tensor, - Tensor *output_tensor); - -} // namespace kernels -} // namespace mace - -#endif // MACE_KERNELS_ADDN_NEON_H_ diff --git a/mace/kernels/neon/relu_neon.cc b/mace/kernels/neon/relu_neon.cc index e4870818..19ae6332 100644 --- a/mace/kernels/neon/relu_neon.cc +++ b/mace/kernels/neon/relu_neon.cc @@ -3,18 +3,15 @@ // #include -#include "mace/kernels/neon/relu_neon.h" +#include "mace/kernels/relu.h" namespace mace { namespace kernels { -void NeonReluFuntion_float(const Tensor *input_tensor, - Tensor *output_tensor) { - int64_t size = input_tensor->size(); - output_tensor->ResizeLike(input_tensor); - const float *input = input_tensor->data(); - float *output = output_tensor->mutable_data(); - +template <> +void ReluFunctor::operator()(const float *input, + float *output, + index_t size) { #pragma omp parallel for num_threads(1) // no significant performance improve for (int64_t i = 0; i < size; i += kCostPerGroup) { int64_t count = std::min(static_cast(kCostPerGroup), size - i); @@ -37,7 +34,8 @@ void NeonReluFuntion_float(const Tensor *input_tensor, ++outptr; } } -} +}; + } // namespace kernels } // namespace mace \ No newline at end of file diff --git a/mace/kernels/neon/relu_neon.h b/mace/kernels/neon/relu_neon.h deleted file mode 100644 index 0be3be6f..00000000 --- a/mace/kernels/neon/relu_neon.h +++ /dev/null @@ -1,19 +0,0 @@ -// -// Copyright (c) 2017 XiaoMi All rights reserved. -// - -#ifndef MACE_KERNELS_RELU_NEON_H_ -#define MACE_KERNELS_RELU_NEON_H_ - -#include "mace/core/tensor.h" - -namespace mace { -namespace kernels { - -void NeonReluFuntion_float(const Tensor *input_tensor, - Tensor *output_tensor); - -} // namespace kernels -} // namespace mace - -#endif // MACE_KERNELS_RELU_NEON_H_ diff --git a/mace/kernels/relu.h b/mace/kernels/relu.h index d0de2f0b..fd845c1f 100644 --- a/mace/kernels/relu.h +++ b/mace/kernels/relu.h @@ -10,17 +10,14 @@ namespace mace { namespace kernels { -template -void ReluFuntion(const Tensor *input_tensor, Tensor *output_tensor) { - int64_t size = input_tensor->size(); - output_tensor->ResizeLike(input_tensor); - const T *input = input_tensor->data(); - T *output = output_tensor->mutable_data(); - - for (int64_t i = 0; i < size; ++i) { - output[i] = std::max(input[i], static_cast(0)); +template +struct ReluFunctor { + void operator()(const T *input, T *output, index_t size) { + for (index_t i = 0; i < size; ++i) { + output[i] = std::max(input[i], static_cast(0)); + } } -} +}; } // namespace kernels } // namespace mace diff --git a/mace/kernels/test/addn_neon_test.cc b/mace/kernels/test/addn_neon_test.cc index 521fe912..c6425595 100644 --- a/mace/kernels/test/addn_neon_test.cc +++ b/mace/kernels/test/addn_neon_test.cc @@ -4,7 +4,6 @@ #include #include #include "gtest/gtest.h" -#include "mace/kernels/neon/addn_neon.h" #include "mace/kernels/addn.h" using namespace mace; @@ -16,26 +15,11 @@ TEST(NeonTest, AddN) { std::normal_distribution nd(0, 1); int64_t count = 100000; - Tensor input_tensor1(cpu_allocator(), DataType::DT_FLOAT); - input_tensor1.Resize({100, 1000}); - Tensor input_tensor2(cpu_allocator(), DataType::DT_FLOAT); - input_tensor2.ResizeLike(input_tensor1); - Tensor input_tensor3(cpu_allocator(), DataType::DT_FLOAT); - input_tensor3.ResizeLike(input_tensor1); - vector input_tensors {&input_tensor1, - &input_tensor2, - &input_tensor3}; - - Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT); - output_tensor.ResizeLike(input_tensors[0]); - Tensor output_tensor_neon(cpu_allocator(), DataType::DT_FLOAT); - output_tensor_neon.ResizeLike(input_tensors[0]); - - float *input1 = input_tensor1.mutable_data(); - float *input2 = input_tensor2.mutable_data(); - float *input3 = input_tensor3.mutable_data(); - float *output = output_tensor.mutable_data(); - float *output_neon = output_tensor_neon.mutable_data(); + vector input1(count); + vector input2(count); + vector input3(count); + vector output(count); + vector output_neon(count); for (int64_t i = 0; i < count; ++i) { input1[i] = nd(gen); @@ -43,11 +27,13 @@ TEST(NeonTest, AddN) { input3[i] = nd(gen); } - AddNFuntion(input_tensors, &output_tensor); - NeonAddNFuntion_float(input_tensors, &output_tensor_neon); + vector inputs { input1.data(), input2.data(), input3.data() }; + + AddNFunctor addn_functor; + AddNFunctor neon_addn_functor; + addn_functor(inputs, &output[0], count); + neon_addn_functor(inputs, &output_neon[0], count); - ASSERT_EQ(count, output_tensor.size()); - ASSERT_EQ(count, output_tensor_neon.size()); for (int64_t i = 0; i < count; ++i) { ASSERT_FLOAT_EQ(output[i], output_neon[i]); } diff --git a/mace/kernels/test/relu_neon_test.cc b/mace/kernels/test/relu_neon_test.cc index a16dc269..2e98b62a 100644 --- a/mace/kernels/test/relu_neon_test.cc +++ b/mace/kernels/test/relu_neon_test.cc @@ -4,7 +4,6 @@ #include #include #include "gtest/gtest.h" -#include "mace/kernels/neon/relu_neon.h" #include "mace/kernels/relu.h" using namespace mace; @@ -16,26 +15,20 @@ TEST(NeonTest, Relu) { std::normal_distribution nd(0, 1); int64_t count = 100000; - Tensor input_tensor(cpu_allocator(), DataType::DT_FLOAT); - input_tensor.Resize({100, 1000}); - Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT); - output_tensor.ResizeLike(input_tensor); - Tensor output_tensor_neon(cpu_allocator(), DataType::DT_FLOAT); - output_tensor_neon.ResizeLike(input_tensor); - - float *input = input_tensor.mutable_data(); - float *output = output_tensor.mutable_data(); - float *output_neon = output_tensor_neon.mutable_data(); + vector input(count); + vector output(count); + vector output_neon(count); for (int64_t i = 0; i < count; ++i) { input[i] = nd(gen); } - ReluFuntion(&input_tensor, &output_tensor); - NeonReluFuntion_float(&input_tensor, &output_tensor_neon); + ReluFunctor relu_functor; + ReluFunctor neon_relu_functor; + + relu_functor(&input[0], &output[0], count); + neon_relu_functor(&input[0], &output_neon[0], count); - ASSERT_EQ(count, output_tensor.size()); - ASSERT_EQ(count, output_tensor_neon.size()); for (int64_t i = 0; i < count; ++i) { ASSERT_FLOAT_EQ(output[i], output_neon[i]); } diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc index 94f506f7..766a223e 100644 --- a/mace/ops/addn.cc +++ b/mace/ops/addn.cc @@ -3,22 +3,12 @@ // #include "mace/ops/addn.h" -#include "mace/proto/mace.pb.h" -#if __ARM_NEON -#include "mace/kernels/neon/addn_neon.h" -#endif // __ARM_NEON namespace mace { REGISTER_CPU_OPERATOR(AddN, AddNOp); #if __ARM_NEON -template <> -bool AddNOp::Run() { - Tensor* output_tensor = Output(0); - kernels::NeonAddNFuntion_float(Inputs(), output_tensor); - return true; -} REGISTER_NEON_OPERATOR(AddN, AddNOp); #endif // __ARM_NEON diff --git a/mace/ops/addn.h b/mace/ops/addn.h index 66e1dba0..c25db759 100644 --- a/mace/ops/addn.h +++ b/mace/ops/addn.h @@ -15,11 +15,25 @@ class AddNOp : public Operator { public: AddNOp(const OperatorDef &operator_def, Workspace *ws) : Operator(operator_def, ws) {} + bool Run() override { - Tensor* output_tensor = this->Output(0); - kernels::AddNFuntion(this->Inputs(), output_tensor); + Tensor* output_tensor = this->outputs_[0]; + output_tensor->ResizeLike(this->inputs_[0]); + T* output = output_tensor->mutable_data(); + index_t size = this->inputs_[0]->size(); + int n = this->inputs_.size(); + vector inputs(n); + for (int i = 0; i < n; ++i) { + const Tensor* input_tensor = this->inputs_[i]; + inputs[i] = input_tensor->data(); + } + + functor_(inputs, output, size); return true; } + + private: + kernels::AddNFunctor functor_; }; } // namespace mace diff --git a/mace/ops/relu.cc b/mace/ops/relu.cc index c2dab6e5..c2193080 100644 --- a/mace/ops/relu.cc +++ b/mace/ops/relu.cc @@ -3,23 +3,12 @@ // #include "mace/ops/relu.h" -#include "mace/proto/mace.pb.h" -#if __ARM_NEON -#include "mace/kernels/neon/relu_neon.h" -#endif // __ARM_NEON namespace mace { REGISTER_CPU_OPERATOR(Relu, ReluOp); #if __ARM_NEON -template <> -bool ReluOp::Run() { - const Tensor* input_tensor = Input(0); - Tensor* output_tensor = Output(0); - kernels::NeonReluFuntion_float(input_tensor, output_tensor); - return true; -} REGISTER_NEON_OPERATOR(Relu, ReluOp); #endif // __ARM_NEON diff --git a/mace/ops/relu.h b/mace/ops/relu.h index 2965dc55..166c7733 100644 --- a/mace/ops/relu.h +++ b/mace/ops/relu.h @@ -16,11 +16,19 @@ class ReluOp : public Operator { ReluOp(const OperatorDef &operator_def, Workspace *ws) : Operator(operator_def, ws) {} bool Run() override { - const Tensor* input_tensor = this->Input(0); - Tensor* output_tensor = this->Output(0); - kernels::ReluFuntion(input_tensor, output_tensor); + const Tensor* input_tensor = this->inputs_[0]; + Tensor* output_tensor = this->outputs_[0]; + output_tensor->ResizeLike(input_tensor); + const T* input = input_tensor->data(); + T* output = output_tensor->mutable_data(); + index_t size = input_tensor->size(); + + functor_(input, output, size); return true; } + + private: + kernels::ReluFunctor functor_; }; } // namespace mace -- GitLab