提交 c2ca7961 编写于 作者: 李寅

Merge branch 'master' into 'master'

Refactor op

See merge request !19
...@@ -10,25 +10,18 @@ ...@@ -10,25 +10,18 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template<DeviceType D, typename T>
void AddNFuntion(const vector<const Tensor*>& input_tensor, Tensor *output_tensor) { struct AddNFunctor {
int n = input_tensor.size(); void operator()(const vector<const T*>& inputs,
MACE_CHECK(n > 1); T *output, index_t size) {
MACE_CHECK_NOTNULL(input_tensor[0]); int n = inputs.size();
int64_t size = input_tensor[0]->size(); for (int i = 0; i < n; ++i) {
vector<const T*> inputs(n); for (index_t j = 0; j < size; ++j) {
for (int i = 0; i < n; ++i) { output[j] += inputs[i][j];
inputs[i] = input_tensor[i]->data<T>(); }
}
output_tensor->ResizeLike(input_tensor[0]);
T* output = output_tensor->mutable_data<T>();
for (int i = 0; i < n; ++i) {
for (int64_t j = 0; j < size; ++j) {
output[j] += inputs[i][j];
} }
} }
} };
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -5,7 +5,6 @@ ...@@ -5,7 +5,6 @@
#include "mace/core/testing/test_benchmark.h" #include "mace/core/testing/test_benchmark.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/addn.h" #include "mace/kernels/addn.h"
#include "mace/kernels/neon/addn_neon.h"
using namespace mace; using namespace mace;
using namespace mace::kernels; using namespace mace::kernels;
...@@ -19,32 +18,24 @@ static void AddNBenchmark(int iters, int n, int type) { ...@@ -19,32 +18,24 @@ static void AddNBenchmark(int iters, int n, int type) {
std::mt19937 gen(rd()); std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1); std::normal_distribution<float> nd(0, 1);
Tensor input_tensor1(cpu_allocator(), DataType::DT_FLOAT); vector<float> input1(n);
input_tensor1.Resize({n}); vector<float> input2(n);
Tensor input_tensor2(cpu_allocator(), DataType::DT_FLOAT); vector<float> input3(n);
input_tensor2.Resize({n}); vector<float> output(n);
Tensor input_tensor3(cpu_allocator(), DataType::DT_FLOAT);
input_tensor3.Resize({n});
vector<const Tensor*> input_tensors {&input_tensor1,
&input_tensor2,
&input_tensor3};
Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT);
output_tensor.ResizeLike(input_tensor1);
float *input1 = input_tensor1.mutable_data<float>();
float *input2 = input_tensor2.mutable_data<float>();
float *input3 = input_tensor3.mutable_data<float>();
float *output = output_tensor.mutable_data<float>();
for (int64_t i = 0; i < n; ++i) { for (int64_t i = 0; i < n; ++i) {
input1[i] = nd(gen); input1[i] = nd(gen);
input2[i] = nd(gen); input2[i] = nd(gen);
input3[i] = nd(gen); input3[i] = nd(gen);
} }
vector<const float*> inputs { input1.data(), input2.data(), input3.data() };
if (type == DeviceType::CPU) { if (type == DeviceType::CPU) {
AddNFuntion<float>(input_tensors, &output_tensor); AddNFunctor<DeviceType::CPU, float> addn_functor;
addn_functor(inputs, &output[0], n);
} else if (type == DeviceType::NEON) { } else if (type == DeviceType::NEON) {
NeonAddNFuntion_float(input_tensors, &output_tensor); AddNFunctor<DeviceType::NEON, float> neon_addn_functor;
neon_addn_functor(inputs, &output[0], n);
} }
} }
......
...@@ -5,7 +5,6 @@ ...@@ -5,7 +5,6 @@
#include "mace/core/testing/test_benchmark.h" #include "mace/core/testing/test_benchmark.h"
#include "mace/core/tensor.h" #include "mace/core/tensor.h"
#include "mace/kernels/relu.h" #include "mace/kernels/relu.h"
#include "mace/kernels/neon/relu_neon.h"
using namespace mace; using namespace mace;
using namespace mace::kernels; using namespace mace::kernels;
...@@ -19,20 +18,19 @@ static void ReluBenchmark(int iters, int n, int type) { ...@@ -19,20 +18,19 @@ static void ReluBenchmark(int iters, int n, int type) {
std::mt19937 gen(rd()); std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1); std::normal_distribution<float> nd(0, 1);
Tensor input_tensor(cpu_allocator(), DataType::DT_FLOAT); vector<float> input(n);
input_tensor.Resize({n}); vector<float> output(n);
Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT);
output_tensor.ResizeLike(input_tensor);
float *input = input_tensor.mutable_data<float>();
float *output = output_tensor.mutable_data<float>();
for (int64_t i = 0; i < n; ++i) { for (int64_t i = 0; i < n; ++i) {
input[i] = nd(gen); input[i] = nd(gen);
} }
if (type == DeviceType::CPU) { if (type == DeviceType::CPU) {
ReluFuntion<float>(&input_tensor, &output_tensor); ReluFunctor<DeviceType::CPU, float> relu_functor;
relu_functor(&input[0], &output[0], n);
} else if (type == DeviceType::NEON) { } else if (type == DeviceType::NEON) {
NeonReluFuntion_float(&input_tensor, &output_tensor); ReluFunctor<DeviceType::NEON, float> neon_relu_functor;
neon_relu_functor(&input[0], &output[0], n);
} }
} }
......
...@@ -3,25 +3,16 @@ ...@@ -3,25 +3,16 @@
// //
#include <arm_neon.h> #include <arm_neon.h>
#include "mace/kernels/neon/addn_neon.h" #include "mace/kernels/addn.h"
#include "mace/core/common.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
void NeonAddNFuntion_float(const vector<const Tensor *> &input_tensor, template <>
Tensor *output_tensor) { void AddNFunctor<DeviceType::NEON, float>::operator()(const vector<const float*>& inputs,
int n = input_tensor.size(); float *output,
MACE_CHECK(n > 1); index_t size) {
MACE_CHECK_NOTNULL(input_tensor[0]); int n = inputs.size();
int64_t size = input_tensor[0]->size();
output_tensor->ResizeLike(input_tensor[0]);
float *output = output_tensor->mutable_data<float>();
vector<const float *> inputs(n);
for (int i = 0; i < n; ++i) {
inputs[i] = input_tensor[i]->data<float>();
}
int64_t cost = size * n; int64_t cost = size * n;
int64_t groups = 1; int64_t groups = 1;
if (cost > kCostPerGroup) { if (cost > kCostPerGroup) {
...@@ -53,7 +44,7 @@ void NeonAddNFuntion_float(const vector<const Tensor *> &input_tensor, ...@@ -53,7 +44,7 @@ void NeonAddNFuntion_float(const vector<const Tensor *> &input_tensor,
} }
} }
} }
} };
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
\ No newline at end of file
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_ADDN_NEON_H_
#define MACE_KERNELS_ADDN_NEON_H_
#include "mace/core/tensor.h"
namespace mace {
namespace kernels {
void NeonAddNFuntion_float(const vector<const Tensor *> &input_tensor,
Tensor *output_tensor);
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_ADDN_NEON_H_
...@@ -3,18 +3,15 @@ ...@@ -3,18 +3,15 @@
// //
#include <arm_neon.h> #include <arm_neon.h>
#include "mace/kernels/neon/relu_neon.h" #include "mace/kernels/relu.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
void NeonReluFuntion_float(const Tensor *input_tensor, template <>
Tensor *output_tensor) { void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input,
int64_t size = input_tensor->size(); float *output,
output_tensor->ResizeLike(input_tensor); index_t size) {
const float *input = input_tensor->data<float>();
float *output = output_tensor->mutable_data<float>();
#pragma omp parallel for num_threads(1) // no significant performance improve #pragma omp parallel for num_threads(1) // no significant performance improve
for (int64_t i = 0; i < size; i += kCostPerGroup) { for (int64_t i = 0; i < size; i += kCostPerGroup) {
int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i); int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
...@@ -37,7 +34,8 @@ void NeonReluFuntion_float(const Tensor *input_tensor, ...@@ -37,7 +34,8 @@ void NeonReluFuntion_float(const Tensor *input_tensor,
++outptr; ++outptr;
} }
} }
} };
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
\ No newline at end of file
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_RELU_NEON_H_
#define MACE_KERNELS_RELU_NEON_H_
#include "mace/core/tensor.h"
namespace mace {
namespace kernels {
void NeonReluFuntion_float(const Tensor *input_tensor,
Tensor *output_tensor);
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_RELU_NEON_H_
...@@ -10,17 +10,14 @@ ...@@ -10,17 +10,14 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
template<typename T> template<DeviceType D, typename T>
void ReluFuntion(const Tensor *input_tensor, Tensor *output_tensor) { struct ReluFunctor {
int64_t size = input_tensor->size(); void operator()(const T *input, T *output, index_t size) {
output_tensor->ResizeLike(input_tensor); for (index_t i = 0; i < size; ++i) {
const T *input = input_tensor->data<T>(); output[i] = std::max(input[i], static_cast<T>(0));
T *output = output_tensor->mutable_data<T>(); }
for (int64_t i = 0; i < size; ++i) {
output[i] = std::max(input[i], static_cast<T>(0));
} }
} };
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
......
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
#include <random> #include <random>
#include <cmath> #include <cmath>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "mace/kernels/neon/addn_neon.h"
#include "mace/kernels/addn.h" #include "mace/kernels/addn.h"
using namespace mace; using namespace mace;
...@@ -16,26 +15,11 @@ TEST(NeonTest, AddN) { ...@@ -16,26 +15,11 @@ TEST(NeonTest, AddN) {
std::normal_distribution<float> nd(0, 1); std::normal_distribution<float> nd(0, 1);
int64_t count = 100000; int64_t count = 100000;
Tensor input_tensor1(cpu_allocator(), DataType::DT_FLOAT); vector<float> input1(count);
input_tensor1.Resize({100, 1000}); vector<float> input2(count);
Tensor input_tensor2(cpu_allocator(), DataType::DT_FLOAT); vector<float> input3(count);
input_tensor2.ResizeLike(input_tensor1); vector<float> output(count);
Tensor input_tensor3(cpu_allocator(), DataType::DT_FLOAT); vector<float> output_neon(count);
input_tensor3.ResizeLike(input_tensor1);
vector<const Tensor*> input_tensors {&input_tensor1,
&input_tensor2,
&input_tensor3};
Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT);
output_tensor.ResizeLike(input_tensors[0]);
Tensor output_tensor_neon(cpu_allocator(), DataType::DT_FLOAT);
output_tensor_neon.ResizeLike(input_tensors[0]);
float *input1 = input_tensor1.mutable_data<float>();
float *input2 = input_tensor2.mutable_data<float>();
float *input3 = input_tensor3.mutable_data<float>();
float *output = output_tensor.mutable_data<float>();
float *output_neon = output_tensor_neon.mutable_data<float>();
for (int64_t i = 0; i < count; ++i) { for (int64_t i = 0; i < count; ++i) {
input1[i] = nd(gen); input1[i] = nd(gen);
...@@ -43,11 +27,13 @@ TEST(NeonTest, AddN) { ...@@ -43,11 +27,13 @@ TEST(NeonTest, AddN) {
input3[i] = nd(gen); input3[i] = nd(gen);
} }
AddNFuntion<float>(input_tensors, &output_tensor); vector<const float*> inputs { input1.data(), input2.data(), input3.data() };
NeonAddNFuntion_float(input_tensors, &output_tensor_neon);
AddNFunctor<DeviceType::CPU, float> addn_functor;
AddNFunctor<DeviceType::NEON, float> neon_addn_functor;
addn_functor(inputs, &output[0], count);
neon_addn_functor(inputs, &output_neon[0], count);
ASSERT_EQ(count, output_tensor.size());
ASSERT_EQ(count, output_tensor_neon.size());
for (int64_t i = 0; i < count; ++i) { for (int64_t i = 0; i < count; ++i) {
ASSERT_FLOAT_EQ(output[i], output_neon[i]); ASSERT_FLOAT_EQ(output[i], output_neon[i]);
} }
......
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
#include <random> #include <random>
#include <cmath> #include <cmath>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "mace/kernels/neon/relu_neon.h"
#include "mace/kernels/relu.h" #include "mace/kernels/relu.h"
using namespace mace; using namespace mace;
...@@ -16,26 +15,20 @@ TEST(NeonTest, Relu) { ...@@ -16,26 +15,20 @@ TEST(NeonTest, Relu) {
std::normal_distribution<float> nd(0, 1); std::normal_distribution<float> nd(0, 1);
int64_t count = 100000; int64_t count = 100000;
Tensor input_tensor(cpu_allocator(), DataType::DT_FLOAT); vector<float> input(count);
input_tensor.Resize({100, 1000}); vector<float> output(count);
Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT); vector<float> output_neon(count);
output_tensor.ResizeLike(input_tensor);
Tensor output_tensor_neon(cpu_allocator(), DataType::DT_FLOAT);
output_tensor_neon.ResizeLike(input_tensor);
float *input = input_tensor.mutable_data<float>();
float *output = output_tensor.mutable_data<float>();
float *output_neon = output_tensor_neon.mutable_data<float>();
for (int64_t i = 0; i < count; ++i) { for (int64_t i = 0; i < count; ++i) {
input[i] = nd(gen); input[i] = nd(gen);
} }
ReluFuntion<float>(&input_tensor, &output_tensor); ReluFunctor<DeviceType::CPU, float> relu_functor;
NeonReluFuntion_float(&input_tensor, &output_tensor_neon); ReluFunctor<DeviceType::NEON, float> neon_relu_functor;
relu_functor(&input[0], &output[0], count);
neon_relu_functor(&input[0], &output_neon[0], count);
ASSERT_EQ(count, output_tensor.size());
ASSERT_EQ(count, output_tensor_neon.size());
for (int64_t i = 0; i < count; ++i) { for (int64_t i = 0; i < count; ++i) {
ASSERT_FLOAT_EQ(output[i], output_neon[i]); ASSERT_FLOAT_EQ(output[i], output_neon[i]);
} }
......
...@@ -3,22 +3,12 @@ ...@@ -3,22 +3,12 @@
// //
#include "mace/ops/addn.h" #include "mace/ops/addn.h"
#include "mace/proto/mace.pb.h"
#if __ARM_NEON
#include "mace/kernels/neon/addn_neon.h"
#endif // __ARM_NEON
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(AddN, AddNOp<DeviceType::CPU, float>); REGISTER_CPU_OPERATOR(AddN, AddNOp<DeviceType::CPU, float>);
#if __ARM_NEON #if __ARM_NEON
template <>
bool AddNOp<DeviceType::NEON, float>::Run() {
Tensor* output_tensor = Output(0);
kernels::NeonAddNFuntion_float(Inputs(), output_tensor);
return true;
}
REGISTER_NEON_OPERATOR(AddN, AddNOp<DeviceType::NEON, float>); REGISTER_NEON_OPERATOR(AddN, AddNOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
......
...@@ -15,11 +15,25 @@ class AddNOp : public Operator<D, T> { ...@@ -15,11 +15,25 @@ class AddNOp : public Operator<D, T> {
public: public:
AddNOp(const OperatorDef &operator_def, Workspace *ws) AddNOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {} : Operator<D, T>(operator_def, ws) {}
bool Run() override { bool Run() override {
Tensor* output_tensor = this->Output(0); Tensor* output_tensor = this->outputs_[0];
kernels::AddNFuntion<T>(this->Inputs(), output_tensor); output_tensor->ResizeLike(this->inputs_[0]);
T* output = output_tensor->mutable_data<T>();
index_t size = this->inputs_[0]->size();
int n = this->inputs_.size();
vector<const T*> inputs(n);
for (int i = 0; i < n; ++i) {
const Tensor* input_tensor = this->inputs_[i];
inputs[i] = input_tensor->data<T>();
}
functor_(inputs, output, size);
return true; return true;
} }
private:
kernels::AddNFunctor<D, T> functor_;
}; };
} // namespace mace } // namespace mace
......
...@@ -3,23 +3,12 @@ ...@@ -3,23 +3,12 @@
// //
#include "mace/ops/relu.h" #include "mace/ops/relu.h"
#include "mace/proto/mace.pb.h"
#if __ARM_NEON
#include "mace/kernels/neon/relu_neon.h"
#endif // __ARM_NEON
namespace mace { namespace mace {
REGISTER_CPU_OPERATOR(Relu, ReluOp<DeviceType::CPU, float>); REGISTER_CPU_OPERATOR(Relu, ReluOp<DeviceType::CPU, float>);
#if __ARM_NEON #if __ARM_NEON
template <>
bool ReluOp<DeviceType::NEON, float>::Run() {
const Tensor* input_tensor = Input(0);
Tensor* output_tensor = Output(0);
kernels::NeonReluFuntion_float(input_tensor, output_tensor);
return true;
}
REGISTER_NEON_OPERATOR(Relu, ReluOp<DeviceType::NEON, float>); REGISTER_NEON_OPERATOR(Relu, ReluOp<DeviceType::NEON, float>);
#endif // __ARM_NEON #endif // __ARM_NEON
......
...@@ -16,11 +16,19 @@ class ReluOp : public Operator<D, T> { ...@@ -16,11 +16,19 @@ class ReluOp : public Operator<D, T> {
ReluOp(const OperatorDef &operator_def, Workspace *ws) ReluOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {} : Operator<D, T>(operator_def, ws) {}
bool Run() override { bool Run() override {
const Tensor* input_tensor = this->Input(0); const Tensor* input_tensor = this->inputs_[0];
Tensor* output_tensor = this->Output(0); Tensor* output_tensor = this->outputs_[0];
kernels::ReluFuntion<T>(input_tensor, output_tensor); output_tensor->ResizeLike(input_tensor);
const T* input = input_tensor->data<T>();
T* output = output_tensor->mutable_data<T>();
index_t size = input_tensor->size();
functor_(input, output, size);
return true; return true;
} }
private:
kernels::ReluFunctor<D, T> functor_;
}; };
} // namespace mace } // namespace mace
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册