提交 94dbdeaf 编写于 作者: 李寅 提交者: wuchenghui

Refactor op

上级 c9856392
......@@ -10,25 +10,18 @@
namespace mace {
namespace kernels {
template<typename T>
void AddNFuntion(const vector<const Tensor*>& input_tensor, Tensor *output_tensor) {
int n = input_tensor.size();
MACE_CHECK(n > 1);
MACE_CHECK_NOTNULL(input_tensor[0]);
int64_t size = input_tensor[0]->size();
vector<const T*> inputs(n);
for (int i = 0; i < n; ++i) {
inputs[i] = input_tensor[i]->data<T>();
}
output_tensor->ResizeLike(input_tensor[0]);
T* output = output_tensor->mutable_data<T>();
for (int i = 0; i < n; ++i) {
for (int64_t j = 0; j < size; ++j) {
output[j] += inputs[i][j];
template<DeviceType D, typename T>
struct AddNFunctor {
void operator()(const vector<const T*>& inputs,
T *output, index_t size) {
int n = inputs.size();
for (int i = 0; i < n; ++i) {
for (index_t j = 0; j < size; ++j) {
output[j] += inputs[i][j];
}
}
}
}
};
} // namespace kernels
} // namespace mace
......
......@@ -5,7 +5,6 @@
#include "mace/core/testing/test_benchmark.h"
#include "mace/core/tensor.h"
#include "mace/kernels/addn.h"
#include "mace/kernels/neon/addn_neon.h"
using namespace mace;
using namespace mace::kernels;
......@@ -19,32 +18,24 @@ static void AddNBenchmark(int iters, int n, int type) {
std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1);
Tensor input_tensor1(cpu_allocator(), DataType::DT_FLOAT);
input_tensor1.Resize({n});
Tensor input_tensor2(cpu_allocator(), DataType::DT_FLOAT);
input_tensor2.Resize({n});
Tensor input_tensor3(cpu_allocator(), DataType::DT_FLOAT);
input_tensor3.Resize({n});
vector<const Tensor*> input_tensors {&input_tensor1,
&input_tensor2,
&input_tensor3};
Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT);
output_tensor.ResizeLike(input_tensor1);
float *input1 = input_tensor1.mutable_data<float>();
float *input2 = input_tensor2.mutable_data<float>();
float *input3 = input_tensor3.mutable_data<float>();
float *output = output_tensor.mutable_data<float>();
vector<float> input1(n);
vector<float> input2(n);
vector<float> input3(n);
vector<float> output(n);
for (int64_t i = 0; i < n; ++i) {
input1[i] = nd(gen);
input2[i] = nd(gen);
input3[i] = nd(gen);
}
vector<const float*> inputs { input1.data(), input2.data(), input3.data() };
if (type == DeviceType::CPU) {
AddNFuntion<float>(input_tensors, &output_tensor);
AddNFunctor<DeviceType::CPU, float> addn_functor;
addn_functor(inputs, &output[0], n);
} else if (type == DeviceType::NEON) {
NeonAddNFuntion_float(input_tensors, &output_tensor);
AddNFunctor<DeviceType::NEON, float> neon_addn_functor;
neon_addn_functor(inputs, &output[0], n);
}
}
......
......@@ -5,7 +5,6 @@
#include "mace/core/testing/test_benchmark.h"
#include "mace/core/tensor.h"
#include "mace/kernels/relu.h"
#include "mace/kernels/neon/relu_neon.h"
using namespace mace;
using namespace mace::kernels;
......@@ -19,20 +18,19 @@ static void ReluBenchmark(int iters, int n, int type) {
std::mt19937 gen(rd());
std::normal_distribution<float> nd(0, 1);
Tensor input_tensor(cpu_allocator(), DataType::DT_FLOAT);
input_tensor.Resize({n});
Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT);
output_tensor.ResizeLike(input_tensor);
float *input = input_tensor.mutable_data<float>();
float *output = output_tensor.mutable_data<float>();
vector<float> input(n);
vector<float> output(n);
for (int64_t i = 0; i < n; ++i) {
input[i] = nd(gen);
}
if (type == DeviceType::CPU) {
ReluFuntion<float>(&input_tensor, &output_tensor);
ReluFunctor<DeviceType::CPU, float> relu_functor;
relu_functor(&input[0], &output[0], n);
} else if (type == DeviceType::NEON) {
NeonReluFuntion_float(&input_tensor, &output_tensor);
ReluFunctor<DeviceType::NEON, float> neon_relu_functor;
neon_relu_functor(&input[0], &output[0], n);
}
}
......
......@@ -3,25 +3,16 @@
//
#include <arm_neon.h>
#include "mace/kernels/neon/addn_neon.h"
#include "mace/core/common.h"
#include "mace/kernels/addn.h"
namespace mace {
namespace kernels {
void NeonAddNFuntion_float(const vector<const Tensor *> &input_tensor,
Tensor *output_tensor) {
int n = input_tensor.size();
MACE_CHECK(n > 1);
MACE_CHECK_NOTNULL(input_tensor[0]);
int64_t size = input_tensor[0]->size();
output_tensor->ResizeLike(input_tensor[0]);
float *output = output_tensor->mutable_data<float>();
vector<const float *> inputs(n);
for (int i = 0; i < n; ++i) {
inputs[i] = input_tensor[i]->data<float>();
}
template <>
void AddNFunctor<DeviceType::NEON, float>::operator()(const vector<const float*>& inputs,
float *output,
index_t size) {
int n = inputs.size();
int64_t cost = size * n;
int64_t groups = 1;
if (cost > kCostPerGroup) {
......@@ -53,7 +44,7 @@ void NeonAddNFuntion_float(const vector<const Tensor *> &input_tensor,
}
}
}
}
};
} // namespace kernels
} // namespace mace
\ No newline at end of file
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_ADDN_NEON_H_
#define MACE_KERNELS_ADDN_NEON_H_
#include "mace/core/tensor.h"
namespace mace {
namespace kernels {
void NeonAddNFuntion_float(const vector<const Tensor *> &input_tensor,
Tensor *output_tensor);
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_ADDN_NEON_H_
......@@ -3,18 +3,15 @@
//
#include <arm_neon.h>
#include "mace/kernels/neon/relu_neon.h"
#include "mace/kernels/relu.h"
namespace mace {
namespace kernels {
void NeonReluFuntion_float(const Tensor *input_tensor,
Tensor *output_tensor) {
int64_t size = input_tensor->size();
output_tensor->ResizeLike(input_tensor);
const float *input = input_tensor->data<float>();
float *output = output_tensor->mutable_data<float>();
template <>
void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input,
float *output,
index_t size) {
#pragma omp parallel for num_threads(1) // no significant performance improve
for (int64_t i = 0; i < size; i += kCostPerGroup) {
int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
......@@ -37,7 +34,8 @@ void NeonReluFuntion_float(const Tensor *input_tensor,
++outptr;
}
}
}
};
} // namespace kernels
} // namespace mace
\ No newline at end of file
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_KERNELS_RELU_NEON_H_
#define MACE_KERNELS_RELU_NEON_H_
#include "mace/core/tensor.h"
namespace mace {
namespace kernels {
void NeonReluFuntion_float(const Tensor *input_tensor,
Tensor *output_tensor);
} // namespace kernels
} // namespace mace
#endif // MACE_KERNELS_RELU_NEON_H_
......@@ -10,17 +10,14 @@
namespace mace {
namespace kernels {
template<typename T>
void ReluFuntion(const Tensor *input_tensor, Tensor *output_tensor) {
int64_t size = input_tensor->size();
output_tensor->ResizeLike(input_tensor);
const T *input = input_tensor->data<T>();
T *output = output_tensor->mutable_data<T>();
for (int64_t i = 0; i < size; ++i) {
output[i] = std::max(input[i], static_cast<T>(0));
template<DeviceType D, typename T>
struct ReluFunctor {
void operator()(const T *input, T *output, index_t size) {
for (index_t i = 0; i < size; ++i) {
output[i] = std::max(input[i], static_cast<T>(0));
}
}
}
};
} // namespace kernels
} // namespace mace
......
......@@ -4,7 +4,6 @@
#include <random>
#include <cmath>
#include "gtest/gtest.h"
#include "mace/kernels/neon/addn_neon.h"
#include "mace/kernels/addn.h"
using namespace mace;
......@@ -16,26 +15,11 @@ TEST(NeonTest, AddN) {
std::normal_distribution<float> nd(0, 1);
int64_t count = 100000;
Tensor input_tensor1(cpu_allocator(), DataType::DT_FLOAT);
input_tensor1.Resize({100, 1000});
Tensor input_tensor2(cpu_allocator(), DataType::DT_FLOAT);
input_tensor2.ResizeLike(input_tensor1);
Tensor input_tensor3(cpu_allocator(), DataType::DT_FLOAT);
input_tensor3.ResizeLike(input_tensor1);
vector<const Tensor*> input_tensors {&input_tensor1,
&input_tensor2,
&input_tensor3};
Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT);
output_tensor.ResizeLike(input_tensors[0]);
Tensor output_tensor_neon(cpu_allocator(), DataType::DT_FLOAT);
output_tensor_neon.ResizeLike(input_tensors[0]);
float *input1 = input_tensor1.mutable_data<float>();
float *input2 = input_tensor2.mutable_data<float>();
float *input3 = input_tensor3.mutable_data<float>();
float *output = output_tensor.mutable_data<float>();
float *output_neon = output_tensor_neon.mutable_data<float>();
vector<float> input1(count);
vector<float> input2(count);
vector<float> input3(count);
vector<float> output(count);
vector<float> output_neon(count);
for (int64_t i = 0; i < count; ++i) {
input1[i] = nd(gen);
......@@ -43,11 +27,13 @@ TEST(NeonTest, AddN) {
input3[i] = nd(gen);
}
AddNFuntion<float>(input_tensors, &output_tensor);
NeonAddNFuntion_float(input_tensors, &output_tensor_neon);
vector<const float*> inputs { input1.data(), input2.data(), input3.data() };
AddNFunctor<DeviceType::CPU, float> addn_functor;
AddNFunctor<DeviceType::NEON, float> neon_addn_functor;
addn_functor(inputs, &output[0], count);
neon_addn_functor(inputs, &output_neon[0], count);
ASSERT_EQ(count, output_tensor.size());
ASSERT_EQ(count, output_tensor_neon.size());
for (int64_t i = 0; i < count; ++i) {
ASSERT_FLOAT_EQ(output[i], output_neon[i]);
}
......
......@@ -4,7 +4,6 @@
#include <random>
#include <cmath>
#include "gtest/gtest.h"
#include "mace/kernels/neon/relu_neon.h"
#include "mace/kernels/relu.h"
using namespace mace;
......@@ -16,26 +15,20 @@ TEST(NeonTest, Relu) {
std::normal_distribution<float> nd(0, 1);
int64_t count = 100000;
Tensor input_tensor(cpu_allocator(), DataType::DT_FLOAT);
input_tensor.Resize({100, 1000});
Tensor output_tensor(cpu_allocator(), DataType::DT_FLOAT);
output_tensor.ResizeLike(input_tensor);
Tensor output_tensor_neon(cpu_allocator(), DataType::DT_FLOAT);
output_tensor_neon.ResizeLike(input_tensor);
float *input = input_tensor.mutable_data<float>();
float *output = output_tensor.mutable_data<float>();
float *output_neon = output_tensor_neon.mutable_data<float>();
vector<float> input(count);
vector<float> output(count);
vector<float> output_neon(count);
for (int64_t i = 0; i < count; ++i) {
input[i] = nd(gen);
}
ReluFuntion<float>(&input_tensor, &output_tensor);
NeonReluFuntion_float(&input_tensor, &output_tensor_neon);
ReluFunctor<DeviceType::CPU, float> relu_functor;
ReluFunctor<DeviceType::NEON, float> neon_relu_functor;
relu_functor(&input[0], &output[0], count);
neon_relu_functor(&input[0], &output_neon[0], count);
ASSERT_EQ(count, output_tensor.size());
ASSERT_EQ(count, output_tensor_neon.size());
for (int64_t i = 0; i < count; ++i) {
ASSERT_FLOAT_EQ(output[i], output_neon[i]);
}
......
......@@ -3,22 +3,12 @@
//
#include "mace/ops/addn.h"
#include "mace/proto/mace.pb.h"
#if __ARM_NEON
#include "mace/kernels/neon/addn_neon.h"
#endif // __ARM_NEON
namespace mace {
REGISTER_CPU_OPERATOR(AddN, AddNOp<DeviceType::CPU, float>);
#if __ARM_NEON
template <>
bool AddNOp<DeviceType::NEON, float>::Run() {
Tensor* output_tensor = Output(0);
kernels::NeonAddNFuntion_float(Inputs(), output_tensor);
return true;
}
REGISTER_NEON_OPERATOR(AddN, AddNOp<DeviceType::NEON, float>);
#endif // __ARM_NEON
......
......@@ -15,11 +15,25 @@ class AddNOp : public Operator<D, T> {
public:
AddNOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {}
bool Run() override {
Tensor* output_tensor = this->Output(0);
kernels::AddNFuntion<T>(this->Inputs(), output_tensor);
Tensor* output_tensor = this->outputs_[0];
output_tensor->ResizeLike(this->inputs_[0]);
T* output = output_tensor->mutable_data<T>();
index_t size = this->inputs_[0]->size();
int n = this->inputs_.size();
vector<const T*> inputs(n);
for (int i = 0; i < n; ++i) {
const Tensor* input_tensor = this->inputs_[i];
inputs[i] = input_tensor->data<T>();
}
functor_(inputs, output, size);
return true;
}
private:
kernels::AddNFunctor<D, T> functor_;
};
} // namespace mace
......
......@@ -3,23 +3,12 @@
//
#include "mace/ops/relu.h"
#include "mace/proto/mace.pb.h"
#if __ARM_NEON
#include "mace/kernels/neon/relu_neon.h"
#endif // __ARM_NEON
namespace mace {
REGISTER_CPU_OPERATOR(Relu, ReluOp<DeviceType::CPU, float>);
#if __ARM_NEON
template <>
bool ReluOp<DeviceType::NEON, float>::Run() {
const Tensor* input_tensor = Input(0);
Tensor* output_tensor = Output(0);
kernels::NeonReluFuntion_float(input_tensor, output_tensor);
return true;
}
REGISTER_NEON_OPERATOR(Relu, ReluOp<DeviceType::NEON, float>);
#endif // __ARM_NEON
......
......@@ -16,11 +16,19 @@ class ReluOp : public Operator<D, T> {
ReluOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<D, T>(operator_def, ws) {}
bool Run() override {
const Tensor* input_tensor = this->Input(0);
Tensor* output_tensor = this->Output(0);
kernels::ReluFuntion<T>(input_tensor, output_tensor);
const Tensor* input_tensor = this->inputs_[0];
Tensor* output_tensor = this->outputs_[0];
output_tensor->ResizeLike(input_tensor);
const T* input = input_tensor->data<T>();
T* output = output_tensor->mutable_data<T>();
index_t size = input_tensor->size();
functor_(input, output, size);
return true;
}
private:
kernels::ReluFunctor<D, T> functor_;
};
} // namespace mace
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册