提交 0c54e020 编写于 作者: 李寅

Implement ReluN

上级 291a5ee6
...@@ -12,18 +12,19 @@ template <> ...@@ -12,18 +12,19 @@ template <>
void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input, void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input,
float *output, float *output,
index_t size) { index_t size) {
#pragma omp parallel for num_threads(1) // no significant performance improve if (max_limit_ < 0) {
#pragma omp parallel for num_threads(1) // no significant perf improve
for (int64_t i = 0; i < size; i += kCostPerGroup) { for (int64_t i = 0; i < size; i += kCostPerGroup) {
int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i); int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
int nn = count >> 2; int block = count >> 2;
int remain = count - (nn << 2); int remain = count - (block << 2);
const float *inptr = input + i; const float *inptr = input + i;
float *outptr = output + i; float *outptr = output + i;
float32x4_t _zero = vdupq_n_f32(0.f); float32x4_t zero = vdupq_n_f32(0.f);
for (; nn > 0; --nn) { for (; block > 0; --block) {
float32x4_t _inptr = vld1q_f32(inptr); float32x4_t in = vld1q_f32(inptr);
float32x4_t _outptr = vmaxq_f32(_inptr, _zero); float32x4_t out = vmaxq_f32(in, zero);
vst1q_f32(outptr, _outptr); vst1q_f32(outptr, out);
inptr += 4; inptr += 4;
outptr += 4; outptr += 4;
...@@ -34,6 +35,32 @@ void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input, ...@@ -34,6 +35,32 @@ void ReluFunctor<DeviceType::NEON, float>::operator()(const float *input,
++outptr; ++outptr;
} }
} }
} else {
#pragma omp parallel for num_threads(1) // no significant perf improve
for (int64_t i = 0; i < size; i += kCostPerGroup) {
int64_t count = std::min(static_cast<int64_t>(kCostPerGroup), size - i);
int block = count >> 2;
int remain = count - (block << 2);
const float *inptr = input + i;
float *outptr = output + i;
float32x4_t zero = vdupq_n_f32(0.f);
float32x4_t vmax = vdupq_n_f32(max_limit_);
for (; block > 0; --block) {
float32x4_t in = vld1q_f32(inptr);
float32x4_t out = vmaxq_f32(in, zero);
out = vminq_f32(out, vmax);
vst1q_f32(outptr, out);
inptr += 4;
outptr += 4;
}
for (; remain > 0; --remain) {
*outptr = std::min(std::max(*inptr, 0.f), max_limit_);
++inptr;
++outptr;
}
}
}
}; };
} // namespace kernels } // namespace kernels
......
...@@ -12,10 +12,18 @@ namespace kernels { ...@@ -12,10 +12,18 @@ namespace kernels {
template <DeviceType D, typename T> template <DeviceType D, typename T>
struct ReluFunctor { struct ReluFunctor {
T max_limit_;
void operator()(const T *input, T *output, index_t size) { void operator()(const T *input, T *output, index_t size) {
if (max_limit_ < 0) {
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::max(input[i], static_cast<T>(0)); output[i] = std::max(input[i], static_cast<T>(0));
} }
} else {
for (index_t i = 0; i < size; ++i) {
output[i] = std::min(std::max(input[i], static_cast<T>(0)), max_limit_);
}
}
} }
}; };
......
...@@ -155,9 +155,9 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) { ...@@ -155,9 +155,9 @@ TEST_F(PoolingOpTest, MAX_k2x2s2x2) {
net.RunOp(DeviceType::NEON); net.RunOp(DeviceType::NEON);
// Check // Check
Tensor expected = CreateTensor<float>({1, 1, 2, 3}, {6, 8, 9, 16, 18, 19}); auto expected = CreateTensor<float>({1, 1, 2, 3}, {6, 8, 9, 16, 18, 19});
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
} }
TEST_F(PoolingOpTest, MAX_k3x3s2x2) { TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
...@@ -183,7 +183,7 @@ TEST_F(PoolingOpTest, MAX_k3x3s2x2) { ...@@ -183,7 +183,7 @@ TEST_F(PoolingOpTest, MAX_k3x3s2x2) {
net.RunOp(DeviceType::NEON); net.RunOp(DeviceType::NEON);
// Check // Check
Tensor expected = CreateTensor<float>({1, 1, 2, 3}, {11, 13, 14, 16, 18, 19}); auto expected = CreateTensor<float>({1, 1, 2, 3}, {11, 13, 14, 16, 18, 19});
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.001); ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 0.001);
} }
...@@ -14,7 +14,10 @@ template <DeviceType D, class T> ...@@ -14,7 +14,10 @@ template <DeviceType D, class T>
class ReluOp : public Operator<D, T> { class ReluOp : public Operator<D, T> {
public: public:
ReluOp(const OperatorDef& operator_def, Workspace* ws) ReluOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<D, T>(operator_def, ws) {} : Operator<D, T>(operator_def, ws) {
functor_.max_limit_ =
OperatorBase::GetSingleArgument<T>("max_limit", static_cast<T>(-1));
}
bool Run() override { bool Run() override {
const Tensor* input_tensor = this->inputs_[0]; const Tensor* input_tensor = this->inputs_[0];
Tensor* output_tensor = this->outputs_[0]; Tensor* output_tensor = this->outputs_[0];
......
...@@ -18,7 +18,7 @@ TEST_F(ReluOpTest, ReluOp) { ...@@ -18,7 +18,7 @@ TEST_F(ReluOpTest, ReluOp) {
.Finalize(net.operator_def()); .Finalize(net.operator_def());
// Add input data // Add input data
net.AddRandomInput<float>("Input", {1, 2, 3, 4}); net.AddRandomInput<float>("Input", {1, 2, 3, 5});
// Run // Run
net.RunOp(); net.RunOp();
...@@ -32,4 +32,29 @@ TEST_F(ReluOpTest, ReluOp) { ...@@ -32,4 +32,29 @@ TEST_F(ReluOpTest, ReluOp) {
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01); ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01);
} }
TEST_F(ReluOpTest, ReluOpWithMax) {
// Construct graph
auto& net = test_net();
OpDefBuilder("Relu", "ReluTestWithMax")
.Input("Input")
.Output("Output")
.Finalize(net.operator_def());
// Add input data
net.AddRandomInput<float>("Input", {1, 2, 3, 5});
net.AddFloatArg("max_limit", 0.5);
// Run
net.RunOp();
Tensor expected;
expected.Copy(*net.GetOutput("Output"));
// Check
net.RunOp(DeviceType::NEON);
ExpectTensorNear<float>(expected, *net.GetOutput("Output"), 0.01);
}
} // namespace mace } // namespace mace
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册