未验证 提交 e28a71fb 编写于 作者: B BUG1989 提交者: GitHub

fix relu reference (#611)

* Fix, relu bugs
上级 129c5eeb
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* License); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Copyright (c) 2020, OPEN AI LAB
* Author: haitao@openailab.com
*/
#include <math.h>
#include <arm_neon.h>
#include "relu_kernel_arm.h"
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
static inline int relu_kernel(const int i, const int id, const void* data, const float* input, float* output,
const float slope)
{
float32x4_t _zero = vdupq_n_f32(0.f);
int step = (( int* )data)[0];
const float* cur_input = input + id * step;
float* cur_output = output + id * step;
if (slope == 0)
{
for (int l = 0; l < (step & -4); l += 4)
{
float32x4_t _p = vld1q_f32(cur_input);
_p = vmaxq_f32(_p, _zero);
vst1q_f32(cur_output, _p);
cur_input += 4;
cur_output += 4;
}
for (int i = step & ~3; i < step; i++)
{
*cur_output++ = MAX(*cur_input++, 0.f);
}
}
else
{
float32x4_t _slope = vdupq_n_f32(slope);
for (int l = 0; l < (step & -4); l += 4)
{
float32x4_t _p = vld1q_f32(cur_input);
// ri = ai <= bi ? 1...1:0...0
uint32x4_t _lemask = vcleq_f32(_p, _zero);
float32x4_t _ps = vmulq_f32(_p, _slope);
// bitwise select
_p = vbslq_f32(_lemask, _ps, _p);
vst1q_f32(cur_output, _p);
cur_input += 4;
cur_output += 4;
}
for (int i = step & ~3; i < step; i++)
{
*cur_output++ = MAX(cur_input[0], 0.f) + slope * MIN(cur_input[0], 0.f);
cur_input++;
}
}
return 0;
}
int relu_arm_run(struct ir_tensor* output_tensor, struct ir_tensor* input_tensor, struct relu_param* relu_param,
int num_thread)
{
float* data = ( float* )input_tensor->data;
float* out_data = ( float* )output_tensor->data;
float negativeslope = relu_param->negative_slope;
int chan_num = input_tensor->dims[0] * input_tensor->dims[1];
int chan_size = input_tensor->dims[2] * input_tensor->dims[3];
// #pragma omp parallel for num_threads(num_thread)
for (int i = 0; i < chan_num; i++)
{
int offset = i * chan_size;
relu_kernel(0, 0, &chan_size, data + offset, out_data + offset, negativeslope);
}
return 0;
}
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* License); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Copyright (c) 2020, OPEN AI LAB
* Author: haitao@openailab.com
*/
#ifndef _RELU_KERNEL_ARM_H_
#define _RELU_KERNEL_ARM_H_
#include "tengine_ir.h"
#include "relu_param.h"
int relu_arm_run(struct ir_tensor* output_tensor, struct ir_tensor* input_tensor, struct relu_param* relu_param,
int num_thread);
#endif
......@@ -36,48 +36,26 @@ static int ref_relu_fp32(struct ir_tensor* input_tensor, struct ir_tensor* outpu
{
float* input_data = input_tensor->data;
float* output_data = output_tensor->data;
int batch = input_tensor->dims[0];
int channel = input_tensor->dims[1];
int cstep = input_tensor->dims[2] * input_tensor->dims[3];
int total_size = input_tensor->elem_num;
if (negative_slope == 0)
{
for (int n = 0; n < batch; n++)
for (int i = 0; i < total_size; i++)
{
#pragma omp parallel for num_threads(num_thread)
for (int c = 0; c < channel; c++)
{
float* in_data = input_data + n * channel * cstep + c * cstep;
float* out_data = output_data + n * channel * cstep + c * cstep;
for (int i = 0; i < cstep; i++)
{
if (in_data[i] < 0.f)
out_data[i] = 0.f;
else
out_data[i] = in_data[i];
}
}
if (input_data[i] < 0)
output_data[i] = 0;
else
output_data[i] = input_data[i];
}
}
else
{
for (int n = 0; n < batch; n++)
for (int i = 0; i < total_size; i++)
{
#pragma omp parallel for num_threads(num_thread)
for (int c = 0; c < channel; c++)
{
float* in_data = input_data + n * channel * cstep + c * cstep;
float* out_data = output_data + n * channel * cstep + c * cstep;
for (int i = 0; i < cstep; i++)
{
if (in_data[i] < 0)
out_data[i] = in_data[i] * negative_slope;
else
out_data[i] = in_data[i];
}
}
if (input_data[i] < 0)
output_data[i] = input_data[i] * negative_slope;
else
output_data[i] = input_data[i];
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册