Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
0fbcd4ea
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
0fbcd4ea
编写于
6月 15, 2019
作者:
H
hong19860320
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add arm kernel and unit test for relue op
test=develop
上级
7c02e682
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
708 addition
and
22 deletion
+708
-22
paddle/fluid/lite/arm/math/CMakeLists.txt
paddle/fluid/lite/arm/math/CMakeLists.txt
+1
-0
paddle/fluid/lite/arm/math/activation.cc
paddle/fluid/lite/arm/math/activation.cc
+520
-0
paddle/fluid/lite/arm/math/activation.h
paddle/fluid/lite/arm/math/activation.h
+50
-0
paddle/fluid/lite/kernels/arm/CMakeLists.txt
paddle/fluid/lite/kernels/arm/CMakeLists.txt
+3
-2
paddle/fluid/lite/kernels/arm/activation_compute.cc
paddle/fluid/lite/kernels/arm/activation_compute.cc
+11
-19
paddle/fluid/lite/kernels/arm/activation_compute.h
paddle/fluid/lite/kernels/arm/activation_compute.h
+23
-1
paddle/fluid/lite/kernels/arm/activation_compute_test.cc
paddle/fluid/lite/kernels/arm/activation_compute_test.cc
+100
-0
未找到文件。
paddle/fluid/lite/arm/math/CMakeLists.txt
浏览文件 @
0fbcd4ea
...
...
@@ -32,6 +32,7 @@ cc_library(math_arm SRCS
conv_winograd_3x3.cc
conv_winograd.cc
split.cc
activation.cc
DEPS
${
lite_kernel_deps
}
eigen3 framework_proto_lite
)
# TODO(TJ): fix me do not deps proto
...
...
paddle/fluid/lite/arm/math/activation.cc
0 → 100644
浏览文件 @
0fbcd4ea
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/arm/math/activation.h"
#include "paddle/fluid/lite/arm/math/funcs.h"
namespace
paddle
{
namespace
lite
{
namespace
arm
{
namespace
math
{
template
<
>
void
act_relu
<
float
>
(
const
float
*
din
,
float
*
dout
,
int
size
,
int
threads
)
{
int
nums_per_thread
=
size
/
threads
;
int
remain
=
size
-
threads
*
nums_per_thread
;
int
neon_loop_cnt
=
nums_per_thread
>>
4
;
int
neon_loop_remain
=
nums_per_thread
-
(
neon_loop_cnt
<<
4
);
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
const
float
*
ptr_in_thread
=
din
+
i
*
nums_per_thread
;
float
*
ptr_out_thread
=
dout
+
i
*
nums_per_thread
;
int
cnt
=
neon_loop_cnt
;
#ifdef __aarch64__
for
(
int
num
=
0
;
num
<
neon_loop_cnt
;
++
num
)
{
float32x4_t
vr0
=
vld1q_f32
(
ptr_in_thread
);
ptr_in_thread
+=
4
;
float32x4_t
vr1
=
vld1q_f32
(
ptr_in_thread
);
ptr_in_thread
+=
4
;
float32x4_t
vr2
=
vld1q_f32
(
ptr_in_thread
);
ptr_in_thread
+=
4
;
float32x4_t
vr3
=
vld1q_f32
(
ptr_in_thread
);
ptr_in_thread
+=
4
;
vr0
=
vmaxq_f32
(
vr0
,
vzero
);
vr1
=
vmaxq_f32
(
vr1
,
vzero
);
vr2
=
vmaxq_f32
(
vr2
,
vzero
);
vr3
=
vmaxq_f32
(
vr3
,
vzero
);
vst1q_f32
(
ptr_out_thread
,
vr0
);
ptr_out_thread
+=
4
;
vst1q_f32
(
ptr_out_thread
,
vr1
);
ptr_out_thread
+=
4
;
vst1q_f32
(
ptr_out_thread
,
vr2
);
ptr_out_thread
+=
4
;
vst1q_f32
(
ptr_out_thread
,
vr3
);
ptr_out_thread
+=
4
;
}
#else
if
(
cnt
>
0
)
{
asm
volatile
(
"1: @ loop header
\n
"
"vld1.32 {d0-d3}, [%[din]]! @ load din 0
\n
"
"vld1.32 {d4-d7}, [%[din]]! @ load din 0
\n
"
"vmax.f32 q8, q0, %q[vzero] @ relu
\n
"
"vmax.f32 q9, q1, %q[vzero] @ relu
\n
"
"vmax.f32 q10, q2, %q[vzero] @ relu
\n
"
"vmax.f32 q11, q3, %q[vzero] @ relu
\n
"
"vst1.32 {d16-d19}, [%[dout]]! @ store result, add pointer
\n
"
"vst1.32 {d20-d23}, [%[dout]]! @ store result, add pointer
\n
"
"subs %[cnt], #1 @ loop count minus 1
\n
"
"bne 1b @ jump to main loop start "
"point
\n
"
:
[
dout
]
"+r"
(
ptr_out_thread
),
[
din
]
"+r"
(
ptr_in_thread
),
[
cnt
]
"+r"
(
cnt
)
:
[
vzero
]
"w"
(
vzero
)
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
);
}
#endif
for
(
int
j
=
0
;
j
<
neon_loop_remain
;
++
j
)
{
ptr_out_thread
[
0
]
=
ptr_in_thread
[
0
]
>
0.
f
?
ptr_in_thread
[
0
]
:
0.
f
;
ptr_in_thread
++
;
ptr_out_thread
++
;
}
}
float
*
out_ptr_remain
=
dout
+
threads
*
nums_per_thread
;
const
float
*
in_ptr_remain
=
din
+
threads
*
nums_per_thread
;
for
(
int
j
=
0
;
j
<
remain
;
++
j
)
{
out_ptr_remain
[
0
]
=
in_ptr_remain
[
0
]
>
0.
f
?
in_ptr_remain
[
0
]
:
0.
f
;
in_ptr_remain
++
;
out_ptr_remain
++
;
}
}
template
<
>
void
act_relu_neg
<
float
>
(
const
float
*
din
,
float
*
dout
,
int
size
,
const
float
negative_slope
,
int
threads
)
{
int
nums_per_thread
=
size
/
threads
;
int
remain
=
size
-
threads
*
nums_per_thread
;
int
neon_loop_cnt
=
nums_per_thread
>>
4
;
int
neon_loop_remain
=
nums_per_thread
-
(
neon_loop_cnt
<<
4
);
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
valpha
=
vdupq_n_f32
(
negative_slope
);
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
const
float
*
ptr_in_thread
=
din
+
i
*
nums_per_thread
;
float
*
ptr_out_thread
=
dout
+
i
*
nums_per_thread
;
int
cnt
=
neon_loop_cnt
;
#ifdef __aarch64__
for
(
int
num
=
0
;
num
<
neon_loop_cnt
;
++
num
)
{
float32x4_t
vr0
=
vld1q_f32
(
ptr_in_thread
);
ptr_in_thread
+=
4
;
float32x4_t
vr1
=
vld1q_f32
(
ptr_in_thread
);
ptr_in_thread
+=
4
;
float32x4_t
vr2
=
vld1q_f32
(
ptr_in_thread
);
ptr_in_thread
+=
4
;
float32x4_t
vr3
=
vld1q_f32
(
ptr_in_thread
);
ptr_in_thread
+=
4
;
uint32x4_t
vm0
=
vcgeq_f32
(
vr0
,
vzero
);
uint32x4_t
vm1
=
vcgeq_f32
(
vr1
,
vzero
);
uint32x4_t
vm2
=
vcgeq_f32
(
vr2
,
vzero
);
uint32x4_t
vm3
=
vcgeq_f32
(
vr3
,
vzero
);
float32x4_t
vn0
=
vmulq_f32
(
vr0
,
valpha
);
float32x4_t
vn1
=
vmulq_f32
(
vr1
,
valpha
);
float32x4_t
vn2
=
vmulq_f32
(
vr2
,
valpha
);
float32x4_t
vn3
=
vmulq_f32
(
vr3
,
valpha
);
float32x4_t
vo0
=
vbslq_f32
(
vm0
,
vr0
,
vn0
);
float32x4_t
vo1
=
vbslq_f32
(
vm1
,
vr1
,
vn1
);
float32x4_t
vo2
=
vbslq_f32
(
vm2
,
vr2
,
vn2
);
float32x4_t
vo3
=
vbslq_f32
(
vm3
,
vr3
,
vn3
);
vst1q_f32
(
ptr_out_thread
,
vo0
);
ptr_out_thread
+=
4
;
vst1q_f32
(
ptr_out_thread
,
vo1
);
ptr_out_thread
+=
4
;
vst1q_f32
(
ptr_out_thread
,
vo2
);
ptr_out_thread
+=
4
;
vst1q_f32
(
ptr_out_thread
,
vo3
);
ptr_out_thread
+=
4
;
}
#else
if
(
cnt
>
0
)
{
asm
volatile
(
"1: @ loop header
\n
"
"vld1.32 {d0-d3}, [%[din]]! @ load din 0
\n
"
"vld1.32 {d4-d7}, [%[din]]! @ load din 0
\n
"
"vcge.f32 q8, q0, %q[vzero] @ get mask
\n
"
"vcge.f32 q9, q1, %q[vzero] @ get mask
\n
"
"vcge.f32 q10, q2, %q[vzero] @ get mask
\n
"
"vcge.f32 q11, q3, %q[vzero] @ get mask
\n
"
"vmul.f32 q4, q0, %q[valpha] @ get neg data
\n
"
"vmul.f32 q5, q1, %q[valpha] @ get neg data
\n
"
"vmul.f32 q6, q2, %q[valpha] @ get neg data
\n
"
"vmul.f32 q7, q3, %q[valpha] @ get neg data
\n
"
"vbit q4, q0, q8 @ bitsel, insert q0 to q4, "
"if q8 is 1
\n
"
"vbit q5, q1, q9 @ bitsel, insert q1 to q5, "
"if q9 is 1
\n
"
"vbit q6, q2, q10 @ bitsel, insert q2 to q6, "
"if q10 is 1
\n
"
"vbit q7, q3, q11 @ bitsel, insert q3 to q7, "
"if q11 is 1
\n
"
"vst1.32 {d8-d11}, [%[dout]]! @ store result, add pointer
\n
"
"vst1.32 {d12-d15}, [%[dout]]! @ store result, add pointer
\n
"
"subs %[cnt], #1 @ loop count minus 1
\n
"
"bne 1b @ jump to main loop start "
"point
\n
"
:
[
dout
]
"+r"
(
ptr_out_thread
),
[
din
]
"+r"
(
ptr_in_thread
),
[
cnt
]
"+r"
(
cnt
)
:
[
vzero
]
"w"
(
vzero
),
[
valpha
]
"w"
(
valpha
)
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
);
}
#endif
for
(
int
j
=
0
;
j
<
neon_loop_remain
;
++
j
)
{
ptr_out_thread
[
0
]
=
ptr_in_thread
[
0
]
>
0.
f
?
ptr_in_thread
[
0
]
:
ptr_in_thread
[
0
]
*
negative_slope
;
ptr_in_thread
++
;
ptr_out_thread
++
;
}
}
float
*
out_ptr_remain
=
dout
+
threads
*
nums_per_thread
;
const
float
*
in_ptr_remain
=
din
+
threads
*
nums_per_thread
;
for
(
int
j
=
0
;
j
<
remain
;
++
j
)
{
out_ptr_remain
[
0
]
=
in_ptr_remain
[
0
]
>
0.
f
?
in_ptr_remain
[
0
]
:
in_ptr_remain
[
0
]
*
negative_slope
;
in_ptr_remain
++
;
out_ptr_remain
++
;
}
}
template
<
>
void
act_clipped_relu
<
float
>
(
const
float
*
din
,
float
*
dout
,
int
size
,
const
float
coef
,
int
threads
)
{
int
nums_per_thread
=
size
/
threads
;
int
remain
=
size
-
threads
*
nums_per_thread
;
int
neon_loop_cnt
=
nums_per_thread
>>
4
;
int
neon_loop_remain
=
nums_per_thread
-
(
neon_loop_cnt
<<
4
);
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vclip
=
vdupq_n_f32
(
coef
);
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
const
float
*
ptr_in_thread
=
din
+
i
*
nums_per_thread
;
float
*
ptr_out_thread
=
dout
+
i
*
nums_per_thread
;
int
cnt
=
neon_loop_cnt
;
#ifdef __aarch64__
for
(
int
num
=
0
;
num
<
neon_loop_cnt
;
++
num
)
{
float32x4_t
vr0
=
vld1q_f32
(
ptr_in_thread
);
ptr_in_thread
+=
4
;
float32x4_t
vr1
=
vld1q_f32
(
ptr_in_thread
);
ptr_in_thread
+=
4
;
float32x4_t
vr2
=
vld1q_f32
(
ptr_in_thread
);
ptr_in_thread
+=
4
;
float32x4_t
vr3
=
vld1q_f32
(
ptr_in_thread
);
ptr_in_thread
+=
4
;
float32x4_t
vt0
=
vmaxq_f32
(
vr0
,
vzero
);
float32x4_t
vt1
=
vmaxq_f32
(
vr1
,
vzero
);
float32x4_t
vt2
=
vmaxq_f32
(
vr2
,
vzero
);
float32x4_t
vt3
=
vmaxq_f32
(
vr3
,
vzero
);
float32x4_t
vo0
=
vminq_f32
(
vt0
,
vclip
);
float32x4_t
vo1
=
vminq_f32
(
vt1
,
vclip
);
float32x4_t
vo2
=
vminq_f32
(
vt2
,
vclip
);
float32x4_t
vo3
=
vminq_f32
(
vt3
,
vclip
);
vst1q_f32
(
ptr_out_thread
,
vo0
);
ptr_out_thread
+=
4
;
vst1q_f32
(
ptr_out_thread
,
vo1
);
ptr_out_thread
+=
4
;
vst1q_f32
(
ptr_out_thread
,
vo2
);
ptr_out_thread
+=
4
;
vst1q_f32
(
ptr_out_thread
,
vo3
);
ptr_out_thread
+=
4
;
}
#else
if
(
cnt
>
0
)
{
asm
volatile
(
"1: @ loop header
\n
"
"vld1.32 {d0-d3}, [%[din]]! @ load din 0
\n
"
"vld1.32 {d4-d7}, [%[din]]! @ load din 0
\n
"
"vmax.f32 q8, q0, %q[vzero] @ relu
\n
"
"vmax.f32 q9, q1, %q[vzero] @ relu
\n
"
"vmax.f32 q10, q2, %q[vzero] @ relu
\n
"
"vmax.f32 q11, q3, %q[vzero] @ relu
\n
"
"vmin.f32 q4, q8, %q[vclip] @ clip relu
\n
"
"vmin.f32 q5, q9, %q[vclip] @ clip relu
\n
"
"vmin.f32 q6, q10, %q[vclip] @ clip relu
\n
"
"vmin.f32 q7, q11, %q[vclip] @ clip relu
\n
"
"vst1.32 {d8-d11}, [%[dout]]! @ store result, add pointer
\n
"
"vst1.32 {d12-d15}, [%[dout]]! @ store result, add pointer
\n
"
"subs %[cnt], #1 @ loop count minus 1
\n
"
"bne 1b @ jump to main loop start "
"point
\n
"
:
[
dout
]
"+r"
(
ptr_out_thread
),
[
din
]
"+r"
(
ptr_in_thread
),
[
cnt
]
"+r"
(
cnt
)
:
[
vzero
]
"w"
(
vzero
),
[
vclip
]
"w"
(
vclip
)
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
);
}
#endif
for
(
int
j
=
0
;
j
<
neon_loop_remain
;
++
j
)
{
ptr_out_thread
[
0
]
=
ptr_in_thread
[
0
]
>
0.
f
?
ptr_in_thread
[
0
]
:
0.
f
;
ptr_out_thread
[
0
]
=
ptr_out_thread
[
0
]
<
coef
?
ptr_out_thread
[
0
]
:
coef
;
ptr_in_thread
++
;
ptr_out_thread
++
;
}
}
float
*
out_ptr_remain
=
dout
+
threads
*
nums_per_thread
;
const
float
*
in_ptr_remain
=
din
+
threads
*
nums_per_thread
;
for
(
int
j
=
0
;
j
<
remain
;
++
j
)
{
out_ptr_remain
[
0
]
=
in_ptr_remain
[
0
]
>
0.
f
?
in_ptr_remain
[
0
]
:
0.
f
;
out_ptr_remain
[
0
]
=
out_ptr_remain
[
0
]
<
coef
?
out_ptr_remain
[
0
]
:
coef
;
in_ptr_remain
++
;
out_ptr_remain
++
;
}
}
template
<
>
void
act_prelu
<
float
>
(
const
float
*
din
,
float
*
dout
,
int
outer_size
,
int
channel_size
,
int
inner_size
,
bool
channel_shared
,
float
*
channel_slope
,
int
threads
)
{
int
stride_size
=
inner_size
*
channel_size
;
int
cnt
=
inner_size
>>
4
;
int
remain
=
inner_size
&
15
;
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
for
(
int
n
=
0
;
n
<
outer_size
;
n
++
)
{
const
float
*
data_in_batch
=
din
+
n
*
stride_size
;
float
*
data_out_batch
=
dout
+
n
*
stride_size
;
#pragma omp parallel for
for
(
int
c
=
0
;
c
<
channel_size
;
c
++
)
{
const
float
*
data_in_c
=
data_in_batch
+
c
*
inner_size
;
float
*
data_out_c
=
data_out_batch
+
c
*
inner_size
;
float
slope
=
channel_shared
?
channel_slope
[
0
]
:
channel_slope
[
c
];
float32x4_t
vslope
=
vdupq_n_f32
(
slope
);
#ifdef __aarch64__
for
(
int
i
=
0
;
i
<
cnt
;
++
i
)
{
float32x4_t
vr0
=
vld1q_f32
(
data_in_c
);
float32x4_t
vr1
=
vld1q_f32
(
data_in_c
+
4
);
float32x4_t
vr2
=
vld1q_f32
(
data_in_c
+
8
);
float32x4_t
vr3
=
vld1q_f32
(
data_in_c
+
12
);
uint32x4_t
vm0
=
vcltq_f32
(
vr0
,
vzero
);
// vr0 <= vzero
uint32x4_t
vm1
=
vcltq_f32
(
vr1
,
vzero
);
// vr0 <= vzero
uint32x4_t
vm2
=
vcltq_f32
(
vr2
,
vzero
);
// vr0 <= vzero
uint32x4_t
vm3
=
vcltq_f32
(
vr3
,
vzero
);
// vr0 <= vzero
float32x4_t
vo0
=
vmulq_f32
(
vr0
,
vslope
);
// vr0 * vslope
float32x4_t
vo1
=
vmulq_f32
(
vr1
,
vslope
);
// vr0 * vslope
float32x4_t
vo2
=
vmulq_f32
(
vr2
,
vslope
);
// vr0 * vslope
float32x4_t
vo3
=
vmulq_f32
(
vr3
,
vslope
);
// vr0 * vslope
float32x4_t
vos0
=
vbslq_f32
(
vm0
,
vo0
,
vr0
);
float32x4_t
vos1
=
vbslq_f32
(
vm1
,
vo1
,
vr1
);
float32x4_t
vos2
=
vbslq_f32
(
vm2
,
vo2
,
vr2
);
float32x4_t
vos3
=
vbslq_f32
(
vm3
,
vo3
,
vr3
);
vst1q_f32
(
data_out_c
,
vos0
);
vst1q_f32
(
data_out_c
+
4
,
vos1
);
vst1q_f32
(
data_out_c
+
8
,
vos2
);
vst1q_f32
(
data_out_c
+
12
,
vos3
);
data_in_c
+=
16
;
data_out_c
+=
16
;
}
#else
int
cnt_loop
=
cnt
;
if
(
cnt_loop
>
0
)
{
asm
volatile
(
"vld1.32 {d0-d3}, [%[ptr_in]]! @ load "
"input to q0, q1
\n
"
"pld [%[ptr_in]] @ preload
\n
"
"pld [%[ptr_in], #64] @ preload
\n
"
"pld [%[ptr_in], #128] @ preload
\n
"
"pld [%[ptr_in], #192] @ preload
\n
"
"1: @main loop
\n
"
"vld1.32 {d4-d7}, [%[ptr_in]]! @ load input to "
"q2, q3
\n
"
"vclt.f32 q8, q0, %q[vzero] @vcle q0 <= vzero
\n
"
"vclt.f32 q9, q1, %q[vzero] @vcle q1 <= vzero
\n
"
"vmul.f32 q10, q0, %q[vslope] @vmul q0 * vslope
\n
"
"vmul.f32 q11, q1, %q[vslope] @vmul q1 * vslope
\n
"
"vclt.f32 q12, q2, %q[vzero] @vcle q2 <= vzero
\n
"
"vclt.f32 q13, q3, %q[vzero] @vcle q3 <= vzero
\n
"
"vmul.f32 q14, q2, %q[vslope] @vmul q2 * vslope
\n
"
"vmul.f32 q15, q3, %q[vslope] @vmul q3 * vslope
\n
"
"vbif.32 q10, q0, q8 @vbit q10, q0, q8
\n
"
"vbif.32 q11, q1, q9 @vbit q11, q1, q9
\n
"
"vbif.32 q14, q2, q12 @vbit q14, q2, "
"q12
\n
"
"vbif.32 q15, q3, q13 @vbit q15, q3, "
"q13
\n
"
"subs %[cnt], #1 @subs nn, 1
\n
"
"vld1.32 {d0-d3}, [%[ptr_in]]! @ load input to "
"q0, q1
\n
"
"vst1.f32 {d20-d23}, [%[dout]]! @store data
\n
"
"vst1.f32 {d28-d31}, [%[dout]]! @store data
\n
"
"bne 1b @bne nn
\n
"
"sub %[ptr_in], #32 @ ptr-32
\n
"
:
[
ptr_in
]
"+r"
(
data_in_c
),
[
cnt
]
"+r"
(
cnt_loop
),
[
dout
]
"+r"
(
data_out_c
)
:
[
vzero
]
"w"
(
vzero
),
[
vslope
]
"w"
(
vslope
)
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
);
}
#endif // __aarch64__
for
(
int
i
=
remain
;
i
>
0
;
i
--
)
{
*
(
data_out_c
++
)
=
data_in_c
[
0
]
>
0.
f
?
data_in_c
[
0
]
:
data_in_c
[
0
]
*
slope
;
data_in_c
++
;
}
}
}
}
template
<
>
void
act_sigmoid
(
const
float
*
din
,
float
*
dout
,
int
size
,
int
threads
)
{
int
nums_per_thread
=
size
/
threads
;
int
remain
=
size
-
threads
*
nums_per_thread
;
int
neon_loop_cnt_dim4
=
nums_per_thread
>>
2
;
int
neon_loop_remain_dim4
=
nums_per_thread
-
(
neon_loop_cnt_dim4
<<
2
);
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
float32x4_t
exp_vec
=
vdupq_n_f32
(
0.0
f
);
float32x4_t
recip
=
vdupq_n_f32
(
0.0
f
);
const
float
*
ptr_in_thread
=
din
+
i
*
nums_per_thread
;
float
*
ptr_out_thread
=
dout
+
i
*
nums_per_thread
;
for
(
int
k
=
0
;
k
<
neon_loop_cnt_dim4
;
++
k
)
{
exp_vec
=
exp_ps
(
vnegq_f32
(
vld1q_f32
(
ptr_in_thread
)));
exp_vec
=
vaddq_f32
(
exp_vec
,
vdupq_n_f32
(
1.0
f
));
recip
=
vrecpeq_f32
(
exp_vec
);
recip
=
vmulq_f32
(
vrecpsq_f32
(
exp_vec
,
recip
),
recip
);
recip
=
vmulq_f32
(
vrecpsq_f32
(
exp_vec
,
recip
),
recip
);
vst1q_f32
(
ptr_out_thread
,
recip
);
ptr_out_thread
+=
4
;
ptr_in_thread
+=
4
;
}
for
(
int
j
=
0
;
j
<
neon_loop_remain_dim4
;
++
j
)
{
ptr_out_thread
[
0
]
=
1.
f
/
(
1
+
expf
(
-
ptr_in_thread
[
0
]));
ptr_in_thread
++
;
ptr_out_thread
++
;
}
}
float
*
ptr_out
=
dout
+
threads
*
nums_per_thread
;
const
float
*
ptr_in
=
din
+
threads
*
nums_per_thread
;
for
(
int
j
=
0
;
j
<
remain
;
++
j
)
{
ptr_out
[
0
]
=
1.
f
/
(
1
+
expf
(
-
ptr_in
[
0
]));
ptr_in
++
;
ptr_out
++
;
}
}
// tanh : (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template
<
>
void
act_tanh
<
float
>
(
const
float
*
din
,
float
*
dout
,
int
size
,
int
threads
)
{
int
nums_per_thread
=
size
/
threads
;
int
remain
=
size
-
threads
*
nums_per_thread
;
int
neon_loop_cnt_dim4
=
nums_per_thread
>>
2
;
int
neon_loop_remain_dim4
=
nums_per_thread
-
(
neon_loop_cnt_dim4
<<
2
);
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
float32x4_t
exp_plus_vec
=
vdupq_n_f32
(
0.0
f
);
float32x4_t
exp_minus_vec
=
vdupq_n_f32
(
0.0
f
);
float32x4_t
exp_sum_vec
=
vdupq_n_f32
(
0.0
f
);
float32x4_t
exp_diff_vec
=
vdupq_n_f32
(
0.0
f
);
float32x4_t
recip
=
vdupq_n_f32
(
0.0
f
);
const
float
*
ptr_in_thread
=
din
+
i
*
nums_per_thread
;
float
*
ptr_out_thread
=
dout
+
i
*
nums_per_thread
;
for
(
int
k
=
0
;
k
<
neon_loop_cnt_dim4
;
++
k
)
{
exp_plus_vec
=
exp_ps
(
vld1q_f32
(
ptr_in_thread
));
exp_minus_vec
=
exp_ps
(
vnegq_f32
(
vld1q_f32
(
ptr_in_thread
)));
exp_sum_vec
=
vaddq_f32
(
exp_plus_vec
,
exp_minus_vec
);
exp_diff_vec
=
vsubq_f32
(
exp_plus_vec
,
exp_minus_vec
);
recip
=
div_ps
(
exp_diff_vec
,
exp_sum_vec
);
vst1q_f32
(
ptr_out_thread
,
recip
);
ptr_out_thread
+=
4
;
ptr_in_thread
+=
4
;
}
for
(
int
j
=
0
;
j
<
neon_loop_remain_dim4
;
++
j
)
{
ptr_out_thread
[
0
]
=
(
expf
(
ptr_in_thread
[
0
])
-
expf
(
-
ptr_in_thread
[
0
]))
/
(
expf
(
ptr_in_thread
[
0
])
+
expf
(
-
ptr_in_thread
[
0
]));
ptr_in_thread
++
;
ptr_out_thread
++
;
}
}
float
*
ptr_out
=
dout
+
threads
*
nums_per_thread
;
const
float
*
ptr_in
=
din
+
threads
*
nums_per_thread
;
for
(
int
j
=
0
;
j
<
remain
;
++
j
)
{
ptr_out
[
0
]
=
(
expf
(
ptr_in
[
0
])
-
expf
(
-
ptr_in
[
0
]))
/
(
expf
(
ptr_in
[
0
])
+
expf
(
-
ptr_in
[
0
]));
ptr_in
++
;
ptr_out
++
;
}
}
// swish: x /(1 + exp(-(b * x)))
template
<
>
void
act_swish
<
float
>
(
const
float
*
din
,
float
*
dout
,
int
size
,
const
float
coef
,
int
threads
)
{
int
nums_per_thread
=
size
/
threads
;
int
remain
=
size
-
threads
*
nums_per_thread
;
int
neon_loop_cnt_dim4
=
nums_per_thread
>>
2
;
int
neon_loop_remain_dim4
=
nums_per_thread
-
(
neon_loop_cnt_dim4
<<
2
);
const
float
beta
=
coef
;
float32x4_t
vbeta
=
vdupq_n_f32
(
beta
);
float32x4_t
vone
=
vdupq_n_f32
(
1.
f
);
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
const
float
*
ptr_in_thread
=
din
+
i
*
nums_per_thread
;
float
*
ptr_out_thread
=
dout
+
i
*
nums_per_thread
;
for
(
int
k
=
0
;
k
<
neon_loop_cnt_dim4
;
++
k
)
{
float32x4_t
va
=
vld1q_f32
(
ptr_in_thread
);
// x
float32x4_t
vb
=
vnegq_f32
(
vld1q_f32
(
ptr_in_thread
));
// -x
float32x4_t
vsum
=
vmulq_f32
(
vb
,
vbeta
);
vsum
=
exp_ps
(
vsum
);
float32x4_t
vc
=
vaddq_f32
(
vone
,
vsum
);
float32x4_t
vrst
=
div_ps
(
va
,
vc
);
vst1q_f32
(
ptr_out_thread
,
vrst
);
ptr_out_thread
+=
4
;
ptr_in_thread
+=
4
;
}
for
(
int
j
=
0
;
j
<
neon_loop_remain_dim4
;
++
j
)
{
ptr_out_thread
[
0
]
=
ptr_in_thread
[
0
]
/
(
1.0
+
expf
(
-
ptr_in_thread
[
0
]
*
beta
));
ptr_in_thread
++
;
ptr_out_thread
++
;
}
}
float
*
ptr_out
=
dout
+
threads
*
nums_per_thread
;
const
float
*
ptr_in
=
din
+
threads
*
nums_per_thread
;
for
(
int
j
=
0
;
j
<
remain
;
++
j
)
{
ptr_out
[
0
]
=
ptr_in
[
0
]
/
(
1.0
+
expf
(
-
ptr_in
[
0
]
*
beta
));
ptr_in
++
;
ptr_out
++
;
}
}
}
// namespace math
}
// namespace arm
}
// namespace lite
}
// namespace paddle
paddle/fluid/lite/arm/math/activation.h
0 → 100644
浏览文件 @
0fbcd4ea
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace
paddle
{
namespace
lite
{
namespace
arm
{
namespace
math
{
template
<
typename
T
>
void
act_relu
(
const
T
*
din
,
T
*
dout
,
int
size
,
int
threads
);
template
<
typename
T
>
void
act_relu_neg
(
const
T
*
din
,
T
*
dout
,
int
size
,
const
float
negative_slope
,
int
threads
);
template
<
typename
T
>
void
act_clipped_relu
(
const
T
*
din
,
T
*
dout
,
int
size
,
const
float
coef
,
int
threads
);
template
<
typename
T
>
void
act_prelu
(
const
T
*
din
,
T
*
dout
,
int
outer_size
,
int
channel_size
,
int
inner_size
,
bool
channel_shared
,
float
*
channel_slope
,
int
threads
);
template
<
typename
T
>
void
act_sigmoid
(
const
T
*
din
,
T
*
dout
,
int
size
,
int
threads
);
template
<
typename
T
>
void
act_tanh
(
const
T
*
din
,
T
*
dout
,
int
size
,
int
threads
);
template
<
typename
T
>
void
act_swish
(
const
T
*
din
,
T
*
dout
,
int
size
,
const
float
coef
,
int
threads
);
}
// namespace math
}
// namespace arm
}
// namespace lite
}
// namespace paddle
paddle/fluid/lite/kernels/arm/CMakeLists.txt
浏览文件 @
0fbcd4ea
...
...
@@ -5,7 +5,7 @@ endif()
message
(
STATUS
"compile with lite ARM kernels"
)
cc_library
(
fc_compute_arm SRCS fc_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
cc_library
(
relu_compute_arm SRCS relu_compute.cc DEPS
${
lite_kernel_deps
}
)
cc_library
(
activation_compute_arm SRCS activation_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
cc_library
(
mul_compute_arm SRCS mul_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
cc_library
(
scale_compute_arm SRCS scale_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
cc_library
(
softmax_compute_arm SRCS softmax_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
...
...
@@ -16,6 +16,7 @@ cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_a
cc_library
(
split_compute_arm SRCS split_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
lite_cc_test
(
test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm
)
lite_cc_test
(
test_activation_compute_arm SRCS activation_compute_test.cc DEPS activation_compute_arm
)
lite_cc_test
(
test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm
)
lite_cc_test
(
test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm
)
lite_cc_test
(
test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm
)
...
...
@@ -27,7 +28,7 @@ lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_comput
set
(
arm_kernels
fc_compute_arm
relu
_compute_arm
activation
_compute_arm
mul_compute_arm
scale_compute_arm
softmax_compute_arm
...
...
paddle/fluid/lite/kernels/arm/
relu_compute.h
→
paddle/fluid/lite/kernels/arm/
activation_compute.cc
浏览文件 @
0fbcd4ea
...
...
@@ -12,31 +12,23 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/kernels/arm/activation_compute.h"
#include "paddle/fluid/lite/arm/math/funcs.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
arm
{
class
ReluCompute
:
public
KernelLite
<
TARGET
(
kARM
),
PRECISION
(
kFloat
)
>
{
public:
void
Run
()
override
{
auto
&
param
=
Param
<
operators
::
ReluParam
>
();
auto
n
=
param
.
input
->
dims
().
production
();
const
float
*
input
=
param
.
input
->
data
<
float
>
();
float
*
output
=
param
.
output
->
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
output
[
i
]
=
std
::
max
(
0.
f
,
input
[
i
]);
}
}
TargetType
target
()
const
override
{
return
TARGET
(
kARM
);
}
PrecisionType
precision
()
const
override
{
return
PRECISION
(
kFloat
);
}
};
void
ReluCompute
::
Run
()
{
auto
&
param
=
this
->
Param
<
param_t
>
();
auto
&
ctx
=
this
->
ctx_
->
template
As
<
ARMContext
>();
auto
x_dims
=
param
.
X
->
dims
();
auto
x_data
=
param
.
X
->
data
<
float
>
();
auto
output_data
=
param
.
Out
->
mutable_data
<
float
>
();
lite
::
arm
::
math
::
act_relu
<
float
>
(
x_data
,
output_data
,
x_dims
.
production
(),
ctx
.
threads
());
}
}
// namespace arm
}
// namespace kernels
...
...
paddle/fluid/lite/kernels/arm/
relu_compute.cc
→
paddle/fluid/lite/kernels/arm/
activation_compute.h
浏览文件 @
0fbcd4ea
...
...
@@ -12,4 +12,26 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/arm/relu_compute.h"
#pragma once
#include <algorithm>
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
arm
{
class
ReluCompute
:
public
KernelLite
<
TARGET
(
kARM
),
PRECISION
(
kFloat
)
>
{
public:
using
param_t
=
operators
::
ActivationParam
;
void
Run
()
override
;
virtual
~
ReluCompute
()
=
default
;
};
}
// namespace arm
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
paddle/fluid/lite/kernels/arm/activation_compute_test.cc
0 → 100644
浏览文件 @
0fbcd4ea
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/arm/activation_compute.h"
#include <gtest/gtest.h>
#include <memory>
#include <utility>
#include <vector>
#include "paddle/fluid/lite/core/op_registry.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
arm
{
template
<
typename
dtype
>
void
activation_compute_ref
(
const
operators
::
ActivationParam
&
param
)
{
auto
x_data
=
param
.
X
->
data
<
dtype
>
();
auto
output_data
=
param
.
Out
->
mutable_data
<
dtype
>
();
DDim
x_dims
=
param
.
X
->
dims
();
DDim
output_dims
=
param
.
Out
->
dims
();
ASSERT_EQ
(
x_dims
.
data
(),
output_dims
.
data
());
for
(
int
i
=
0
;
i
<
output_dims
.
production
();
i
++
)
{
output_data
[
i
]
=
std
::
max
(
0.
f
,
x_data
[
i
]);
}
}
TEST
(
activation_arm
,
retrive_op
)
{
auto
activation
=
KernelRegistry
::
Global
().
Create
<
TARGET
(
kARM
),
PRECISION
(
kFloat
)
>
(
"relu"
);
ASSERT_FALSE
(
activation
.
empty
());
ASSERT_TRUE
(
activation
.
front
());
}
TEST
(
activation_arm
,
init
)
{
ReluCompute
activation
;
ASSERT_EQ
(
activation
.
precision
(),
PRECISION
(
kFloat
));
ASSERT_EQ
(
activation
.
target
(),
TARGET
(
kARM
));
}
TEST
(
activation_arm
,
compute
)
{
DeviceInfo
::
Init
();
for
(
auto
n
:
{
1
,
2
})
{
for
(
auto
c
:
{
6
,
32
/*, 128*/
})
{
for
(
auto
h
:
{
9
,
18
/*, 56 , 112, 224, 512*/
})
{
for
(
auto
w
:
{
9
,
18
/*, 56, 112, 224, 512*/
})
{
Tensor
x
;
Tensor
output
;
Tensor
output_ref
;
// set the dims of input, output, ref output tensors
x
.
Resize
({
n
,
c
,
h
,
w
});
output
.
Resize
({
n
,
c
,
h
,
w
});
output_ref
.
Resize
({
n
,
c
,
h
,
w
});
// initialize the data of input tensors
auto
*
x_data
=
x
.
mutable_data
<
float
>
();
auto
*
output_data
=
output
.
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
x
.
dims
().
production
();
i
++
)
{
float
sign
=
i
%
3
==
0
?
-
1.0
f
:
1.0
f
;
x_data
[
i
]
=
sign
*
static_cast
<
float
>
(
i
%
128
)
*
0.013
f
;
}
// prepare kernel params and run
ReluCompute
activation
;
std
::
unique_ptr
<
KernelContext
>
ctx
(
new
KernelContext
);
ctx
->
As
<
ARMContext
>
();
activation
.
SetContext
(
std
::
move
(
ctx
));
operators
::
ActivationParam
param
;
param
.
X
=
&
x
;
param
.
Out
=
&
output
;
activation
.
SetParam
(
param
);
activation
.
Launch
();
// invoking ref implementation and compare results
param
.
Out
=
&
output_ref
;
activation_compute_ref
<
float
>
(
param
);
auto
*
output_ref_data
=
output_ref
.
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
output
.
dims
().
production
();
i
++
)
{
EXPECT_NEAR
(
output_data
[
i
],
output_ref_data
[
i
],
1e-5
);
}
}
}
}
}
}
}
// namespace arm
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
USE_LITE_KERNEL
(
relu
,
kARM
,
kFloat
,
kNCHW
,
def
);
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录