Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
c0af965c
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
c0af965c
编写于
1月 14, 2020
作者:
H
HappyAngel
提交者:
xiaogang
1月 14, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[arm]add gemm + relu6/leakyrelu fusion (#2674)
add gemm + relu6/leakyrelu fusion
上级
7a8118b0
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
405 addition
and
694 deletion
+405
-694
lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
+55
-55
lite/backends/arm/math/conv_block_utils.h
lite/backends/arm/math/conv_block_utils.h
+3
-1
lite/backends/arm/math/conv_impl.cc
lite/backends/arm/math/conv_impl.cc
+6
-2
lite/backends/arm/math/conv_winograd_3x3.cc
lite/backends/arm/math/conv_winograd_3x3.cc
+3
-1
lite/backends/arm/math/fill_bias_relu.cc
lite/backends/arm/math/fill_bias_relu.cc
+235
-1
lite/backends/arm/math/fill_bias_relu.h
lite/backends/arm/math/fill_bias_relu.h
+16
-1
lite/backends/arm/math/gru_utils.h
lite/backends/arm/math/gru_utils.h
+4
-2
lite/backends/arm/math/packed_sgemm.cc
lite/backends/arm/math/packed_sgemm.cc
+42
-79
lite/backends/arm/math/packed_sgemm.h
lite/backends/arm/math/packed_sgemm.h
+2
-1
lite/backends/arm/math/sgemm.cc
lite/backends/arm/math/sgemm.cc
+2
-2
lite/backends/arm/math/sgemm.h
lite/backends/arm/math/sgemm.h
+1
-1
lite/kernels/arm/conv_transpose_compute.cc
lite/kernels/arm/conv_transpose_compute.cc
+5
-2
lite/kernels/arm/fc_compute.cc
lite/kernels/arm/fc_compute.cc
+3
-1
lite/kernels/arm/matmul_compute.cc
lite/kernels/arm/matmul_compute.cc
+7
-5
lite/kernels/arm/mul_compute.cc
lite/kernels/arm/mul_compute.cc
+3
-1
lite/tests/kernels/CMakeLists.txt
lite/tests/kernels/CMakeLists.txt
+0
-2
lite/tests/kernels/conv2d_transpose_compute_test.cc
lite/tests/kernels/conv2d_transpose_compute_test.cc
+0
-535
lite/tests/math/conv_transpose_compute_test.cc
lite/tests/math/conv_transpose_compute_test.cc
+8
-0
lite/tests/math/sgemm_compute_test.cc
lite/tests/math/sgemm_compute_test.cc
+10
-2
未找到文件。
lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
浏览文件 @
c0af965c
...
...
@@ -924,58 +924,58 @@ void conv_depthwise_3x3s1_fp32(const float *din,
\
"st1 {v15.4s}, [%[doutr3]], #16 \n"
#define RIGHT_RESULT_S1_RELU6 \
"fmax v12.4s, v12.4s, v20.4s \n"
/*relu*/
\
\
"fmla v15.4s , v16.4s, %[w0].s[1]\n"
/* outr00 += din0_1234 * w0[1]*/
\
"fmla v14.4s , v16.4s, %[w1].s[1]\n"
/* outr00 += din0_1234 * w0[1]*/
\
"fmla v13.4s , v16.4s, %[w2].s[1]\n"
/* outr00 += din0_1234 * w0[1]*/
\
\
"fmin v12.4s, v12.4s, %[vsix].4s \n"
/*relu6*/
\
\
"fmla v15.4s , v17.4s, %[w0].s[2]\n"
/* outr00 += din0_2345 * w0[2]*/
\
"fmla v14.4s , v17.4s, %[w1].s[2]\n"
/* outr00 += din0_2345 * w0[2]*/
\
"fmla v13.4s , v17.4s, %[w2].s[2]\n"
/* outr00 += din0_2345 * w0[2]*/
\
\
"ext v16.16b, v8.16b, v9.16b, #4 \n"
/* v16 = 1234*/
\
"ext v17.16b, v8.16b, v9.16b, #8 \n"
/* v16 = 2345 */
/* r3 */
\
"bif v12.16b, v22.16b, v18.16b \n" \
"fmla v15.4s , v8.4s, %[w1].s[0]\n"
/* outr00 += din0_0123 * w0[0]*/
\
"fmla v14.4s , v8.4s, %[w2].s[0]\n"
/* outr00 += din0_0123 * w0[0]*/
\
"fmax v13.4s, v13.4s, v20.4s \n"
/*relu*/
\
\
"fmla v15.4s , v16.4s, %[w1].s[1]\n"
/* outr00 += din0_1234 * w0[1]*/
\
"fmla v14.4s , v16.4s, %[w2].s[1]\n"
/* outr00 += din0_1234 * w0[1]*/
\
"st1 {v12.4s}, [%[doutr0]], #16 \n" \
\
"fmin v13.4s, v13.4s, %[vsix].4s \n"
/*relu6*/
\
\
"fmla v15.4s , v17.4s, %[w1].s[2]\n"
/* outr00 += din0_2345 * w0[2]*/
\
"fmla v14.4s , v17.4s, %[w2].s[2]\n"
/* outr00 += din0_2345 * w0[2]*/
\
\
"ext v16.16b, v10.16b, v11.16b, #4 \n"
/* v16 = 1234*/
\
"ext v17.16b, v10.16b, v11.16b, #8 \n"
/* v16 = 2345 */
\
"bif v13.16b, v23.16b, v18.16b \n" \
\
"fmla v15.4s , v10.4s,
v20.s[0]\n"
/* outr00 += din0_0123 * w0[0]*/
\
\
"fmax v14.4s, v14.4s, v20.4s \n"
/*relu*/
\
"st1 {v13.4s}, [%[doutr1]], #16 \n"
/* r3 */
\
\
"fmla v15.4s , v16.4s, %[w2].s[1]\n"
/* outr00 += din0_1234 * w0[1]*/
\
\
"fmin v14.4s, v14.4s, %[vsix].4s \n"
/*relu6*/
\
\
"fmla v15.4s , v17.4s, %[w2].s[2]\n"
/* outr00 += din0_2345 * w0[2]*/
\
\
"bif v14.16b, v24.16b, v18.16b \n" \
"fmax v15.4s, v15.4s, v20.4s \n"
/*relu*/
\
\
"st1 {v14.4s}, [%[doutr2]], #16 \n" \
\
"fmin v15.4s, v15.4s, %[vsix].4s \n"
/*relu6*/
\
"bif v15.16b, v25.16b, v18.16b \n" \
\
#define RIGHT_RESULT_S1_RELU6
\
"fmax v12.4s, v12.4s, v20.4s \n"
/*relu*/
\
\
"fmla v15.4s , v16.4s, %[w0].s[1]\n"
/* outr00 += din0_1234 * w0[1]*/
\
"fmla v14.4s , v16.4s, %[w1].s[1]\n"
/* outr00 += din0_1234 * w0[1]*/
\
"fmla v13.4s , v16.4s, %[w2].s[1]\n"
/* outr00 += din0_1234 * w0[1]*/
\
\
"fmin v12.4s, v12.4s, %[vsix].4s \n"
/*relu6*/
\
\
"fmla v15.4s , v17.4s, %[w0].s[2]\n"
/* outr00 += din0_2345 * w0[2]*/
\
"fmla v14.4s , v17.4s, %[w1].s[2]\n"
/* outr00 += din0_2345 * w0[2]*/
\
"fmla v13.4s , v17.4s, %[w2].s[2]\n"
/* outr00 += din0_2345 * w0[2]*/
\
\
"ext v16.16b, v8.16b, v9.16b, #4 \n"
/* v16 = 1234*/
\
"ext v17.16b, v8.16b, v9.16b, #8 \n"
/* v16 = 2345 */
/* r3 */
\
"bif v12.16b, v22.16b, v18.16b \n"
\
"fmla v15.4s , v8.4s, %[w1].s[0]\n"
/* outr00 += din0_0123 * w0[0]*/
\
"fmla v14.4s , v8.4s, %[w2].s[0]\n"
/* outr00 += din0_0123 * w0[0]*/
\
"fmax v13.4s, v13.4s, v20.4s \n"
/*relu*/
\
\
"fmla v15.4s , v16.4s, %[w1].s[1]\n"
/* outr00 += din0_1234 * w0[1]*/
\
"fmla v14.4s , v16.4s, %[w2].s[1]\n"
/* outr00 += din0_1234 * w0[1]*/
\
"st1 {v12.4s}, [%[doutr0]], #16 \n"
\
\
"fmin v13.4s, v13.4s, %[vsix].4s \n"
/*relu6*/
\
\
"fmla v15.4s , v17.4s, %[w1].s[2]\n"
/* outr00 += din0_2345 * w0[2]*/
\
"fmla v14.4s , v17.4s, %[w2].s[2]\n"
/* outr00 += din0_2345 * w0[2]*/
\
\
"ext v16.16b, v10.16b, v11.16b, #4 \n"
/* v16 = 1234*/
\
"ext v17.16b, v10.16b, v11.16b, #8 \n"
/* v16 = 2345 */
\
"bif v13.16b, v23.16b, v18.16b \n"
\
\
"fmla v15.4s , v10.4s,
%[w2].s[0]\n"
/* outr00 += din0_0123 * w0[0]*/
\
\
"fmax v14.4s, v14.4s, v20.4s \n"
/*relu*/
\
"st1 {v13.4s}, [%[doutr1]], #16 \n"
/* r3 */
\
\
"fmla v15.4s , v16.4s, %[w2].s[1]\n"
/* outr00 += din0_1234 * w0[1]*/
\
\
"fmin v14.4s, v14.4s, %[vsix].4s \n"
/*relu6*/
\
\
"fmla v15.4s , v17.4s, %[w2].s[2]\n"
/* outr00 += din0_2345 * w0[2]*/
\
\
"bif v14.16b, v24.16b, v18.16b \n"
\
"fmax v15.4s, v15.4s, v20.4s \n"
/*relu*/
\
\
"st1 {v14.4s}, [%[doutr2]], #16 \n"
\
\
"fmin v15.4s, v15.4s, %[vsix].4s \n"
/*relu6*/
\
"bif v15.16b, v25.16b, v18.16b \n"
\
\
"st1 {v15.4s}, [%[doutr3]], #16 \n"
#define RIGHT_RESULT_S1_LEAKY_RELU \
...
...
@@ -1586,7 +1586,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
/* r3 */
\
"vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \
\
"vld1.32 {d28-d29}, [%[six_ptr]]
! @ load din r0\n"
\
"vld1.32 {d28-d29}, [%[six_ptr]]
@ load din r0\n"
\
"vmax.f32 q4, q4, %q[vzero] @ relu \n" \
\
"vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \
...
...
@@ -1617,7 +1617,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
/* r3 */
\
"vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \
\
"vld1.32 {d28-d29}, [%[scale_ptr]]
! @ load din r0\n"
\
"vld1.32 {d28-d29}, [%[scale_ptr]]
@ load din r0\n"
\
\
"vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \
\
...
...
@@ -1694,7 +1694,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
/* r3 */
\
"vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \
\
"vld1.32 {d28-d29}, [%[scale_ptr]]
! @ load din r0\n"
\
"vld1.32 {d28-d29}, [%[scale_ptr]]
@ load din r0\n"
\
\
"vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \
\
...
...
lite/backends/arm/math/conv_block_utils.h
浏览文件 @
c0af965c
...
...
@@ -2237,7 +2237,7 @@ inline void act_switch_process(float* src,
int
cnt
=
size
>>
4
;
int
remain
=
size
%
16
;
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
if
(
act_param
!=
nullptr
&&
act_param
->
has_active
)
{
if
(
act_param
!=
nullptr
)
{
float32x4_t
vsix
=
vdupq_n_f32
(
act_param
->
Relu_clipped_coef
);
float32x4_t
vscale
=
vdupq_n_f32
(
act_param
->
Leaky_relu_alpha
);
if
(
cnt
>
0
)
{
...
...
@@ -2327,6 +2327,7 @@ inline void act_switch_process(float* src,
src
++
;
dst
++
;
}
break
;
case
lite_api
::
ActivationType
::
kRelu6
:
for
(
int
i
=
0
;
i
<
remain
;
i
++
)
{
float
tmp
=
*
src
>=
0.
f
?
*
src
:
0.
f
;
...
...
@@ -2336,6 +2337,7 @@ inline void act_switch_process(float* src,
src
++
;
dst
++
;
}
break
;
case
lite_api
::
ActivationType
::
kLeakyRelu
:
for
(
int
i
=
0
;
i
<
remain
;
i
++
)
{
if
(
*
src
>=
0.
f
)
{
...
...
lite/backends/arm/math/conv_impl.cc
浏览文件 @
c0af965c
...
...
@@ -180,6 +180,8 @@ void conv1x1s1_gemm(const float* i_data,
bool
flag_relu
=
param
.
fuse_relu
;
bool
flag_bias
=
param
.
bias
!=
nullptr
;
auto
act_param
=
param
.
activation_param
;
int
hblock
=
get_hblock
(
ctx
);
int
m_roundup
=
hblock
*
((
m
+
hblock
-
1
)
/
hblock
);
int
weights_size_per_group
=
m
*
k
;
...
...
@@ -223,7 +225,7 @@ void conv1x1s1_gemm(const float* i_data,
n
,
bias_group
,
flag_bias
,
flag_relu
,
act_param
,
ctx
);
}
}
...
...
@@ -361,6 +363,8 @@ void conv_im2col_gemm(const float* i_data,
int
hblock
=
get_hblock
(
ctx
);
int
m_roundup
=
hblock
*
((
m
+
hblock
-
1
)
/
hblock
);
int
weights_size_per_group
=
m
*
k
;
auto
act_param
=
param
.
activation_param
;
if
(
n
>
1
)
{
weights_size_per_group
=
((
m_roundup
*
k
+
15
)
/
16
)
*
16
;
}
...
...
@@ -422,7 +426,7 @@ void conv_im2col_gemm(const float* i_data,
n
,
bias_group
,
flag_bias
,
flag_relu
,
act_param
,
ctx
);
}
}
...
...
lite/backends/arm/math/conv_winograd_3x3.cc
浏览文件 @
c0af965c
...
...
@@ -44,6 +44,8 @@ void conv_winograd3x3(const float* din,
int
size_out_channel
=
wout
*
hout
;
bool
flag_relu
=
param
.
fuse_relu
;
bool
flag_bias
=
param
.
bias
!=
nullptr
;
auto
act_param
=
param
.
activation_param
;
act_param
.
has_active
=
false
;
//! transform input
int
tile_w
=
(
wout
+
5
)
/
6
;
...
...
@@ -127,7 +129,7 @@ void conv_winograd3x3(const float* din,
size_tile
,
nullptr
,
false
,
false
,
act_param
,
ctx
);
}
...
...
lite/backends/arm/math/fill_bias_relu.cc
浏览文件 @
c0af965c
...
...
@@ -115,7 +115,241 @@ void fill_bias_relu<int>(int* tensor,
}
}
}
#ifdef __aarch64__
#define FILL_BIAS \
"1: \n" \
"ld1 {v0.4s}, [%[din_ptr]], #16 \n"
/*vld1q_f32(din_ptr0)*/
\
"ld1 {v1.4s}, [%[din_ptr]], #16 \n"
/*vld1q_f32(din_ptr0)*/
\
"ld1 {v2.4s}, [%[din_ptr]], #16 \n"
/*vld1q_f32(din_ptr0)*/
\
"ld1 {v3.4s}, [%[din_ptr]], #16 \n"
/*vld1q_f32(din_ptr0)*/
\
"add v0.4s, v0.4s, %[vbias].4s \n" \
"add v1.4s, v1.4s, %[vbias].4s \n" \
"add v2.4s, v2.4s, %[vbias].4s \n" \
"add v3.4s, v3.4s, %[vbias].4s \n"
#define FILL_RELU \
"fmax v0.4s, v0.4s, %[vzero].4s \n"
/* vmaxq_f32() */
\
"fmax v1.4s, v1.4s, %[vzero].4s \n"
/* vmaxq_f32() */
\
"fmax v2.4s, v2.4s, %[vzero].4s \n"
/* vmaxq_f32() */
\
"fmax v3.4s, v3.4s, %[vzero].4s \n"
/* vmaxq_f32() */
#define FILL_RELU6 \
"fmin v0.4s, v0.4s, %[vsix].4s \n"
/* vmaxq_f32() */
\
"fmin v1.4s, v1.4s, %[vsix].4s \n"
/* vmaxq_f32() */
\
"fmin v2.4s, v2.4s, %[vsix].4s \n"
/* vmaxq_f32() */
\
"fmin v3.4s, v3.4s, %[vsix].4s \n"
/* vmaxq_f32() */
#define FILL_LEAKY_RELU \
"cmhs v4.4s, v0.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v5.4s, v0.4s, %[vscale].4s \n"
/* vmulq_f32 */
\
"cmhs v6.4s, v1.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v7.4s, v1.4s, %[vscale].4s \n"
/* vmulq_f32 */
\
"cmhs v8.4s, v2.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v9.4s, v2.4s, %[vscale].4s \n"
/* vmulq_f32 */
\
"cmhs v10.4s, v3.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v11.4s, v3.4s, %[vscale].4s \n"
/* vmulq_f32 */
\
"bif v0.16b, v5.16b, v4.16b \n"
/* choose*/
\
"bif v1.16b, v7.16b, v6.16b \n"
/* choose*/
\
"bif v2.16b, v9.16b, v8.16b \n"
/* choose*/
\
"bif v3.16b, v11.16b, v10.16b \n"
/* choose*/
#define FILL_STORE \
"subs %w[cnt], %w[cnt], #1 \n" \
"st1 {v0.4s}, [%[dout_ptr]], #16 \n"
/* vst1q_f32() */
\
"st1 {v1.4s}, [%[dout_ptr]], #16 \n"
/* vst1q_f32() */
\
"st1 {v2.4s}, [%[dout_ptr]], #16 \n"
/* vst1q_f32() */
\
"st1 {v3.4s}, [%[dout_ptr]], #16 \n"
/* vst1q_f32() */
\
"bne 1b \n"
#else
#define FILL_BIAS \
"1: \n" \
"vld1.32 {d6-d7}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
"vld1.32 {d8-d9}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
"vld1.32 {d10-d11}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
"vld1.32 {d12-d13}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
"vadd.f32 q3, q3, %q[vbias] @ add \n" \
"vadd.f32 q4, q4, %q[vbias] @ add \n" \
"vadd.f32 q5, q5, %q[vbias] @ add \n" \
"vadd.f32 q6, q6, %q[vbias] @ add \n"
#define FILL_RELU \
"vmax.f32 q3, q3, %q[vzero] @ vmaxq_f32() \n" \
"vmax.f32 q4, q4, %q[vzero] @ vmaxq_f32() \n" \
"vmax.f32 q5, q5, %q[vzero] @ vmaxq_f32() \n" \
"vmax.f32 q6, q6, %q[vzero] @ vmaxq_f32() \n"
#define FILL_RELU6 \
"vmin.f32 q3, q3, %q[vsix] @ vminq_f32() \n" \
"vmin.f32 q4, q4, %q[vsix] @ vmaxq_f32() \n" \
"vmin.f32 q5, q5, %q[vsix] @ vmaxq_f32() \n" \
"vmin.f32 q6, q6, %q[vsix] @ vmaxq_f32() \n"
#define FILL_LEAKY_RELU \
"vcge.f32 q7, q3, %q[vzero] @ vcgeq_u32 \n" \
"vmul.f32 q8, q3, %q[vscale] @ vmulq_f32 \n" \
"vcge.f32 q9, q4, %q[vzero] @ vcgeq_u32 \n" \
"vmul.f32 q10, q4, %q[vscale] @ vmulq_f32 \n" \
"vcge.f32 q11, q5, %q[vzero] @ vcgeq_u32 \n" \
"vmul.f32 q12, q5, %q[vscale] @ vmulq_f32 \n" \
"vcge.f32 q13, q6, %q[vzero] @ vcgeq_u32 \n" \
"vmul.f32 q14, q6, %q[vscale] @ vmulq_f32 \n" \
"vbif q3, q8, q7 @ choose \n" \
"vbif q4, q10, q9 @ choose \n" \
"vbif q5, q12, q11 @ choose \n" \
"vbif q6, q14, q13 @ choose \n"
#define FILL_STORE \
"subs %[cnt], #1 \n" \
"vst1.32 {d6-d7}, [%[dout_ptr]]! @ vst1q_f32() \n" \
"vst1.32 {d8-d9}, [%[dout_ptr]]! @ vst1q_f32() \n" \
"vst1.32 {d10-d11}, [%[dout_ptr]]! @ vst1q_f32() \n" \
"vst1.32 {d12-d13}, [%[dout_ptr]]! @ vst1q_f32() \n" \
"bne 1b \n"
#endif
template
<
>
void
fill_bias_act
<
float
>
(
float
*
tensor
,
const
float
*
bias
,
int
channel
,
int
channel_size
,
bool
flag_bias
,
const
operators
::
ActivationParam
*
act_param
)
{
float
*
data
=
tensor
;
int
cnt
=
channel_size
>>
4
;
int
remain
=
channel_size
%
16
;
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
if
(
act_param
!=
nullptr
&&
act_param
->
has_active
)
{
float32x4_t
vsix
=
vdupq_n_f32
(
act_param
->
Relu_clipped_coef
);
float32x4_t
vscale
=
vdupq_n_f32
(
act_param
->
Leaky_relu_alpha
);
for
(
int
j
=
0
;
j
<
channel
;
j
++
)
{
float
bias_data
=
flag_bias
?
bias
[
j
]
:
0.
f
;
float
*
src
=
data
+
j
*
channel_size
;
float
*
dst
=
data
+
j
*
channel_size
;
float32x4_t
vbias
=
vdupq_n_f32
(
bias_data
);
if
(
cnt
>
0
)
{
switch
(
act_param
->
active_type
)
{
case
lite_api
::
ActivationType
::
kRelu
:
#ifdef __aarch64__
asm
volatile
(
FILL_BIAS
FILL_RELU
FILL_STORE
:
[
din_ptr
]
"+r"
(
src
),
[
dout_ptr
]
"+r"
(
dst
),
[
cnt
]
"+r"
(
cnt
)
:
[
vzero
]
"w"
(
vzero
),
[
vbias
]
"w"
(
vbias
)
:
"memory"
,
"cc"
,
"v0"
,
"v1"
,
"v2"
,
"v3"
);
#else
asm
volatile
(
FILL_BIAS
FILL_RELU
FILL_STORE
:
[
din_ptr
]
"+r"
(
src
),
[
dout_ptr
]
"+r"
(
dst
),
[
cnt
]
"+r"
(
cnt
)
:
[
vzero
]
"w"
(
vzero
),
[
vbias
]
"w"
(
vbias
)
:
"memory"
,
"cc"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
);
#endif
break
;
case
lite_api
::
ActivationType
::
kRelu6
:
#ifdef __aarch64__
asm
volatile
(
FILL_BIAS
FILL_RELU
FILL_RELU6
FILL_STORE
:
[
din_ptr
]
"+r"
(
src
),
[
dout_ptr
]
"+r"
(
dst
),
[
cnt
]
"+r"
(
cnt
)
:
[
vzero
]
"w"
(
vzero
),
[
vsix
]
"w"
(
vsix
),
[
vbias
]
"w"
(
vbias
)
:
"memory"
,
"cc"
,
"v0"
,
"v1"
,
"v2"
,
"v3"
);
#else
asm
volatile
(
FILL_BIAS
FILL_RELU
FILL_RELU6
FILL_STORE
:
[
din_ptr
]
"+r"
(
src
),
[
dout_ptr
]
"+r"
(
dst
),
[
cnt
]
"+r"
(
cnt
)
:
[
vzero
]
"w"
(
vzero
),
[
vsix
]
"w"
(
vsix
),
[
vbias
]
"w"
(
vbias
)
:
"memory"
,
"cc"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
);
#endif
break
;
case
lite_api
::
ActivationType
::
kLeakyRelu
:
#ifdef __aarch64__
asm
volatile
(
FILL_BIAS
FILL_LEAKY_RELU
FILL_STORE
:
[
din_ptr
]
"+r"
(
src
),
[
dout_ptr
]
"+r"
(
dst
),
[
cnt
]
"+r"
(
cnt
)
:
[
vzero
]
"w"
(
vzero
),
[
vscale
]
"w"
(
vscale
),
[
vbias
]
"w"
(
vbias
)
:
"memory"
,
"cc"
,
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
);
#else
asm
volatile
(
FILL_BIAS
FILL_LEAKY_RELU
FILL_STORE
:
[
din_ptr
]
"+r"
(
src
),
[
dout_ptr
]
"+r"
(
dst
),
[
cnt
]
"+r"
(
cnt
)
:
[
vzero
]
"w"
(
vzero
),
[
vscale
]
"w"
(
vscale
),
[
vbias
]
"w"
(
vbias
)
:
"memory"
,
"cc"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
);
#endif
break
;
default:
LOG
(
FATAL
)
<<
"this act_type: "
<<
static_cast
<
int
>
(
act_param
->
active_type
)
<<
" fuse not support"
;
}
}
// remain
switch
(
act_param
->
active_type
)
{
case
lite_api
::
ActivationType
::
kRelu
:
for
(
int
i
=
0
;
i
<
remain
;
i
++
)
{
*
dst
=
*
src
>=
0.
f
?
*
src
:
0.
f
;
src
++
;
dst
++
;
}
case
lite_api
::
ActivationType
::
kRelu6
:
for
(
int
i
=
0
;
i
<
remain
;
i
++
)
{
float
tmp
=
*
src
>=
0.
f
?
*
src
:
0.
f
;
*
dst
=
tmp
<=
act_param
->
Relu_clipped_coef
?
tmp
:
act_param
->
Relu_clipped_coef
;
src
++
;
dst
++
;
}
case
lite_api
::
ActivationType
::
kLeakyRelu
:
for
(
int
i
=
0
;
i
<
remain
;
i
++
)
{
if
(
*
src
>=
0.
f
)
{
*
dst
=
*
src
;
}
else
{
*
dst
=
*
src
*
act_param
->
Leaky_relu_alpha
;
}
src
++
;
dst
++
;
}
break
;
default:
LOG
(
FATAL
)
<<
"this act_type: "
<<
static_cast
<
int
>
(
act_param
->
active_type
)
<<
" fuse not support"
;
}
}
}
else
{
for
(
int
j
=
0
;
j
<
channel
;
++
j
)
{
float
bias_data
=
flag_bias
?
bias
[
j
]
:
0.
f
;
float32x4_t
vbias
=
vdupq_n_f32
(
bias_data
);
float
*
src
=
data
+
j
*
channel_size
;
float
*
dst
=
data
+
j
*
channel_size
;
#ifdef __aarch64__
asm
volatile
(
FILL_BIAS
FILL_STORE
:
[
din_ptr
]
"+r"
(
src
),
[
dout_ptr
]
"+r"
(
dst
),
[
cnt
]
"+r"
(
cnt
)
:
[
vbias
]
"w"
(
vbias
)
:
"memory"
,
"cc"
,
"v0"
,
"v1"
,
"v2"
,
"v3"
);
#else
asm
volatile
(
FILL_BIAS
FILL_STORE
:
[
din_ptr
]
"+r"
(
src
),
[
dout_ptr
]
"+r"
(
dst
),
[
cnt
]
"+r"
(
cnt
)
:
[
vbias
]
"w"
(
vbias
)
:
"memory"
,
"cc"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
);
#endif
}
}
}
}
// namespace math
}
// namespace arm
}
// namespace lite
...
...
lite/backends/arm/math/fill_bias_relu.h
浏览文件 @
c0af965c
...
...
@@ -37,7 +37,22 @@ void fill_bias_relu(Dtype* tensor,
int
channel_size
,
bool
flag_bias
,
bool
flag_relu
);
/**
* * \brief neon implementation to add bias and activation(relu, relu6,
* leakyrelu)
* * @param tensor
* * @param bias
* * @param channel
* * @param channel_size
*
*/
template
<
typename
Dtype
>
void
fill_bias_act
(
Dtype
*
tensor
,
const
Dtype
*
bias
,
int
channel
,
int
channel_size
,
bool
flag_bias
,
const
operators
::
ActivationParam
*
act_param
);
}
// namespace math
}
// namespace arm
}
// namespace lite
...
...
lite/backends/arm/math/gru_utils.h
浏览文件 @
c0af965c
...
...
@@ -383,6 +383,8 @@ struct GRUUnitFunctor {
const
lite_api
::
ActivationType
active_gate
,
bool
origin_mode
,
ARMContext
*
ctx
)
{
operators
::
ActivationParam
act_param
;
act_param
.
has_active
=
false
;
if
(
value
.
prev_out_value
)
{
sgemm
(
false
,
false
,
...
...
@@ -399,7 +401,7 @@ struct GRUUnitFunctor {
frame_size
*
3
,
nullptr
,
false
,
false
,
act_param
,
ctx
);
}
gru_unit_reset_act
(
active_gate
,
value
,
frame_size
,
batch_size
);
...
...
@@ -420,7 +422,7 @@ struct GRUUnitFunctor {
frame_size
*
3
,
nullptr
,
false
,
false
,
act_param
,
ctx
);
}
...
...
lite/backends/arm/math/packed_sgemm.cc
浏览文件 @
c0af965c
...
...
@@ -14,6 +14,7 @@
#include "lite/backends/arm/math/packed_sgemm.h"
#include <arm_neon.h>
#include "lite/backends/arm/math/conv_block_utils.h"
namespace
paddle
{
namespace
lite
{
...
...
@@ -51,7 +52,7 @@ void sgemm_prepacked_8x12(bool is_transB,
int
ldc
,
const
float
*
bias
,
bool
has_bias
,
bool
has_relu
,
const
operators
::
ActivationParam
act_param
,
ARMContext
*
ctx
);
void
pack_m4
(
float
*
out
,
...
...
@@ -83,7 +84,7 @@ void sgemm_prepacked_4x4(bool is_transB,
int
ldc
,
const
float
*
bias
,
bool
has_bias
,
bool
has_relu
,
const
operators
::
ActivationParam
act_param
,
ARMContext
*
ctx
);
#else
// for kA72
...
...
@@ -136,7 +137,7 @@ void sgemm_prepacked_6x8(bool is_transB,
int
ldc
,
const
float
*
bias
,
bool
has_bias
,
bool
has_relu
,
const
operators
::
ActivationParam
act_param
,
ARMContext
*
ctx
);
// for kA73, 4x8
void
sgemm_prepacked_4x8
(
bool
is_transB
,
...
...
@@ -151,7 +152,7 @@ void sgemm_prepacked_4x8(bool is_transB,
int
ldc
,
const
float
*
bias
,
bool
has_bias
,
bool
has_relu
,
const
operators
::
ActivationParam
act_param
,
ARMContext
*
ctx
);
#endif // __aarch64__
...
...
@@ -249,7 +250,7 @@ void sgemm_prepack(bool is_transB,
int
ldc
,
const
float
*
bias
,
bool
has_bias
,
bool
has_relu
,
const
operators
::
ActivationParam
act_param
,
ARMContext
*
ctx
)
{
#ifdef __aarch64__
if
(
M
<=
4
)
{
...
...
@@ -265,7 +266,7 @@ void sgemm_prepack(bool is_transB,
ldc
,
bias
,
has_bias
,
has_relu
,
act_param
,
ctx
);
}
else
{
sgemm_prepacked_8x12
(
is_transB
,
...
...
@@ -280,7 +281,7 @@ void sgemm_prepack(bool is_transB,
ldc
,
bias
,
has_bias
,
has_relu
,
act_param
,
ctx
);
}
#else // armv7
...
...
@@ -297,7 +298,7 @@ void sgemm_prepack(bool is_transB,
ldc
,
bias
,
has_bias
,
has_relu
,
act_param
,
ctx
);
}
else
{
sgemm_prepacked_6x8
(
is_transB
,
...
...
@@ -312,7 +313,7 @@ void sgemm_prepack(bool is_transB,
ldc
,
bias
,
has_bias
,
has_relu
,
act_param
,
ctx
);
}
#endif // arm64
...
...
@@ -2283,7 +2284,7 @@ void sgemm_prepacked_8x12(bool is_transB,
int
ldc
,
const
float
*
bias
,
bool
has_bias
,
bool
has_relu
,
const
operators
::
ActivationParam
act_param
,
ARMContext
*
ctx
)
{
size_t
l2_cache
=
ctx
->
llc_size
()
>
0
?
ctx
->
llc_size
()
:
512
*
1024
;
auto
workspace
=
ctx
->
workspace_data
<
float
>
();
...
...
@@ -2837,33 +2838,6 @@ void sgemm_prepacked_8x12(bool is_transB,
"fmla v28.4s, v4.4s, v1.s[2]
\n
"
/* out22 = b2 * a10[0], b2 =q7*/
"fmla v31.4s, v4.4s, v1.s[3]
\n
"
/* out23 = b2 * a10[0], b2 =q7*/
"11:
\n
"
/* check if relu */
"cbz %w[relu], 12f
\n
"
/* skip relu */
"movi v2.4s, #0
\n
"
/* for relu*/
"fmax v8.4s, v8.4s, v2.4s
\n
"
/* relu*/
"fmax v9.4s, v9.4s, v2.4s
\n
"
/* relu*/
"fmax v10.4s, v10.4s, v2.4s
\n
"
/* relu*/
"fmax v11.4s, v11.4s, v2.4s
\n
"
/* relu*/
"fmax v12.4s, v12.4s, v2.4s
\n
"
/* relu*/
"fmax v13.4s, v13.4s, v2.4s
\n
"
/* relu*/
"fmax v14.4s, v14.4s, v2.4s
\n
"
/* relu*/
"fmax v15.4s, v15.4s, v2.4s
\n
"
/* relu*/
"fmax v16.4s,v16.4s,v2.4s
\n
"
/* relu*/
"fmax v17.4s,v17.4s,v2.4s
\n
"
/* relu*/
"fmax v18.4s, v18.4s, v2.4s
\n
"
/* relu*/
"fmax v19.4s, v19.4s, v2.4s
\n
"
/* relu*/
"fmax v20.4s, v20.4s, v2.4s
\n
"
/* relu*/
"fmax v21.4s, v21.4s, v2.4s
\n
"
/* relu*/
"fmax v22.4s, v22.4s, v2.4s
\n
"
/* relu*/
"fmax v23.4s, v23.4s, v2.4s
\n
"
/* relu*/
"fmax v24.4s,v24.4s,v2.4s
\n
"
/* relu*/
"fmax v25.4s,v25.4s,v2.4s
\n
"
/* relu*/
"fmax v26.4s, v26.4s, v2.4s
\n
"
/* relu*/
"fmax v27.4s, v27.4s, v2.4s
\n
"
/* relu*/
"fmax v28.4s, v28.4s, v2.4s
\n
"
/* relu*/
"fmax v29.4s, v29.4s, v2.4s
\n
"
/* relu*/
"fmax v30.4s, v30.4s, v2.4s
\n
"
/* relu*/
"fmax v31.4s, v31.4s, v2.4s
\n
"
/* relu*/
"12:
\n
"
"st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48
\n
"
/* store r0 */
"st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48
\n
"
/* store r1 */
"st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48
\n
"
/* store r2 */
...
...
@@ -2886,7 +2860,6 @@ void sgemm_prepacked_8x12(bool is_transB,
[
c_ptr6
]
"+r"
(
c_ptr6
),
[
c_ptr7
]
"+r"
(
c_ptr7
)
:
[
bias_ptr
]
"r"
(
bias_local
),
[
relu
]
"r"
(
has_relu
),
[
has_beta
]
"r"
(
has_beta
),
[
beta
]
"r"
(
beta
)
:
"cc"
,
"memory"
,
...
...
@@ -2911,6 +2884,13 @@ void sgemm_prepacked_8x12(bool is_transB,
}
}
}
if
(
act_param
.
has_active
)
{
#pragma omp parallel for num_threads(threads)
for
(
unsigned
int
x
=
0
;
x
<
M
;
x
++
)
{
float
*
dst
=
C
+
x
*
ldc
;
act_switch_process
(
dst
,
dst
,
N
,
&
act_param
);
}
}
}
void
sgemm_prepacked_4x4
(
bool
is_transB
,
...
...
@@ -2925,7 +2905,7 @@ void sgemm_prepacked_4x4(bool is_transB,
int
ldc
,
const
float
*
bias
,
bool
has_bias
,
bool
has_relu
,
const
operators
::
ActivationParam
act_param
,
ARMContext
*
ctx
)
{
size_t
l2_cache
=
ctx
->
llc_size
()
>
0
?
ctx
->
llc_size
()
:
512
*
1024
;
auto
workspace
=
ctx
->
workspace_data
<
float
>
();
...
...
@@ -3158,13 +3138,6 @@ void sgemm_prepacked_4x4(bool is_transB,
"fmla v11.4s, v6.4s, v2.s[3]
\n
"
/* out3 = b2 * a20[3], b1 =q6*/
"11:
\n
"
/* check if relu */
"cbz %w[relu], 12f
\n
"
/* skip relu */
"movi v2.4s, #0
\n
"
/* for relu*/
"fmax v8.4s, v8.4s, v2.4s
\n
"
/* relu*/
"fmax v9.4s, v9.4s, v2.4s
\n
"
/* relu*/
"fmax v10.4s, v10.4s, v2.4s
\n
"
/* relu*/
"fmax v11.4s, v11.4s, v2.4s
\n
"
/* relu*/
"12:
\n
"
"st1 {v8.4s}, [%[c_ptr0]], #16
\n
"
/* store r0 */
"st1 {v9.4s}, [%[c_ptr1]], #16
\n
"
/* store r1 */
"st1 {v10.4s}, [%[c_ptr2]], #16
\n
"
/* store r2 */
...
...
@@ -3179,7 +3152,6 @@ void sgemm_prepacked_4x4(bool is_transB,
[
c_ptr2
]
"+r"
(
c_ptr2
),
[
c_ptr3
]
"+r"
(
c_ptr3
)
:
[
bias_ptr
]
"r"
(
bias_local
),
[
relu
]
"r"
(
has_relu
),
[
has_beta
]
"r"
(
has_beta
),
[
beta
]
"r"
(
beta
)
:
"cc"
,
"memory"
,
...
...
@@ -3197,6 +3169,13 @@ void sgemm_prepacked_4x4(bool is_transB,
}
}
}
if
(
act_param
.
has_active
)
{
#pragma omp parallel for num_threads(threads)
for
(
unsigned
int
x
=
0
;
x
<
M
;
x
++
)
{
float
*
dst
=
C
+
x
*
ldc
;
act_switch_process
(
dst
,
dst
,
N
,
&
act_param
);
}
}
}
#else // __aarch64__
/**
...
...
@@ -3222,7 +3201,7 @@ void sgemm_prepacked_6x8(bool is_transB,
int
ldc
,
const
float
*
bias
,
bool
has_bias
,
bool
has_relu
,
const
operators
::
ActivationParam
act_param
,
ARMContext
*
ctx
)
{
size_t
l2_cache
=
ctx
->
llc_size
()
>
0
?
ctx
->
llc_size
()
:
512
*
1024
;
auto
*
workspace
=
ctx
->
workspace_data
<
float
>
();
...
...
@@ -3601,22 +3580,6 @@ void sgemm_prepacked_6x8(bool is_transB,
"vmla.f32 q13, q3, d0[0] @ out10 += b2 * a4
\n
"
"vmla.f32 q15, q3, d0[1] @ out11 += b2 * a5
\n
"
"2: @ check relu
\n
"
"cmp %[relu], #0 @ check if has relu
\n
"
"ble 6f @ skip relu if relu <= 0
\n
"
"vmov.u32 q0, #0 @ for relu
\n
"
"vmax.f32 q4, q4, q0 @ for relu
\n
"
"vmax.f32 q5, q5, q0 @ for relu
\n
"
"vmax.f32 q6, q6, q0 @ for relu
\n
"
"vmax.f32 q7, q7, q0 @ for relu
\n
"
"vmax.f32 q8, q8, q0 @ for relu
\n
"
"vmax.f32 q9, q9, q0 @ for relu
\n
"
"vmax.f32 q10, q10, q0 @ for relu
\n
"
"vmax.f32 q11, q11, q0 @ for relu
\n
"
"vmax.f32 q12, q12, q0 @ for relu
\n
"
"vmax.f32 q13, q13, q0 @ for relu
\n
"
"vmax.f32 q14, q14, q0 @ for relu
\n
"
"vmax.f32 q15, q15, q0 @ for relu
\n
"
"6: @ store result
\n
"
"vst1.32 {d8-d11}, [%[c_ptr0]]! @ store r0
\n
"
"vst1.32 {d12-d15}, [%[c_ptr1]]! @ store r1
\n
"
"vst1.32 {d16-d19}, [%[c_ptr2]]! @ store r2
\n
"
...
...
@@ -3634,7 +3597,6 @@ void sgemm_prepacked_6x8(bool is_transB,
[
k
]
"+r"
(
k
),
[
tails
]
"+r"
(
tails
)
:
[
bias_ptr
]
"r"
(
bias_local
),
[
relu
]
"r"
(
has_relu
),
[
beta
]
"r"
(
beta
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
...
...
@@ -3654,6 +3616,13 @@ void sgemm_prepacked_6x8(bool is_transB,
}
}
}
if
(
act_param
.
has_active
)
{
#pragma omp parallel for num_threads(threads)
for
(
unsigned
int
x
=
0
;
x
<
M
;
x
++
)
{
float
*
dst
=
C
+
x
*
ldc
;
act_switch_process
(
dst
,
dst
,
N
,
&
act_param
);
}
}
}
void
sgemm_prepacked_4x8
(
bool
is_transB
,
...
...
@@ -3668,7 +3637,7 @@ void sgemm_prepacked_4x8(bool is_transB,
int
ldc
,
const
float
*
bias
,
bool
has_bias
,
bool
has_relu
,
const
operators
::
ActivationParam
act_param
,
ARMContext
*
ctx
)
{
size_t
l2_cache
=
ctx
->
llc_size
()
>
0
?
ctx
->
llc_size
()
:
512
*
1024
;
auto
*
workspace
=
ctx
->
workspace_data
<
float
>
();
...
...
@@ -3953,18 +3922,6 @@ void sgemm_prepacked_4x8(bool is_transB,
/*aptr - 16*/
"sub %[a_ptr], %[a_ptr], #16 @ tail--
\n
"
"2: @ check relu
\n
"
"cmp %[relu], #0 @ check if has relu
\n
"
"ble 6f @ skip relu if relu <= 0
\n
"
"vmov.u32 q0, #0 @ for relu
\n
"
"vmax.f32 q8, q8, q0 @ for relu
\n
"
"vmax.f32 q9, q9, q0 @ for relu
\n
"
"vmax.f32 q10, q10, q0 @ for relu
\n
"
"vmax.f32 q11, q11, q0 @ for relu
\n
"
"vmax.f32 q12, q12, q0 @ for relu
\n
"
"vmax.f32 q13, q13, q0 @ for relu
\n
"
"vmax.f32 q14, q14, q0 @ for relu
\n
"
"vmax.f32 q15, q15, q0 @ for relu
\n
"
"6: @ store result
\n
"
"vst1.32 {d16-d19}, [%[c_ptr0]]! @ store r0
\n
"
"vst1.32 {d20-d23}, [%[c_ptr1]]! @ store r1
\n
"
"vst1.32 {d24-d27}, [%[c_ptr2]]! @ store r2
\n
"
...
...
@@ -3978,7 +3935,6 @@ void sgemm_prepacked_4x8(bool is_transB,
[
k
]
"+r"
(
k
),
[
tails
]
"+r"
(
tails
)
:
[
bias_ptr
]
"r"
(
bias_local
),
[
relu
]
"r"
(
has_relu
),
[
beta
]
"r"
(
beta
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
...
...
@@ -3995,6 +3951,13 @@ void sgemm_prepacked_4x8(bool is_transB,
}
}
}
if
(
act_param
.
has_active
)
{
#pragma omp parallel for num_threads(threads)
for
(
unsigned
int
x
=
0
;
x
<
M
;
x
++
)
{
float
*
dst
=
C
+
x
*
ldc
;
act_switch_process
(
dst
,
dst
,
N
,
&
act_param
);
}
}
}
#endif // __aarch64__
...
...
lite/backends/arm/math/packed_sgemm.h
浏览文件 @
c0af965c
...
...
@@ -17,6 +17,7 @@
#include <cmath>
#include "lite/core/context.h"
#include "lite/core/tensor.h"
#include "lite/operators/op_params.h"
namespace
paddle
{
namespace
lite
{
...
...
@@ -74,7 +75,7 @@ void sgemm_prepack(bool is_transB,
int
ldc
,
const
float
*
bias
,
bool
has_bias
,
bool
has_relu
,
const
operators
::
ActivationParam
act_param
,
ARMContext
*
ctx
);
}
// namespace math
...
...
lite/backends/arm/math/sgemm.cc
浏览文件 @
c0af965c
...
...
@@ -34,7 +34,7 @@ void sgemm(bool is_transA,
int
ldc
,
const
float
*
bias
,
bool
is_bias
,
bool
is_relu
,
const
operators
::
ActivationParam
act_param
,
ARMContext
*
ctx
)
{
int
hblock
=
get_hblock
(
ctx
);
int
m_roundup
=
hblock
*
((
M
+
hblock
-
1
)
/
hblock
);
...
...
@@ -56,7 +56,7 @@ void sgemm(bool is_transA,
ldc
,
bias
,
is_bias
,
is_relu
,
act_param
,
ctx
);
TargetFree
(
TargetType
::
kARM
,
packed_A
);
}
...
...
lite/backends/arm/math/sgemm.h
浏览文件 @
c0af965c
...
...
@@ -39,7 +39,7 @@ void sgemm(bool is_transA,
int
ldc
,
const
float
*
bias
,
bool
is_bias
,
bool
is_relu
,
const
operators
::
ActivationParam
act_param
,
ARMContext
*
ctx
);
}
// namespace math
...
...
lite/kernels/arm/conv_transpose_compute.cc
浏览文件 @
c0af965c
...
...
@@ -103,6 +103,7 @@ void Conv2DTransposeCompute::Run() {
auto
din
=
param
.
x
->
data
<
float
>
();
auto
dout
=
param
.
output
->
mutable_data
<
float
>
();
auto
weights
=
param
.
filter
->
data
<
float
>
();
auto
act_param
=
param
.
activation_param
;
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
const
float
*
din_batch
=
din
+
i
*
chin
*
hin
*
win
;
float
*
dout_batch
=
dout
+
i
*
chout
*
hout
*
wout
;
...
...
@@ -115,7 +116,9 @@ void Conv2DTransposeCompute::Run() {
const
float
*
din_group
=
din_batch
+
g
*
group_size_in
;
const
float
*
weights_group
=
weights
+
g
*
group_size_weights
;
float
*
coldata_group
=
col_data
+
g
*
group_size_coldata
;
if
(
flag_bias
)
{
act_param
.
has_active
=
false
;
}
lite
::
arm
::
math
::
sgemm_prepack
(
false
,
m
,
n
,
...
...
@@ -128,7 +131,7 @@ void Conv2DTransposeCompute::Run() {
n
,
nullptr
,
false
,
fuse_relu
&&
(
!
flag_bias
)
,
act_param
,
&
ctx
);
}
if
(
!
flag_1x1s1p1
)
{
...
...
lite/kernels/arm/fc_compute.cc
浏览文件 @
c0af965c
...
...
@@ -94,6 +94,8 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
b_data
=
bias_
.
data
<
float
>
();
}
if
(
flag_gemm_
)
{
operators
::
ActivationParam
act_param
;
act_param
.
has_active
=
false
;
lite
::
arm
::
math
::
sgemm
(
false
,
false
,
m_
,
...
...
@@ -109,7 +111,7 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
n_
,
nullptr
,
false
,
false
,
act_param
,
&
ctx
);
if
(
param
.
bias
)
{
CHECK_EQ
(
param
.
bias
->
numel
(),
n_
);
...
...
lite/kernels/arm/matmul_compute.cc
浏览文件 @
c0af965c
...
...
@@ -42,6 +42,9 @@ void MatMulCompute::Run() {
float
alpha
=
param
.
alpha
;
auto
&
ctx
=
this
->
ctx_
->
template
As
<
ARMContext
>();
operators
::
ActivationParam
act_param
;
act_param
.
has_active
=
false
;
if
(
x_dims
.
size
()
>
2
&&
y_dims
.
size
()
>=
2
)
{
// x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
// x: [B, M, K], y: [K, N], out: [B, M, N]
...
...
@@ -97,7 +100,6 @@ void MatMulCompute::Run() {
if
(
x_transpose
)
{
x_data_trans
=
static_cast
<
float
*>
(
malloc
(
sizeof
(
float
)
*
x_inner
));
}
if
(
y_dims
.
size
()
>
2
)
{
for
(
size_t
i
=
0
;
i
<
x_dims
.
count
(
0
,
x_dims
.
size
()
-
2
);
++
i
)
{
lite
::
arm
::
math
::
sgemm
(
x_transpose
,
...
...
@@ -115,7 +117,7 @@ void MatMulCompute::Run() {
ldc
,
nullptr
,
false
,
false
,
act_param
,
&
ctx
);
}
}
else
{
...
...
@@ -135,7 +137,7 @@ void MatMulCompute::Run() {
ldc
,
nullptr
,
false
,
false
,
act_param
,
&
ctx
);
}
}
...
...
@@ -200,7 +202,7 @@ void MatMulCompute::Run() {
ldc
,
nullptr
,
false
,
false
,
act_param
,
&
ctx
);
}
else
if
(
x_dims
.
size
()
>
2
&&
y_dims
.
size
()
==
1
)
{
// x: [B, M, K], y: [K], out: [B, M]
...
...
@@ -254,7 +256,7 @@ void MatMulCompute::Run() {
ldc
,
nullptr
,
false
,
false
,
act_param
,
&
ctx
);
}
}
...
...
lite/kernels/arm/mul_compute.cc
浏览文件 @
c0af965c
...
...
@@ -67,6 +67,8 @@ void MulCompute::Run() {
if
(
is_tranposed_y
)
{
ldb
=
k_
;
}
operators
::
ActivationParam
act_param
;
act_param
.
has_active
=
false
;
lite
::
arm
::
math
::
sgemm_prepack
(
is_tranposed_y
,
m_
,
n_
,
...
...
@@ -79,7 +81,7 @@ void MulCompute::Run() {
n_
,
nullptr
,
false
,
false
,
act_param
,
&
ctx
);
}
}
...
...
lite/tests/kernels/CMakeLists.txt
浏览文件 @
c0af965c
...
...
@@ -11,8 +11,6 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_
lite_cc_test
(
test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework
${
npu_kernels
}
${
xpu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_conv2d_transpose_compute SRCS conv2d_transpose_compute_test.cc DEPS arena_framework
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
...
...
lite/tests/kernels/conv2d_transpose_compute_test.cc
已删除
100644 → 0
浏览文件 @
7a8118b0
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/core/arena/framework.h"
namespace
paddle
{
namespace
lite
{
inline
bool
is_a_ge_zero_and_a_lt_b
(
int
a
,
int
b
)
{
return
static_cast
<
unsigned
>
(
a
)
<
static_cast
<
unsigned
>
(
b
);
}
template
<
typename
Dtype
>
void
col2im
(
const
Dtype
*
data_col
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
kernel_h
,
const
int
kernel_w
,
const
int
pad_h0
,
const
int
pad_h1
,
const
int
pad_w0
,
const
int
pad_w1
,
const
int
stride_h
,
const
int
stride_w
,
const
int
dilation_h
,
const
int
dilation_w
,
Dtype
*
data_im
)
{
memset
(
data_im
,
0
,
height
*
width
*
channels
*
sizeof
(
float
));
const
int
output_h
=
(
height
+
pad_h0
+
pad_h1
-
(
dilation_h
*
(
kernel_h
-
1
)
+
1
))
/
stride_h
+
1
;
const
int
output_w
=
(
width
+
pad_w0
+
pad_w1
-
(
dilation_w
*
(
kernel_w
-
1
)
+
1
))
/
stride_w
+
1
;
const
int
channel_size
=
height
*
width
;
for
(
int
channel
=
channels
;
channel
--
;
data_im
+=
channel_size
)
{
for
(
int
kernel_row
=
0
;
kernel_row
<
kernel_h
;
kernel_row
++
)
{
for
(
int
kernel_col
=
0
;
kernel_col
<
kernel_w
;
kernel_col
++
)
{
int
input_row
=
-
pad_h0
+
kernel_row
*
dilation_h
;
for
(
int
output_rows
=
output_h
;
output_rows
;
output_rows
--
)
{
if
(
!
is_a_ge_zero_and_a_lt_b
(
input_row
,
height
))
{
data_col
+=
output_w
;
}
else
{
int
input_col
=
-
pad_w0
+
kernel_col
*
dilation_w
;
for
(
int
output_col
=
output_w
;
output_col
;
output_col
--
)
{
if
(
is_a_ge_zero_and_a_lt_b
(
input_col
,
width
))
{
data_im
[
input_row
*
width
+
input_col
]
+=
*
data_col
;
}
data_col
++
;
input_col
+=
stride_w
;
}
}
input_row
+=
stride_h
;
}
}
}
}
}
template
<
typename
Dtype
>
void
fill_bias_relu
(
Dtype
*
tensor
,
const
Dtype
*
bias
,
int
channel
,
int
channel_size
,
bool
flag_bias
,
bool
flag_relu
);
template
<
>
void
fill_bias_relu
<
float
>
(
float
*
tensor
,
const
float
*
bias
,
int
channel
,
int
channel_size
,
bool
flag_bias
,
bool
flag_relu
)
{
float
*
data
=
tensor
;
if
(
flag_relu
)
{
for
(
int
j
=
0
;
j
<
channel
;
++
j
)
{
float
bias_data
=
flag_bias
?
bias
[
j
]
:
0.
f
;
for
(
int
i
=
0
;
i
<
channel_size
;
i
++
)
{
data
[
i
]
+=
bias_data
;
data
[
i
]
=
data
[
i
]
>
0
?
data
[
i
]
:
0.
f
;
}
data
+=
channel_size
;
}
}
else
{
for
(
int
j
=
0
;
j
<
channel
;
++
j
)
{
float
bias_data
=
flag_bias
?
bias
[
j
]
:
0.
f
;
for
(
int
i
=
0
;
i
<
channel_size
;
i
++
)
{
data
[
i
]
+=
bias_data
;
}
data
+=
channel_size
;
}
}
}
inline
void
UpdatePaddingAndDilation
(
std
::
vector
<
int
>*
paddings
,
std
::
vector
<
int
>*
dilations
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
string
padding_algorithm
,
const
DDim
data_dims
,
const
std
::
vector
<
int
>&
ksize
)
{
// when padding_desc is "VALID" or "SAME"
if
(
padding_algorithm
==
"SAME"
)
{
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
int
out_size
=
(
data_dims
[
i
+
2
]
+
strides
[
i
]
-
1
)
/
strides
[
i
];
int
pad_sum
=
std
::
max
(
(
out_size
-
1
)
*
strides
[
i
]
+
ksize
[
i
+
2
]
-
data_dims
[
i
+
2
],
(
int64_t
)
0
);
int
pad_0
=
pad_sum
/
2
;
int
pad_1
=
pad_sum
-
pad_0
;
// pad
*
(
paddings
->
begin
()
+
i
*
2
)
=
pad_0
;
*
(
paddings
->
begin
()
+
i
*
2
+
1
)
=
pad_1
;
// dilation
*
(
dilations
->
begin
()
+
i
)
=
1
;
}
}
else
if
(
padding_algorithm
==
"VALID"
)
{
for
(
auto
&
it
:
*
paddings
)
{
it
=
0
;
}
}
}
template
<
typename
type
,
typename
type2
>
static
void
basic_gemm
(
int
m
,
int
n
,
int
k
,
const
type
*
a
,
const
type
*
b
,
const
type2
*
bias
,
type2
*
c
,
type2
alpha
,
type2
beta
,
bool
trans_a
=
false
,
bool
trans_b
=
false
,
bool
flag_bias
=
false
,
bool
flag_relu
=
false
)
{
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
type2
bias_data
=
(
type2
)
0
;
if
(
flag_bias
)
{
bias_data
=
bias
[
i
];
}
for
(
int
j
=
0
;
j
<
n
;
++
j
)
{
type2
sum
=
static_cast
<
type2
>
(
0
);
for
(
int
l
=
0
;
l
<
k
;
++
l
)
{
type
av
;
type
bv
;
if
(
trans_a
)
{
av
=
a
[
l
*
m
+
i
];
}
else
{
av
=
a
[
i
*
k
+
l
];
}
if
(
trans_b
)
{
bv
=
b
[
j
*
k
+
l
];
}
else
{
bv
=
b
[
l
*
n
+
j
];
}
sum
+=
av
*
bv
;
}
type2
tmp
=
alpha
*
sum
+
beta
*
c
[
i
*
n
+
j
]
+
bias_data
;
if
(
flag_relu
)
{
c
[
i
*
n
+
j
]
=
tmp
>
(
type2
)
0
?
tmp
:
(
type2
)
0
;
}
else
{
c
[
i
*
n
+
j
]
=
tmp
;
}
}
}
}
//! for float, dtype1 and type2 is float
//! for int8, dytpe1 is char, dtype2 is int
template
<
typename
Dtype1
,
typename
Dtype2
>
bool
deconv_basic
(
const
Dtype1
*
din
,
Dtype2
*
dout
,
int
num
,
int
chout
,
int
hout
,
int
wout
,
int
chin
,
int
hin
,
int
win
,
const
Dtype1
*
weights
,
const
Dtype2
*
bias
,
int
group
,
int
kernel_w
,
int
kernel_h
,
int
stride_w
,
int
stride_h
,
int
dila_w
,
int
dila_h
,
int
pad_w0
,
int
pad_w1
,
int
pad_h0
,
int
pad_h1
,
bool
flag_bias
,
bool
flag_relu
)
{
int
m
=
chout
*
kernel_w
*
kernel_h
/
group
;
int
n
=
hin
*
win
;
int
k
=
chin
/
group
;
if
(
chin
!=
chout
||
group
!=
chin
)
{
CHECK_OR_FALSE
(
chin
%
group
==
0
);
CHECK_OR_FALSE
(
chout
%
group
==
0
);
}
lite
::
Tensor
workspace_tensor
;
std
::
vector
<
int64_t
>
wt_shape
=
{
1
,
1
,
1
,
group
*
m
*
n
};
workspace_tensor
.
Resize
(
wt_shape
);
auto
*
workspace_ptr
=
workspace_tensor
.
mutable_data
<
Dtype2
>
();
int
group_size_in
=
win
*
hin
*
chin
/
group
;
int
group_size_coldata
=
m
*
n
;
int
group_size_weights
=
chin
*
chout
*
kernel_w
*
kernel_h
/
(
group
*
group
);
bool
flag_1x1s1p1
=
(
kernel_w
==
1
)
&&
(
kernel_h
==
1
)
&&
(
stride_h
==
1
)
&&
(
stride_w
==
1
)
&&
(
pad_w0
==
0
)
&&
(
pad_h0
==
0
)
&&
(
pad_w1
==
0
)
&&
(
pad_h1
==
0
)
&&
(
dila_w
==
1
)
&&
(
dila_h
==
1
);
for
(
int
i
=
0
;
i
<
num
;
++
i
)
{
const
Dtype1
*
din_batch
=
din
+
i
*
chin
*
hin
*
win
;
Dtype2
*
dout_batch
=
dout
+
i
*
chout
*
hout
*
wout
;
Dtype2
*
col_data
=
workspace_ptr
;
if
(
flag_1x1s1p1
)
{
col_data
=
dout_batch
;
}
memset
(
col_data
,
0
,
sizeof
(
Dtype2
)
*
group_size_coldata
*
group
);
for
(
int
g
=
0
;
g
<
group
;
++
g
)
{
const
Dtype1
*
din_group
=
din_batch
+
g
*
group_size_in
;
const
Dtype1
*
weights_group
=
weights
+
g
*
group_size_weights
;
Dtype2
*
coldata_group
=
col_data
+
g
*
group_size_coldata
;
basic_gemm
<
Dtype1
,
Dtype2
>
(
m
,
n
,
k
,
weights_group
,
din_group
,
nullptr
,
coldata_group
,
(
Dtype2
)
1
,
(
Dtype2
)
0
,
true
,
false
,
false
,
(
!
flag_bias
&&
flag_relu
));
}
if
(
!
flag_1x1s1p1
)
{
col2im
(
col_data
,
chout
,
hout
,
wout
,
kernel_h
,
kernel_w
,
pad_h0
,
pad_h1
,
pad_w0
,
pad_w1
,
stride_h
,
stride_w
,
dila_h
,
dila_w
,
dout_batch
);
}
if
(
flag_bias
)
{
fill_bias_relu
(
dout_batch
,
bias
,
chout
,
wout
*
hout
,
flag_bias
,
flag_relu
);
}
}
return
true
;
}
class
Conv2DTransposeComputeTester
:
public
arena
::
TestCase
{
protected:
// common attributes for this op.
std
::
string
x_
=
"x"
;
std
::
string
output_
=
"out"
;
std
::
string
filter_
=
"filter"
;
std
::
string
bias_
=
"bias"
;
std
::
string
padding_algorithm_
=
""
;
std
::
vector
<
int
>
strides_
{
1
,
1
};
std
::
vector
<
int
>
paddings_
{
0
,
0
,
0
,
0
};
int
groups_
{
1
};
std
::
vector
<
int
>
dilations_
{
1
,
1
};
bool
flag_relu_
{
false
};
int
n_
=
1
;
int
ic_
=
1
;
int
oc_
=
1
;
int
ih_
=
9
;
int
iw_
=
9
;
bool
flag_bias_
=
false
;
int
ks_
=
1
;
public:
Conv2DTransposeComputeTester
(
const
Place
&
place
,
const
std
::
string
&
alias
,
int
n
,
int
ic
,
int
oc
,
int
ih
,
int
iw
,
bool
flag_bias
,
bool
flag_relu
,
int
dilation
,
int
stride
,
int
pad_h0
,
int
pad_h1
,
int
pad_w0
,
int
pad_w1
,
int
ks
,
int
groups
,
std
::
string
padding_algorithm
)
:
TestCase
(
place
,
alias
)
{
n_
=
n
;
ic_
=
ic
;
oc_
=
oc
;
ih_
=
ih
;
iw_
=
iw
;
ks_
=
ks
;
flag_bias_
=
flag_bias
;
padding_algorithm_
=
padding_algorithm
;
strides_
=
std
::
vector
<
int
>
({
stride
,
stride
});
paddings_
=
std
::
vector
<
int
>
({
pad_h0
,
pad_h1
,
pad_w0
,
pad_w1
});
dilations_
=
std
::
vector
<
int
>
({
dilation
,
dilation
});
groups_
=
groups
;
flag_relu_
=
flag_relu
;
}
void
RunBaseline
(
Scope
*
scope
)
override
{
auto
*
out
=
scope
->
NewTensor
(
output_
);
CHECK
(
out
);
auto
*
x
=
scope
->
FindTensor
(
x_
);
auto
input_dim
=
x
->
dims
();
std
::
vector
<
int
>
ksize
({
1
,
1
,
ks_
,
ks_
});
UpdatePaddingAndDilation
(
&
paddings_
,
&
dilations_
,
strides_
,
padding_algorithm_
,
input_dim
,
ksize
);
int
oh
=
(
ih_
-
1
)
*
strides_
[
0
]
-
paddings_
[
0
]
-
paddings_
[
1
]
+
dilations_
[
0
]
*
(
ks_
-
1
)
+
1
;
int
ow
=
(
iw_
-
1
)
*
strides_
[
1
]
-
paddings_
[
2
]
-
paddings_
[
3
]
+
dilations_
[
1
]
*
(
ks_
-
1
)
+
1
;
CHECK
(
oh
>
0
||
ow
>
0
);
std
::
vector
<
int64_t
>
output_shape
=
{
n_
,
oc_
,
oh
,
ow
};
DDim
output_dims
(
output_shape
);
out
->
Resize
(
output_dims
);
auto
*
output_data
=
out
->
mutable_data
<
float
>
();
const
auto
*
x_data
=
x
->
data
<
float
>
();
auto
*
filter
=
scope
->
FindTensor
(
filter_
);
const
auto
*
filter_data
=
filter
->
data
<
float
>
();
const
float
*
bias_data
=
nullptr
;
if
(
flag_bias_
)
{
auto
*
bias
=
scope
->
FindTensor
(
bias_
);
bias_data
=
bias
->
data
<
float
>
();
}
deconv_basic
<
float
,
float
>
(
x_data
,
output_data
,
n_
,
oc_
,
oh
,
ow
,
ic_
,
ih_
,
iw_
,
filter_data
,
bias_data
,
groups_
,
ks_
,
ks_
,
strides_
[
1
],
strides_
[
0
],
dilations_
[
1
],
dilations_
[
0
],
paddings_
[
2
],
paddings_
[
3
],
paddings_
[
0
],
paddings_
[
1
],
flag_bias_
,
flag_relu_
);
}
void
PrepareOpDesc
(
cpp
::
OpDesc
*
op_desc
)
{
op_desc
->
SetType
(
"conv2d_transpose"
);
op_desc
->
SetInput
(
"Input"
,
{
x_
});
op_desc
->
SetInput
(
"Filter"
,
{
filter_
});
op_desc
->
SetOutput
(
"Output"
,
{
output_
});
op_desc
->
SetAttr
(
"strides"
,
strides_
);
op_desc
->
SetAttr
(
"paddings"
,
paddings_
);
op_desc
->
SetAttr
(
"groups"
,
groups_
);
op_desc
->
SetAttr
(
"dilations"
,
dilations_
);
if
(
flag_bias_
)
{
op_desc
->
SetInput
(
"Bias"
,
{
bias_
});
}
op_desc
->
SetAttr
(
"fuse_relu"
,
flag_relu_
);
op_desc
->
SetAttr
(
"padding_algorithm"
,
padding_algorithm_
);
}
void
PrepareData
()
override
{
std
::
vector
<
int64_t
>
input_shape
=
{
n_
,
ic_
,
ih_
,
iw_
};
std
::
vector
<
int64_t
>
filter_shape
=
{
ic_
,
oc_
/
groups_
,
ks_
,
ks_
};
std
::
vector
<
int64_t
>
bias_shape
=
{
1
,
oc_
,
1
,
1
};
// x tensor
DDim
x_dims
(
input_shape
);
std
::
vector
<
float
>
x_data
(
x_dims
.
production
());
for
(
int
i
=
0
;
i
<
x_dims
.
production
();
i
++
)
{
float
sign
=
i
%
3
==
0
?
-
1.0
f
:
1.0
f
;
x_data
[
i
]
=
sign
*
static_cast
<
float
>
(
i
%
128
)
*
0.013
f
+
0.001
;
}
SetCommonTensor
(
x_
,
x_dims
,
x_data
.
data
());
// filter tensor
DDim
filter_dims
(
filter_shape
);
std
::
vector
<
float
>
filter_data
(
filter_dims
.
production
());
for
(
int
i
=
0
;
i
<
filter_dims
.
production
();
i
++
)
{
float
sign
=
i
%
3
==
0
?
-
1.0
f
:
1.0
f
;
filter_data
[
i
]
=
sign
*
static_cast
<
float
>
(
i
%
128
)
*
0.01
f
+
0.001
;
}
SetCommonTensor
(
filter_
,
filter_dims
,
filter_data
.
data
());
// bias tensor
if
(
flag_bias_
)
{
DDim
bias_dims
(
bias_shape
);
std
::
vector
<
float
>
bias_data
(
bias_dims
.
production
());
for
(
int
i
=
0
;
i
<
bias_dims
.
production
();
i
++
)
{
float
sign
=
i
%
3
==
0
?
-
1.0
f
:
1.0
f
;
bias_data
[
i
]
=
sign
*
static_cast
<
float
>
(
i
%
128
)
*
0.01
f
+
0.001
;
}
SetCommonTensor
(
bias_
,
bias_dims
,
bias_data
.
data
());
}
}
};
TEST
(
conv2d_transpose
,
precision
)
{
LOG
(
INFO
)
<<
"test conv2d_transpose op"
;
#ifdef LITE_WITH_ARM
Place
place
(
TARGET
(
kARM
));
for
(
auto
n
:
{
2
})
{
for
(
auto
ic
:
{
1
,
4
/*, 128*/
})
{
for
(
auto
oc
:
{
1
,
4
/*, 128*/
})
{
LOG
(
INFO
)
<<
"n:"
<<
n
<<
",ic:"
<<
ic
<<
",oc:"
<<
oc
;
for
(
auto
ih
:
{
8
,
8
/*, 56 , 112, 224, 512*/
})
{
for
(
auto
iw
:
{
8
,
16
/*, 56, 112, 224, 512*/
})
{
for
(
auto
flag_bias
:
{
false
,
true
})
{
for
(
auto
flag_relu
:
{
false
,
true
})
{
for
(
auto
dilation
:
{
1
,
2
})
{
for
(
auto
stride
:
{
1
,
2
})
{
for
(
auto
pad_h0
:
{
0
,
1
})
{
for
(
auto
pad_h1
:
{
0
,
1
})
{
for
(
auto
pad_w0
:
{
0
,
1
})
{
for
(
auto
pad_w1
:
{
0
,
1
})
{
for
(
auto
ks
:
{
1
,
4
})
{
for
(
auto
group
:
{
1
,
2
})
{
for
(
auto
padding_algorithm
:
{
""
,
"SAME"
,
"VALID"
})
{
// obtain shape
// LOG(INFO) << "n:" << n << ",ic:" << ic <<
// ",oc:" <<
// oc
// << ",ih:" << ih << ",iw:" << iw
// << ",flag_bias:" << flag_bias
// << ",flag_relu:" << flag_relu
// << ",dila:" << dilation
// << ",stride:" << stride
// << ",padding:" << padding <<
// ",ks:" << ks
// << ",group:" << group;
if
(
ic
%
group
!=
0
||
oc
%
group
!=
0
)
{
group
=
1
;
}
std
::
unique_ptr
<
arena
::
TestCase
>
tester
(
new
Conv2DTransposeComputeTester
(
place
,
"def"
,
n
,
ic
,
oc
,
ih
,
iw
,
flag_bias
,
flag_relu
,
dilation
,
stride
,
pad_h0
,
pad_h1
,
pad_w0
,
pad_w1
,
ks
,
group
,
padding_algorithm
));
arena
::
Arena
arena
(
std
::
move
(
tester
),
place
,
2e-5
);
arena
.
TestPrecision
();
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
}
#endif
}
}
// namespace lite
}
// namespace paddle
lite/tests/math/conv_transpose_compute_test.cc
浏览文件 @
c0af965c
...
...
@@ -59,6 +59,7 @@ DEFINE_bool(flag_bias, false, "with bias");
typedef
paddle
::
lite
::
DDim
DDim
;
typedef
paddle
::
lite
::
Tensor
Tensor
;
typedef
paddle
::
lite
::
operators
::
ConvParam
ConvParam
;
typedef
paddle
::
lite
::
operators
::
ActivationParam
ActivationParam
;
using
paddle
::
lite
::
profile
::
Timer
;
DDim
compute_out_dim
(
const
DDim
&
dim_in
,
...
...
@@ -117,6 +118,13 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
paddle
::
lite
::
fill_tensor_rand
(
*
param
.
bias
,
-
1.
f
,
1.
f
);
// paddle::lite::fill_tensor_const(*param.bias, 1.f);
}
if
(
flag_relu
)
{
ActivationParam
act_param
;
act_param
.
has_active
=
true
;
act_param
.
active_type
=
(
paddle
::
lite_api
::
ActivationType
)
1
;
// 2-relu6 4-leakyrelu
param
.
activation_param
=
act_param
;
}
Tensor
tmp_weights
;
tmp_weights
.
Resize
(
weight_dim
);
tmp_weights
.
CopyDataFrom
(
*
param
.
filter
);
...
...
lite/tests/math/sgemm_compute_test.cc
浏览文件 @
c0af965c
...
...
@@ -22,9 +22,11 @@
#include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/core/tensor.h"
#include "lite/operators/op_params.h"
#include "lite/tests/utils/tensor_utils.h"
typedef
paddle
::
lite
::
Tensor
Tensor
;
typedef
paddle
::
lite
::
operators
::
ActivationParam
ActivationParam
;
using
paddle
::
lite
::
profile
::
Timer
;
DEFINE_int32
(
power_mode
,
...
...
@@ -136,6 +138,12 @@ bool test_sgemm(bool tra,
has_relu
);
}
Timer
t0
;
ActivationParam
act_param
;
if
(
has_relu
)
{
act_param
.
has_active
=
true
;
act_param
.
active_type
=
(
paddle
::
lite_api
::
ActivationType
)
1
;
// 2-relu6 4-leakyrelu
}
#ifdef LITE_WITH_ARM
//! compute
double
ops
=
2.0
*
m
*
n
*
k
;
...
...
@@ -163,7 +171,7 @@ bool test_sgemm(bool tra,
ldc
,
dbias
,
has_bias
,
has_relu
,
act_param
,
&
ctx
);
}
...
...
@@ -184,7 +192,7 @@ bool test_sgemm(bool tra,
ldc
,
dbias
,
has_bias
,
has_relu
,
act_param
,
&
ctx
);
t0
.
Stop
();
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录