Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
dea6154b
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
dea6154b
编写于
2月 05, 2020
作者:
L
Liu Yiqun
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into step_rnn/opt_ddim_lite
上级
794b01ec
928e2a24
变更
35
展开全部
隐藏空白更改
内联
并排
Showing
35 changed file
with
2526 addition
and
10425 deletion
+2526
-10425
lite/api/benchmark.cc
lite/api/benchmark.cc
+20
-8
lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
+31
-31
lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
+8
-8
lite/backends/arm/math/conv3x3s2_direct_int8.cc
lite/backends/arm/math/conv3x3s2_direct_int8.cc
+2
-2
lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
+81
-81
lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
+4
-4
lite/backends/arm/math/conv5x5s1_depthwise_fp32.cc
lite/backends/arm/math/conv5x5s1_depthwise_fp32.cc
+713
-9565
lite/backends/arm/math/conv5x5s1_depthwise_int8.cc
lite/backends/arm/math/conv5x5s1_depthwise_int8.cc
+0
-1
lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
+18
-18
lite/backends/arm/math/conv_block_utils.h
lite/backends/arm/math/conv_block_utils.h
+55
-55
lite/backends/arm/math/conv_depthwise.h
lite/backends/arm/math/conv_depthwise.h
+11
-10
lite/backends/arm/math/conv_impl.cc
lite/backends/arm/math/conv_impl.cc
+23
-16
lite/backends/arm/math/fill_bias_relu.cc
lite/backends/arm/math/fill_bias_relu.cc
+13
-13
lite/backends/arm/math/sgemv.cc
lite/backends/arm/math/sgemv.cc
+584
-411
lite/backends/arm/math/sgemv.h
lite/backends/arm/math/sgemv.h
+10
-7
lite/backends/opencl/cl_kernel/cl_common.h
lite/backends/opencl/cl_kernel/cl_common.h
+10
-2
lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
...s/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
+1
-3
lite/backends/opencl/cl_kernel/image/relu6_kernel.cl
lite/backends/opencl/cl_kernel/image/relu6_kernel.cl
+32
-0
lite/kernels/arm/conv_compute.cc
lite/kernels/arm/conv_compute.cc
+12
-14
lite/kernels/arm/conv_depthwise.cc
lite/kernels/arm/conv_depthwise.cc
+7
-3
lite/kernels/arm/fc_compute.cc
lite/kernels/arm/fc_compute.cc
+7
-4
lite/kernels/arm/matmul_compute.cc
lite/kernels/arm/matmul_compute.cc
+11
-2
lite/kernels/arm/mul_compute.cc
lite/kernels/arm/mul_compute.cc
+11
-2
lite/kernels/opencl/conv_compute.cc
lite/kernels/opencl/conv_compute.cc
+11
-5
lite/kernels/opencl/conv_compute_test.cc
lite/kernels/opencl/conv_compute_test.cc
+32
-8
lite/kernels/opencl/depthwise_conv2d_compute.cc
lite/kernels/opencl/depthwise_conv2d_compute.cc
+13
-1
lite/kernels/opencl/relu_compute.cc
lite/kernels/opencl/relu_compute.cc
+182
-1
lite/kernels/opencl/relu_compute_test.cc
lite/kernels/opencl/relu_compute_test.cc
+338
-8
lite/tests/kernels/fc_compute_test.cc
lite/tests/kernels/fc_compute_test.cc
+1
-1
lite/tests/math/conv_compute_test.cc
lite/tests/math/conv_compute_test.cc
+139
-83
lite/tests/math/conv_int8_compute_test.cc
lite/tests/math/conv_int8_compute_test.cc
+25
-24
lite/tests/math/gemv_int8_compute_test.cc
lite/tests/math/gemv_int8_compute_test.cc
+1
-1
lite/tests/math/sgemm_c4_compute_test.cc
lite/tests/math/sgemm_c4_compute_test.cc
+1
-1
lite/tests/math/sgemv_compute_test.cc
lite/tests/math/sgemv_compute_test.cc
+83
-24
lite/tests/utils/naive_math_impl.h
lite/tests/utils/naive_math_impl.h
+36
-8
未找到文件。
lite/api/benchmark.cc
浏览文件 @
dea6154b
...
...
@@ -30,7 +30,19 @@
#include "lite/utils/cp_logging.h"
#include "lite/utils/string.h"
DEFINE_string
(
model_dir
,
""
,
"model dir"
);
DEFINE_string
(
model_dir
,
""
,
"the path of the model, set model_dir when the model is no "
"combined formate. This option will be ignored if model_file "
"and param_file are exist."
);
DEFINE_string
(
model_file
,
""
,
"the path of model file, set model_file when the model is "
"combined formate."
);
DEFINE_string
(
param_file
,
""
,
"the path of param file, set param_file when the model is "
"combined formate."
);
DEFINE_string
(
input_shape
,
"1,3,224,224"
,
"set input shapes according to the model, "
...
...
@@ -68,11 +80,12 @@ inline double GetCurrentUS() {
return
1e+6
*
time
.
tv_sec
+
time
.
tv_usec
;
}
void
OutputOptModel
(
const
std
::
string
&
load_model_dir
,
const
std
::
string
&
save_optimized_model_dir
,
void
OutputOptModel
(
const
std
::
string
&
save_optimized_model_dir
,
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
input_shapes
)
{
lite_api
::
CxxConfig
config
;
config
.
set_model_dir
(
load_model_dir
);
config
.
set_model_dir
(
FLAGS_model_dir
);
config
.
set_model_file
(
FLAGS_model_file
);
config
.
set_param_file
(
FLAGS_param_file
);
std
::
vector
<
Place
>
vaild_places
=
{
Place
{
TARGET
(
kARM
),
PRECISION
(
kFloat
)},
};
...
...
@@ -91,7 +104,7 @@ void OutputOptModel(const std::string& load_model_dir,
}
predictor
->
SaveOptimizedModel
(
save_optimized_model_dir
,
LiteModelType
::
kNaiveBuffer
);
LOG
(
INFO
)
<<
"Load model from "
<<
load
_model_dir
;
LOG
(
INFO
)
<<
"Load model from "
<<
FLAGS
_model_dir
;
LOG
(
INFO
)
<<
"Save optimized model to "
<<
save_optimized_model_dir
;
}
...
...
@@ -146,7 +159,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
LOG
(
FATAL
)
<<
"open result file failed"
;
}
ofs
.
precision
(
5
);
ofs
<<
std
::
setw
(
2
0
)
<<
std
::
fixed
<<
std
::
left
<<
model_name
;
ofs
<<
std
::
setw
(
3
0
)
<<
std
::
fixed
<<
std
::
left
<<
model_name
;
ofs
<<
"min = "
<<
std
::
setw
(
12
)
<<
min_res
;
ofs
<<
"max = "
<<
std
::
setw
(
12
)
<<
max_res
;
ofs
<<
"average = "
<<
std
::
setw
(
12
)
<<
avg_res
;
...
...
@@ -209,8 +222,7 @@ int main(int argc, char** argv) {
// Output optimized model if needed
if
(
FLAGS_run_model_optimize
)
{
paddle
::
lite_api
::
OutputOptModel
(
FLAGS_model_dir
,
save_optimized_model_dir
,
input_shapes
);
paddle
::
lite_api
::
OutputOptModel
(
save_optimized_model_dir
,
input_shapes
);
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
...
...
lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
浏览文件 @
dea6154b
...
...
@@ -614,11 +614,11 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"blt 3f \n"
#define LEFT_RESULT_S1_LEAKY_RELU \
"
cmhs v18.4s, v12.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"
cmhs v19.4s, v13.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v20.4s, v12.4s, %[vscale].4s \n"
/* mul */
\
"fmul v21.4s, v12.4s, %[vscale].4s \n"
/* mul */
\
"ld1 {v8.4s}, [%[din_ptr4]], #16 \n"
/*vld1q_f32(din_ptr0)*/
\
"
fcmge v18.4s, v12.4s, %[vzero].4s \n"
/* vcgeq_f32 */
\
"
fcmge v19.4s, v13.4s, %[vzero].4s \n"
/* vcgeq_f32 */
\
"fmul v20.4s, v12.4s, %[vscale].4s \n"
/* mul */
\
"fmul v21.4s, v12.4s, %[vscale].4s \n"
/* mul */
\
"ld1 {v8.4s}, [%[din_ptr4]], #16 \n"
/*vld1q_f32(din_ptr0)*/
\
\
"fmla v15.4s , v16.4s, %[w1].s[0]\n"
/* outr00 += din2_0123 * w0[1]*/
\
"fmla v14.4s , v16.4s, %[w2].s[0]\n"
/* outr00 += din2_0123 * w1[1]*/
\
...
...
@@ -639,8 +639,8 @@ void conv_depthwise_3x3s1_fp32(const float *din,
\
"ld1 {v12.4s}, [%[bias_val]] \n"
/*vdupq_n_f32(bias_val)*/
\
"ld1 {v13.4s}, [%[bias_val]] \n"
/*vdupq_n_f32(bias_val)*/
/* r5*/
\
"
cmhs v18.4s, v14.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v20.4s, v14.4s, %[vscale].4s \n"
/* mul */
\
"
fcmge v18.4s, v14.4s, %[vzero].4s \n"
/* vcgeq_f32 */
\
"fmul v20.4s, v14.4s, %[vscale].4s \n"
/* mul */
\
\
"ld1 {v10.4s}, [%[din_ptr5]], #16 \n"
/*vld1q_f32(din_ptr0)*/
\
\
...
...
@@ -657,10 +657,10 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"ext v16.16b, v0.16b, v1.16b, #4 \n"
/* v16 = 1234*/
\
"ext v17.16b, v0.16b, v1.16b, #8 \n"
/* v16 = 2345 */
\
\
"
cmhs v18.4s, v15.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v20.4s, v15.4s, %[vscale].4s \n"
/* mul */
\
"ld1 {v14.4s}, [%[bias_val]] \n"
/*vdupq_n_f32(bias_val)*/
\
"bif v15.16b, v20.16b, v18.16b \n"
/* choose*/
\
"
fcmge v18.4s, v15.4s, %[vzero].4s \n"
/* vcgeq_f32 */
\
"fmul v20.4s, v15.4s, %[vscale].4s \n"
/* mul */
\
"ld1 {v14.4s}, [%[bias_val]] \n"
/*vdupq_n_f32(bias_val)*/
\
"bif v15.16b, v20.16b, v18.16b \n"
/* choose*/
\
"cmp %w[cnt], #1 \n" \
"st1 {v15.4s}, [%[doutr3]], #16 \n"
/* vst1q_f32() */
\
"ld1 {v15.4s}, [%[bias_val]] \n"
/*vdupq_n_f32(bias_val)*/
\
...
...
@@ -802,7 +802,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
#define MID_RESULT_S1_LEAKY_RELU \
"movi v21.4s, #0 \n" \
"
cmhs v18.4s, v12.4s, v21.4s \n"
/* vcgeq_u
32 */
\
"
fcmge v18.4s, v12.4s, v21.4s \n"
/* vcgeq_f
32 */
\
"fmul v20.4s, v12.4s, %[vscale].4s \n"
/* mul */
\
\
"fmla v15.4s , v16.4s, %[w0].s[1]\n"
/* outr00 += din0_1234 * w0[1]*/
\
...
...
@@ -824,7 +824,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"fmla v14.4s , v8.4s, %[w2].s[0]\n"
/* outr00 += din0_0123 * w0[0]*/
\
\
"ld1 {v8.4s}, [%[din_ptr4]], #16 \n"
/*vld1q_f32(din_ptr0)*/
\
"
cmhs v18.4s, v13.4s, v21.4s \n"
/* vcgeq_u
32 */
\
"
fcmge v18.4s, v13.4s, v21.4s \n"
/* vcgeq_f
32 */
\
"fmul v20.4s, v13.4s, %[vscale].4s \n"
/* mul */
\
\
"fmla v15.4s , v16.4s, %[w1].s[1]\n"
/* outr00 += din0_1234 * w0[1]*/
\
...
...
@@ -846,7 +846,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"fmla v15.4s , v10.4s, %[w2].s[0]\n"
/* outr00 += din0_0123 * w0[0]*/
\
"ld1 {v10.4s}, [%[din_ptr5]], #16 \n"
/*vld1q_f32(din_ptr0)*/
\
"ld1 {v13.4s}, [%[bias_val]] \n"
/*vdupq_n_f32(bias_val)*/
\
"
cmhs v18.4s, v14.4s, v21.4s \n"
/* vcgeq_u
32 */
\
"
fcmge v18.4s, v14.4s, v21.4s \n"
/* vcgeq_f
32 */
\
"fmul v20.4s, v14.4s, %[vscale].4s \n"
/* mul */
\
\
"fmla v15.4s , v16.4s, %[w2].s[1]\n"
/* outr00 += din0_1234 * w0[1]*/
\
...
...
@@ -861,7 +861,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"ext v17.16b, v0.16b, v1.16b, #8 \n"
/* v16 = 2345 */
\
"st1 {v14.4s}, [%[doutr2]], #16 \n" \
\
"
cmhs v18.4s, v15.4s, v21.4s \n"
/* vcgeq_u
32 */
\
"
fcmge v18.4s, v15.4s, v21.4s \n"
/* vcgeq_f
32 */
\
"fmul v20.4s, v15.4s, %[vscale].4s \n"
/* mul */
\
\
"ld1 {v14.4s}, [%[bias_val]] \n"
/*vdupq_n_f32(bias_val)*/
\
...
...
@@ -980,7 +980,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
#define RIGHT_RESULT_S1_LEAKY_RELU \
"movi v1.4s, #0 \n" \
"
cmhs v20.4s, v12.4s, v1.4s \n"
/* vcgeq_u
32 */
\
"
fcmge v20.4s, v12.4s, v1.4s \n"
/* vcgeq_f
32 */
\
"fmul v21.4s, v12.4s, %[vscale].4s \n"
/* mul */
\
\
"fmla v15.4s , v16.4s, %[w0].s[1]\n"
/* outr00 += din0_1234 * w0[1]*/
\
...
...
@@ -999,7 +999,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"fmla v15.4s , v8.4s, %[w1].s[0]\n"
/* outr00 += din0_0123 * w0[0]*/
\
"fmla v14.4s , v8.4s, %[w2].s[0]\n"
/* outr00 += din0_0123 * w0[0]*/
\
\
"
cmhs v20.4s, v13.4s, v1.4s \n"
/* vcgeq_u
32 */
\
"
fcmge v20.4s, v13.4s, v1.4s \n"
/* vcgeq_f
32 */
\
"fmul v21.4s, v13.4s, %[vscale].4s \n"
/* mul */
\
"st1 {v12.4s}, [%[doutr0]], #16 \n" \
\
...
...
@@ -1017,7 +1017,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
\
"fmla v15.4s , v10.4s, %[w2].s[0]\n"
/* outr00 += din0_0123 * w0[0]*/
\
\
"
cmhs v20.4s, v14.4s, v1.4s \n"
/* vcgeq_u
32 */
\
"
fcmge v20.4s, v14.4s, v1.4s \n"
/* vcgeq_f
32 */
\
"fmul v21.4s, v14.4s, %[vscale].4s \n"
/* mul */
\
"st1 {v13.4s}, [%[doutr1]], #16 \n"
/* r3 */
\
\
...
...
@@ -1028,7 +1028,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
\
"bif v14.16b, v24.16b, v18.16b \n" \
\
"
cmhs v20.4s, v15.4s, v1.4s \n"
/* vcgeq_u
32 */
\
"
fcmge v20.4s, v15.4s, v1.4s \n"
/* vcgeq_f
32 */
\
"fmul v21.4s, v15.4s, %[vscale].4s \n"
/* mul */
\
\
"st1 {v14.4s}, [%[doutr2]], #16 \n" \
...
...
@@ -1128,18 +1128,18 @@ void conv_depthwise_3x3s1_fp32(const float *din,
"st1 {v12.4s}, [%[out1]]\n" \
"st1 {v13.4s}, [%[out2]]\n"
#define RESULT_S_S1_LEAKY_RELU \
"prfm pldl1keep, [%[out1]]\n" \
"prfm pldl1keep, [%[out2]]\n" \
\
"
cmhs
v18.4s, v12.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"
cmhs
v19.4s, v13.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v20.4s, v12.4s, %[vscale].4s \n"
/* mul */
\
"fmul v21.4s, v13.4s, %[vscale].4s \n"
/* mul */
\
\
"bif v12.16b, v20.16b, v18.16b \n" \
"bif v13.16b, v21.16b, v19.16b \n" \
"st1 {v12.4s}, [%[out1]]\n" \
#define RESULT_S_S1_LEAKY_RELU
\
"prfm pldl1keep, [%[out1]]\n"
\
"prfm pldl1keep, [%[out2]]\n"
\
\
"
fcmge
v18.4s, v12.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"
fcmge
v19.4s, v13.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v20.4s, v12.4s, %[vscale].4s \n"
/* mul */
\
"fmul v21.4s, v13.4s, %[vscale].4s \n"
/* mul */
\
\
"bif v12.16b, v20.16b, v18.16b \n"
\
"bif v13.16b, v21.16b, v19.16b \n"
\
"st1 {v12.4s}, [%[out1]]\n"
\
"st1 {v13.4s}, [%[out2]]\n"
#define COMPUTE_S_S1_P0 \
"prfm pldl1keep, [%[din0]]\n" \
...
...
lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
浏览文件 @
dea6154b
...
...
@@ -179,11 +179,11 @@ namespace math {
#define LEAKY_RELU \
"movi v0.4s, #0\n"
/* for relu */
\
"ldr x0, [%[outl], #88]\n" \
"
cmhs v1.4s, v15.4s, v0.4s \n"
/* vcgeq_u
32 */
\
"
cmhs v2.4s, v16.4s, v0.4s \n"
/* vcgeq_u
32 */
\
"
fcmge v1.4s, v15.4s, v0.4s \n"
/* vcgeq_f
32 */
\
"
fcmge v2.4s, v16.4s, v0.4s \n"
/* vcgeq_f
32 */
\
"ld1 {v9.4s}, [x0] \n" \
"
cmhs v3.4s, v17.4s, v0.4s \n"
/* vcgeq_u
32 */
\
"
cmhs v4.4s, v18.4s, v0.4s \n"
/* vcgeq_u
32 */
\
"
fcmge v3.4s, v17.4s, v0.4s \n"
/* vcgeq_f
32 */
\
"
fcmge v4.4s, v18.4s, v0.4s \n"
/* vcgeq_f
32 */
\
"ldr x0, [%[outl]] \n" \
"fmul v5.4s, v15.4s, v9.4s \n"
/* mul */
\
"fmul v6.4s, v16.4s, v9.4s \n"
/* mul */
\
...
...
@@ -193,10 +193,10 @@ namespace math {
"bif v16.16b, v6.16b, v2.16b \n"
/* choose*/
\
"bif v17.16b, v7.16b, v3.16b \n"
/* choose*/
\
"bif v18.16b, v8.16b, v4.16b \n"
/* choose*/
\
"
cmhs v1.4s, v19.4s, v0.4s \n"
/* vcgeq_u
32 */
\
"
cmhs v2.4s, v20.4s, v0.4s \n"
/* vcgeq_u
32 */
\
"
cmhs v3.4s, v21.4s, v0.4s \n"
/* vcgeq_u
32 */
\
"
cmhs v4.4s, v22.4s, v0.4s \n"
/* vcgeq_u
32 */
\
"
fcmge v1.4s, v19.4s, v0.4s \n"
/* vcgeq_f
32 */
\
"
fcmge v2.4s, v20.4s, v0.4s \n"
/* vcgeq_f
32 */
\
"
fcmge v3.4s, v21.4s, v0.4s \n"
/* vcgeq_f
32 */
\
"
fcmge v4.4s, v22.4s, v0.4s \n"
/* vcgeq_f
32 */
\
"fmul v5.4s, v19.4s, v9.4s \n"
/* mul */
\
"fmul v6.4s, v20.4s, v9.4s \n"
/* mul */
\
"fmul v7.4s, v21.4s, v9.4s \n"
/* mul */
\
...
...
lite/backends/arm/math/conv3x3s2_direct_int8.cc
浏览文件 @
dea6154b
...
...
@@ -50,7 +50,7 @@ void conv_3x3s2_direct_int8(const int8_t* din,
bool
flag_relu
=
param
.
fuse_relu
;
bool
flag_bias
=
param
.
bias
;
int
pad_h
=
paddings
[
0
];
int
pad_w
=
paddings
[
1
];
int
pad_w
=
paddings
[
2
];
const
int
threads
=
ctx
->
threads
();
int
llc_size
=
ctx
->
llc_size
()
/
4
;
...
...
@@ -477,7 +477,7 @@ void conv_3x3s2_direct_int8(const int8_t* din,
bool
flag_relu
=
param
.
fuse_relu
;
bool
flag_bias
=
param
.
bias
;
int
pad_h
=
paddings
[
0
];
int
pad_w
=
paddings
[
1
];
int
pad_w
=
paddings
[
2
];
const
int
threads
=
ctx
->
threads
();
//! set 1/4 l2 cache
int
llc_size
=
ctx
->
llc_size
()
/
4
;
...
...
lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
浏览文件 @
dea6154b
...
...
@@ -451,44 +451,44 @@ void conv_depthwise_3x3s2_fp32(const float* din,
\
"blt 1f \n"
#define LEFT_RESULT_S2_LEAKY_RELU \
"ld1 {v22.4s}, [%[scale_ptr]] \n" \
"
cmhs v11.4s, v16.4s, %[vzero].4s \n"
/* vcgeq_u
32 */
\
\
"ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \
"ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \
"ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \
\
"fmul v12.4s, v16.4s, v22.4s \n" \
"fadd v17.4s, v17.4s, v13.4s \n" \
\
"ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \
"ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \
"ld1 {v15.4s}, [%[inptr0]] \n" \
\
"fadd v17.4s, v17.4s, v14.4s \n" \
"bif v16.16b, v12.16b, v11.16b \n"
/* choose*/
\
\
"ld1 {v18.4s}, [%[inptr1]] \n" \
"ld1 {v19.4s}, [%[inptr2]] \n" \
\
"ext v10.16b, v0.16b, v15.16b, #4 \n" \
\
"st1 {v16.4s}, [%[outptr0]], #16 \n" \
"
cmhs
v11.4s, v17.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v12.4s, v16.4s, v22.4s \n" \
\
"ld1 {v20.4s}, [%[inptr3]] \n" \
"ld1 {v21.4s}, [%[inptr4]] \n" \
\
"and v16.16b, %[vbias].16b, %[vbias].16b \n" \
"bif v17.16b, v12.16b, v11.16b \n"
/* choose*/
\
\
"cmp %w[cnt], #1 \n" \
\
"st1 {v17.4s}, [%[outptr1]], #16 \n" \
"and v17.16b, %[vbias].16b, %[vbias].16b \n" \
\
#define LEFT_RESULT_S2_LEAKY_RELU
\
"ld1 {v22.4s}, [%[scale_ptr]] \n"
\
"
fcmge v11.4s, v16.4s, %[vzero].4s \n"
/* vcgeq_f
32 */
\
\
"ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n"
\
"ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n"
\
"ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n"
\
\
"fmul v12.4s, v16.4s, v22.4s \n"
\
"fadd v17.4s, v17.4s, v13.4s \n"
\
\
"ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n"
\
"ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n"
\
"ld1 {v15.4s}, [%[inptr0]] \n"
\
\
"fadd v17.4s, v17.4s, v14.4s \n"
\
"bif v16.16b, v12.16b, v11.16b \n"
/* choose*/
\
\
"ld1 {v18.4s}, [%[inptr1]] \n"
\
"ld1 {v19.4s}, [%[inptr2]] \n"
\
\
"ext v10.16b, v0.16b, v15.16b, #4 \n"
\
\
"st1 {v16.4s}, [%[outptr0]], #16 \n"
\
"
fcmge
v11.4s, v17.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v12.4s, v16.4s, v22.4s \n"
\
\
"ld1 {v20.4s}, [%[inptr3]] \n"
\
"ld1 {v21.4s}, [%[inptr4]] \n"
\
\
"and v16.16b, %[vbias].16b, %[vbias].16b \n"
\
"bif v17.16b, v12.16b, v11.16b \n"
/* choose*/
\
\
"cmp %w[cnt], #1 \n"
\
\
"st1 {v17.4s}, [%[outptr1]], #16 \n"
\
"and v17.16b, %[vbias].16b, %[vbias].16b \n"
\
\
"blt 1f \n"
#define MID_RESULT_S2_RELU \
...
...
@@ -542,30 +542,30 @@ void conv_depthwise_3x3s2_fp32(const float* din,
\
"bne 2b \n"
#define MID_RESULT_S2_LEAKY_RELU \
"
cmhs
v11.4s, v16.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v12.4s, v16.4s, v22.4s \n" \
\
"fadd v17.4s, v17.4s, v13.4s \n" \
\
"ld1 {v19.4s}, [%[inptr2]] \n" \
"ld1 {v20.4s}, [%[inptr3]] \n" \
"ld1 {v21.4s}, [%[inptr4]] \n" \
\
"bif v16.16b, v12.16b, v11.16b \n"
/* choose*/
\
"ext v10.16b, v0.16b, v15.16b, #4 \n" \
"
cmhs
v11.4s, v17.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v12.4s, v17.4s, v22.4s \n" \
\
"st1 {v16.4s}, [%[outptr0]], #16 \n" \
"subs %w[cnt], %w[cnt], #1 \n" \
\
"and v16.16b, %[vbias].16b, %[vbias].16b \n" \
"bif v17.16b, v12.16b, v11.16b \n"
/* choose*/
\
"st1 {v17.4s}, [%[outptr1]], #16 \n" \
\
"and v17.16b, %[vbias].16b, %[vbias].16b \n" \
\
#define MID_RESULT_S2_LEAKY_RELU
\
"
fcmge
v11.4s, v16.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v12.4s, v16.4s, v22.4s \n"
\
\
"fadd v17.4s, v17.4s, v13.4s \n"
\
\
"ld1 {v19.4s}, [%[inptr2]] \n"
\
"ld1 {v20.4s}, [%[inptr3]] \n"
\
"ld1 {v21.4s}, [%[inptr4]] \n"
\
\
"bif v16.16b, v12.16b, v11.16b \n"
/* choose*/
\
"ext v10.16b, v0.16b, v15.16b, #4 \n"
\
"
fcmge
v11.4s, v17.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v12.4s, v17.4s, v22.4s \n"
\
\
"st1 {v16.4s}, [%[outptr0]], #16 \n"
\
"subs %w[cnt], %w[cnt], #1 \n"
\
\
"and v16.16b, %[vbias].16b, %[vbias].16b \n"
\
"bif v17.16b, v12.16b, v11.16b \n"
/* choose*/
\
"st1 {v17.4s}, [%[outptr1]], #16 \n"
\
\
"and v17.16b, %[vbias].16b, %[vbias].16b \n"
\
\
"bne 2b \n"
#define RIGHT_RESULT_S2_RELU \
...
...
@@ -606,25 +606,25 @@ void conv_depthwise_3x3s2_fp32(const float* din,
"st1 {v17.4s}, [%[outptr1]], #16 \n" \
"4: \n"
#define RIGHT_RESULT_S2_LEAKY_RELU \
"
cmhs
v11.4s, v16.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v12.4s, v16.4s, v22.4s \n" \
"fadd v17.4s, v17.4s, v13.4s \n" \
\
"bif v16.16b, v12.16b, v11.16b \n"
/* choose*/
\
\
"fadd v17.4s, v17.4s, v14.4s \n" \
\
"bif v16.16b, v0.16b, %[wmask].16b \n" \
\
"
cmhs
v11.4s, v17.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v12.4s, v17.4s, v22.4s \n" \
\
"st1 {v16.4s}, [%[outptr0]], #16 \n" \
"bif v17.16b, v12.16b, v11.16b \n"
/* choose*/
\
"bif v17.16b, v1.16b, %[wmask].16b \n" \
\
"st1 {v17.4s}, [%[outptr1]], #16 \n" \
#define RIGHT_RESULT_S2_LEAKY_RELU
\
"
fcmge
v11.4s, v16.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v12.4s, v16.4s, v22.4s \n"
\
"fadd v17.4s, v17.4s, v13.4s \n"
\
\
"bif v16.16b, v12.16b, v11.16b \n"
/* choose*/
\
\
"fadd v17.4s, v17.4s, v14.4s \n"
\
\
"bif v16.16b, v0.16b, %[wmask].16b \n"
\
\
"
fcmge
v11.4s, v17.4s, %[vzero].4s \n"
/* vcgeq_u32 */
\
"fmul v12.4s, v17.4s, v22.4s \n"
\
\
"st1 {v16.4s}, [%[outptr0]], #16 \n"
\
"bif v17.16b, v12.16b, v11.16b \n"
/* choose*/
\
"bif v17.16b, v1.16b, %[wmask].16b \n"
\
\
"st1 {v17.4s}, [%[outptr1]], #16 \n"
\
"4: \n"
#define COMPUTE_S_S2 \
...
...
lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
浏览文件 @
dea6154b
...
...
@@ -104,13 +104,13 @@ namespace math {
"fmin v22.4s, v22.4s, %[vsix].4s\n"
#define LEAKY_RELU
/* LeakyRelu */
\
"movi v0.4s, #0\n"
/* for relu */
\
"
cmhs v1.4s, v19.4s, v0.4s \n"
/* vcgeq_u32 */
\
"
fcmge v1.4s, v19.4s, v0.4s \n"
/* vcgeq_u32 */
\
"fmul v2.4s, v19.4s, %[vscale].4s \n"
/* mul */
\
"
cmhs v3.4s, v20.4s, v0.4s \n"
/* vcgeq_u32 */
\
"
fcmge v3.4s, v20.4s, v0.4s \n"
/* vcgeq_u32 */
\
"fmul v4.4s, v20.4s, %[vscale].4s \n"
/* mul */
\
"
cmhs v5.4s, v21.4s, v0.4s \n"
/* vcgeq_u32 */
\
"
fcmge v5.4s, v21.4s, v0.4s \n"
/* vcgeq_u32 */
\
"fmul v6.4s, v21.4s, %[vscale].4s \n"
/* mul */
\
"
cmhs v7.4s, v22.4s, v0.4s \n"
/* vcgeq_u32 */
\
"
fcmge v7.4s, v22.4s, v0.4s \n"
/* vcgeq_u32 */
\
"fmul v8.4s, v22.4s, %[vscale].4s \n"
/* mul */
\
"bif v19.16b, v2.16b, v1.16b \n"
/* choose*/
\
"bif v19.16b, v4.16b, v3.16b \n"
/* choose*/
\
...
...
lite/backends/arm/math/conv5x5s1_depthwise_fp32.cc
浏览文件 @
dea6154b
此差异已折叠。
点击以展开。
lite/backends/arm/math/conv5x5s1_depthwise_int8.cc
浏览文件 @
dea6154b
...
...
@@ -709,7 +709,6 @@ void conv_depthwise_5x5s1_int8(Dtype* dout,
"q15"
);
#endif
// clang-format on
int32_t
*
ptr_tmp
=
ptr_out0
-
w_loop
*
32
;
block_inr0
=
block_inr1
;
block_inr1
=
block_inr2
;
block_inr2
=
block_inr3
;
...
...
lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
浏览文件 @
dea6154b
...
...
@@ -198,24 +198,24 @@ namespace math {
"fmin v20.4s, v20.4s, %[vsix].4s\n" \
"fmin v21.4s, v21.4s, %[vsix].4s\n" \
"fmin v22.4s, v22.4s, %[vsix].4s\n"
#define LEAKY_RELU
/* LeakyRelu */
\
"movi v0.4s, #0\n"
/* for relu */
\
"
cmhs v1.4s, v19.4s, v0.4s \n"
/* vcgeq_u
32 */
\
"fmul v2.4s, v19.4s, %[vscale].4s \n"
/* mul */
\
"
cmhs v3.4s, v20.4s, v0.4s \n"
/* vcgeq_u
32 */
\
"fmul v4.4s, v20.4s, %[vscale].4s \n"
/* mul */
\
"
cmhs v5.4s, v21.4s, v0.4s \n"
/* vcgeq_u
32 */
\
"fmul v6.4s, v21.4s, %[vscale].4s \n"
/* mul */
\
"
cmhs v7.4s, v22.4s, v0.4s \n"
/* vcgeq_u
32 */
\
"fmul v8.4s, v22.4s, %[vscale].4s \n"
/* mul */
\
"bif v19.16b, v2.16b, v1.16b \n"
/* choose*/
\
"bif v19.16b, v4.16b, v3.16b \n"
/* choose*/
\
"bif v19.16b, v6.16b, v5.16b \n"
/* choose*/
\
"bif v19.16b, v8.16b, v7.16b \n"
/* choose*/
#define STORE
/* save result */
\
"str q19, [%[outc0]], #16\n" \
"str q20, [%[outc1]], #16\n" \
"str q21, [%[outc2]], #16\n" \
#define LEAKY_RELU
/* LeakyRelu */
\
"movi v0.4s, #0\n"
/* for relu */
\
"
fcmge v1.4s, v19.4s, v0.4s \n"
/* vcgeq_f
32 */
\
"fmul
v2.4s, v19.4s, %[vscale].4s \n"
/* mul */
\
"
fcmge v3.4s, v20.4s, v0.4s \n"
/* vcgeq_f
32 */
\
"fmul
v4.4s, v20.4s, %[vscale].4s \n"
/* mul */
\
"
fcmge v5.4s, v21.4s, v0.4s \n"
/* vcgeq_f
32 */
\
"fmul
v6.4s, v21.4s, %[vscale].4s \n"
/* mul */
\
"
fcmge v7.4s, v22.4s, v0.4s \n"
/* vcgeq_f
32 */
\
"fmul
v8.4s, v22.4s, %[vscale].4s \n"
/* mul */
\
"bif v19.16b, v2.16b, v1.16b \n"
/* choose*/
\
"bif v19.16b, v4.16b, v3.16b \n"
/* choose*/
\
"bif v19.16b, v6.16b, v5.16b \n"
/* choose*/
\
"bif v19.16b, v8.16b, v7.16b \n"
/* choose*/
#define STORE
/* save result */
\
"str q19, [%[outc0]], #16\n"
\
"str q20, [%[outc1]], #16\n"
\
"str q21, [%[outc2]], #16\n"
\
"str q22, [%[outc3]], #16\n"
#else
...
...
lite/backends/arm/math/conv_block_utils.h
浏览文件 @
dea6154b
...
...
@@ -614,16 +614,16 @@ inline void prepack_input_nxwc8_int8_dw(const int8_t* din,
"fmin v3.4s, v3.4s, %[six].4s \n"
/* relu6 */
#define NCHWC1_TRANS_FP32_LEAKY_RELU \
"
cmhs v4.4s, v0.4s, v20.4s \n"
/* vcgeq_u
32 */
\
"
cmhs v5.4s, v1.4s, v20.4s \n"
/* vcgeq_u
32 */
\
"
cmhs v6.4s, v2.4s, v20.4s \n"
/* vcgeq_u
32 */
\
"
cmhs v7.4s, v3.4s, v20.4s \n"
/* vcgeq_u
32 */
\
"fmul v8.4s, v0.4s, %[scale].4s
\n"
/* mul */
\
"fmul v9.4s, v1.4s, %[scale].4s
\n"
/* mul */
\
"
fcmge v4.4s, v0.4s, v20.4s \n"
/* vcgeq_f
32 */
\
"
fcmge v5.4s, v1.4s, v20.4s \n"
/* vcgeq_f
32 */
\
"
fcmge v6.4s, v2.4s, v20.4s \n"
/* vcgeq_f
32 */
\
"
fcmge v7.4s, v3.4s, v20.4s \n"
/* vcgeq_f
32 */
\
"fmul v8.4s, v0.4s, %[scale].4s
\n"
/* mul */
\
"fmul v9.4s, v1.4s, %[scale].4s
\n"
/* mul */
\
"fmul v10.4s, v2.4s, %[scale].4s \n"
/* mul */
\
"fmul v11.4s, v3.4s, %[scale].4s \n"
/* mul */
\
"bif v0.16b, v8.16b, v4.16b
\n"
/* choose*/
\
"bif v1.16b, v9.16b, v5.16b
\n"
/* choose*/
\
"bif v0.16b, v8.16b, v4.16b
\n"
/* choose*/
\
"bif v1.16b, v9.16b, v5.16b
\n"
/* choose*/
\
"bif v2.16b, v10.16b, v6.16b \n"
/* choose*/
\
"bif v3.16b, v11.16b, v7.16b \n"
/* choose*/
...
...
@@ -674,15 +674,15 @@ inline void prepack_input_nxwc8_int8_dw(const int8_t* din,
"vbif q3, q12, q8 @ choose \n"
#define NCHWC1_TRANS_FP32_STORE \
"vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result
\n"
\
"vst1.32 {d2-d3}, [%[doutc0r0]]! @ store result,
\n"
\
"vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result
\n"
\
"vst1.32 {d2-d3}, [%[doutc0r0]]! @ store result,
\n"
\
"subs %[cnt], %[cnt], #1 @ loop count - 1\n" \
\
"vld1.32 {d0-d3}, [%[ptr_din]]! @ load data
\n"
\
"vst1.32 {d4-d5}, [%[doutc0r0]]! @ store result
\n"
\
"vld1.32 {d0-d3}, [%[ptr_din]]! @ load data
\n"
\
"vst1.32 {d4-d5}, [%[doutc0r0]]! @ store result
\n"
\
"vst1.32 {d6-d7}, [%[doutc0r0]]! @ store result, \n" \
\
"vld1.32 {d4-d7}, [%[ptr_din]]! @ load data
\n"
\
"vld1.32 {d4-d7}, [%[ptr_din]]! @ load data
\n"
\
\
"bne 1b @ jump to main loop\n"
#endif
...
...
@@ -934,12 +934,12 @@ inline bool write_to_output_c1_fp32(const float* din,
"fmin v2.4s, v2.4s, %[six].4s \n"
/* relu6 */
\
"fmin v3.4s, v3.4s, %[six].4s \n"
/* relu6 */
#define NCHWC2_TRANS_FP32_LEAKY_RELU \
"
cmhs v6.4s, v2.4s, v20.4s \n"
/* vcgeq_u
32 */
\
"
cmhs v7.4s, v3.4s, v20.4s \n"
/* vcgeq_u
32 */
\
"fmul v4.4s, v2.4s, %[scale].4s \n"
/* mul */
\
"fmul v5.4s, v3.4s, %[scale].4s \n"
/* mul */
\
"bif v2.16b, v4.16b, v6.16b \n"
/* choose*/
\
#define NCHWC2_TRANS_FP32_LEAKY_RELU
\
"
fcmge v6.4s, v2.4s, v20.4s \n"
/* vcgeq_f
32 */
\
"
fcmge v7.4s, v3.4s, v20.4s \n"
/* vcgeq_f
32 */
\
"fmul v4.4s, v2.4s, %[scale].4s \n"
/* mul */
\
"fmul v5.4s, v3.4s, %[scale].4s \n"
/* mul */
\
"bif v2.16b, v4.16b, v6.16b \n"
/* choose*/
\
"bif v3.16b, v5.16b, v7.16b \n"
/* choose*/
#define NCHWC2_TRANS_FP32_STORE \
...
...
@@ -1275,19 +1275,19 @@ inline bool write_to_output_c2_fp32(const float* din,
"fmin v18.4s, v18.4s, %[six].4s \n"
/* relu6 */
\
"fmin v19.4s, v19.4s, %[six].4s \n"
/* relu6 */
#define NCHWC4_TRANS_FP32_LEAKY_RELU \
"
cmhs v8.4s, v16.4s, v20.4s \n"
/* vcgeq_u
32 */
\
"
cmhs v9.4s, v17.4s, v20.4s \n"
/* vcgeq_u
32 */
\
"
cmhs v10.4s, v18.4s, v20.4s \n"
/* vcgeq_u
32 */
\
"
cmhs v11.4s, v19.4s, v20.4s \n"
/* vcgeq_u
32 */
\
"fmul v4.4s, v16.4s, %[scale].4s \n"
/* mul */
\
"fmul v5.4s, v17.4s, %[scale].4s \n"
/* mul */
\
"fmul v6.4s, v18.4s, %[scale].4s \n"
/* mul */
\
"fmul v7.4s, v19.4s, %[scale].4s \n"
/* mul */
\
"bif v16.16b, v4.16b, v8.16b \n"
/* choose*/
\
"bif v17.16b, v5.16b, v9.16b \n"
/* choose*/
\
"bif v18.16b, v6.16b, v10.16b \n"
/* choose*/
\
"bif v19.16b, v7.16b, v11.16b \n"
/* choose*/
#define NCHWC4_TRANS_FP32_LEAKY_RELU
\
"
fcmge v8.4s, v16.4s, v20.4s \n"
/* vcgeq_f
32 */
\
"
fcmge v9.4s, v17.4s, v20.4s \n"
/* vcgeq_f
32 */
\
"
fcmge v10.4s, v18.4s, v20.4s \n"
/* vcgeq_f
32 */
\
"
fcmge v11.4s, v19.4s, v20.4s \n"
/* vcgeq_f
32 */
\
"fmul v4.4s, v16.4s, %[scale].4s \n"
/* mul */
\
"fmul v5.4s, v17.4s, %[scale].4s \n"
/* mul */
\
"fmul v6.4s, v18.4s, %[scale].4s \n"
/* mul */
\
"fmul v7.4s, v19.4s, %[scale].4s \n"
/* mul */
\
"bif v16.16b, v4.16b, v8.16b
\n"
/* choose*/
\
"bif v17.16b, v5.16b, v9.16b
\n"
/* choose*/
\
"bif v18.16b, v6.16b, v10.16b \n"
/* choose*/
\
"bif v19.16b, v7.16b, v11.16b \n"
/* choose*/
#define NCHWC4_TRANS_FP32_STORE \
"str q16, [%[doutc0r0]], #16 \n"
/* store c0r0*/
\
...
...
@@ -1754,15 +1754,15 @@ inline bool write_to_output_c4_fp32(const float* din,
"fmin v13.4s, v13.4s, %[six].4s \n"
/*relu6*/
#define NCHWC8_TRANS_FP32_LEAKY_RELU \
"
cmhs v10.4s, v16.4s, v20.4s \n"
/* vcgeq_u32 */
\
"
cmhs v11.4s, v17.4s, v20.4s \n"
/* vcgeq_u32 */
\
"
cmhs v14.4s, v18.4s, v20.4s \n"
/* vcgeq_u32 */
\
"
cmhs v15.4s, v19.4s, v20.4s \n"
/* vcgeq_u32 */
\
"
fcmge v10.4s, v16.4s, v20.4s \n"
/* vcgeq_u32 */
\
"
fcmge v11.4s, v17.4s, v20.4s \n"
/* vcgeq_u32 */
\
"
fcmge v14.4s, v18.4s, v20.4s \n"
/* vcgeq_u32 */
\
"
fcmge v15.4s, v19.4s, v20.4s \n"
/* vcgeq_u32 */
\
\
"
cmhs v21.4s, v8.4s, v20.4s \n"
/* vcgeq_u32 */
\
"
cmhs v22.4s, v9.4s, v20.4s \n"
/* vcgeq_u32 */
\
"
cmhs v23.4s, v12.4s, v20.4s \n"
/* vcgeq_u32 */
\
"
cmhs v24.4s, v13.4s, v20.4s \n"
/* vcgeq_u32 */
\
"
fcmge v21.4s, v8.4s, v20.4s \n"
/* vcgeq_u32 */
\
"
fcmge v22.4s, v9.4s, v20.4s \n"
/* vcgeq_u32 */
\
"
fcmge v23.4s, v12.4s, v20.4s \n"
/* vcgeq_u32 */
\
"
fcmge v24.4s, v13.4s, v20.4s \n"
/* vcgeq_u32 */
\
\
"fmul v25.4s, v16.4s, %[scale].4s \n"
/* mul */
\
"fmul v26.4s, v17.4s, %[scale].4s \n"
/* mul */
\
...
...
@@ -1839,7 +1839,7 @@ inline bool write_to_output_c4_fp32(const float* din,
"vmin.f32 q7, q7, %q[six] @ relu6\n"
#define NCHWC8_TRANS_FP32_LEAKY_RELU \
"vcge.f32 q9, q0,
q15 @ q0 > 0 \n"
\
"vcge.f32 q9, q0,
q15 @ q0 > 0 \n"
\
"vcge.f32 q10, q1, q15 @ q0 > 0 \n" \
"vcge.f32 q11, q2, q15 @ q0 > 0 \n" \
"vcge.f32 q12, q3, q15 @ q0 > 0 \n" \
...
...
@@ -2168,19 +2168,19 @@ inline void act_switch_c8_fp32(const float* din_ptr,
"fmin v1.4s, v1.4s, %[vsix].4s \n"
/* vmaxq_f32() */
\
"fmin v2.4s, v2.4s, %[vsix].4s \n"
/* vmaxq_f32() */
\
"fmin v3.4s, v3.4s, %[vsix].4s \n"
/* vmaxq_f32() */
#define DO_LEAKY_RELU
\
"
cmhs v4.4s, v0.4s, %[vzero].4s \n"
/* vcgeq_u
32 */
\
"fmul v5.4s, v0.4s, %[vscale].4s
\n"
/* vmulq_f32 */
\
"
cmhs v6.4s, v1.4s, %[vzero].4s \n"
/* vcgeq_u
32 */
\
"fmul v7.4s, v1.4s, %[vscale].4s
\n"
/* vmulq_f32 */
\
"
cmhs v8.4s, v2.4s, %[vzero].4s \n"
/* vcgeq_u
32 */
\
"fmul v9.4s, v2.4s, %[vscale].4s
\n"
/* vmulq_f32 */
\
"
cmhs v10.4s, v3.4s, %[vzero].4s \n"
/* vcgeq_u
32 */
\
"fmul v11.4s, v3.4s, %[vscale].4s
\n"
/* vmulq_f32 */
\
"bif v0.16b, v5.16b, v4.16b
\n"
/* choose*/
\
"bif v1.16b, v7.16b, v6.16b
\n"
/* choose*/
\
"bif v2.16b, v9.16b, v8.16b
\n"
/* choose*/
\
"bif v3.16b, v11.16b, v10.16b
\n"
/* choose*/
#define DO_LEAKY_RELU \
"
fcmge v4.4s, v0.4s, %[vzero].4s \n"
/* vcgeq_f
32 */
\
"fmul v5.4s, v0.4s, %[vscale].4s
\n"
/* vmulq_f32 */
\
"
fcmge v6.4s, v1.4s, %[vzero].4s \n"
/* vcgeq_f
32 */
\
"fmul v7.4s, v1.4s, %[vscale].4s
\n"
/* vmulq_f32 */
\
"
fcmge v8.4s, v2.4s, %[vzero].4s \n"
/* vcgeq_f
32 */
\
"fmul v9.4s, v2.4s, %[vscale].4s
\n"
/* vmulq_f32 */
\
"
fcmge v10.4s, v3.4s, %[vzero].4s \n"
/* vcgeq_f
32 */
\
"fmul v11.4s, v3.4s, %[vscale].4s
\n"
/* vmulq_f32 */
\
"bif v0.16b, v5.16b, v4.16b
\n"
/* choose*/
\
"bif v1.16b, v7.16b, v6.16b
\n"
/* choose*/
\
"bif v2.16b, v9.16b, v8.16b
\n"
/* choose*/
\
"bif v3.16b, v11.16b, v10.16b \n"
/* choose*/
#define DO_STORE \
"subs %w[cnt], %w[cnt], #1 \n" \
"st1 {v0.4s}, [%[dout_ptr]], #16 \n"
/* vst1q_f32() */
\
...
...
@@ -2217,7 +2217,7 @@ inline void act_switch_c8_fp32(const float* din_ptr,
"vbif q3, q8, q7 @ choose \n" \
"vbif q4, q10, q9 @ choose \n" \
"vbif q5, q12, q11 @ choose \n" \
"vbif q6, q1
3
, q13 @ choose \n"
"vbif q6, q1
4
, q13 @ choose \n"
#define DO_STORE \
"subs %[cnt], #1 \n" \
"vst1.32 {d6-d7}, [%[dout_ptr]]! @ vst1q_f32() \n" \
...
...
lite/backends/arm/math/conv_depthwise.h
浏览文件 @
dea6154b
...
...
@@ -123,20 +123,21 @@ void conv_depthwise_3x3s2_int8(Dtype* dout,
int
padh
,
ARMContext
*
ctx
);
void
conv_depthwise_5x5s1_fp32
(
const
float
*
din
,
float
*
dout
,
int
num
,
int
chout
,
int
hout
,
int
wout
,
int
chin
,
int
hin
,
int
win
,
void
conv_depthwise_5x5s1_fp32
(
float
*
dout
,
const
float
*
din
,
const
float
*
weights
,
const
float
*
bias
,
int
pad
,
bool
flag_bias
,
bool
flag_relu
,
int
num
,
int
chin
,
int
hin
,
int
win
,
int
hout
,
int
wout
,
int
padw
,
int
padh
,
const
operators
::
ConvParam
&
param
,
ARMContext
*
ctx
);
void
conv_depthwise_5x5s2_fp32
(
const
float
*
din
,
...
...
lite/backends/arm/math/conv_impl.cc
浏览文件 @
dea6154b
...
...
@@ -188,7 +188,6 @@ void conv1x1s1_gemm(const float* i_data,
if
(
n
>
1
)
{
weights_size_per_group
=
((
m_roundup
*
k
+
15
)
/
16
)
*
16
;
}
//! use gemv when the output channel size = 1
for
(
int
b
=
0
;
b
<
num
;
++
b
)
{
// dC
...
...
@@ -210,8 +209,11 @@ void conv1x1s1_gemm(const float* i_data,
k
,
flag_bias
,
bias_group
,
flag_relu
,
ctx
);
act_param
.
has_active
,
act_param
.
active_type
,
ctx
,
act_param
.
Relu_clipped_coef
,
act_param
.
Leaky_relu_alpha
);
}
else
{
sgemm_prepack
(
false
,
m
,
...
...
@@ -410,8 +412,11 @@ void conv_im2col_gemm(const float* i_data,
k
,
flag_bias
,
bias_group
,
flag_relu
,
ctx
);
act_param
.
has_active
,
act_param
.
active_type
,
ctx
,
act_param
.
Relu_clipped_coef
,
act_param
.
Leaky_relu_alpha
);
}
else
{
int
ldb
=
n
;
sgemm_prepack
(
false
,
...
...
@@ -677,7 +682,8 @@ void conv_depthwise_5x5_fp32(const void* din,
const
float
*
scale
)
{
auto
paddings
=
*
param
.
paddings
;
auto
act_param
=
param
.
activation_param
;
int
pad
=
paddings
[
0
];
int
pad_h
=
paddings
[
0
];
int
pad_w
=
paddings
[
2
];
int
stride
=
param
.
strides
[
1
];
bool
flag_relu
=
param
.
fuse_relu
;
bool
flag_bias
=
param
.
bias
!=
nullptr
;
...
...
@@ -698,20 +704,21 @@ void conv_depthwise_5x5_fp32(const void* din,
act_param
,
ctx
);
}
else
if
(
stride
==
1
)
{
conv_depthwise_5x5s1_fp32
(
reinterpret_cast
<
const
float
*>
(
din
),
reinterpret_cast
<
float
*>
(
dout
),
num
,
ch_out
,
h_out
,
w_out
,
ch_in
,
h_in
,
w_in
,
conv_depthwise_5x5s1_fp32
(
reinterpret_cast
<
float
*>
(
dout
),
reinterpret_cast
<
const
float
*>
(
din
),
reinterpret_cast
<
const
float
*>
(
weights
),
bias
,
pad
,
flag_bias
,
flag_relu
,
num
,
ch_in
,
h_in
,
w_in
,
h_out
,
w_out
,
pad_w
,
pad_h
,
param
,
ctx
);
}
else
{
LOG
(
FATAL
)
<<
"unsupport this type 5x5 dw conv"
;
...
...
lite/backends/arm/math/fill_bias_relu.cc
浏览文件 @
dea6154b
...
...
@@ -136,19 +136,19 @@ void fill_bias_relu<int>(int* tensor,
"fmin v1.4s, v1.4s, %[vsix].4s \n"
/* vmaxq_f32() */
\
"fmin v2.4s, v2.4s, %[vsix].4s \n"
/* vmaxq_f32() */
\
"fmin v3.4s, v3.4s, %[vsix].4s \n"
/* vmaxq_f32() */
#define FILL_LEAKY_RELU
\
"
cmhs v4.4s, v0.4s, %[vzero].4s \n"
/* vcgeq_u
32 */
\
"fmul v5.4s, v0.4s, %[vscale].4s
\n"
/* vmulq_f32 */
\
"
cmhs v6.4s, v1.4s, %[vzero].4s \n"
/* vcgeq_u
32 */
\
"fmul v7.4s, v1.4s, %[vscale].4s
\n"
/* vmulq_f32 */
\
"
cmhs v8.4s, v2.4s, %[vzero].4s \n"
/* vcgeq_u
32 */
\
"fmul v9.4s, v2.4s, %[vscale].4s
\n"
/* vmulq_f32 */
\
"
cmhs v10.4s, v3.4s, %[vzero].4s \n"
/* vcgeq_u
32 */
\
"fmul v11.4s, v3.4s, %[vscale].4s
\n"
/* vmulq_f32 */
\
"bif v0.16b, v5.16b, v4.16b
\n"
/* choose*/
\
"bif v1.16b, v7.16b, v6.16b
\n"
/* choose*/
\
"bif v2.16b, v9.16b, v8.16b
\n"
/* choose*/
\
"bif v3.16b, v11.16b, v10.16b
\n"
/* choose*/
#define FILL_LEAKY_RELU \
"
fcmge v4.4s, v0.4s, %[vzero].4s \n"
/* vcgeq_f
32 */
\
"fmul v5.4s, v0.4s, %[vscale].4s
\n"
/* vmulq_f32 */
\
"
fcmge v6.4s, v1.4s, %[vzero].4s \n"
/* vcgeq_f
32 */
\
"fmul v7.4s, v1.4s, %[vscale].4s
\n"
/* vmulq_f32 */
\
"
fcmge v8.4s, v2.4s, %[vzero].4s \n"
/* vcgeq_f
32 */
\
"fmul v9.4s, v2.4s, %[vscale].4s
\n"
/* vmulq_f32 */
\
"
fcmge v10.4s, v3.4s, %[vzero].4s \n"
/* vcgeq_f
32 */
\
"fmul v11.4s, v3.4s, %[vscale].4s
\n"
/* vmulq_f32 */
\
"bif v0.16b, v5.16b, v4.16b
\n"
/* choose*/
\
"bif v1.16b, v7.16b, v6.16b
\n"
/* choose*/
\
"bif v2.16b, v9.16b, v8.16b
\n"
/* choose*/
\
"bif v3.16b, v11.16b, v10.16b \n"
/* choose*/
#define FILL_STORE \
"subs %w[cnt], %w[cnt], #1 \n" \
"st1 {v0.4s}, [%[dout_ptr]], #16 \n"
/* vst1q_f32() */
\
...
...
lite/backends/arm/math/sgemv.cc
浏览文件 @
dea6154b
此差异已折叠。
点击以展开。
lite/backends/arm/math/sgemv.h
浏览文件 @
dea6154b
...
...
@@ -17,23 +17,26 @@
#include <cmath>
#include "lite/core/context.h"
#include "lite/core/device_info.h"
#include "lite/operators/op_params.h"
namespace
paddle
{
namespace
lite
{
namespace
arm
{
namespace
math
{
// TODO(xxx): fixme now only support transA = false
bool
sgemv
(
const
float
*
A
,
const
float
*
x
,
float
*
y
,
bool
sgemv
(
const
float
*
A
,
const
float
*
x
,
float
*
y
,
bool
transA
,
int
M
,
int
N
,
bool
is_bias
,
const
float
*
bias
,
bool
is_relu
,
const
ARMContext
*
ctx
);
const
float
*
bias
,
bool
flag_act
,
lite_api
::
ActivationType
act
,
const
ARMContext
*
ctx
,
float
six
=
6.
f
,
float
alpha
=
1.
f
);
}
// namespace math
}
// namespace arm
...
...
lite/backends/opencl/cl_kernel/cl_common.h
浏览文件 @
dea6154b
...
...
@@ -72,7 +72,7 @@ inline CL_DTYPE activation(CL_DTYPE in
CL_DTYPE
prelu_alpha
#endif
)
{
CL_DTYPE
output
;
CL_DTYPE
output
=
in
;
#ifdef PRELU
output
=
select
(
prelu_alpha
*
in
,
in
,
in
>=
(
CL_DTYPE
)
0
);
#endif
...
...
@@ -80,6 +80,10 @@ inline CL_DTYPE activation(CL_DTYPE in
#ifdef RELU
output
=
fmax
(
in
,
(
CL_DTYPE
)
0
);
#endif
#ifdef RELU6
output
=
clamp
(
in
,
(
CL_DTYPE
)
0
,
(
CL_DTYPE
)
6
);
#endif
return
output
;
}
...
...
@@ -89,7 +93,7 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in
CL_DTYPE4
prelu_alpha
#endif
)
{
CL_DTYPE4
output
;
CL_DTYPE4
output
=
in
;
#ifdef PRELU
output
=
select
(
prelu_alpha
*
in
,
in
,
in
>=
(
CL_DTYPE4
)
0
.
0
);
#endif
...
...
@@ -97,5 +101,9 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in
#ifdef RELU
output
=
fmax
(
in
,
(
CL_DTYPE4
)
0
);
#endif
#ifdef RELU6
output
=
clamp
(
in
,
(
CL_DTYPE4
)
0
,
(
CL_DTYPE4
)
6
);
#endif
return
output
;
}
lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
浏览文件 @
dea6154b
...
...
@@ -95,9 +95,7 @@ __kernel void depth_conv2d(__private const int global_size_dim0,
READ_IMG_TYPE
(
CL_DTYPE_CHAR,
new_biase,
sampler,
(
int2
)(
out_c,
0
))
;
#
endif
#
ifdef
RELU
output
=
activation_type4
(
output
)
;
#
endif
WRITE_IMG_TYPE
(
CL_DTYPE_CHAR,
output_image,
output_pos,
output
)
;
}
\ No newline at end of file
}
lite/backends/opencl/cl_kernel/image/relu6_kernel.cl
0 → 100644
浏览文件 @
dea6154b
/*
Copyright
(
c
)
2018
PaddlePaddle
Authors.
All
Rights
Reserved.
Licensed
under
the
Apache
License,
Version
2.0
(
the
"License"
)
;
you
may
not
use
this
file
except
in
compliance
with
the
License.
You
may
obtain
a
copy
of
the
License
at
http://www.apache.org/licenses/LICENSE-2.0
Unless
required
by
applicable
law
or
agreed
to
in
writing,
software
distributed
under
the
License
is
distributed
on
an
"AS IS"
BASIS,
WITHOUT
WARRANTIES
OR
CONDITIONS
OF
ANY
KIND,
either
express
or
implied.
See
the
License
for
the
specific
language
governing
permissions
and
limitations
under
the
License.
*/
#
include
<cl_common.h>
__kernel
void
relu6
(
__read_only
image2d_t
input,
__write_only
image2d_t
output,
__private
const
float
threshold
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
|
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
CL_DTYPE4
in
=
READ_IMG_TYPE
(
CL_DTYPE_CHAR,
input,
sampler,
(
int2
)(
x,
y
))
;
in
=
max
((
CL_DTYPE4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
,
in
)
;
in
=
min
((
CL_DTYPE4
)(
threshold,
threshold,
threshold,
threshold
)
,
in
)
;
WRITE_IMG_TYPE
(
CL_DTYPE_CHAR,
output,
(
int2
)(
x,
y
)
,
in
)
;
}
lite/kernels/arm/conv_compute.cc
浏览文件 @
dea6154b
...
...
@@ -42,8 +42,6 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
int
stride
=
param
.
strides
[
0
];
int
threads
=
ctx
.
threads
();
bool
pads_equal
=
((
paddings
[
0
]
==
paddings
[
1
])
&&
(
paddings
[
2
]
==
paddings
[
3
]));
int
chin
=
param
.
x
->
dims
()[
1
];
int
hin
=
param
.
x
->
dims
()[
2
];
int
win
=
param
.
x
->
dims
()[
3
];
...
...
@@ -51,28 +49,28 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
int
hout
=
param
.
output
->
dims
()[
2
];
int
wout
=
param
.
output
->
dims
()[
3
];
bool
pads_equal
=
((
paddings
[
0
]
==
paddings
[
1
])
&&
(
paddings
[
2
]
==
paddings
[
3
]));
bool
pads_all_equal
=
(
pads_equal
&&
paddings
[
0
]
==
paddings
[
2
]);
bool
k
p
s_equal
=
(
param
.
strides
[
0
]
==
param
.
strides
[
1
])
&&
(
kw
==
kh
);
bool
ks_equal
=
(
param
.
strides
[
0
]
==
param
.
strides
[
1
])
&&
(
kw
==
kh
);
bool
no_dilation
=
(
dilations
[
0
]
==
1
)
&&
(
dilations
[
1
]
==
1
);
bool
flag_dw_3x3
=
(
kw
==
3
&&
kh
==
3
&&
(
stride
==
1
||
stride
==
2
));
bool
flag_dw_
5x5
=
(
paddings
[
0
]
==
paddings
[
2
])
&&
((
kw
==
5
&&
stride
==
1
)
||
(
kw
==
5
&&
stride
==
2
)
);
bool
flag_dw_
3x3
=
(
kw
==
3
)
&&
(
kh
==
3
)
&&
(
stride
==
1
||
stride
==
2
);
bool
flag_dw_5x5
=
(
kw
==
5
)
&&
(
kh
==
5
)
&&
(
stride
==
1
||
stride
==
2
);
bool
flag_dw
=
flag_dw_3x3
||
flag_dw_5x5
;
/// select conv impl
if
(
param
.
groups
==
ic
&&
ic
==
oc
&&
kps_equal
&&
no_dilation
&&
flag_dw
)
{
/// dw conv impl
if
(
param
.
groups
==
ic
&&
ic
==
oc
&&
ks_equal
&&
no_dilation
&&
flag_dw
)
{
impl_
=
new
DepthwiseConv
<
PRECISION
(
kFloat
),
PRECISION
(
kFloat
)
>
;
// VLOG(3) << "invoking dw conv";
}
else
if
(
param
.
groups
==
1
&&
kw
==
3
&&
stride
==
1
&&
k
p
s_equal
&&
}
else
if
(
param
.
groups
==
1
&&
kw
==
3
&&
stride
==
1
&&
ks_equal
&&
no_dilation
&&
pads_all_equal
)
{
//
/ winograd conv impl
//
TODO(MyPandaShaoxiang): winograd conv support any pad
impl_
=
new
WinogradConv
<
PRECISION
(
kFloat
),
PRECISION
(
kFloat
)
>
;
// VLOG(3) << "invoking winograd conv";
}
else
if
(
param
.
groups
==
1
&&
kw
==
3
&&
stride
==
2
&&
chin
*
chout
<
4
*
hin
*
win
&&
kps_equal
&&
no_dilation
)
{
/// direct conv impl
chin
*
chout
<
4
*
hin
*
win
&&
ks_equal
&&
no_dilation
)
{
impl_
=
new
DirectConv
<
PRECISION
(
kFloat
),
PRECISION
(
kFloat
)
>
;
// VLOG(3) << "invoking direct conv";
}
else
{
...
...
@@ -109,7 +107,7 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
bool
kps_equal
=
(
pw
==
ph
)
&&
(
sh
==
sw
)
&&
(
kw
==
kh
);
bool
no_dilation
=
(
dilations
[
0
]
==
1
)
&&
(
dilations
[
1
]
==
1
);
bool
flag_dw_3x3
=
(
kw
==
3
&&
kh
==
3
&&
(
sw
==
1
||
sw
==
2
));
bool
flag_dw_5x5
=
pads_all_equal
&&
(
kw
==
5
&&
sw
==
1
);
bool
flag_dw_5x5
=
pads_all_equal
&&
(
kw
==
5
&&
kh
==
5
&&
sw
==
1
);
bool
flag_dw
=
flag_dw_3x3
||
flag_dw_5x5
;
if
(
param
.
groups
==
ic
&&
ic
==
oc
&&
kps_equal
&&
pads_equal
&&
...
...
@@ -154,7 +152,7 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
bool
kps_equal
=
(
pw
==
ph
)
&&
(
sh
==
sw
)
&&
(
kw
==
kh
);
bool
no_dilation
=
(
dilations
[
0
]
==
1
)
&&
(
dilations
[
1
]
==
1
);
bool
flag_dw_3x3
=
(
kw
==
3
&&
kh
==
3
&&
(
sw
==
1
||
sw
==
2
));
bool
flag_dw_5x5
=
pads_all_equal
&&
(
kw
==
5
&&
sw
==
1
);
bool
flag_dw_5x5
=
pads_all_equal
&&
(
kw
==
5
&&
kh
==
5
&&
sw
==
1
);
bool
flag_dw
=
flag_dw_3x3
||
flag_dw_5x5
;
if
(
param
.
groups
==
ic
&&
ic
==
oc
&&
kps_equal
&&
pads_equal
&&
...
...
lite/kernels/arm/conv_depthwise.cc
浏览文件 @
dea6154b
...
...
@@ -52,7 +52,10 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
impl_
=
lite
::
arm
::
math
::
conv_depthwise_3x3_fp32
;
}
else
if
(
kw
==
5
)
{
// VLOG(5) << "invoke 5x5 dw conv fp32";
if
(
param
.
strides
[
0
]
==
2
)
{
// conv5x5s2_dw
auto
strides
=
param
.
strides
;
if
((
strides
[
0
]
==
1
&&
strides
[
1
]
==
1
)
||
(
strides
[
0
]
==
2
&&
strides
[
1
]
==
2
))
{
// trans weights
constexpr
int
cblock
=
4
;
auto
oc
=
w_dims
[
0
];
auto
kh
=
w_dims
[
2
];
...
...
@@ -63,10 +66,11 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
lite
::
arm
::
math
::
conv_trans_weights_numc
(
w_data_in
,
w_data
,
oc
,
1
,
cblock
,
kh
*
kw
);
flag_trans_weights_
=
true
;
impl_
=
lite
::
arm
::
math
::
conv_depthwise_5x5_fp32
;
}
else
{
flag_trans_weights_
=
false
;
LOG
(
FATAL
)
<<
"5x5 depthwise conv only support stride == 1 or stride == 2"
;
}
impl_
=
lite
::
arm
::
math
::
conv_depthwise_5x5_fp32
;
}
else
{
LOG
(
FATAL
)
<<
"this type dw conv not impl"
;
}
...
...
lite/kernels/arm/fc_compute.cc
浏览文件 @
dea6154b
...
...
@@ -93,9 +93,11 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
if
(
flag_trans_bias_
)
{
b_data
=
bias_
.
data
<
float
>
();
}
bool
flag_relu
=
false
;
bool
flag_act
=
false
;
lite_api
::
ActivationType
act
;
if
(
param
.
activation_type
==
"relu"
)
{
flag_relu
=
true
;
act
=
lite_api
::
ActivationType
::
kRelu
;
flag_act
=
true
;
}
if
(
flag_gemm_
)
{
operators
::
ActivationParam
act_param
;
...
...
@@ -119,7 +121,7 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
&
ctx
);
if
(
param
.
bias
)
{
CHECK_EQ
(
param
.
bias
->
numel
(),
n_
);
lite
::
arm
::
math
::
fill_bias_fc
(
o_data
,
b_data
,
m_
,
n_
,
flag_
relu
);
lite
::
arm
::
math
::
fill_bias_fc
(
o_data
,
b_data
,
m_
,
n_
,
flag_
act
);
}
}
else
{
for
(
int
i
=
0
;
i
<
m_
;
++
i
)
{
...
...
@@ -133,7 +135,8 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
k_
,
param
.
bias
!=
nullptr
,
b_data
,
flag_relu
,
flag_act
,
act
,
&
ctx
);
}
}
...
...
lite/kernels/arm/matmul_compute.cc
浏览文件 @
dea6154b
...
...
@@ -233,8 +233,17 @@ void MatMulCompute::Run() {
int
ldb
=
n_
;
int
ldc
=
n_
;
if
(
n_
==
1
)
{
lite
::
arm
::
math
::
sgemv
(
x_data
,
y_data
,
o_data
,
false
,
m_
,
k_
,
false
,
nullptr
,
false
,
&
ctx
);
lite
::
arm
::
math
::
sgemv
(
x_data
,
y_data
,
o_data
,
false
,
m_
,
k_
,
false
,
nullptr
,
false
,
lite_api
::
ActivationType
::
kIndentity
,
&
ctx
);
if
(
fabsf
(
alpha
-
1.
f
)
>
1e-8
f
)
{
for
(
size_t
i
=
0
;
i
<
param
.
Out
->
dims
().
production
();
++
i
)
{
o_data
[
i
]
*=
alpha
;
...
...
lite/kernels/arm/mul_compute.cc
浏览文件 @
dea6154b
...
...
@@ -50,8 +50,17 @@ void MulCompute::Run() {
k_
=
x_w
;
auto
&
ctx
=
this
->
ctx_
->
template
As
<
ARMContext
>();
if
(
n_
==
1
)
{
lite
::
arm
::
math
::
sgemv
(
x_data
,
y_data
,
o_data
,
false
,
m_
,
k_
,
false
,
nullptr
,
false
,
&
ctx
);
lite
::
arm
::
math
::
sgemv
(
x_data
,
y_data
,
o_data
,
false
,
m_
,
k_
,
false
,
nullptr
,
false
,
lite_api
::
ActivationType
::
kIndentity
,
&
ctx
);
}
else
{
constexpr
bool
is_tranposed_y
=
false
;
...
...
lite/kernels/opencl/conv_compute.cc
浏览文件 @
dea6154b
...
...
@@ -70,9 +70,12 @@ void ConvCompute::PrepareForRun() {
kernel_func_names_
.
push_back
(
"gemm_batch"
);
kernel_func_paths_
.
push_back
(
"buffer/fc_kernel.cl"
);
if
(
relu_fused
)
{
build_options_
.
push_back
(
"-DCL_DTYPE=float -DRELU"
);
build_options_
.
push_back
(
"-DCL_DTYPE_float -DRELU"
);
}
else
if
(
param
.
activation_param
.
active_type
==
lite_api
::
ActivationType
::
kRelu6
)
{
build_options_
.
push_back
(
"-DCL_DTYPE_float -DRELU6"
);
}
else
{
build_options_
.
push_back
(
"-DCL_DTYPE
=
float"
);
build_options_
.
push_back
(
"-DCL_DTYPE
_
float"
);
}
impl_
=
&
ConvCompute
::
Conv2d1x1
;
}
else
if
(
pad_equal
)
{
...
...
@@ -80,11 +83,14 @@ void ConvCompute::PrepareForRun() {
kernel_func_names_
.
push_back
(
"gemm_batch"
);
kernel_func_paths_
.
push_back
(
"buffer/im2col_kernel.cl"
);
kernel_func_paths_
.
push_back
(
"buffer/fc_kernel.cl"
);
build_options_
.
push_back
(
"-DCL_DTYPE
=
float"
);
build_options_
.
push_back
(
"-DCL_DTYPE
_
float"
);
if
(
relu_fused
)
{
build_options_
.
push_back
(
"-DCL_DTYPE=float -DRELU"
);
build_options_
.
push_back
(
"-DCL_DTYPE_float -DRELU"
);
}
else
if
(
param
.
activation_param
.
active_type
==
lite_api
::
ActivationType
::
kRelu6
)
{
build_options_
.
push_back
(
"-DCL_DTYPE_float -DRELU6"
);
}
else
{
build_options_
.
push_back
(
"-DCL_DTYPE
=
float"
);
build_options_
.
push_back
(
"-DCL_DTYPE
_
float"
);
}
impl_
=
&
ConvCompute
::
GemmlikeConv2d
;
col_buffer_
.
reset
(
new
lite
::
Tensor
);
...
...
lite/kernels/opencl/conv_compute_test.cc
浏览文件 @
dea6154b
...
...
@@ -46,7 +46,7 @@ static void conv_basic(const Dtype1* din,
int
pad_w
,
int
pad_h
,
bool
flag_bias
,
bool
flag_relu
)
{
std
::
string
flag_relu
)
{
Dtype2
beta
=
0
;
auto
src_data
=
din
;
auto
dst_data_ref
=
dout
;
...
...
@@ -96,10 +96,15 @@ static void conv_basic(const Dtype1* din,
}
}
}
if
(
flag_relu
)
{
if
(
flag_relu
==
"relu"
)
{
dst_data_ref
[
out_idx
]
=
dst_data_ref
[
out_idx
]
>
(
Dtype2
)
0
?
dst_data_ref
[
out_idx
]
:
(
Dtype2
)
0
;
}
else
if
(
flag_relu
==
"relu6"
)
{
auto
dst_tmp
=
(
dst_data_ref
[
out_idx
]
>
(
Dtype2
)
0
)
?
dst_data_ref
[
out_idx
]
:
(
Dtype2
)
0
;
dst_data_ref
[
out_idx
]
=
(
dst_tmp
<
6.
f
)
?
dst_tmp
:
6.
f
;
}
}
}
...
...
@@ -186,7 +191,7 @@ TEST(conv2d, compute_conv2d_1x1) {
/*int iw = ih;*/
for
(
int
iw
=
1
;
iw
<
10
;
iw
+=
1
)
{
// iw
for
(
int
ic
=
1
;
ic
<
10
;
ic
+=
1
)
{
// k
for
(
bool
bias_flag
:
{
true
/*, false*/
})
{
for
(
bool
relu_flag
:
{
true
/*, false
*/
})
{
for
(
std
::
string
relu_flag
:
{
"relu"
/*, "relu6", "None"
*/
})
{
#else
// groups:1 stride_h:1 stride_w:1 pad_h:0 pad_w:0 kernel_h:1 kernel_h:1
// x_dims:1 32 112 112
...
...
@@ -229,7 +234,16 @@ TEST(conv2d, compute_conv2d_1x1) {
std
::
vector
<
int
>
paddings
=
{
pad
,
pad
,
pad
,
pad
};
param
.
groups
=
group
;
std
::
vector
<
int
>
dilations
=
{
dilation
,
dilation
};
param
.
fuse_relu
=
relu_flag
;
if
(
relu_flag
==
"relu"
)
{
param
.
fuse_relu
=
true
;
}
else
if
(
relu_flag
==
"None"
)
{
param
.
fuse_relu
=
false
;
}
else
if
(
relu_flag
==
"relu6"
)
{
param
.
activation_param
.
Relu_clipped_coef
=
6.
f
;
param
.
activation_param
.
has_active
=
true
;
param
.
activation_param
.
active_type
=
lite_api
::
ActivationType
::
kRelu6
;
}
param
.
paddings
=
std
::
make_shared
<
std
::
vector
<
int
>>
(
paddings
);
param
.
dilations
=
std
::
make_shared
<
std
::
vector
<
int
>>
(
dilations
);
...
...
@@ -390,7 +404,7 @@ TEST(conv2d, compute_conv2d_1x1) {
#undef PRINT_RESULT
// #define PRINT_RESULT
#define LOOP_TEST
//
#define LOOP_TEST
TEST
(
conv2d
,
compute_conv2d_gemm
)
{
std
::
unique_ptr
<
KernelContext
>
context
(
new
KernelContext
);
context
->
As
<
OpenCLContext
>
().
InitOnce
();
...
...
@@ -411,7 +425,7 @@ TEST(conv2d, compute_conv2d_gemm) {
for
(
int
iw
=
1
;
iw
<
10
;
iw
+=
1
)
{
// iw
for
(
int
ic
=
1
;
ic
<
10
;
ic
+=
1
)
{
// k
for
(
bool
bias_flag
:
{
true
,
false
})
{
for
(
bool
relu_flag
:
{
true
,
false
})
{
for
(
std
::
string
relu_flag
:
{
"relu"
,
"relu6"
,
"None"
})
{
#else
const
int
batch_size
=
8
;
...
...
@@ -420,7 +434,8 @@ TEST(conv2d, compute_conv2d_gemm) {
const
int
iw
=
224
;
const
int
ic
=
3
;
const
bool
bias_flag
=
true
;
const
bool
relu_flag
=
true
;
const
std
::
string
relu_flag
=
"relu6"
;
// "relu", "relu6", "None"
#endif
const
int
oh
=
(
ih
+
2
*
pad
-
ksize
)
/
stride
+
1
;
...
...
@@ -458,7 +473,16 @@ TEST(conv2d, compute_conv2d_gemm) {
std
::
vector
<
int
>
paddings
=
{
pad
,
pad
,
pad
,
pad
};
param
.
groups
=
group
;
std
::
vector
<
int
>
dilations
=
{
dilation
,
dilation
};
param
.
fuse_relu
=
relu_flag
;
if
(
relu_flag
==
"relu"
)
{
param
.
fuse_relu
=
true
;
}
else
if
(
relu_flag
==
"None"
)
{
param
.
fuse_relu
=
false
;
}
else
if
(
relu_flag
==
"relu6"
)
{
param
.
activation_param
.
Relu_clipped_coef
=
6.
f
;
param
.
activation_param
.
has_active
=
true
;
param
.
activation_param
.
active_type
=
lite_api
::
ActivationType
::
kRelu6
;
}
param
.
paddings
=
std
::
make_shared
<
std
::
vector
<
int
>>
(
paddings
);
param
.
dilations
=
std
::
make_shared
<
std
::
vector
<
int
>>
(
dilations
);
...
...
lite/kernels/opencl/depthwise_conv2d_compute.cc
浏览文件 @
dea6154b
...
...
@@ -39,6 +39,9 @@ class DepthwiseConv2dCompute
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
if
(
param
.
fuse_relu
)
{
build_options_
+=
" -DRELU"
;
}
else
if
(
param
.
activation_param
.
active_type
==
lite_api
::
ActivationType
::
kRelu6
)
{
build_options_
+=
" -DRELU6"
;
}
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
...
...
@@ -116,7 +119,7 @@ class DepthwiseConv2dCompute
private:
std
::
string
kernel_func_name_
{
"depthwise_conv2d"
};
std
::
string
build_options_
{
"-DCL_DTYPE
=
float"
};
std
::
string
build_options_
{
"-DCL_DTYPE
_
float"
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
...
...
@@ -135,6 +138,9 @@ class DepthwiseConv2dComputeFP16Image
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
if
(
param
.
fuse_relu
)
{
build_options_
+=
" -DRELU"
;
}
else
if
(
param
.
activation_param
.
active_type
==
lite_api
::
ActivationType
::
kRelu6
)
{
build_options_
+=
" -DRELU6"
;
}
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
...
...
@@ -252,6 +258,9 @@ class DepthwiseConv2d3x3s1ComputeFP16Image
const
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
if
(
param
.
fuse_relu
)
{
build_options_
+=
" -DRELU"
;
}
else
if
(
param
.
activation_param
.
active_type
==
lite_api
::
ActivationType
::
kRelu6
)
{
build_options_
+=
" -DRELU6"
;
}
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
...
...
@@ -360,6 +369,9 @@ class DepthwiseConv2dBasicComputeFP32Image
has_bias
&&
param
.
output
->
dims
()
==
param
.
bias
->
dims
();
if
(
param
.
fuse_relu
)
{
build_options_
+=
" -DRELU"
;
}
else
if
(
param
.
activation_param
.
active_type
==
lite_api
::
ActivationType
::
kRelu6
)
{
build_options_
+=
" -DRELU6"
;
}
if
(
has_bias
)
{
build_options_
+=
is_element_wise_bias
?
" -DBIASE_ELE"
:
" -DBIASE_CH"
;
...
...
lite/kernels/opencl/relu_compute.cc
浏览文件 @
dea6154b
...
...
@@ -220,12 +220,158 @@ class ReluComputeFP16ImageDefault
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
class
Relu6ComputeFloatImageDefault
:
public
KernelLite
<
TARGET
(
kOpenCL
),
PRECISION
(
kFloat
),
DATALAYOUT
(
kImageDefault
)
>
{
public:
using
param_t
=
operators
::
ActivationParam
;
std
::
string
doc
()
const
override
{
return
"Relu6 using cl::Image2D(ImageDefault/RGBA), kFloat"
;
}
void
PrepareForRun
()
override
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
"image/relu6_kernel.cl"
,
build_options_
);
}
void
Run
()
override
{
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
&
x_dims
=
param
.
X
->
dims
();
auto
*
x_buf
=
param
.
X
->
data
<
float
,
cl
::
Image2D
>
();
auto
image_shape
=
InitImageDimInfoWith
(
x_dims
);
auto
*
out_buf
=
param
.
Out
->
mutable_data
<
float
,
cl
::
Image2D
>
(
image_shape
[
"width"
],
image_shape
[
"height"
]);
const
auto
&
y_dims
=
param
.
Out
->
dims
();
// useless: check dim only
auto
threshold
=
param
.
Relu_clipped_coef
;
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
int
arg_idx
=
0
;
cl_int
status
=
kernel
.
setArg
(
arg_idx
,
*
x_buf
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
*
out_buf
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
threshold
);
CL_CHECK_FATAL
(
status
);
VLOG
(
4
)
<<
TargetToStr
(
param
.
X
->
target
());
VLOG
(
4
)
<<
TargetToStr
(
param
.
Out
->
target
());
VLOG
(
4
)
<<
"image_shape(w,h):"
<<
image_shape
[
"width"
]
<<
" "
<<
image_shape
[
"height"
];
VLOG
(
4
)
<<
"x_dims["
<<
x_dims
.
size
()
<<
"D]:"
<<
x_dims
[
0
]
<<
" "
<<
x_dims
[
1
]
<<
" "
<<
x_dims
[
2
]
<<
" "
<<
x_dims
[
3
];
VLOG
(
4
)
<<
"y_dims["
<<
y_dims
.
size
()
<<
"D]:"
<<
y_dims
[
0
]
<<
" "
<<
y_dims
[
1
]
<<
" "
<<
y_dims
[
2
]
<<
" "
<<
y_dims
[
3
];
VLOG
(
4
)
<<
"threshold:"
<<
threshold
;
auto
global_work_size
=
cl
::
NDRange
{
static_cast
<
cl
::
size_type
>
(
image_shape
[
"width"
]),
static_cast
<
cl
::
size_type
>
(
image_shape
[
"height"
])};
status
=
context
.
cl_context
()
->
GetCommandQueue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
global_work_size
,
cl
::
NullRange
,
nullptr
,
event_
.
get
());
CL_CHECK_FATAL
(
status
);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context
.
cl_context
()
->
GetCommandQueue
().
finish
();
}
private:
std
::
string
kernel_func_name_
{
"relu6"
};
std
::
string
build_options_
{
"-DCL_DTYPE_float -DRELU6"
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
class
Relu6ComputeFP16ImageDefault
:
public
KernelLite
<
TARGET
(
kOpenCL
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kImageDefault
)
>
{
public:
using
param_t
=
operators
::
ActivationParam
;
std
::
string
doc
()
const
override
{
return
"Relu6 using cl::Image2D(ImageDefault/RGBA), kFP16"
;
}
void
PrepareForRun
()
override
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
"image/relu6_kernel.cl"
,
build_options_
);
}
void
Run
()
override
{
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
&
x_dims
=
param
.
X
->
dims
();
auto
*
x_buf
=
param
.
X
->
data
<
int16_t
,
cl
::
Image2D
>
();
auto
image_shape
=
InitImageDimInfoWith
(
x_dims
);
auto
*
out_buf
=
param
.
Out
->
mutable_data
<
int16_t
,
cl
::
Image2D
>
(
image_shape
[
"width"
],
image_shape
[
"height"
]);
const
auto
&
y_dims
=
param
.
Out
->
dims
();
// useless: check dim only
auto
threshold
=
param
.
Relu_clipped_coef
;
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
int
arg_idx
=
0
;
cl_int
status
=
kernel
.
setArg
(
arg_idx
,
*
x_buf
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
*
out_buf
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
threshold
);
CL_CHECK_FATAL
(
status
);
VLOG
(
4
)
<<
TargetToStr
(
param
.
X
->
target
());
VLOG
(
4
)
<<
TargetToStr
(
param
.
Out
->
target
());
VLOG
(
4
)
<<
"image_shape(w,h):"
<<
image_shape
[
"width"
]
<<
" "
<<
image_shape
[
"height"
];
VLOG
(
4
)
<<
"x_dims["
<<
x_dims
.
size
()
<<
"D]:"
<<
x_dims
[
0
]
<<
" "
<<
x_dims
[
1
]
<<
" "
<<
x_dims
[
2
]
<<
" "
<<
x_dims
[
3
];
VLOG
(
4
)
<<
"y_dims["
<<
y_dims
.
size
()
<<
"D]:"
<<
y_dims
[
0
]
<<
" "
<<
y_dims
[
1
]
<<
" "
<<
y_dims
[
2
]
<<
" "
<<
y_dims
[
3
];
VLOG
(
4
)
<<
"threshold:"
<<
threshold
;
auto
global_work_size
=
cl
::
NDRange
{
static_cast
<
cl
::
size_type
>
(
image_shape
[
"width"
]),
static_cast
<
cl
::
size_type
>
(
image_shape
[
"height"
])};
status
=
context
.
cl_context
()
->
GetCommandQueue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
global_work_size
,
cl
::
NullRange
,
nullptr
,
event_
.
get
());
CL_CHECK_FATAL
(
status
);
// TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
// context.cl_wait_list()->emplace(out_buf, event_);
context
.
cl_context
()
->
GetCommandQueue
().
finish
();
}
private:
std
::
string
kernel_func_name_
{
"relu6"
};
std
::
string
build_options_
{
"-DCL_DTYPE_half -DRELU6"
};
std
::
shared_ptr
<
cl
::
Event
>
event_
{
new
cl
::
Event
};
};
}
// namespace opencl
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
// REGISTER_LITE_KERNEL(relu,
// REGISTER_LITE_KERNEL(relu,
`
// kOpenCL,
// kFloat,
// kNCHW,
...
...
@@ -267,3 +413,38 @@ REGISTER_LITE_KERNEL(relu,
PRECISION
(
kFP16
),
DATALAYOUT
(
kImageDefault
))})
.
Finalize
();
// Relu6
REGISTER_LITE_KERNEL
(
relu6
,
kOpenCL
,
kFloat
,
kImageDefault
,
paddle
::
lite
::
kernels
::
opencl
::
Relu6ComputeFloatImageDefault
,
ImageDefault
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kOpenCL
),
PRECISION
(
kFloat
),
DATALAYOUT
(
kImageDefault
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kOpenCL
),
PRECISION
(
kFloat
),
DATALAYOUT
(
kImageDefault
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
relu6
,
kOpenCL
,
kFP16
,
kImageDefault
,
paddle
::
lite
::
kernels
::
opencl
::
Relu6ComputeFP16ImageDefault
,
ImageDefault
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kOpenCL
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kImageDefault
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kOpenCL
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kImageDefault
))})
.
Finalize
();
lite/kernels/opencl/relu_compute_test.cc
浏览文件 @
dea6154b
...
...
@@ -23,9 +23,21 @@ namespace paddle {
namespace
lite
{
template
<
typename
dtype
>
void
relu_compute_ref
(
const
dtype
*
x_data
,
const
DDim
&
x_dim
,
dtype
*
out_data
)
{
for
(
int
i
=
0
;
i
<
x_dim
.
production
();
++
i
)
{
out_data
[
i
]
=
x_data
[
i
]
>
0.
f
?
x_data
[
i
]
:
0.
f
;
void
relu_compute_ref
(
const
dtype
*
x_data
,
const
DDim
&
x_dim
,
dtype
*
out_data
,
float
threshold
=
0.
f
)
{
if
(
abs
(
threshold
)
<
1e-5
)
{
// relu
for
(
int
i
=
0
;
i
<
x_dim
.
production
();
++
i
)
{
out_data
[
i
]
=
(
x_data
[
i
]
>
threshold
)
?
x_data
[
i
]
:
threshold
;
}
}
else
{
// relu6 or relu with threshold
for
(
int
i
=
0
;
i
<
x_dim
.
production
();
++
i
)
{
auto
out_tmp
=
(
x_data
[
i
]
>
0
)
?
x_data
[
i
]
:
0
;
out_data
[
i
]
=
(
out_tmp
<
threshold
)
?
out_tmp
:
threshold
;
}
}
}
...
...
@@ -252,7 +264,7 @@ TEST(relu_image2d_fp16, compute) {
"layout(img2buf) "
"-> host"
;
#ifdef LOOP_TEST
#ifdef
RELU_FP16_
LOOP_TEST
for
(
int
n
=
1
;
n
<=
100
;
n
+=
33
)
{
for
(
auto
c
:
{
1
,
3
})
{
for
(
int
h
=
12
;
h
<=
100
;
h
+=
13
)
{
...
...
@@ -262,7 +274,7 @@ TEST(relu_image2d_fp16, compute) {
const
int
c
=
2
;
const
int
h
=
3
;
const
int
w
=
4
;
#endif // LOOP_TEST
#endif //
RELU_FP16_
LOOP_TEST
LOG
(
INFO
)
<<
"======== input shape[n,c,h,w]:"
<<
n
<<
" "
<<
c
<<
" "
<<
h
<<
" "
<<
w
<<
" ========"
;
...
...
@@ -367,13 +379,13 @@ TEST(relu_image2d_fp16, compute) {
// compute ref cpu
relu_compute_ref
<
float
>
(
mapped_x
,
x_dim
,
y_data_ref
);
// result
#ifdef PRINT_RESULT
#ifdef
RELU_FP16_
PRINT_RESULT
LOG
(
INFO
)
<<
"---- print kernel result (input -> output) ----"
;
for
(
int
eidx
=
0
;
eidx
<
x_dim
.
production
();
++
eidx
)
{
std
::
cout
<<
mapped_x
[
eidx
]
<<
" -> "
<<
mapped_y
[
eidx
]
<<
std
::
endl
;
}
#endif // PRINT_RESULT
#endif //
RELU_FP16_
PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref)
for
(
int
eidx
=
0
;
eidx
<
x_dim
.
production
();
eidx
++
)
{
...
...
@@ -391,7 +403,321 @@ TEST(relu_image2d_fp16, compute) {
LOG
(
INFO
)
<<
"free: unmap x, y"
;
TargetWrapperCL
::
Unmap
(
x_data
,
mapped_x
);
TargetWrapperCL
::
Unmap
(
y_data
,
mapped_y
);
#ifdef LOOP_TEST
#ifdef RELU_FP16_LOOP_TEST
}
// w
}
// h
}
// c
}
// n
#else
// nothing to do.
#endif
}
// #define RELU6_FP32_LOOP_TEST
// #define RELU6_FP32_PRINT_RESULT
TEST
(
relu6_image2d_fp32
,
compute
)
{
LOG
(
INFO
)
<<
"main steps of test: host -> layout(buf2img) -> relu6(img) -> "
"layout(img2buf) "
"-> host"
;
#ifdef RELU6_FP32_LOOP_TEST
for
(
int
n
=
1
;
n
<=
100
;
n
+=
33
)
{
for
(
auto
c
:
{
1
,
3
})
{
for
(
int
h
=
12
;
h
<=
100
;
h
+=
13
)
{
for
(
int
w
=
12
;
w
<=
100
;
w
+=
25
)
{
#else
const
int
n
=
1
;
const
int
c
=
2
;
const
int
h
=
3
;
const
int
w
=
4
;
#endif // RELU6_FP32_LOOP_TEST
LOG
(
INFO
)
<<
"======== input shape[n,c,h,w]:"
<<
n
<<
" "
<<
c
<<
" "
<<
h
<<
" "
<<
w
<<
" ========"
;
// set layout kernels
auto
buf_to_img_kernels
=
KernelRegistry
::
Global
().
Create
(
"layout"
,
TARGET
(
kOpenCL
),
PRECISION
(
kAny
),
DATALAYOUT
(
kImageDefault
));
auto
img_to_buf_kernels
=
KernelRegistry
::
Global
().
Create
(
"layout"
,
TARGET
(
kOpenCL
),
PRECISION
(
kAny
),
DATALAYOUT
(
kNCHW
));
auto
relu_img_kernels
=
KernelRegistry
::
Global
().
Create
(
"relu6"
,
TARGET
(
kOpenCL
),
PRECISION
(
kFloat
),
DATALAYOUT
(
kImageDefault
));
ASSERT_FALSE
(
buf_to_img_kernels
.
empty
());
ASSERT_FALSE
(
buf_to_img_kernels
.
empty
());
ASSERT_FALSE
(
relu_img_kernels
.
empty
());
auto
buf_to_img_kernel
=
std
::
move
(
buf_to_img_kernels
.
front
());
auto
img_to_buf_kernel
=
std
::
move
(
img_to_buf_kernels
.
front
());
auto
relu_img_kernel
=
std
::
move
(
relu_img_kernels
.
front
());
LOG
(
INFO
)
<<
"get 1st kernel: "
<<
buf_to_img_kernel
->
doc
();
LOG
(
INFO
)
<<
"get 2nd kernel: "
<<
img_to_buf_kernel
->
doc
();
LOG
(
INFO
)
<<
"get 3rd kernel: "
<<
relu_img_kernel
->
doc
();
// set tensors about op param
LOG
(
INFO
)
<<
"set tensors about op param"
;
// layout(buf->img): x -> relu_in
// relu(img): relu_in -> relu_out
// layout(img->buf): relu_out -> y
lite
::
Tensor
x
,
y
,
relu_in
,
relu_out
,
y_ref
;
operators
::
LayoutParam
BufferToImageParam
;
operators
::
LayoutParam
ImageToBufferParam
;
BufferToImageParam
.
x
=
&
x
;
BufferToImageParam
.
y
=
&
relu_in
;
ImageToBufferParam
.
x
=
&
relu_out
;
ImageToBufferParam
.
y
=
&
y
;
operators
::
ActivationParam
ReluParam
;
ReluParam
.
X
=
&
relu_in
;
ReluParam
.
Out
=
&
relu_out
;
ReluParam
.
Relu_clipped_coef
=
6.
f
;
const
DDim
x_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
n
,
c
,
h
,
w
});
x
.
Resize
(
x_dim
);
y
.
Resize
(
x_dim
);
relu_in
.
Resize
(
x_dim
);
relu_out
.
Resize
(
x_dim
);
y_ref
.
Resize
(
x_dim
);
auto
relu_image2d_shape
=
paddle
::
lite
::
kernels
::
opencl
::
InitImageDimInfoWith
(
x_dim
);
// initialize tensors
LOG
(
INFO
)
<<
"initialize tensors"
;
auto
*
x_data
=
x
.
mutable_data
<
float
,
cl
::
Buffer
>
(
TARGET
(
kOpenCL
));
auto
*
y_data
=
y
.
mutable_data
<
float
,
cl
::
Buffer
>
(
TARGET
(
kOpenCL
));
auto
*
y_data_ref
=
y_ref
.
mutable_data
<
float
>
(
TARGET
(
kARM
));
auto
*
mapped_x
=
static_cast
<
float
*>
(
TargetWrapperCL
::
Map
(
x_data
,
0
,
sizeof
(
float
)
*
x_dim
.
production
()));
auto
*
mapped_y
=
static_cast
<
float
*>
(
TargetWrapperCL
::
Map
(
y_data
,
0
,
sizeof
(
float
)
*
x_dim
.
production
()));
for
(
int
i
=
0
;
i
<
x_dim
.
production
();
++
i
)
{
mapped_x
[
i
]
=
static_cast
<
int
>
(
i
)
-
x_dim
.
production
()
/
2
;
mapped_y
[
i
]
=
static_cast
<
int
>
(
0
);
}
auto
*
relu_in_data
=
relu_in
.
mutable_data
<
float
,
cl
::
Image2D
>
(
relu_image2d_shape
[
"width"
],
relu_image2d_shape
[
"height"
]);
auto
*
relu_out_data
=
relu_out
.
mutable_data
<
float
,
cl
::
Image2D
>
(
relu_image2d_shape
[
"width"
],
relu_image2d_shape
[
"height"
]);
// set context and kernel args
LOG
(
INFO
)
<<
"set context and kernel args"
;
std
::
unique_ptr
<
KernelContext
>
context
(
new
KernelContext
);
context
->
As
<
OpenCLContext
>
().
InitOnce
();
buf_to_img_kernel
->
SetParam
(
BufferToImageParam
);
std
::
unique_ptr
<
KernelContext
>
buf_to_img_context
(
new
KernelContext
);
context
->
As
<
OpenCLContext
>
().
CopySharedTo
(
&
(
buf_to_img_context
->
As
<
OpenCLContext
>
()));
buf_to_img_kernel
->
SetContext
(
std
::
move
(
buf_to_img_context
));
img_to_buf_kernel
->
SetParam
(
ImageToBufferParam
);
std
::
unique_ptr
<
KernelContext
>
img_to_buf_context
(
new
KernelContext
);
context
->
As
<
OpenCLContext
>
().
CopySharedTo
(
&
(
img_to_buf_context
->
As
<
OpenCLContext
>
()));
img_to_buf_kernel
->
SetContext
(
std
::
move
(
img_to_buf_context
));
relu_img_kernel
->
SetParam
(
ReluParam
);
std
::
unique_ptr
<
KernelContext
>
relu_img_context
(
new
KernelContext
);
context
->
As
<
OpenCLContext
>
().
CopySharedTo
(
&
(
relu_img_context
->
As
<
OpenCLContext
>
()));
relu_img_kernel
->
SetContext
(
std
::
move
(
relu_img_context
));
// run kernels
LOG
(
INFO
)
<<
"run kernel: buf_to_img_kernel"
;
buf_to_img_kernel
->
Launch
();
LOG
(
INFO
)
<<
"run kernel: relu_img_kernel"
;
relu_img_kernel
->
Launch
();
LOG
(
INFO
)
<<
"run kernel: img_to_buf_kernel"
;
img_to_buf_kernel
->
Launch
();
// compute ref cpu
relu_compute_ref
<
float
>
(
mapped_x
,
x_dim
,
y_data_ref
,
6.
f
);
// result
#ifdef RELU6_FP32_PRINT_RESULT
LOG
(
INFO
)
<<
"---- print kernel result (input -> output) ----"
;
for
(
int
eidx
=
0
;
eidx
<
x_dim
.
production
();
++
eidx
)
{
std
::
cout
<<
mapped_x
[
eidx
]
<<
" -> "
<<
mapped_y
[
eidx
]
<<
std
::
endl
;
}
#endif // RELU6_FP32_PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref)
for
(
int
eidx
=
0
;
eidx
<
x_dim
.
production
();
eidx
++
)
{
EXPECT_NEAR
(
y_data_ref
[
eidx
],
mapped_y
[
eidx
],
1e-6
);
if
(
abs
(
y_data_ref
[
eidx
]
-
mapped_y
[
eidx
])
>
1e-6
)
{
LOG
(
INFO
)
<<
"1st diff in this case at eidx[from 0]:"
<<
eidx
<<
" / "
<<
x_dim
.
production
()
<<
", y_data_ref["
<<
eidx
<<
"]:"
<<
y_data_ref
[
eidx
]
<<
", mapped_y["
<<
eidx
<<
"]:"
<<
mapped_y
[
eidx
];
break
;
}
}
// free
LOG
(
INFO
)
<<
"free: unmap x, y"
;
TargetWrapperCL
::
Unmap
(
x_data
,
mapped_x
);
TargetWrapperCL
::
Unmap
(
y_data
,
mapped_y
);
#ifdef RELU6_FP32_LOOP_TEST
}
// w
}
// h
}
// c
}
// n
#else
// nothing to do.
#endif
}
// #define RELU6_FP16_LOOP_TEST
// #define RELU6_FP16_PRINT_RESULT
TEST
(
relu6_image2d_fp16
,
compute
)
{
LOG
(
INFO
)
<<
"main steps of test: host -> layout(buf2img) -> relu6(img) -> "
"layout(img2buf) "
"-> host"
;
#ifdef RELU6_FP16_LOOP_TEST
for
(
int
n
=
1
;
n
<=
100
;
n
+=
33
)
{
for
(
auto
c
:
{
1
,
3
})
{
for
(
int
h
=
12
;
h
<=
100
;
h
+=
13
)
{
for
(
int
w
=
12
;
w
<=
100
;
w
+=
25
)
{
#else
const
int
n
=
1
;
const
int
c
=
2
;
const
int
h
=
3
;
const
int
w
=
4
;
#endif // RELU6_FP16_LOOP_TEST
LOG
(
INFO
)
<<
"======== input shape[n,c,h,w]:"
<<
n
<<
" "
<<
c
<<
" "
<<
h
<<
" "
<<
w
<<
" ========"
;
// set layout kernels
auto
buf_to_img_kernels
=
KernelRegistry
::
Global
().
Create
(
"layout"
,
TARGET
(
kOpenCL
),
PRECISION
(
kAny
),
DATALAYOUT
(
kImageDefault
));
auto
img_to_buf_kernels
=
KernelRegistry
::
Global
().
Create
(
"layout"
,
TARGET
(
kOpenCL
),
PRECISION
(
kAny
),
DATALAYOUT
(
kNCHW
));
auto
relu_img_kernels
=
KernelRegistry
::
Global
().
Create
(
"relu6"
,
TARGET
(
kOpenCL
),
PRECISION
(
kFloat
),
DATALAYOUT
(
kImageDefault
));
ASSERT_FALSE
(
buf_to_img_kernels
.
empty
());
ASSERT_FALSE
(
buf_to_img_kernels
.
empty
());
ASSERT_FALSE
(
relu_img_kernels
.
empty
());
auto
buf_to_img_kernel
=
std
::
move
(
buf_to_img_kernels
.
front
());
auto
img_to_buf_kernel
=
std
::
move
(
img_to_buf_kernels
.
front
());
auto
relu_img_kernel
=
std
::
move
(
relu_img_kernels
.
front
());
LOG
(
INFO
)
<<
"get 1st kernel: "
<<
buf_to_img_kernel
->
doc
();
LOG
(
INFO
)
<<
"get 2nd kernel: "
<<
img_to_buf_kernel
->
doc
();
LOG
(
INFO
)
<<
"get 3rd kernel: "
<<
relu_img_kernel
->
doc
();
// set tensors about op param
LOG
(
INFO
)
<<
"set tensors about op param"
;
// layout(buf->img): x -> relu_in
// relu(img): relu_in -> relu_out
// layout(img->buf): relu_out -> y
lite
::
Tensor
x
,
y
,
relu_in
,
relu_out
,
y_ref
;
operators
::
LayoutParam
BufferToImageParam
;
operators
::
LayoutParam
ImageToBufferParam
;
BufferToImageParam
.
x
=
&
x
;
BufferToImageParam
.
y
=
&
relu_in
;
ImageToBufferParam
.
x
=
&
relu_out
;
ImageToBufferParam
.
y
=
&
y
;
operators
::
ActivationParam
ReluParam
;
ReluParam
.
X
=
&
relu_in
;
ReluParam
.
Out
=
&
relu_out
;
ReluParam
.
Relu_clipped_coef
=
6.
f
;
const
DDim
x_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
n
,
c
,
h
,
w
});
x
.
Resize
(
x_dim
);
y
.
Resize
(
x_dim
);
relu_in
.
Resize
(
x_dim
);
relu_out
.
Resize
(
x_dim
);
y_ref
.
Resize
(
x_dim
);
auto
relu_image2d_shape
=
paddle
::
lite
::
kernels
::
opencl
::
InitImageDimInfoWith
(
x_dim
);
// initialize tensors
LOG
(
INFO
)
<<
"initialize tensors"
;
auto
*
x_data
=
x
.
mutable_data
<
float
,
cl
::
Buffer
>
(
TARGET
(
kOpenCL
));
auto
*
y_data
=
y
.
mutable_data
<
float
,
cl
::
Buffer
>
(
TARGET
(
kOpenCL
));
auto
*
y_data_ref
=
y_ref
.
mutable_data
<
float
>
(
TARGET
(
kARM
));
auto
*
mapped_x
=
static_cast
<
float
*>
(
TargetWrapperCL
::
Map
(
x_data
,
0
,
sizeof
(
float
)
*
x_dim
.
production
()));
auto
*
mapped_y
=
static_cast
<
float
*>
(
TargetWrapperCL
::
Map
(
y_data
,
0
,
sizeof
(
float
)
*
x_dim
.
production
()));
for
(
int
i
=
0
;
i
<
x_dim
.
production
();
++
i
)
{
mapped_x
[
i
]
=
static_cast
<
int
>
(
i
)
-
x_dim
.
production
()
/
2
;
mapped_y
[
i
]
=
static_cast
<
int
>
(
0
);
}
auto
*
relu_in_data
=
relu_in
.
mutable_data
<
int16_t
,
cl
::
Image2D
>
(
relu_image2d_shape
[
"width"
],
relu_image2d_shape
[
"height"
]);
auto
*
relu_out_data
=
relu_out
.
mutable_data
<
int16_t
,
cl
::
Image2D
>
(
relu_image2d_shape
[
"width"
],
relu_image2d_shape
[
"height"
]);
// set context and kernel args
LOG
(
INFO
)
<<
"set context and kernel args"
;
std
::
unique_ptr
<
KernelContext
>
context
(
new
KernelContext
);
context
->
As
<
OpenCLContext
>
().
InitOnce
();
buf_to_img_kernel
->
SetParam
(
BufferToImageParam
);
std
::
unique_ptr
<
KernelContext
>
buf_to_img_context
(
new
KernelContext
);
context
->
As
<
OpenCLContext
>
().
CopySharedTo
(
&
(
buf_to_img_context
->
As
<
OpenCLContext
>
()));
buf_to_img_kernel
->
SetContext
(
std
::
move
(
buf_to_img_context
));
img_to_buf_kernel
->
SetParam
(
ImageToBufferParam
);
std
::
unique_ptr
<
KernelContext
>
img_to_buf_context
(
new
KernelContext
);
context
->
As
<
OpenCLContext
>
().
CopySharedTo
(
&
(
img_to_buf_context
->
As
<
OpenCLContext
>
()));
img_to_buf_kernel
->
SetContext
(
std
::
move
(
img_to_buf_context
));
relu_img_kernel
->
SetParam
(
ReluParam
);
std
::
unique_ptr
<
KernelContext
>
relu_img_context
(
new
KernelContext
);
context
->
As
<
OpenCLContext
>
().
CopySharedTo
(
&
(
relu_img_context
->
As
<
OpenCLContext
>
()));
relu_img_kernel
->
SetContext
(
std
::
move
(
relu_img_context
));
// run kernels
LOG
(
INFO
)
<<
"run kernel: buf_to_img_kernel"
;
buf_to_img_kernel
->
Launch
();
LOG
(
INFO
)
<<
"run kernel: relu_img_kernel"
;
relu_img_kernel
->
Launch
();
LOG
(
INFO
)
<<
"run kernel: img_to_buf_kernel"
;
img_to_buf_kernel
->
Launch
();
// compute ref cpu
relu_compute_ref
<
float
>
(
mapped_x
,
x_dim
,
y_data_ref
,
6.
f
);
// result
#ifdef RELU6_FP16_PRINT_RESULT
LOG
(
INFO
)
<<
"---- print kernel result (input -> output) ----"
;
for
(
int
eidx
=
0
;
eidx
<
x_dim
.
production
();
++
eidx
)
{
std
::
cout
<<
mapped_x
[
eidx
]
<<
" -> "
<<
mapped_y
[
eidx
]
<<
std
::
endl
;
}
#endif // RELU6_FP16_PRINT_RESULT
// check result: compare kernel output and cpu output(y_data_ref)
for
(
int
eidx
=
0
;
eidx
<
x_dim
.
production
();
eidx
++
)
{
EXPECT_NEAR
(
y_data_ref
[
eidx
],
mapped_y
[
eidx
],
1e-6
);
if
(
abs
(
y_data_ref
[
eidx
]
-
mapped_y
[
eidx
])
>
1e-6
)
{
LOG
(
INFO
)
<<
"1st diff in this case at eidx[from 0]:"
<<
eidx
<<
" / "
<<
x_dim
.
production
()
<<
", y_data_ref["
<<
eidx
<<
"]:"
<<
y_data_ref
[
eidx
]
<<
", mapped_y["
<<
eidx
<<
"]:"
<<
mapped_y
[
eidx
];
break
;
}
}
// free
LOG
(
INFO
)
<<
"free: unmap x, y"
;
TargetWrapperCL
::
Unmap
(
x_data
,
mapped_x
);
TargetWrapperCL
::
Unmap
(
y_data
,
mapped_y
);
#ifdef RELU6_FP16_LOOP_TEST
}
// w
}
// h
}
// c
...
...
@@ -414,3 +740,7 @@ USE_LITE_KERNEL(relu, kOpenCL, kFloat, kImageDefault, ImageDefault);
// relu image2d fp16
USE_LITE_KERNEL
(
relu
,
kOpenCL
,
kFP16
,
kImageDefault
,
ImageDefault
);
// relu6 image2d fp32
USE_LITE_KERNEL
(
relu6
,
kOpenCL
,
kFloat
,
kImageDefault
,
ImageDefault
);
USE_LITE_KERNEL
(
relu6
,
kOpenCL
,
kFP16
,
kImageDefault
,
ImageDefault
);
lite/tests/kernels/fc_compute_test.cc
浏览文件 @
dea6154b
...
...
@@ -130,7 +130,7 @@ class FcOPTest : public arena::TestCase {
1.
f
,
0.
f
,
true
,
flag_bias
,
static_cast
<
int
>
(
flag_bias
)
,
false
);
}
else
{
basic_gemm
(
false
,
...
...
lite/tests/math/conv_compute_test.cc
浏览文件 @
dea6154b
...
...
@@ -46,14 +46,19 @@ DEFINE_int32(out_channel, 32, "output channel");
DEFINE_int32
(
group
,
1
,
"group"
);
DEFINE_int32
(
kernel_h
,
3
,
"kernel height"
);
DEFINE_int32
(
kernel_w
,
3
,
"kernel width"
);
DEFINE_int32
(
pad_h
,
1
,
"pad height"
);
DEFINE_int32
(
pad_w
,
1
,
"pad width"
);
DEFINE_int32
(
pad_h0
,
1
,
"pad top"
);
DEFINE_int32
(
pad_h1
,
1
,
"pad bottom"
);
DEFINE_int32
(
pad_w0
,
1
,
"pad left"
);
DEFINE_int32
(
pad_w1
,
1
,
"pad right"
);
DEFINE_int32
(
stride_h
,
1
,
"stride height"
);
DEFINE_int32
(
stride_w
,
1
,
"stride width"
);
DEFINE_int32
(
dila_h
,
1
,
"dilation height"
);
DEFINE_int32
(
dila_w
,
1
,
"dilation width"
);
DEFINE_bool
(
flag_relu
,
true
,
"do relu"
);
DEFINE_int32
(
flag_act
,
0
,
"do activation"
);
// 0-no act, 1-relu, 2-relu6, 4-leakyrelu
DEFINE_double
(
leakey_relu_alpha
,
1.0
,
"leakey relu alpha"
);
DEFINE_bool
(
flag_bias
,
true
,
"with bias"
);
typedef
paddle
::
lite
::
DDim
DDim
;
...
...
@@ -98,9 +103,10 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
const
std
::
vector
<
int
>&
pads
,
const
std
::
vector
<
int
>&
dilas
,
bool
flag_bias
,
bool
flag_relu
,
int
flag_act
,
const
std
::
vector
<
int
>&
thread_num
,
const
std
::
vector
<
int
>&
power_mode
)
{
const
std
::
vector
<
int
>&
power_mode
,
const
float
leakey_relu_scale
)
{
#ifdef LITE_WITH_ARM
paddle
::
lite
::
DeviceInfo
::
Init
();
#endif
...
...
@@ -118,13 +124,20 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
param
.
strides
=
strides
;
param
.
paddings
=
std
::
make_shared
<
std
::
vector
<
int
>>
(
pads
);
param
.
dilations
=
std
::
make_shared
<
std
::
vector
<
int
>>
(
dilas
);
param
.
fuse_relu
=
flag_relu
;
param
.
groups
=
group
;
if
(
flag_relu
)
{
const
float
six
=
6.
f
;
if
(
flag_act
>
0
)
{
ActivationParam
act_param
;
act_param
.
has_active
=
true
;
act_param
.
active_type
=
(
paddle
::
lite_api
::
ActivationType
)
1
;
// 2-relu6 4-leakyrelu
act_param
.
active_type
=
(
paddle
::
lite_api
::
ActivationType
)
flag_act
;
// 1-relu, 2-relu6, 4-leakyrelu
if
(
flag_act
==
1
)
{
param
.
fuse_relu
=
true
;
}
else
if
(
flag_act
==
2
)
{
act_param
.
Relu_clipped_coef
=
six
;
}
else
if
(
flag_act
==
4
)
{
act_param
.
Leaky_relu_alpha
=
leakey_relu_scale
;
}
param
.
activation_param
=
act_param
;
}
...
...
@@ -205,7 +218,9 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
pads
[
2
],
pads
[
0
],
flag_bias
,
flag_relu
);
flag_act
,
six
,
leakey_relu_scale
);
}
/// warm up
for
(
int
i
=
0
;
i
<
FLAGS_warmup
;
++
i
)
{
...
...
@@ -254,22 +269,20 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
<<
", dila_: "
<<
dilas
[
0
]
<<
", "
<<
dilas
[
1
]
<<
", group: "
<<
group
<<
", bias: "
<<
(
flag_bias
?
"true"
:
"false"
)
<<
", relu: "
<<
(
flag_relu
?
"true"
:
"false"
)
<<
", threads: "
<<
th
<<
", power_mode: "
<<
cls
<<
" failed!!
\n
"
;
<<
", act: "
<<
flag_act
<<
", threads: "
<<
th
<<
", power_mode: "
<<
cls
<<
" failed!!
\n
"
;
}
}
}
LOG
(
INFO
)
<<
"test fp32 conv: input: "
<<
dim_in
<<
", output: "
<<
dim_out
<<
", weight dim: "
<<
weight_dim
<<
", pad: "
<<
pads
[
0
]
<<
", "
<<
pads
[
1
]
<<
",
stride: "
<<
strides
[
0
]
<<
", "
<<
strides
[
1
]
<<
", dila_: "
<<
dilas
[
0
]
<<
", "
<<
dilas
[
1
]
<<
", pad: "
<<
pads
[
0
]
<<
", "
<<
pads
[
1
]
<<
", "
<<
pads
[
2
]
<<
",
"
<<
pads
[
3
]
<<
", stride: "
<<
strides
[
0
]
<<
", "
<<
strides
[
1
]
<<
", dila_: "
<<
dilas
[
0
]
<<
", "
<<
dilas
[
1
]
<<
", group: "
<<
group
<<
", bias: "
<<
(
flag_bias
?
"true"
:
"false"
)
<<
", relu: "
<<
(
flag_relu
?
"true"
:
"false"
)
<<
", threads: "
<<
th
<<
", power_mode: "
<<
cls
<<
" successed!!
\n
"
;
<<
", act: "
<<
flag_act
<<
", threads: "
<<
th
<<
", power_mode: "
<<
cls
<<
" successed!!
\n
"
;
}
}
}
...
...
@@ -287,12 +300,14 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
const
std
::
vector
<
int
>&
pads
,
const
std
::
vector
<
int
>&
dilas
,
bool
flag_bias
,
bool
flag_relu
,
int
flag_act
,
const
std
::
vector
<
int
>&
thread_num
,
const
std
::
vector
<
int
>&
power_mode
)
{}
const
std
::
vector
<
int
>&
power_mode
,
const
float
leakey_relu_scale
)
{}
#endif // LITE_WITH_ARM
#if 1 /// 3x3dw
// TODO(chenjiaoAngel): fix me, diff: 3x3 depthwise conv
#if 0 /// 3x3dw
TEST(TestConv3x3DW, test_conv3x3_depthwise) {
if (FLAGS_basic_test) {
for (auto& stride : {1, 2}) {
...
...
@@ -301,7 +316,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
for (auto& pad_top : {0, 1, 2}) {
for (auto& pad_bottom : {0, 1, 2}) {
for (auto& flag_bias : {false, true}) {
for
(
auto
&
flag_
relu
:
{
false
,
true
})
{
for (auto& flag_
act : {0, 1, 2, 4
}) {
for (auto& c : {1, 3, 5, 8, 16, 32}) {
std::vector<DDim> dims;
DDim weights_dim({c, 1, 3, 3});
...
...
@@ -310,6 +325,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
dims.push_back(DDim({batch, c, h, h}));
}
}
const float leakey_relu_scale = 8.88;
test_conv_fp32(dims,
weights_dim,
c,
...
...
@@ -317,9 +333,10 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
{pad_top, pad_bottom, pad_left, pad_right},
{1, 1},
flag_bias,
flag_
relu
,
flag_
act
,
{1, 2, 4},
{
FLAGS_power_mode
});
{FLAGS_power_mode},
leakey_relu_scale);
}
}
}
...
...
@@ -335,28 +352,41 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
#if 1 /// 5x5dw
TEST
(
TestConv5x5DW
,
test_conv5x5_depthwise
)
{
if
(
FLAGS_basic_test
)
{
#ifdef __aarch64__
// TODO(chenjiaoAngel): fix me, diff: arm64 5x5s2 depthwise conv
for
(
auto
&
stride
:
{
1
})
{
#else
for
(
auto
&
stride
:
{
1
,
2
})
{
for
(
auto
&
pad
:
{
0
,
1
,
2
})
{
for
(
auto
&
flag_bias
:
{
false
,
true
})
{
for
(
auto
&
flag_relu
:
{
false
,
true
})
{
for
(
auto
&
c
:
{
1
,
3
,
5
,
8
,
16
,
32
})
{
std
::
vector
<
DDim
>
dims
;
DDim
weights_dim
({
c
,
1
,
5
,
5
});
for
(
auto
&
batch
:
{
1
,
2
})
{
for
(
auto
&
h
:
{
1
,
3
,
15
,
19
,
28
,
32
,
75
})
{
dims
.
push_back
(
DDim
({
batch
,
c
,
h
,
h
}));
#endif
for
(
auto
&
pad_left
:
{
0
,
1
,
2
})
{
for
(
auto
&
pad_right
:
{
0
,
1
,
2
})
{
for
(
auto
&
pad_top
:
{
0
,
1
,
2
})
{
for
(
auto
&
pad_bottom
:
{
0
,
1
,
2
})
{
for
(
auto
&
flag_bias
:
{
false
,
true
})
{
for
(
auto
&
flag_act
:
{
0
,
1
,
2
,
4
})
{
for
(
auto
&
c
:
{
1
,
15
,
32
})
{
std
::
vector
<
DDim
>
dims
;
DDim
weights_dim
({
c
,
1
,
5
,
5
});
for
(
auto
&
batch
:
{
1
,
2
})
{
for
(
auto
&
h
:
{
1
,
3
,
15
,
56
})
{
dims
.
push_back
(
DDim
({
batch
,
c
,
h
,
h
}));
}
}
const
float
leakey_relu_scale
=
8.88
;
test_conv_fp32
(
dims
,
weights_dim
,
c
,
{
stride
,
stride
},
{
pad_left
,
pad_right
,
pad_top
,
pad_bottom
},
{
1
,
1
},
flag_bias
,
flag_act
,
{
4
},
{
FLAGS_power_mode
},
leakey_relu_scale
);
}
}
}
test_conv_fp32
(
dims
,
weights_dim
,
c
,
{
stride
,
stride
},
{
pad
,
pad
,
pad
,
pad
},
{
1
,
1
},
flag_bias
,
flag_relu
,
{
1
,
2
,
4
},
{
FLAGS_power_mode
});
}
}
}
...
...
@@ -373,7 +403,7 @@ TEST(TestConv1x1s1, test_conv1x1s1) {
for
(
auto
&
cout
:
{
1
,
5
,
16
,
37
})
{
for
(
auto
&
g
:
{
1
,
2
})
{
for
(
auto
&
flag_bias
:
{
false
,
true
})
{
for
(
auto
&
flag_
relu
:
{
false
,
true
})
{
for
(
auto
&
flag_
act
:
{
0
,
1
,
2
,
4
})
{
std
::
vector
<
DDim
>
dims
;
if
(
cin
%
g
!=
0
||
cout
%
g
!=
0
)
{
continue
;
...
...
@@ -384,6 +414,7 @@ TEST(TestConv1x1s1, test_conv1x1s1) {
dims
.
push_back
(
DDim
({
batch
,
cin
,
h
,
h
}));
}
}
const
float
leakey_relu_scale
=
8.88
;
test_conv_fp32
(
dims
,
weights_dim
,
g
,
...
...
@@ -391,9 +422,10 @@ TEST(TestConv1x1s1, test_conv1x1s1) {
{
0
,
0
,
0
,
0
},
{
1
,
1
},
flag_bias
,
flag_
relu
,
flag_
act
,
{
1
,
2
,
4
},
{
FLAGS_power_mode
});
{
FLAGS_power_mode
},
leakey_relu_scale
);
}
}
}
...
...
@@ -403,24 +435,29 @@ TEST(TestConv1x1s1, test_conv1x1s1) {
}
#endif /// conv1x1s1
#if 1 /// conv3x3s1
// TODO(MyPandaShaoxiang): fix me, diff: 3x3s1 winograd
#if 0 /// conv3x3s1
TEST(TestConv3x3s1, test_conv_3x3s1) {
if (FLAGS_basic_test) {
for
(
auto
&
cin
:
{
1
,
3
,
8
,
32
,
4
8
})
{
for
(
auto
&
cout
:
{
1
,
5
,
8
,
32
,
48
})
{
for
(
auto
&
pad_left
:
{
1
,
2
})
{
for
(
auto
&
pad_right
:
{
1
,
2
})
{
for
(
auto
&
pad_top
:
{
1
,
2
})
{
for
(
auto
&
pad_bottom
:
{
1
,
2
})
{
for (auto& cin : {1, 3, 8, 8}) {
for (auto& cout : {1, 5, 32, 48}) {
for (auto& pad_left : {
0,
1, 2}) {
for (auto& pad_right : {
0,
1, 2}) {
for (auto& pad_top : {
0,
1, 2}) {
for (auto& pad_bottom : {
0,
1, 2}) {
for (auto& flag_bias : {false, true}) {
for
(
auto
&
flag_
relu
:
{
false
,
true
})
{
for (auto& flag_
act : {0, 1, 2, 4
}) {
std::vector<DDim> dims;
DDim weights_dim({cout, cin, 3, 3});
for (auto& batch : {1, 2}) {
for
(
auto
&
h
:
{
1
,
7
,
19
,
56
,
32
})
{
for (auto& h : {1,
3, 17, 33
}) {
dims.push_back(DDim({batch, cin, h, h}));
}
}
if (cin == 1 && cout ==1) {
continue;
}
const float leakey_relu_scale = 8.88;
test_conv_fp32(dims,
weights_dim,
1,
...
...
@@ -428,9 +465,10 @@ TEST(TestConv3x3s1, test_conv_3x3s1) {
{pad_top, pad_bottom, pad_left, pad_right},
{1, 1},
flag_bias,
flag_relu
,
{
1
,
2
,
4
},
{
FLAGS_power_mode
});
flag_act,
{4},
{FLAGS_power_mode},
leakey_relu_scale);
}
}
}
...
...
@@ -446,21 +484,25 @@ TEST(TestConv3x3s1, test_conv_3x3s1) {
#if 1 /// conv3x3s2
TEST
(
TestConv3x3s2
,
test_conv_3x3s2
)
{
if
(
FLAGS_basic_test
)
{
for
(
auto
&
cin
:
{
1
,
3
,
8
,
32
})
{
for
(
auto
&
cout
:
{
1
,
5
,
8
,
32
})
{
for
(
auto
&
pad_left
:
{
1
,
2
})
{
for
(
auto
&
pad_right
:
{
1
,
2
})
{
for
(
auto
&
pad_top
:
{
1
,
2
})
{
for
(
auto
&
pad_bottom
:
{
1
,
2
})
{
for
(
auto
&
cin
:
{
1
,
3
,
8
})
{
for
(
auto
&
cout
:
{
1
,
3
,
9
,
32
})
{
for
(
auto
&
pad_left
:
{
0
,
1
,
2
})
{
for
(
auto
&
pad_right
:
{
0
,
1
,
2
})
{
for
(
auto
&
pad_top
:
{
0
,
1
,
2
})
{
for
(
auto
&
pad_bottom
:
{
0
,
1
,
2
})
{
for
(
auto
&
flag_bias
:
{
false
,
true
})
{
for
(
auto
&
flag_
relu
:
{
false
,
true
})
{
for
(
auto
&
flag_
act
:
{
0
,
1
,
2
,
4
})
{
std
::
vector
<
DDim
>
dims
;
DDim
weights_dim
({
cout
,
cin
,
3
,
3
});
for
(
auto
&
batch
:
{
1
,
2
})
{
for
(
auto
&
h
:
{
1
,
7
,
19
,
28
,
7
5
,
56
,
32
})
{
for
(
auto
&
h
:
{
3
,
7
,
1
5
,
56
,
32
})
{
dims
.
push_back
(
DDim
({
batch
,
cin
,
h
,
h
}));
}
}
if
(
cin
==
1
&&
cout
==
1
)
{
continue
;
}
const
float
leakey_relu_scale
=
8.88
;
test_conv_fp32
(
dims
,
weights_dim
,
1
,
...
...
@@ -468,9 +510,10 @@ TEST(TestConv3x3s2, test_conv_3x3s2) {
{
pad_top
,
pad_bottom
,
pad_left
,
pad_right
},
{
1
,
1
},
flag_bias
,
flag_
relu
,
flag_
act
,
{
1
,
2
,
4
},
{
FLAGS_power_mode
});
{
FLAGS_power_mode
},
leakey_relu_scale
);
}
}
}
...
...
@@ -486,29 +529,40 @@ TEST(TestConv3x3s2, test_conv_3x3s2) {
#if 1 /// random param conv
TEST
(
TestConvRand
,
test_conv_rand
)
{
if
(
FLAGS_basic_test
)
{
for
(
auto
&
cin
:
{
1
,
3
,
8
,
16
})
{
for
(
auto
&
cout
:
{
1
,
5
,
8
,
16
})
{
for
(
auto
&
cin
:
{
1
,
3
,
8
})
{
for
(
auto
&
cout
:
{
1
,
5
,
16
})
{
for
(
auto
&
g
:
{
1
,
2
})
{
for
(
auto
&
kw
:
{
1
,
2
,
3
})
{
for
(
auto
&
kh
:
{
1
,
2
,
3
})
{
for
(
auto
&
stride
:
{
1
,
2
})
{
for
(
auto
&
pad_left
:
{
0
,
1
,
2
})
{
for
(
auto
&
pad_right
:
{
0
,
1
,
2
})
{
for
(
auto
&
pad_top
:
{
0
,
1
,
2
})
{
for
(
auto
&
pad_bottom
:
{
0
,
1
,
2
})
{
for
(
auto
&
pad_left
:
{
0
,
2
})
{
for
(
auto
&
pad_right
:
{
0
,
2
})
{
for
(
auto
&
pad_top
:
{
0
,
2
})
{
for
(
auto
&
pad_bottom
:
{
0
,
2
})
{
for
(
auto
&
dila
:
{
1
,
2
})
{
for
(
auto
&
flag_bias
:
{
false
,
true
})
{
for
(
auto
&
flag_
relu
:
{
false
,
true
})
{
for
(
auto
&
flag_
act
:
{
0
,
1
,
2
,
4
})
{
if
(
cin
%
g
!=
0
||
cout
%
g
!=
0
)
{
continue
;
}
std
::
vector
<
DDim
>
dims
;
DDim
weights_dim
({
cout
,
cin
/
g
,
kh
,
kw
});
for
(
auto
&
batch
:
{
1
,
2
})
{
for
(
auto
&
h
:
{
1
,
3
,
19
,
32
,
28
})
{
for
(
auto
&
h
:
{
1
,
3
,
19
,
32
})
{
dims
.
push_back
(
DDim
({
batch
,
cin
,
h
,
h
}));
}
}
// skip 3x3 depthwise conv
if
(
g
==
cin
&&
cin
==
cout
&&
kw
==
3
&&
kh
==
3
)
{
break
;
}
// skip 3x3s1 direct conv
if
(
g
==
1
&&
(
cin
!=
1
||
cout
!=
1
)
&&
kw
==
3
&&
kh
==
3
&&
stride
==
1
)
{
break
;
}
const
float
leakey_relu_scale
=
8.88
;
test_conv_fp32
(
dims
,
weights_dim
,
...
...
@@ -517,9 +571,10 @@ TEST(TestConvRand, test_conv_rand) {
{
pad_top
,
pad_bottom
,
pad_left
,
pad_right
},
{
dila
,
dila
},
flag_bias
,
flag_relu
,
{
1
,
2
,
4
},
{
FLAGS_power_mode
});
flag_act
,
{
4
},
{
FLAGS_power_mode
},
leakey_relu_scale
);
}
}
}
...
...
@@ -551,11 +606,12 @@ TEST(TestConvCustom, test_conv_fp32_custom_size) {
FLAGS_kernel_w
}),
FLAGS_group
,
{
FLAGS_stride_h
,
FLAGS_stride_w
},
{
FLAGS_pad_h
,
FLAGS_pad_h
,
FLAGS_pad_w
,
FLAGS_pad_w
},
{
FLAGS_pad_h
0
,
FLAGS_pad_h1
,
FLAGS_pad_w0
,
FLAGS_pad_w1
},
{
FLAGS_dila_h
,
FLAGS_dila_w
},
FLAGS_flag_bias
,
FLAGS_flag_
relu
,
FLAGS_flag_
act
,
{
FLAGS_threads
},
{
FLAGS_power_mode
});
{
FLAGS_power_mode
},
FLAGS_leakey_relu_alpha
);
}
#endif // custom
lite/tests/math/conv_int8_compute_test.cc
浏览文件 @
dea6154b
...
...
@@ -291,7 +291,7 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
pads
[
2
],
pads
[
0
],
flag_bias
,
flag_relu
);
static_cast
<
int
>
(
flag_relu
)
);
paddle
::
lite
::
arm
::
math
::
fp32_to_int8
(
dout_basic_fp32
,
dout_basic_int8
,
scale_out
.
data
(),
...
...
@@ -362,6 +362,7 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
<<
pads
[
2
]
<<
", "
<<
pads
[
3
]
<<
", stride: "
<<
strides
[
0
]
<<
", "
<<
strides
[
1
]
<<
", dila_: "
<<
dilas
[
0
]
<<
", "
<<
dilas
[
1
]
<<
", group: "
<<
group
<<
", bias: "
<<
(
flag_bias
?
"true"
:
"false"
)
<<
", relu: "
<<
(
flag_relu
?
"true"
:
"false"
)
<<
", threads: "
<<
th
<<
", power_mode: "
<<
cls
...
...
@@ -467,7 +468,7 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
std
::
vector
<
DDim
>
dims
;
DDim
weights_dim
({
c
,
1
,
3
,
3
});
for
(
auto
&
batch
:
{
1
,
2
})
{
for
(
auto
&
h
:
{
1
,
3
,
15
,
19
,
75
,
32
,
28
})
{
for
(
auto
&
h
:
{
1
,
3
,
15
,
33
})
{
dims
.
push_back
(
DDim
({
batch
,
c
,
h
,
h
}));
}
}
...
...
@@ -479,7 +480,7 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
{
1
,
1
},
flag_bias
,
flag_relu
,
{
1
,
2
,
4
},
{
4
},
{
FLAGS_power_mode
});
}
}
...
...
@@ -494,14 +495,14 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
TEST
(
TestConv5x5DWInt8
,
test_conv5x5_depthwise
)
{
if
(
FLAGS_basic_test
)
{
for
(
auto
&
stride
:
{
1
})
{
for
(
auto
&
pad
:
{
0
,
1
,
2
})
{
for
(
auto
&
pad
:
{
0
,
1
,
2
,
3
,
4
})
{
for
(
auto
&
flag_bias
:
{
false
,
true
})
{
for
(
auto
&
flag_relu
:
{
false
,
true
})
{
for
(
auto
&
c
:
{
1
,
3
,
5
,
8
,
16
,
32
})
{
for
(
auto
&
c
:
{
1
,
5
,
15
,
33
})
{
std
::
vector
<
DDim
>
dims
;
DDim
weights_dim
({
c
,
1
,
5
,
5
});
for
(
auto
&
batch
:
{
1
,
2
})
{
for
(
auto
&
h
:
{
1
,
3
,
15
,
19
,
28
,
32
,
75
})
{
for
(
auto
&
h
:
{
1
,
3
,
15
,
33
})
{
dims
.
push_back
(
DDim
({
batch
,
c
,
h
,
h
}));
}
}
...
...
@@ -513,7 +514,7 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
{
1
,
1
},
flag_bias
,
flag_relu
,
{
1
,
2
,
4
},
{
4
},
{
FLAGS_power_mode
});
}
}
...
...
@@ -527,8 +528,8 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
#if 1 /// conv1x1s1
TEST
(
TestConv1x1s1Int8
,
test_conv1x1s1
)
{
if
(
FLAGS_basic_test
)
{
for
(
auto
&
cin
:
{
1
,
3
,
8
,
11
,
32
})
{
for
(
auto
&
cout
:
{
1
,
5
,
1
6
,
3
7
})
{
for
(
auto
&
cin
:
{
1
,
3
,
8
,
32
})
{
for
(
auto
&
cout
:
{
1
,
5
,
17
})
{
for
(
auto
&
g
:
{
1
,
2
})
{
for
(
auto
&
flag_bias
:
{
false
,
true
})
{
for
(
auto
&
flag_relu
:
{
false
,
true
})
{
...
...
@@ -538,7 +539,7 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
}
DDim
weights_dim
({
cout
,
cin
/
g
,
1
,
1
});
for
(
auto
&
batch
:
{
1
,
2
})
{
for
(
auto
&
h
:
{
1
,
7
,
19
,
28
,
32
,
56
,
1
})
{
for
(
auto
&
h
:
{
1
,
9
,
16
,
33
})
{
dims
.
push_back
(
DDim
({
batch
,
cin
,
h
,
h
}));
}
}
...
...
@@ -550,7 +551,7 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
{
1
,
1
},
flag_bias
,
flag_relu
,
{
1
,
2
,
4
},
{
4
},
{
FLAGS_power_mode
});
}
}
...
...
@@ -564,8 +565,8 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
#if 1 /// conv3x3s1
TEST
(
TestConv3x3s1Int8
,
test_conv_3x3s1
)
{
if
(
FLAGS_basic_test
)
{
for
(
auto
&
cin
:
{
1
,
3
,
8
,
3
2
,
48
})
{
for
(
auto
&
cout
:
{
1
,
5
,
8
,
32
,
48
})
{
for
(
auto
&
cin
:
{
1
,
3
,
8
,
3
3
})
{
for
(
auto
&
cout
:
{
1
,
5
,
33
})
{
for
(
auto
&
pad_top
:
{
1
,
2
})
{
for
(
auto
&
pad_bottom
:
{
1
,
2
})
{
for
(
auto
&
pad_left
:
{
1
,
2
})
{
...
...
@@ -575,7 +576,7 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
std
::
vector
<
DDim
>
dims
;
DDim
weights_dim
({
cout
,
cin
,
3
,
3
});
for
(
auto
&
batch
:
{
1
,
2
})
{
for
(
auto
&
h
:
{
1
,
7
,
1
9
,
56
,
32
})
{
for
(
auto
&
h
:
{
1
,
7
,
1
7
,
33
})
{
dims
.
push_back
(
DDim
({
batch
,
cin
,
h
,
h
}));
}
}
...
...
@@ -587,7 +588,7 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
{
1
,
1
},
flag_bias
,
flag_relu
,
{
1
,
2
,
4
},
{
4
},
{
FLAGS_power_mode
});
}
}
...
...
@@ -604,8 +605,8 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
#if 1 /// conv3x3s2
TEST
(
TestConv3x3s2Int8
,
test_conv_3x3s2
)
{
if
(
FLAGS_basic_test
)
{
for
(
auto
&
cin
:
{
1
,
3
,
8
,
32
})
{
for
(
auto
&
cout
:
{
1
,
5
,
8
,
32
})
{
for
(
auto
&
cin
:
{
1
,
3
,
31
})
{
for
(
auto
&
cout
:
{
1
,
5
,
33
})
{
for
(
auto
&
pad_top
:
{
1
,
2
})
{
for
(
auto
&
pad_bottom
:
{
1
,
2
})
{
for
(
auto
&
pad_left
:
{
1
,
2
})
{
...
...
@@ -615,7 +616,7 @@ TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
std
::
vector
<
DDim
>
dims
;
DDim
weights_dim
({
cout
,
cin
,
3
,
3
});
for
(
auto
&
batch
:
{
1
,
2
})
{
for
(
auto
&
h
:
{
1
,
7
,
19
,
28
,
75
,
56
,
32
})
{
for
(
auto
&
h
:
{
1
,
7
,
19
,
33
})
{
dims
.
push_back
(
DDim
({
batch
,
cin
,
h
,
h
}));
}
}
...
...
@@ -627,7 +628,7 @@ TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
{
1
,
1
},
flag_bias
,
flag_relu
,
{
1
,
2
,
4
},
{
4
},
{
FLAGS_power_mode
});
}
}
...
...
@@ -644,8 +645,8 @@ TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
#if 1 /// random param conv
TEST
(
TestConvRandInt8
,
test_conv_rand
)
{
if
(
FLAGS_basic_test
)
{
for
(
auto
&
cin
:
{
1
,
3
,
8
,
16
})
{
for
(
auto
&
cout
:
{
1
,
5
,
8
,
16
})
{
for
(
auto
&
cin
:
{
1
,
17
})
{
for
(
auto
&
cout
:
{
1
,
8
,
17
})
{
for
(
auto
&
g
:
{
1
,
2
})
{
for
(
auto
&
kw
:
{
1
,
2
,
3
})
{
for
(
auto
&
kh
:
{
1
,
2
,
3
})
{
...
...
@@ -658,12 +659,12 @@ TEST(TestConvRandInt8, test_conv_rand) {
for
(
auto
&
flag_bias
:
{
false
,
true
})
{
for
(
auto
&
flag_relu
:
{
false
,
true
})
{
if
(
cin
%
g
!=
0
||
cout
%
g
!=
0
)
{
continue
;
break
;
}
std
::
vector
<
DDim
>
dims
;
DDim
weights_dim
({
cout
,
cin
/
g
,
kh
,
kw
});
for
(
auto
&
batch
:
{
1
,
2
})
{
for
(
auto
&
h
:
{
1
,
3
,
19
,
32
,
28
})
{
for
(
auto
&
h
:
{
1
,
3
,
5
,
19
})
{
dims
.
push_back
(
DDim
({
batch
,
cin
,
h
,
h
}));
}
}
...
...
@@ -676,7 +677,7 @@ TEST(TestConvRandInt8, test_conv_rand) {
{
dila
,
dila
},
flag_bias
,
flag_relu
,
{
1
,
2
,
4
},
{
4
},
{
FLAGS_power_mode
});
}
}
...
...
lite/tests/math/gemv_int8_compute_test.cc
浏览文件 @
dea6154b
...
...
@@ -37,7 +37,7 @@ DEFINE_int32(power_mode,
DEFINE_int32
(
threads
,
1
,
"threads num"
);
DEFINE_int32
(
warmup
,
0
,
"warmup times"
);
DEFINE_int32
(
repeats
,
1
,
"repeats times"
);
DEFINE_bool
(
basic_test
,
fals
e
,
"do all tests"
);
DEFINE_bool
(
basic_test
,
tru
e
,
"do all tests"
);
DEFINE_bool
(
check_result
,
true
,
"check the result"
);
DEFINE_int32
(
M
,
512
,
"gemv: M"
);
...
...
lite/tests/math/sgemm_c4_compute_test.cc
浏览文件 @
dea6154b
...
...
@@ -37,7 +37,7 @@ DEFINE_int32(power_mode,
DEFINE_int32
(
threads
,
1
,
"threads num"
);
DEFINE_int32
(
warmup
,
0
,
"warmup times"
);
DEFINE_int32
(
repeats
,
1
,
"repeats times"
);
DEFINE_bool
(
basic_test
,
fals
e
,
"do all tests"
);
DEFINE_bool
(
basic_test
,
tru
e
,
"do all tests"
);
DEFINE_bool
(
check_result
,
true
,
"check the result"
);
DEFINE_int32
(
M
,
512
,
"gemm_c4: M"
);
...
...
lite/tests/math/sgemv_compute_test.cc
浏览文件 @
dea6154b
...
...
@@ -38,11 +38,19 @@ DEFINE_int32(K, 512, "sgemv: K");
DEFINE_bool
(
traA
,
false
,
"gemv: A transpose"
);
DEFINE_
bool
(
flag_relu
,
false
,
"do relu
"
);
DEFINE_
int32
(
flag_act
,
0
,
"do act
"
);
DEFINE_bool
(
flag_bias
,
false
,
"with bias"
);
bool
test_sgemv
(
bool
tra
,
int
m
,
int
k
,
bool
has_bias
,
bool
has_relu
,
int
cls
,
int
ths
)
{
DEFINE_double
(
leakey_relu_alpha
,
1.0
,
"leakey relu alpha"
);
DEFINE_double
(
clipped_coef
,
6.0
,
"clipped relu coef"
);
bool
test_sgemv
(
bool
tra
,
int
m
,
int
k
,
bool
has_bias
,
int
flag_act
,
int
cls
,
int
ths
,
float
six
=
6.
f
,
float
alpha
=
1.
f
)
{
Tensor
ta
;
Tensor
tb
;
Tensor
tc
;
...
...
@@ -68,8 +76,7 @@ bool test_sgemv(
fill_tensor_rand
(
tbias
,
-
1.
f
,
1.
f
);
LOG
(
INFO
)
<<
"sgemv M: "
<<
m
<<
", K: "
<<
k
<<
", transA: "
<<
(
tra
?
"true"
:
"false"
)
<<
", relu: "
<<
(
has_relu
?
"true"
:
"false"
)
<<
", transA: "
<<
(
tra
?
"true"
:
"false"
)
<<
", act: "
<<
flag_act
<<
", bias: "
<<
(
has_bias
?
"true"
:
"false"
);
#ifdef LITE_WITH_ARM
...
...
@@ -78,10 +85,29 @@ bool test_sgemv(
auto
dc
=
tc
.
mutable_data
<
float
>
();
auto
dc_basic
=
tc_basic
.
mutable_data
<
float
>
();
auto
dbias
=
tbias
.
mutable_data
<
float
>
();
paddle
::
lite_api
::
ActivationType
act
=
paddle
::
lite_api
::
ActivationType
::
kIndentity
;
if
(
flag_act
==
1
)
{
act
=
paddle
::
lite_api
::
ActivationType
::
kRelu
;
}
else
if
(
flag_act
==
2
)
{
act
=
paddle
::
lite_api
::
ActivationType
::
kRelu6
;
}
else
if
(
flag_act
==
4
)
{
act
=
paddle
::
lite_api
::
ActivationType
::
kLeakyRelu
;
}
if
(
FLAGS_check_result
)
{
basic_gemv
(
m
,
k
,
da
,
db
,
dbias
,
dc_basic
,
1.
f
,
0.
f
,
tra
,
has_bias
,
has_relu
);
basic_gemv
(
m
,
k
,
da
,
db
,
dbias
,
dc_basic
,
1.
f
,
0.
f
,
tra
,
has_bias
,
flag_act
,
six
,
alpha
);
}
paddle
::
lite
::
profile
::
Timer
t0
;
//! compute
...
...
@@ -92,15 +118,37 @@ bool test_sgemv(
ctx
.
SetRunMode
(
static_cast
<
paddle
::
lite_api
::
PowerMode
>
(
cls
),
ths
);
/// warmup
for
(
int
j
=
0
;
j
<
FLAGS_warmup
;
++
j
)
{
paddle
::
lite
::
arm
::
math
::
sgemv
(
da
,
db
,
dc
,
tra
,
m
,
k
,
has_bias
,
dbias
,
has_relu
,
&
ctx
);
paddle
::
lite
::
arm
::
math
::
sgemv
(
da
,
db
,
dc
,
tra
,
m
,
k
,
has_bias
,
dbias
,
flag_act
>
0
,
act
,
&
ctx
,
six
,
alpha
);
}
t0
.
Reset
();
for
(
int
i
=
0
;
i
<
FLAGS_repeats
;
++
i
)
{
t0
.
Start
();
paddle
::
lite
::
arm
::
math
::
sgemv
(
da
,
db
,
dc
,
tra
,
m
,
k
,
has_bias
,
dbias
,
has_relu
,
&
ctx
);
paddle
::
lite
::
arm
::
math
::
sgemv
(
da
,
db
,
dc
,
tra
,
m
,
k
,
has_bias
,
dbias
,
flag_act
>
0
,
act
,
&
ctx
,
six
,
alpha
);
t0
.
Stop
();
}
LOG
(
INFO
)
<<
"gemv output: M: "
<<
m
<<
", K: "
<<
k
<<
", cluster: "
<<
cls
...
...
@@ -125,7 +173,7 @@ bool test_sgemv(
tensor_diff
(
tc_basic
,
tc
,
tdiff
);
LOG
(
INFO
)
<<
"basic result: "
;
print_tensor
(
tc_basic
);
LOG
(
INFO
)
<<
"
saber
result: "
;
LOG
(
INFO
)
<<
"
lite
result: "
;
print_tensor
(
tc
);
LOG
(
INFO
)
<<
"diff result: "
;
print_tensor
(
tdiff
);
...
...
@@ -144,22 +192,31 @@ TEST(TestLiteSgemv, Sgemv) {
LOG
(
INFO
)
<<
"run basic sgemv test"
;
for
(
auto
&
m
:
{
1
,
3
,
8
,
21
,
32
,
397
})
{
for
(
auto
&
k
:
{
1
,
3
,
8
,
17
,
59
,
234
})
{
for
(
auto
&
tra
:
{
true
,
fals
e
})
{
for
(
auto
&
tra
:
{
false
,
tru
e
})
{
for
(
auto
&
has_bias
:
{
false
,
true
})
{
for
(
auto
&
has_relu
:
{
false
,
true
})
{
for
(
auto
&
flag_act
:
{
0
,
1
,
2
,
4
})
{
for
(
auto
&
th
:
{
1
,
2
,
4
})
{
auto
flag
=
test_sgemv
(
tra
,
m
,
k
,
has_bias
,
has_relu
,
FLAGS_cluster
,
th
);
float
six
=
6.
f
;
float
alpha
=
8.88
f
;
auto
flag
=
test_sgemv
(
tra
,
m
,
k
,
has_bias
,
flag_act
,
FLAGS_cluster
,
th
,
six
,
alpha
);
if
(
flag
)
{
LOG
(
INFO
)
<<
"test m = "
<<
m
<<
", k="
<<
k
<<
", bias: "
<<
(
has_bias
?
"true"
:
"false"
)
<<
",
relu: "
<<
(
has_relu
?
"true"
:
"false"
)
<<
",
flag act: "
<<
flag_act
<<
", trans A: "
<<
(
tra
?
"true"
:
"false"
)
<<
", threads: "
<<
th
<<
" passed
\n
"
;
}
else
{
LOG
(
FATAL
)
<<
"test m = "
<<
m
<<
", k="
<<
k
<<
", bias: "
<<
(
has_bias
?
"true"
:
"false"
)
<<
",
relu: "
<<
(
has_relu
?
"true"
:
"false"
)
<<
",
flag_act: "
<<
flag_act
<<
", trans A: "
<<
(
tra
?
"true"
:
"false"
)
<<
", threads: "
<<
th
<<
" failed
\n
"
;
}
...
...
@@ -180,15 +237,17 @@ TEST(TestSgemvCustom, Sgemv_custom) {
FLAGS_M
,
FLAGS_K
,
FLAGS_flag_bias
,
FLAGS_flag_
relu
,
FLAGS_flag_
act
,
FLAGS_cluster
,
FLAGS_threads
);
FLAGS_threads
,
FLAGS_clipped_coef
,
FLAGS_leakey_relu_alpha
);
if
(
!
flag
)
{
LOG
(
FATAL
)
<<
"test m = "
<<
FLAGS_M
<<
", k="
<<
FLAGS_K
<<
", trans A: "
<<
FLAGS_traA
<<
", bias: "
<<
FLAGS_flag_bias
<<
",
relu: "
<<
FLAGS_flag_relu
<<
" failed!!"
;
<<
",
act: "
<<
FLAGS_flag_act
<<
" failed!!"
;
}
LOG
(
INFO
)
<<
"test m = "
<<
FLAGS_M
<<
", k="
<<
FLAGS_K
<<
", trans A: "
<<
FLAGS_traA
<<
", bias: "
<<
FLAGS_flag_bias
<<
",
relu: "
<<
FLAGS_flag_relu
<<
" passed!!"
;
<<
",
act: "
<<
FLAGS_flag_act
<<
" passed!!"
;
}
lite/tests/utils/naive_math_impl.h
浏览文件 @
dea6154b
...
...
@@ -177,7 +177,9 @@ static void basic_gemv(int m,
type2
beta
,
bool
trans_a
=
false
,
bool
flag_bias
=
false
,
bool
flag_relu
=
false
)
{
int
flag_act
=
false
,
float
six
=
6.
f
,
float
leakey_relu_alpha
=
1.
f
)
{
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
auto
bias_data
=
static_cast
<
type2
>
(
0
);
...
...
@@ -195,8 +197,15 @@ static void basic_gemv(int m,
sum
+=
av
*
b
[
j
];
}
type2
tmp
=
alpha
*
sum
+
beta
*
c
[
i
]
+
bias_data
;
if
(
flag_relu
)
{
c
[
i
]
=
tmp
>
(
type2
)
0
?
tmp
:
(
type2
)
0
;
if
(
flag_act
>
0
)
{
if
(
flag_act
==
1
)
{
// relu
c
[
i
]
=
tmp
>
(
type2
)
0
?
tmp
:
(
type2
)
0
;
}
else
if
(
flag_act
==
2
)
{
// relu 6
c
[
i
]
=
tmp
>
(
type2
)
0
?
tmp
:
(
type2
)
0
;
c
[
i
]
=
c
[
i
]
<
six
?
c
[
i
]
:
six
;
}
else
if
(
flag_act
==
4
)
{
// leakey relu
c
[
i
]
=
tmp
<
(
type2
)
0
?
(
type2
)(
tmp
*
leakey_relu_alpha
)
:
tmp
;
}
}
else
{
c
[
i
]
=
tmp
;
}
...
...
@@ -230,7 +239,9 @@ static void conv_basic(const Dtype1* din,
int
pad_w
,
int
pad_h
,
bool
flag_bias
,
bool
flag_relu
)
{
int
act_type
,
float
six
=
6.
f
,
float
scale
=
1.
f
)
{
Dtype2
beta
=
0
;
auto
src_data
=
din
;
auto
dst_data_ref
=
dout
;
...
...
@@ -280,10 +291,27 @@ static void conv_basic(const Dtype1* din,
}
}
}
if
(
flag_relu
)
{
dst_data_ref
[
out_idx
]
=
dst_data_ref
[
out_idx
]
>
(
Dtype2
)
0
?
dst_data_ref
[
out_idx
]
:
(
Dtype2
)
0
;
if
(
act_type
>
0
)
{
// 1-relu 2-relu6 4-leakyrelu
if
(
act_type
==
1
)
{
dst_data_ref
[
out_idx
]
=
dst_data_ref
[
out_idx
]
>
(
Dtype2
)
0
?
dst_data_ref
[
out_idx
]
:
(
Dtype2
)
0
;
}
else
if
(
act_type
==
2
)
{
dst_data_ref
[
out_idx
]
=
dst_data_ref
[
out_idx
]
>
(
Dtype2
)
0
?
dst_data_ref
[
out_idx
]
:
(
Dtype2
)
0
;
dst_data_ref
[
out_idx
]
=
dst_data_ref
[
out_idx
]
<
(
Dtype2
)
six
?
dst_data_ref
[
out_idx
]
:
(
Dtype2
)
six
;
}
else
if
(
act_type
==
4
)
{
dst_data_ref
[
out_idx
]
=
dst_data_ref
[
out_idx
]
>
(
Dtype2
)
0
?
dst_data_ref
[
out_idx
]
:
(
Dtype2
)(
dst_data_ref
[
out_idx
]
*
scale
);
}
else
{
printf
(
"this act type: %d does not support
\n
"
,
act_type
);
}
}
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录