Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
9f343ac2
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
338
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
9f343ac2
编写于
8月 04, 2020
作者:
开心的小妮
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix opencl concat. test=develop
上级
d51324bf
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
513 addition
and
210 deletion
+513
-210
lite/backends/opencl/cl_kernel/image/concat_kernel.cl
lite/backends/opencl/cl_kernel/image/concat_kernel.cl
+283
-0
lite/core/optimizer.h
lite/core/optimizer.h
+4
-2
lite/core/profile/precision_profiler.h
lite/core/profile/precision_profiler.h
+18
-4
lite/kernels/opencl/concat_image_compute.cc
lite/kernels/opencl/concat_image_compute.cc
+192
-175
lite/kernels/opencl/conv_image_compute.cc
lite/kernels/opencl/conv_image_compute.cc
+15
-28
lite/kernels/opencl/test_helper.h
lite/kernels/opencl/test_helper.h
+1
-1
未找到文件。
lite/backends/opencl/cl_kernel/image/concat_kernel.cl
浏览文件 @
9f343ac2
...
...
@@ -11,6 +11,285 @@ limitations under the License. */
#
include
<cl_common.h>
__kernel
void
concatByCWith2Inputs
(
__write_only
image2d_t
output_image,
__private
const
int
out_C,
__private
const
int
out_W,
__read_only
image2d_t
input_image_0,
__private
const
int
C_0,
__read_only
image2d_t
input_image_1,
__private
const
int
C_1
)
{
const
int
out_c
=
get_global_id
(
0
)
;
const
int
out_w
=
get_global_id
(
1
)
;
const
int
out_nh
=
get_global_id
(
2
)
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
|
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
output_pos
;
output_pos.x
=
out_c
*
out_W
+
out_w
;
output_pos.y
=
out_nh
;
CL_DTYPE4
output_data
;
for
(
int
i
=
0
; i < 4; i++) {
int
c
=
out_c
*
4
+
i
;
if
(
c
>=
out_C
)
{
break
;
}
int
c_in
;
CL_DTYPE4
input_data
;
if
(
c
<
C_0
)
{
c_in
=
c
;
int2
input_pos
;
input_pos.x
=
(
c_in
/
4
)
*
out_W
+
out_w
;
input_pos.y
=
out_nh
;
input_data
=
READ_IMG_TYPE
(
CL_DTYPE_CHAR,
input_image_0,
sampler,
input_pos
)
;
}
else
{
c_in
=
c
-
C_0
;
int2
input_pos
;
input_pos.x
=
(
c_in
/
4
)
*
out_W
+
out_w
;
input_pos.y
=
out_nh
;
input_data
=
READ_IMG_TYPE
(
CL_DTYPE_CHAR,
input_image_1,
sampler,
input_pos
)
;
}
int
value_offset
=
c_in
%
4
;
float
value
;
if
(
value_offset
==
0
)
{
value
=
input_data.x
;
}
else
if
(
value_offset
==
1
)
{
value
=
input_data.y
;
}
else
if
(
value_offset
==
2
)
{
value
=
input_data.z
;
}
else
if
(
value_offset
==
3
)
{
value
=
input_data.w
;
}
if
(
i
==
0
)
{
output_data.x
=
value
;
}
else
if
(
i
==
1
)
{
output_data.y
=
value
;
}
else
if
(
i
==
2
)
{
output_data.z
=
value
;
}
else
if
(
i
==
3
)
{
output_data.w
=
value
;
}
}
WRITE_IMG_TYPE
(
CL_DTYPE_CHAR,
output_image,
output_pos,
output_data
)
;
}
__kernel
void
concatByCWith3Inputs
(
__write_only
image2d_t
output_image,
__private
const
int
out_C,
__private
const
int
out_W,
__read_only
image2d_t
input_image_0,
__private
const
int
C_0,
__read_only
image2d_t
input_image_1,
__private
const
int
C_1,
__read_only
image2d_t
input_image_2,
__private
const
int
C_2
)
{
const
int
out_c
=
get_global_id
(
0
)
;
const
int
out_w
=
get_global_id
(
1
)
;
const
int
out_nh
=
get_global_id
(
2
)
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
|
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
output_pos
;
output_pos.x
=
out_c
*
out_W
+
out_w
;
output_pos.y
=
out_nh
;
CL_DTYPE4
output_data
;
for
(
int
i
=
0
; i < 4; i++) {
int
c
=
out_c
*
4
+
i
;
if
(
c
>=
out_C
)
{
break
;
}
int
c_in
;
CL_DTYPE4
input_data
;
if
(
c
<
C_0
)
{
c_in
=
c
;
int2
input_pos
;
input_pos.x
=
(
c_in
/
4
)
*
out_W
+
out_w
;
input_pos.y
=
out_nh
;
input_data
=
READ_IMG_TYPE
(
CL_DTYPE_CHAR,
input_image_0,
sampler,
input_pos
)
;
}
else
if
(
c
<
C_0
+
C_1
)
{
c_in
=
c
-
C_0
;
int2
input_pos
;
input_pos.x
=
(
c_in
/
4
)
*
out_W
+
out_w
;
input_pos.y
=
out_nh
;
input_data
=
READ_IMG_TYPE
(
CL_DTYPE_CHAR,
input_image_1,
sampler,
input_pos
)
;
}
else
{
c_in
=
c
-
C_0
-
C_1
;
int2
input_pos
;
input_pos.x
=
(
c_in
/
4
)
*
out_W
+
out_w
;
input_pos.y
=
out_nh
;
input_data
=
READ_IMG_TYPE
(
CL_DTYPE_CHAR,
input_image_2,
sampler,
input_pos
)
;
}
int
value_offset
=
c_in
%
4
;
float
value
;
if
(
value_offset
==
0
)
{
value
=
input_data.x
;
}
else
if
(
value_offset
==
1
)
{
value
=
input_data.y
;
}
else
if
(
value_offset
==
2
)
{
value
=
input_data.z
;
}
else
if
(
value_offset
==
3
)
{
value
=
input_data.w
;
}
if
(
i
==
0
)
{
output_data.x
=
value
;
}
else
if
(
i
==
1
)
{
output_data.y
=
value
;
}
else
if
(
i
==
2
)
{
output_data.z
=
value
;
}
else
if
(
i
==
3
)
{
output_data.w
=
value
;
}
}
WRITE_IMG_TYPE
(
CL_DTYPE_CHAR,
output_image,
output_pos,
output_data
)
;
}
__kernel
void
concatByCWith4Inputs
(
__write_only
image2d_t
output_image,
__private
const
int
out_C,
__private
const
int
out_W,
__read_only
image2d_t
input_image_0,
__private
const
int
C_0,
__read_only
image2d_t
input_image_1,
__private
const
int
C_1,
__read_only
image2d_t
input_image_2,
__private
const
int
C_2,
__read_only
image2d_t
input_image_3,
__private
const
int
C_3
)
{
const
int
out_c
=
get_global_id
(
0
)
;
const
int
out_w
=
get_global_id
(
1
)
;
const
int
out_nh
=
get_global_id
(
2
)
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
|
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
output_pos
;
output_pos.x
=
out_c
*
out_W
+
out_w
;
output_pos.y
=
out_nh
;
CL_DTYPE4
output_data
;
for
(
int
i
=
0
; i < 4; i++) {
int
c
=
out_c
*
4
+
i
;
if
(
c
>=
out_C
)
{
break
;
}
int
c_in
;
CL_DTYPE4
input_data
;
if
(
c
<
C_0
)
{
c_in
=
c
;
int2
input_pos
;
input_pos.x
=
(
c_in
/
4
)
*
out_W
+
out_w
;
input_pos.y
=
out_nh
;
input_data
=
READ_IMG_TYPE
(
CL_DTYPE_CHAR,
input_image_0,
sampler,
input_pos
)
;
}
else
if
(
c
<
C_0
+
C_1
)
{
c_in
=
c
-
C_0
;
int2
input_pos
;
input_pos.x
=
(
c_in
/
4
)
*
out_W
+
out_w
;
input_pos.y
=
out_nh
;
input_data
=
READ_IMG_TYPE
(
CL_DTYPE_CHAR,
input_image_1,
sampler,
input_pos
)
;
}
else
if
(
c
<
C_0
+
C_1
+
C_2
)
{
c_in
=
c
-
C_0
-
C_1
;
int2
input_pos
;
input_pos.x
=
(
c_in
/
4
)
*
out_W
+
out_w
;
input_pos.y
=
out_nh
;
input_data
=
READ_IMG_TYPE
(
CL_DTYPE_CHAR,
input_image_2,
sampler,
input_pos
)
;
}else
if
(
c
<
C_0
+
C_1
+
C_2
+
C_3
)
{
c_in
=
c
-
C_0
-
C_1
-
C_2
;
int2
input_pos
;
input_pos.x
=
(
c_in
/
4
)
*
out_W
+
out_w
;
input_pos.y
=
out_nh
;
input_data
=
READ_IMG_TYPE
(
CL_DTYPE_CHAR,
input_image_3,
sampler,
input_pos
)
;
}
int
value_offset
=
c_in
%
4
;
float
value
;
if
(
value_offset
==
0
)
{
value
=
input_data.x
;
}
else
if
(
value_offset
==
1
)
{
value
=
input_data.y
;
}
else
if
(
value_offset
==
2
)
{
value
=
input_data.z
;
}
else
if
(
value_offset
==
3
)
{
value
=
input_data.w
;
}
if
(
i
==
0
)
{
output_data.x
=
value
;
}
else
if
(
i
==
1
)
{
output_data.y
=
value
;
}
else
if
(
i
==
2
)
{
output_data.z
=
value
;
}
else
if
(
i
==
3
)
{
output_data.w
=
value
;
}
}
WRITE_IMG_TYPE
(
CL_DTYPE_CHAR,
output_image,
output_pos,
output_data
)
;
}
__kernel
void
concatByH
(
__read_only
image2d_t
input_image,
__write_only
image2d_t
output_image,
__private
const
int
out_W,
__private
const
int
out_H_Start
)
{
const
int
in_c
=
get_global_id
(
0
)
;
const
int
in_w
=
get_global_id
(
1
)
;
const
int
in_nh
=
get_global_id
(
2
)
;
int2
input_pos
;
input_pos.x
=
in_c
*
out_W
+
in_w
;
input_pos.y
=
in_nh
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
|
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
CL_DTYPE4
input
;
input
=
READ_IMG_TYPE
(
CL_DTYPE_CHAR,
input_image,
sampler,input_pos
)
;
int2
output_pos
;
output_pos.x
=
input_pos.x
;
output_pos.y
=
out_H_Start
+
input_pos.y
;
WRITE_IMG_TYPE
(
CL_DTYPE_CHAR,
output_image,
output_pos,
input
)
;
}
__kernel
void
concatByW
(
__read_only
image2d_t
input_image,
__write_only
image2d_t
output_image,
__private
const
int
in_W,
__private
const
int
pre_Width,
__private
const
int
out_Width
)
{
const
int
in_c
=
get_global_id
(
0
)
;
const
int
in_w
=
get_global_id
(
1
)
;
const
int
in_nh
=
get_global_id
(
2
)
;
int2
input_pos
;
input_pos.x
=
in_c
*
in_W
+
in_w
;
input_pos.y
=
in_nh
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
|
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
CL_DTYPE4
input
;
input
=
READ_IMG_TYPE
(
CL_DTYPE_CHAR,
input_image,
sampler,input_pos
)
;
int2
output_pos
;
output_pos.x
=
input_pos.x
+
pre_Width
+
out_Width
*
in_c
;
output_pos.y
=
input_pos.y
;
WRITE_IMG_TYPE
(
CL_DTYPE_CHAR,
output_image,
output_pos,
input
)
;
}
//
deprecated
__kernel
void
concat2
(
__read_only
image2d_t
input0,
__read_only
image2d_t
input1,
__write_only
image2d_t
output,
...
...
@@ -104,6 +383,7 @@ __kernel void concat2(__read_only image2d_t input0,
}
}
//
deprecated
__kernel
void
concat_mul
(
__read_only
image2d_t
input,
__write_only
image2d_t
output,
int
flag,
int
C_0,
int
out_C,
int
out_W,
int
in_W,
int
width
)
{
...
...
@@ -162,3 +442,6 @@ __kernel void concat_mul(__read_only image2d_t input,
WRITE_IMG_TYPE
(
CL_DTYPE_CHAR,
output,
output_pos,
input_data
)
;
}
}
lite/core/optimizer.h
浏览文件 @
9f343ac2
...
...
@@ -161,8 +161,10 @@ class Optimizer {
"runtime_context_assign_pass"
,
"argument_type_display_pass"
,
"memory_optimize_pass"
}};
#ifndef LITE_WITH_PRECISION_PROFILE
"memory_optimize_pass"
#endif
}};
if
(
passes
.
size
()
==
1
)
{
// multi_stream_analysis_pass must be in the front of
...
...
lite/core/profile/precision_profiler.h
浏览文件 @
9f343ac2
...
...
@@ -36,6 +36,10 @@
#include "lite/backends/cuda/math/type_trans.h"
#endif
#ifdef LITE_ON_TINY_PUBLISH
#include "lite/utils/replace_stl/stream.h"
#endif
namespace
paddle
{
namespace
lite
{
namespace
profile
{
...
...
@@ -88,19 +92,25 @@ class PrecisionProfiler {
PrecisionProfiler
()
{}
std
::
string
GetSummaryHeader
()
{
#ifdef LITE_ON_TINY_PUBLISH
using
replace_stl
::
setw
;
#else
using
std
::
setw
;
#endif
using
std
::
left
;
using
std
::
fixed
;
STL
::
stringstream
ss
;
ss
<<
"
\n\n
========================================= "
<<
"Detailed Precision Profiler Summary "
<<
"========================================="
<<
std
::
endl
;
<<
"========================================="
<<
"
\n
"
;
ss
<<
setw
(
45
)
<<
left
<<
"operator:(kernel_info)"
<<
" "
<<
setw
(
70
)
<<
left
<<
"output_tensor_name:(tensor_info)"
<<
" "
<<
setw
(
15
)
<<
left
<<
"dims"
<<
" "
<<
setw
(
15
)
<<
left
<<
"mean"
<<
" "
<<
setw
(
15
)
<<
left
<<
"std_deviation"
<<
" "
<<
setw
(
15
)
<<
left
<<
"ave_grow_rate*"
<<
std
::
endl
;
<<
" "
<<
setw
(
15
)
<<
left
<<
"ave_grow_rate*"
<<
"
\n
"
;
// write to file with path: `log_dir`
if
(
log_dir_
!=
""
)
{
...
...
@@ -368,7 +378,11 @@ class PrecisionProfiler {
}
std
::
string
GetInstPrecision
(
const
Instruction
*
inst
=
nullptr
)
{
#ifdef LITE_ON_TINY_PUBLISH
using
replace_stl
::
setw
;
#else
using
std
::
setw
;
#endif
using
std
::
left
;
using
std
::
fixed
;
STL
::
stringstream
ss
;
...
...
@@ -429,7 +443,7 @@ class PrecisionProfiler {
<<
output_arg_info
<<
" "
<<
setw
(
15
)
<<
left
<<
tout
->
dims
()
<<
" "
<<
setw
(
15
)
<<
left
<<
mean_str
<<
" "
<<
setw
(
15
)
<<
left
<<
std_dev_str
<<
" "
<<
setw
(
15
)
<<
left
<<
ave_grow_rate_str
<<
std
::
endl
;
<<
"
\n
"
;
}
else
if
(
type
->
IsTensorList
())
{
auto
touts
=
op_scope
->
FindVar
(
out_name
)
->
GetMutable
<
std
::
vector
<
Tensor
>>
();
...
...
@@ -466,7 +480,7 @@ class PrecisionProfiler {
<<
output_arg_info
<<
" "
<<
setw
(
15
)
<<
left
<<
tout
->
dims
()
<<
" "
<<
setw
(
15
)
<<
left
<<
mean_str
<<
" "
<<
setw
(
15
)
<<
left
<<
std_dev_str
<<
" "
<<
setw
(
15
)
<<
left
<<
ave_grow_rate_str
<<
std
::
endl
;
<<
"
\n
"
;
}
}
}
...
...
lite/kernels/opencl/concat_image_compute.cc
浏览文件 @
9f343ac2
...
...
@@ -38,213 +38,230 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
void
PrepareForRun
()
override
{
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
concat_param_
=
param_
.
get_mutable
<
param_t
>
();
if
(
concat_param_
->
x
.
size
()
==
2
)
{
kernel_func_name_
=
"concat2"
;
}
else
{
kernel_func_name_
=
"concat_mul"
;
}
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
"image/concat_kernel.cl"
,
build_options_
,
time_stamp_
);
auto
input_num
=
concat_param_
->
x
.
size
();
auto
*
output
=
concat_param_
->
output
;
auto
output_dims_size
=
output
->
dims
().
size
();
auto
axis
=
concat_param_
->
axis
;
auto
inputs
=
concat_param_
->
x
;
auto
out_dims
=
concat_param_
->
output
->
dims
();
auto
*
axis_tensor
=
concat_param_
->
axis_tensor
;
if
(
axis_tensor
!=
nullptr
)
{
// auto* axis_tensor_data = axis_tensor->data<int>(TARGET(kARM));
// axis = axis_tensor_data[0];
}
auto
in_dims
=
inputs
[
0
]
->
dims
();
axis_size_
=
out_dims
[
axis
];
axis_
=
axis
;
if
(
out_dims
.
size
()
<
4
)
{
if
(
out_dims
.
size
()
-
axis
==
1
)
{
// width
width_
=
out_dims
[
1
];
// c
flag_
=
3
;
if
(
output_dims_size
<
4
)
{
if
(
output_dims_size
-
axis
==
1
)
{
kernel_func_name_
=
"concatByW"
;
}
else
{
// height
width_
=
out_dims
[
0
];
// n
flag_
=
2
;
}
}
else
{
switch
(
axis_
)
{
case
0
:
width_
=
out_dims
[
2
];
// h
flag_
=
0
;
break
;
case
1
:
// channel
width_
=
out_dims
[
3
];
// w
flag_
=
1
;
break
;
case
2
:
// height
width_
=
out_dims
[
0
];
// n
flag_
=
2
;
break
;
case
3
:
case
-
1
:
// width
width_
=
out_dims
[
1
];
// c
flag_
=
3
;
break
;
default:
printf
(
"this axis: %d does not support
\n
"
,
axis_
);
kernel_func_name_
=
"concatByH"
;
}
}
for
(
int
i
=
1
;
i
<
inputs
.
size
();
i
++
)
{
auto
dims
=
inputs
[
i
]
->
dims
();
// auto flag = CHECK_EQ_OR_FALSE(in_dims.size(), dims.size());
if
(
in_dims
.
size
()
!=
dims
.
size
())
{
printf
(
"input shape must be same
\n
"
);
return
;
}
for
(
int
i
=
0
;
i
<
dims
.
size
();
i
++
)
{
if
(
i
!=
axis
)
{
if
(
in_dims
[
i
]
!=
dims
[
i
])
{
printf
(
"input shape must be same
\n
"
);
return
;
}
}
}
else
if
(
output_dims_size
==
4
)
{
// output->dims.size() == 4
if
(
input_num
==
2
)
{
kernel_func_name_
=
"concatByCWith2Inputs"
;
}
else
if
(
input_num
==
3
)
{
kernel_func_name_
=
"concatByCWith3Inputs"
;
}
else
if
(
input_num
==
4
)
{
kernel_func_name_
=
"concatByCWith4Inputs"
;
}
else
{
LOG
(
FATAL
)
<<
"Unsupported input tensors number:"
<<
input_num
<<
"."
;
}
}
else
{
// output->dims.size() > 4
LOG
(
FATAL
)
<<
"Unsupported output dims "
<<
output
->
dims
()
<<
", whose dims.size() is bigger than 4."
;
}
VLOG
(
1
)
<<
"kernel_func_name_:"
<<
kernel_func_name_
;
context
.
cl_context
()
->
AddKernel
(
kernel_func_name_
,
"image/concat_kernel.cl"
,
build_options_
,
time_stamp_
);
}
void
Run
()
override
{
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
const
auto
&
x_dims
=
param
.
output
->
dims
();
auto
image_shape
=
InitImageDimInfoWith
(
x_dims
);
auto
*
out_buf
=
param
.
output
->
mutable_data
<
half_t
,
cl
::
Image2D
>
(
image_shape
[
"width"
],
image_shape
[
"height"
]);
const
auto
&
y_dims
=
param
.
output
->
dims
();
// useless: check dim only
auto
output_tensor_dims
=
concat_param_
->
output
->
dims
();
auto
output_image_shape
=
InitImageDimInfoWith
(
output_tensor_dims
);
auto
output_image_p
=
concat_param_
->
output
->
mutable_data
<
half_t
,
cl
::
Image2D
>
(
output_image_shape
[
"width"
],
output_image_shape
[
"height"
]);
auto
inputs
=
concat_param_
->
x
;
auto
axis
=
concat_param_
->
axis
;
auto
&
context
=
ctx_
->
As
<
OpenCLContext
>
();
CHECK
(
context
.
cl_context
()
!=
nullptr
);
STL
::
stringstream
kernel_key
;
kernel_key
<<
kernel_func_name_
<<
build_options_
<<
time_stamp_
;
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
auto
inputs
=
param
.
x
;
int
arg_idx
=
0
;
int
width
=
inputs
[
0
]
->
dims
()[
inputs
[
0
]
->
dims
().
size
()
-
1
];
if
(
kernel_func_name_
==
"concatByW"
||
kernel_func_name_
==
"concatByH"
)
{
auto
output_tensor_w
=
output_tensor_dims
[
output_tensor_dims
.
size
()
-
1
];
if
(
output_tensor_dims
.
size
()
-
axis
==
1
)
{
for
(
size_t
input_idx
=
0
;
input_idx
<
inputs
.
size
();
++
input_idx
)
{
auto
*
input
=
inputs
[
input_idx
];
auto
input_tensor_dims
=
input
->
dims
();
auto
input_image_shape
=
InitImageDimInfoWith
(
input_tensor_dims
);
auto
input_tensor_w
=
input_tensor_dims
[
input_tensor_dims
.
size
()
-
1
];
auto
*
input_image_p
=
input
->
data
<
half_t
,
cl
::
Image2D
>
();
#ifdef LITE_WITH_LOG
VLOG
(
4
)
<<
"concat input shape: "
;
for
(
size_t
i
=
0
;
i
<
inputs
.
size
();
i
++
)
{
VLOG
(
4
)
<<
"inputs ["
<<
i
<<
"]"
<<
"["
<<
inputs
[
i
]
->
dims
().
size
()
<<
"D]:"
<<
" dims:"
<<
inputs
[
i
]
->
dims
()[
0
]
<<
" "
<<
inputs
[
i
]
->
dims
()[
1
]
<<
" "
<<
inputs
[
i
]
->
dims
()[
2
]
<<
" "
<<
inputs
[
i
]
->
dims
()[
3
];
}
size_t
input_tensor_pre_w
=
0
;
for
(
size_t
ii_idx
=
0
;
ii_idx
<
input_idx
;
++
ii_idx
)
{
auto
input_tensor_dims
=
inputs
[
ii_idx
]
->
dims
();
input_tensor_pre_w
+=
input_tensor_dims
[
input_tensor_dims
.
size
()
-
1
];
}
VLOG
(
4
)
<<
"concat output shape: "
;
VLOG
(
4
)
<<
" out dims: "
<<
"["
<<
x_dims
.
size
()
<<
"D]:"
<<
x_dims
[
0
]
<<
" "
<<
x_dims
[
1
]
<<
" "
<<
x_dims
[
2
]
<<
" "
<<
x_dims
[
3
];
VLOG
(
4
)
<<
"axis_: "
<<
axis_
;
VLOG
(
4
)
<<
"flag_: "
<<
flag_
;
#endif
int
input_special_w
=
input_tensor_dims
[
output_tensor_dims
.
size
()
-
2
];
// not a good var name
auto
global_work_size
=
cl
::
NDRange
{
static_cast
<
cl
::
size_type
>
(
x_dims
[
x_dims
.
size
()
-
1
]),
static_cast
<
cl
::
size_type
>
(
image_shape
[
"width"
]
/
x_dims
[
x_dims
.
size
()
-
1
]),
static_cast
<
cl
::
size_type
>
(
image_shape
[
"height"
])};
const
std
::
vector
<
size_t
>&
default_work_size
=
DefaultWorkSize
(
input_tensor_dims
,
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
static_cast
<
int64_t
>
(
input_image_shape
[
"width"
]),
static_cast
<
int64_t
>
(
input_image_shape
[
"height"
])}));
cl
::
NDRange
global_work_size
=
cl
::
NDRange
{
static_cast
<
size_t
>
(
default_work_size
[
0
]),
static_cast
<
size_t
>
(
default_work_size
[
1
]),
static_cast
<
size_t
>
(
default_work_size
[
2
])};
cl_int
status
;
status
=
kernel
.
setArg
(
0
,
*
input_image_p
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
1
,
*
output_image_p
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
2
,
input_special_w
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
3
,
input_tensor_pre_w
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
4
,
output_tensor_w
);
CL_CHECK_FATAL
(
status
);
#ifdef LITE_WITH_LOG
VLOG
(
4
)
<<
TargetToStr
(
param
.
output
->
target
());
VLOG
(
4
)
<<
"image_shape(w,h):"
<<
image_shape
[
"width"
]
<<
" "
<<
image_shape
[
"height"
];
VLOG
(
4
)
<<
"x_dims["
<<
x_dims
.
size
()
<<
"D]:"
<<
x_dims
[
0
]
<<
" "
<<
x_dims
[
1
]
<<
" "
<<
x_dims
[
2
]
<<
" "
<<
x_dims
[
3
]
<<
"x_dims[x_dims.size() - 1]"
<<
x_dims
[
x_dims
.
size
()
-
1
];
VLOG
(
4
)
<<
"y_dims["
<<
y_dims
.
size
()
<<
"D]:"
<<
y_dims
[
0
]
<<
" "
<<
y_dims
[
1
]
<<
" "
<<
y_dims
[
2
]
<<
" "
<<
y_dims
[
3
];
VLOG
(
4
)
<<
"width_: "
<<
width_
<<
", flag_: "
<<
flag_
;
VLOG
(
4
)
<<
"global_work_size: "
<<
x_dims
[
x_dims
.
size
()
-
1
]
<<
" "
<<
(
image_shape
[
"width"
]
/
x_dims
[
x_dims
.
size
()
-
1
])
<<
" "
<<
(
image_shape
[
"height"
]);
#endif
status
=
EnqueueNDRangeKernel
(
context
,
kernel
,
cl
::
NullRange
,
global_work_size
,
cl
::
NullRange
,
nullptr
,
event_
);
CL_CHECK_FATAL
(
status
);
}
}
else
{
size_t
output_image_height_start
=
0
;
// output image height start
for
(
size_t
input_idx
=
0
;
input_idx
<
inputs
.
size
();
++
input_idx
)
{
auto
*
input
=
inputs
[
input_idx
];
auto
input_tensor_dims
=
input
->
dims
();
auto
input_image_shape
=
InitImageDimInfoWith
(
input_tensor_dims
);
auto
input_tensor_w
=
input_tensor_dims
[
input_tensor_dims
.
size
()
-
1
];
auto
*
input_image_p
=
input
->
data
<
half_t
,
cl
::
Image2D
>
();
auto
kernel
=
context
.
cl_context
()
->
GetKernel
(
kernel_key
.
str
());
int
out_w
=
x_dims
[
x_dims
.
size
()
-
1
];
int
out_c
=
x_dims
[
1
];
if
(
inputs
.
size
()
==
2
)
{
auto
*
x_buf0
=
inputs
[
0
]
->
data
<
half_t
,
cl
::
Image2D
>
();
auto
*
x_buf1
=
inputs
[
1
]
->
data
<
half_t
,
cl
::
Image2D
>
();
cl_int
status
=
kernel
.
setArg
(
arg_idx
,
*
x_buf0
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
*
x_buf1
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
*
out_buf
);
const
std
::
vector
<
size_t
>&
default_work_size
=
DefaultWorkSize
(
input_tensor_dims
,
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
static_cast
<
int64_t
>
(
input_image_shape
[
"width"
]),
static_cast
<
int64_t
>
(
input_image_shape
[
"height"
])}));
cl
::
NDRange
global_work_size
=
cl
::
NDRange
{
static_cast
<
size_t
>
(
default_work_size
[
0
]),
static_cast
<
size_t
>
(
default_work_size
[
1
]),
static_cast
<
size_t
>
(
default_work_size
[
2
])};
cl_int
status
;
status
=
kernel
.
setArg
(
0
,
*
input_image_p
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
1
,
*
output_image_p
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
2
,
output_tensor_w
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
3
,
output_image_height_start
);
CL_CHECK_FATAL
(
status
);
status
=
EnqueueNDRangeKernel
(
context
,
kernel
,
cl
::
NullRange
,
global_work_size
,
cl
::
NullRange
,
nullptr
,
event_
);
CL_CHECK_FATAL
(
status
);
// compute new output_image_height_start
if
(
output_tensor_dims
.
size
()
==
3
)
{
output_image_height_start
+=
input_tensor_dims
[
1
];
}
else
if
(
output_tensor_dims
.
size
()
==
2
)
{
output_image_height_start
+=
input_tensor_dims
[
0
];
}
}
}
}
else
if
(
kernel_func_name_
==
"concatByCWith2Inputs"
||
kernel_func_name_
==
"concatByCWith3Inputs"
||
kernel_func_name_
==
"concatByCWith4Inputs"
)
{
auto
*
input0
=
inputs
[
0
];
auto
*
input0_image_p
=
input0
->
data
<
half_t
,
cl
::
Image2D
>
();
size_t
input0_tensor_c
=
input0
->
dims
()[
1
];
auto
*
input1
=
inputs
.
size
()
>=
2
?
inputs
[
1
]
:
nullptr
;
auto
*
input1_image_p
=
input1
?
input1
->
data
<
half_t
,
cl
::
Image2D
>
()
:
nullptr
;
size_t
input1_tensor_c
=
input1
?
input1
->
dims
()[
1
]
:
-
1
;
auto
*
input2
=
inputs
.
size
()
>=
3
?
inputs
[
2
]
:
nullptr
;
auto
*
input2_image_p
=
input2
?
input2
->
data
<
half_t
,
cl
::
Image2D
>
()
:
nullptr
;
size_t
input2_tensor_c
=
input2
?
input2
->
dims
()[
1
]
:
-
1
;
auto
*
input3
=
inputs
.
size
()
>=
4
?
inputs
[
3
]
:
nullptr
;
auto
*
input3_image_p
=
input3
?
input3
->
data
<
half_t
,
cl
::
Image2D
>
()
:
nullptr
;
size_t
input3_tensor_c
=
input3
?
input3
->
dims
()[
1
]
:
-
1
;
LOG
(
INFO
)
<<
"input0_image_p:"
<<
input0_image_p
;
LOG
(
INFO
)
<<
"input0_tensor_c:"
<<
input0_tensor_c
;
LOG
(
INFO
)
<<
"input1_image_p:"
<<
input1_image_p
;
LOG
(
INFO
)
<<
"input1_tensor_c:"
<<
input1_tensor_c
;
LOG
(
INFO
)
<<
"input2_image_p:"
<<
input2_image_p
;
LOG
(
INFO
)
<<
"input2_tensor_c:"
<<
input2_tensor_c
;
LOG
(
INFO
)
<<
"input3_image_p:"
<<
input3_image_p
;
LOG
(
INFO
)
<<
"input3_tensor_c:"
<<
input3_tensor_c
;
const
std
::
vector
<
size_t
>&
default_work_size
=
DefaultWorkSize
(
output_tensor_dims
,
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
static_cast
<
int64_t
>
(
output_image_shape
[
"width"
]),
static_cast
<
int64_t
>
(
output_image_shape
[
"height"
])}));
cl
::
NDRange
global_work_size
=
cl
::
NDRange
{
static_cast
<
size_t
>
(
default_work_size
[
0
]),
static_cast
<
size_t
>
(
default_work_size
[
1
]),
static_cast
<
size_t
>
(
default_work_size
[
2
])};
cl_int
status
;
status
=
kernel
.
setArg
(
0
,
*
output_image_p
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
flag_
);
status
=
kernel
.
setArg
(
1
,
static_cast
<
size_t
>
(
output_tensor_dims
[
1
]));
// output_tensor_c
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
static_cast
<
int
>
(
inputs
[
0
]
->
dims
()[
axis_
]));
status
=
kernel
.
setArg
(
2
,
static_cast
<
size_t
>
(
output_tensor_dims
[
3
]));
// output_tensor_w
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
out_c
);
status
=
kernel
.
setArg
(
3
,
*
input0_image_p
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
out_w
);
status
=
kernel
.
setArg
(
4
,
input0_tensor_c
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
width_
);
status
=
kernel
.
setArg
(
5
,
*
input1_image_p
);
CL_CHECK_FATAL
(
status
);
status
=
context
.
cl_context
()
->
GetCommandQueue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
global_work_size
,
cl
::
NullRange
,
nullptr
,
nullptr
);
status
=
kernel
.
setArg
(
6
,
input1_tensor_c
);
CL_CHECK_FATAL
(
status
);
}
else
{
auto
start
=
0
;
for
(
int
i
=
0
;
i
<
inputs
.
size
();
i
++
)
{
arg_idx
=
0
;
auto
in_dims
=
inputs
[
i
]
->
dims
();
image_shape
=
InitImageDimInfoWith
(
in_dims
);
auto
*
x_buf
=
inputs
[
i
]
->
data
<
half_t
,
cl
::
Image2D
>
();
int
in_w
=
in_dims
[
in_dims
.
size
()
-
1
];
#ifdef LITE_WITH_LOG
VLOG
(
4
)
<<
"image_shape(w,h):"
<<
image_shape
[
"width"
]
<<
" "
<<
image_shape
[
"height"
];
#endif
global_work_size
=
cl
::
NDRange
{
static_cast
<
cl
::
size_type
>
(
in_dims
[
in_dims
.
size
()
-
1
]),
static_cast
<
cl
::
size_type
>
(
image_shape
[
"width"
]
/
in_dims
[
in_dims
.
size
()
-
1
]),
static_cast
<
cl
::
size_type
>
(
image_shape
[
"height"
])};
cl_int
status
=
kernel
.
setArg
(
arg_idx
,
*
x_buf
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
*
out_buf
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
flag_
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
start
);
if
(
inputs
.
size
()
>=
3
)
{
status
=
kernel
.
setArg
(
7
,
*
input2_image_p
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
out
_c
);
status
=
kernel
.
setArg
(
8
,
input2_tensor
_c
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
out_w
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
in_w
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
++
arg_idx
,
width_
);
}
if
(
inputs
.
size
()
==
4
)
{
status
=
kernel
.
setArg
(
9
,
*
input3_image_p
);
CL_CHECK_FATAL
(
status
);
status
=
kernel
.
setArg
(
10
,
input3_tensor_c
);
CL_CHECK_FATAL
(
status
);
status
=
context
.
cl_context
()
->
GetCommandQueue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
global_work_size
,
cl
::
NullRange
,
nullptr
,
nullptr
);
CL_CHECK_FATAL
(
status
);
start
+=
inputs
[
i
]
->
dims
()[
axis_
];
}
status
=
EnqueueNDRangeKernel
(
context
,
kernel
,
cl
::
NullRange
,
global_work_size
,
cl
::
NullRange
,
nullptr
,
event_
);
CL_CHECK_FATAL
(
status
);
}
else
{
LOG
(
FATAL
)
<<
"Unsupported kernel func name: "
<<
kernel_func_name_
;
}
}
...
...
lite/kernels/opencl/conv_image_compute.cc
浏览文件 @
9f343ac2
...
...
@@ -185,47 +185,29 @@ void ConvImageCompute::PrepareForRun() {
impl_
=
&
ConvImageCompute
::
DepthwiseConv2d
;
}
else
if
(
filter_tensor_h_
==
3
&&
filter_tensor_w_
==
3
)
{
// #define CONV3x3OPT_FALL_BACK
#ifndef CONV3x3OPT_FALL_BACK
// conv2d_3x3
kernel_func_names_
.
push_back
(
input_tensor_n_
>
1
?
"conv2d_3x3_multi_batch"
:
"conv2d_3x3_opt"
);
kernel_func_paths_
.
push_back
(
"image/conv2d_3x3_opt_kernel.cl"
);
CLImageConverterFolder
converter
;
const
DDim
&
filter_image_dims
=
converter
.
InitImageDimInfoWith
(
filter_dims
);
filter_image_h_
=
filter_image_dims
[
1
];
filter_image_w_
=
filter_image_dims
[
0
];
tensor_hold_filter_image_
->
Resize
({
1
,
filter_image_w_
,
filter_image_h_
,
4
});
half_t
*
filter_image_data
=
tensor_hold_filter_image_
->
mutable_data
<
half_t
>
();
converter
.
NCHWToImage
(
filter_cpu
,
filter_image_data
,
filter_dims
);
filter_gpu_image_
->
mutable_data
<
half_t
,
cl
::
Image2D
>
(
filter_image_w_
,
filter_image_h_
,
filter_image_data
);
impl_
=
&
ConvImageCompute
::
Conv2d3x3opt
;
#else
kernel_func_names_
.
push_back
(
"conv2d_3x3"
);
kernel_func_paths_
.
push_back
(
"image/conv2d_3x3_kernel.cl"
);
if
(
groups_
==
1
)
{
kernel_func_names_
.
push_back
(
input_tensor_n_
>
1
?
"conv2d_3x3_multi_batch"
:
"conv2d_3x3_opt"
);
kernel_func_paths_
.
push_back
(
"image/conv2d_3x3_opt_kernel.cl"
);
impl_
=
&
ConvImageCompute
::
Conv2d3x3opt
;
}
else
{
// groups_ > 1
kernel_func_names_
.
push_back
(
"conv2d_3x3"
);
kernel_func_paths_
.
push_back
(
"image/conv2d_3x3_kernel.cl"
);
impl_
=
&
ConvImageCompute
::
Conv2d3x3
;
}
CLImageConverterFolder
converter
;
const
DDim
&
filter_image_dims
=
converter
.
InitImageDimInfoWith
(
filter_dims
);
filter_image_h_
=
filter_image_dims
[
1
];
filter_image_w_
=
filter_image_dims
[
0
];
tensor_hold_filter_image_
->
Resize
({
1
,
filter_image_w_
,
filter_image_h_
,
4
});
half_t
*
filter_image_data
=
tensor_hold_filter_image_
->
mutable_data
<
half_t
>
();
converter
.
NCHWToImage
(
filter_cpu
,
filter_image_data
,
filter_dims
);
filter_gpu_image_
->
mutable_data
<
half_t
,
cl
::
Image2D
>
(
filter_image_w_
,
filter_image_h_
,
filter_image_data
);
impl_
=
&
ConvImageCompute
::
Conv2d3x3
;
#endif
#undef CONV3x3OPT_FALL_BACK
}
else
if
(
filter_tensor_h_
==
5
&&
filter_tensor_w_
==
5
)
{
#define CONV_5x5_OPT
#ifndef CONV_5x5_OPT
...
...
@@ -584,6 +566,11 @@ void ConvImageCompute::GetGlobalWorkSize() {
static_cast
<
size_t
>
(
w_blk_
),
static_cast
<
size_t
>
(
nh_blk_
)};
input_c_block_
=
static_cast
<
const
int
>
((
input_tensor_c_
+
3
)
/
4
);
}
else
if
(
kernel_func_names_
[
0
]
==
"conv2d_3x3"
)
{
global_work_size_
=
cl
::
NDRange
{
static_cast
<
size_t
>
(
c_blk_
),
static_cast
<
size_t
>
(
w_blk_
),
static_cast
<
size_t
>
(
nh_blk_
)};
}
else
if
(
kernel_func_names_
[
0
]
==
"conv2d_3x3_multi_batch"
||
kernel_func_names_
[
0
]
==
"conv2d_3x3_opt"
)
{
int
w_blk_size
=
5
;
...
...
lite/kernels/opencl/test_helper.h
浏览文件 @
9f343ac2
...
...
@@ -19,7 +19,7 @@
#define COMPUTE_RELATIVE_DIFF(res0, res1) abs(abs(res0 - res1) / (res1 + 1e-5))
#define IS_DIFF_PASSED(res0, res1, threshold) \
(((COMP
TU
E_ABS_DIFF(res0, res1) < threshold) || \
(((COMP
UT
E_ABS_DIFF(res0, res1) < threshold) || \
(COMPUTE_RELATIVE_DIFF(res0, res1) < threshold)) \
? true \
: false)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录