Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
8b71275c
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8b71275c
编写于
11月 21, 2018
作者:
Z
zhangyang
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'upstream/develop' into develop
上级
1c050149
af65b34c
变更
16
隐藏空白更改
内联
并排
Showing
16 changed file
with
659 addition
and
132 deletion
+659
-132
src/framework/cl/cl_helper.h
src/framework/cl/cl_helper.h
+7
-0
src/framework/cl/cl_image.h
src/framework/cl/cl_image.h
+5
-3
src/framework/cl/cl_image_converter.cpp
src/framework/cl/cl_image_converter.cpp
+37
-0
src/framework/cl/cl_image_converter.h
src/framework/cl/cl_image_converter.h
+25
-0
src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+4
-4
src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
+69
-42
src/operators/kernel/cl/cl_kernel/reshape.cl
src/operators/kernel/cl/cl_kernel/reshape.cl
+141
-17
src/operators/kernel/cl/cl_kernel/transpose_kernel.cl
src/operators/kernel/cl/cl_kernel/transpose_kernel.cl
+129
-0
src/operators/kernel/cl/fetch_kernel.cpp
src/operators/kernel/cl/fetch_kernel.cpp
+20
-22
src/operators/kernel/cl/prior_box_kernel.cpp
src/operators/kernel/cl/prior_box_kernel.cpp
+53
-19
src/operators/kernel/cl/reshape_kernel.cpp
src/operators/kernel/cl/reshape_kernel.cpp
+58
-22
src/operators/kernel/cl/transpose_kernel.cpp
src/operators/kernel/cl/transpose_kernel.cpp
+37
-1
src/operators/op_param.h
src/operators/op_param.h
+2
-0
test/CMakeLists.txt
test/CMakeLists.txt
+3
-0
test/net/test_super.cpp
test/net/test_super.cpp
+64
-0
test/test_helper.h
test/test_helper.h
+5
-2
未找到文件。
src/framework/cl/cl_helper.h
浏览文件 @
8b71275c
...
...
@@ -61,9 +61,16 @@ class CLHelper {
auto
work_size_2
=
n
*
h
;
return
{
work_size_0
,
work_size_1
,
work_size_2
};
}
else
if
(
image_dim
.
size
()
==
2
)
{
auto
h
=
image_dim
[
0
];
auto
w
=
image_dim
[
1
];
return
{
1
,
image
.
ImageWidth
(),
image
.
ImageHeight
()};
}
else
if
(
image_dim
.
size
()
==
1
)
{
return
{
1
,
image
.
ImageWidth
(),
1
};
}
else
if
(
image_dim
.
size
()
==
3
)
{
int
c
=
image_dim
[
0
];
int
h
=
image_dim
[
1
];
int
w
=
image_dim
[
2
];
return
{(
c
+
3
)
/
4
,
w
,
h
};
}
PADDLE_MOBILE_THROW_EXCEPTION
(
" not support this dim, need imp "
);
}
...
...
src/framework/cl/cl_image.h
浏览文件 @
8b71275c
...
...
@@ -120,17 +120,19 @@ class CLImage {
PADDLE_MOBILE_ENFORCE
(
tensor_data_
==
nullptr
,
" empty image tensor data shouldn't have value"
);
CLImageConverterFolder
*
folder_converter
=
new
CLImageConverterFolder
();
// CLImageConverterFolder *folder_converter = new
// CLImageConverterFolder();
CLImageConverterNormal
*
normal_converter
=
new
CLImageConverterNormal
();
DLOG
<<
" to get image dims "
;
image_dims_
=
folder
_converter
->
InitImageDimInfoWith
(
dim
);
image_dims_
=
normal
_converter
->
InitImageDimInfoWith
(
dim
);
DLOG
<<
" end get image dims "
<<
image_dims_
;
InitCLImage
(
context
,
image_dims_
[
0
],
image_dims_
[
1
],
nullptr
);
tensor_dims_
=
dim
;
command_queue_
=
command_queue
;
image_converter_
=
folder
_converter
;
image_converter_
=
normal
_converter
;
cl_event_
=
CLEngine
::
Instance
()
->
CreateEvent
(
context
);
initialized_
=
true
;
DLOG
<<
" end init cl image"
;
...
...
src/framework/cl/cl_image_converter.cpp
浏览文件 @
8b71275c
...
...
@@ -389,5 +389,42 @@ void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
}
}
const
DDim
&
CLImageConverterNormal
::
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
)
{
size_t
new_dims
[]
=
{
1
,
1
,
1
,
1
};
for
(
int
j
=
0
;
j
<
tensor_dim
.
size
();
++
j
)
{
new_dims
[
4
-
tensor_dim
.
size
()
+
j
]
=
tensor_dim
[
j
];
}
size_t
N
,
C
,
H
,
W
;
N
=
new_dims
[
0
];
C
=
new_dims
[
1
];
H
=
new_dims
[
2
];
W
=
new_dims
[
3
];
size_t
width
=
W
*
((
C
+
3
)
/
4
);
size_t
height
=
H
*
N
;
width_of_one_block_
=
W
;
height_of_one_block_
=
H
;
c_block_
=
width
/
W
;
return
make_ddim
({
width
,
height
});
}
void
CLImageConverterNormal
::
NCHWToImage
(
float
*
tensor
,
half_t
*
image
,
const
DDim
&
tensor_dim
)
{
PADDLE_MOBILE_ENFORCE
(
tensor_dim
.
size
()
<=
4
&&
tensor_dim
.
size
()
>
0
,
"tensor dim is not support "
);
CLImageConverterDefault
default_converter
;
default_converter
.
NCHWToImage
(
tensor
,
image
,
tensor_dim
);
}
void
CLImageConverterNormal
::
ImageToNCHW
(
half_t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
{
CLImageConverterDefault
default_converter
;
default_converter
.
ImageToNCHW
(
image
,
tensor
,
image_dim
,
tensor_dim
);
}
}
// namespace framework
}
// namespace paddle_mobile
src/framework/cl/cl_image_converter.h
浏览文件 @
8b71275c
...
...
@@ -63,6 +63,31 @@ class CLImageConverterFolder : public CLImageConverterBase {
int
height_of_one_block_
;
};
class
CLImageConverterNormal
:
public
CLImageConverterBase
{
public:
const
DDim
&
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
);
void
NCHWToImage
(
float
*
tensor
,
half_t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
half_t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
);
/*
* width of original tensor
* */
inline
size_t
WidthOfOneBlock
()
const
{
return
width_of_one_block_
;
}
/*
* height of original tensor
* */
inline
size_t
HeightOfOneBlock
()
const
{
return
height_of_one_block_
;
}
int
GetCBlock
()
const
{
return
c_block_
;
}
private:
int
c_block_
;
int
width_of_one_block_
;
int
height_of_one_block_
;
};
class
CLImageConverterNWBlock
:
public
CLImageConverterBase
{
const
DDim
&
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
);
void
NCHWToImage
(
float
*
tensor
,
half_t
*
image
,
const
DDim
&
tensor_dim
);
...
...
src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
浏览文件 @
8b71275c
...
...
@@ -138,19 +138,19 @@ __kernel void conv_3x3(__private const int global_size_dim0,
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
float4 weight_x = read_imagef
(filter, sampler, pos_of_weight);
half4 weight_x = read_imageh
(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
float4 weight_y = read_imagef
(filter, sampler, pos_of_weight);
half4 weight_y = read_imageh
(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
float4 weight_z = read_imagef
(filter, sampler, pos_of_weight);
half4 weight_z = read_imageh
(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
float4 weight_w = read_imagef
(filter, sampler, pos_of_weight);
half4 weight_w = read_imageh
(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
*/
...
...
src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
浏览文件 @
8b71275c
...
...
@@ -19,47 +19,52 @@ __kernel void prior_box(__private const int global_size_dim0,
__private
const
int
global_size_dim2,
__global
float
*box_width,
__global
float
*box_height,
__write_only
image2d_t
output_image,
__global
float
*variances_Buffer,
__write_only
image2d_t
output_boxes,
__write_only
image2d_t
output_variances,
__private
const
float
step_width,
__private
const
float
step_height,
__private
const
float
offset,
__private
const
int
img_width,
__private
const
int
img_height,
__private
const
int
num_priors,
__private
const
int
C
)
{
__private
const
int
C
,
__private
const
int
clip
)
{
const
int
out_c
=
get_global_id
(
0
)
;
const
int
out_nh
=
get_global_id
(
1
)
;
const
int
out_n
=
out_nh/num_priors
;
const
int
out_h
=
out_nh%num_priors
;
if
(
out_c
>=
global_size_dim0
|
|out_nh >= global_size_dim2) {
return;
}
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST
;
int2
output_pos
;
output_pos.x
=
out_c
*
4
;
output_pos.y
=
out_nh
;
float
center_x0
=
(
offset
+
out_c
*
4
)
*
step_width
;
float
center_x1
=
(
offset
+
out_c
*
4
+
1
)
*
step_width
;
float
center_x2
=
(
offset
+
out_c
*
4
+
2
)
*
step_width
;
float
center_x3
=
(
offset
+
out_c
*
4
+
3
)
*
step_width
;
float
center_y
=
(
out_n
+
offset
)
*
step_height
;
float
center_x0
=
(
offset
+
(
float
)(
out_c
*
4
)
)
*
step_width
;
float
center_x1
=
(
offset
+
(
float
)(
out_c
*
4
+
1
)
)
*
step_width
;
float
center_x2
=
(
offset
+
(
float
)(
out_c
*
4
+
2
)
)
*
step_width
;
float
center_x3
=
(
offset
+
(
float
)(
out_c
*
4
+
3
)
)
*
step_width
;
float
center_y
=
(
(
float
)
out_n
+
offset
)
*
step_height
;
half4
output[4]
;
output[0].x
=
convert_half
((
center_x0
-
box_width[out_h]
)
/
img_width
)
;
output[1].x
=
convert_half
((
center_y
-
box_height[out_h]
)
/
img_height
)
;
output[2].x
=
convert_half
((
center_x0
+
box_width[out_h]
)
/
img_width
)
;
output[3].x
=
convert_half
((
center_y
+
box_height[out_h]
)
/
img_height
)
;
half4
variances[4]
;
output[0].x
=
convert_half
((
center_x0
-
box_width[out_h]
)
/
(
float
)
img_width
)
;
output[1].x
=
convert_half
((
center_y
-
box_height[out_h]
)
/
(
float
)
img_height
)
;
output[2].x
=
convert_half
((
center_x0
+
box_width[out_h]
)
/
(
float
)
img_width
)
;
output[3].x
=
convert_half
((
center_y
+
box_height[out_h]
)
/
(
float
)
img_height
)
;
variances[0].x
=
convert_half
(
variances_Buffer[0]
)
;
variances[1].x
=
convert_half
(
variances_Buffer[1]
)
;
variances[2].x
=
convert_half
(
variances_Buffer[2]
)
;
variances[3].x
=
convert_half
(
variances_Buffer[3]
)
;
if
(
C
-
4
*
out_c>=2
)
{
output[0].y
=
convert_half
((
center_x1
-
box_width[out_h]
)
/
img_width
)
;
output[1].y
=
convert_half
((
center_y
-
box_height[out_h]
)
/
img_height
)
;
output[2].y
=
convert_half
((
center_x1
+
box_width[out_h]
)
/
img_width
)
;
output[3].y
=
convert_half
((
center_y
+
box_height[out_h]
)
/
img_height
)
;
output[0].y
=
convert_half
((
center_x1
-
box_width[out_h]
)
/
(
float
)
img_width
)
;
output[1].y
=
convert_half
((
center_y
-
box_height[out_h]
)
/
(
float
)
img_height
)
;
output[2].y
=
convert_half
((
center_x1
+
box_width[out_h]
)
/
(
float
)
img_width
)
;
output[3].y
=
convert_half
((
center_y
+
box_height[out_h]
)
/
(
float
)
img_height
)
;
variances[0].y
=
convert_half
(
variances_Buffer[0]
)
;
variances[1].y
=
convert_half
(
variances_Buffer[1]
)
;
variances[2].y
=
convert_half
(
variances_Buffer[2]
)
;
variances[3].y
=
convert_half
(
variances_Buffer[3]
)
;
}else{
output[0].y
=
0.0f
;
output[1].y
=
0.0f
;
...
...
@@ -67,10 +72,14 @@ __kernel void prior_box(__private const int global_size_dim0,
output[3].y
=
0.0f
;
}
if
(
C
-
4
*
out_c>=3
)
{
output[0].z
=
convert_half
((
center_x2
-
box_width[out_h]
)
/
img_width
)
;
output[1].z
=
convert_half
((
center_y
-
box_height[out_h]
)
/
img_height
)
;
output[2].z
=
convert_half
((
center_x2
+
box_width[out_h]
)
/
img_width
)
;
output[3].z
=
convert_half
((
center_y
+
box_height[out_h]
)
/
img_height
)
;
output[0].z
=
convert_half
((
center_x2
-
box_width[out_h]
)
/
(
float
)
img_width
)
;
output[1].z
=
convert_half
((
center_y
-
box_height[out_h]
)
/
(
float
)
img_height
)
;
output[2].z
=
convert_half
((
center_x2
+
box_width[out_h]
)
/
(
float
)
img_width
)
;
output[3].z
=
convert_half
((
center_y
+
box_height[out_h]
)
/
(
float
)
img_height
)
;
variances[0].z
=
convert_half
(
variances_Buffer[0]
)
;
variances[1].z
=
convert_half
(
variances_Buffer[1]
)
;
variances[2].z
=
convert_half
(
variances_Buffer[2]
)
;
variances[3].z
=
convert_half
(
variances_Buffer[3]
)
;
}else{
output[0].z
=
0.0f
;
output[1].z
=
0.0f
;
...
...
@@ -78,23 +87,41 @@ __kernel void prior_box(__private const int global_size_dim0,
output[3].z
=
0.0f
;
}
if
(
C
-
4
*
out_c>=4
)
{
output[0].w
=
convert_half
((
center_x3
-
box_width[out_h]
)
/
img_width
)
;
output[1].w
=
convert_half
((
center_y
-
box_height[out_h]
)
/
img_height
)
;
output[2].w
=
convert_half
((
center_x3
+
box_width[out_h]
)
/
img_width
)
;
output[3].w
=
convert_half
((
center_y
+
box_height[out_h]
)
/
img_height
)
;
output[0].w
=
convert_half
((
center_x3
-
box_width[out_h]
)
/
(
float
)
img_width
)
;
output[1].w
=
convert_half
((
center_y
-
box_height[out_h]
)
/
(
float
)
img_height
)
;
output[2].w
=
convert_half
((
center_x3
+
box_width[out_h]
)
/
(
float
)
img_width
)
;
output[3].w
=
convert_half
((
center_y
+
box_height[out_h]
)
/
(
float
)
img_height
)
;
variances[0].w
=
convert_half
(
variances_Buffer[0]
)
;
variances[1].w
=
convert_half
(
variances_Buffer[1]
)
;
variances[2].w
=
convert_half
(
variances_Buffer[2]
)
;
variances[3].w
=
convert_half
(
variances_Buffer[3]
)
;
}else{
output[0].z
=
0.0f
;
output[1].z
=
0.0f
;
output[2].z
=
0.0f
;
output[3].z
=
0.0f
;
output[0].w
=
0.0f
;
output[1].w
=
0.0f
;
output[2].w
=
0.0f
;
output[3].w
=
0.0f
;
}
if
(
clip==1
)
{
output[0]
=
min
(
max
((
half4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
,
output[0]
)
,
(
half4
)(
1.0f,
1.0f,
1.0f,
1.0f
))
;
output[1]
=
min
(
max
((
half4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
,
output[1]
)
,
(
half4
)(
1.0f,
1.0f,
1.0f,
1.0f
))
;
output[2]
=
min
(
max
((
half4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
,
output[2]
)
,
(
half4
)(
1.0f,
1.0f,
1.0f,
1.0f
))
;
output[3]
=
min
(
max
((
half4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
,
output[3]
)
,
(
half4
)(
1.0f,
1.0f,
1.0f,
1.0f
))
;
}
output[0]
=
min
(
max
((
half4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
,
output[0]
)
,
(
half4
)(
1.0f,
1.0f,
1.0f,
1.0f
))
;
output[1]
=
min
(
max
((
half4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
,
output[1]
)
,
(
half4
)(
1.0f,
1.0f,
1.0f,
1.0f
))
;
output[2]
=
min
(
max
((
half4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
,
output[2]
)
,
(
half4
)(
1.0f,
1.0f,
1.0f,
1.0f
))
;
output[3]
=
min
(
max
((
half4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
,
output[3]
)
,
(
half4
)(
1.0f,
1.0f,
1.0f,
1.0f
))
;
write_imageh
(
output_image,
(
int2
)(
output_pos.x
+
1
,
output_pos.y
)
,
output[0]
)
;
write_imageh
(
output_image,
(
int2
)(
output_pos.x
+
2
,
output_pos.y
)
,
output[1]
)
;
write_imageh
(
output_image,
(
int2
)(
output_pos.x
+
3
,
output_pos.y
)
,
output[2]
)
;
write_imageh
(
output_image,
(
int2
)(
output_pos.x
+
4
,
output_pos.y
)
,
output[3]
)
;
if
(
output_pos.x
==
0
&&
output_pos.y
==
1
)
{
float4
out
=
(
float4
)(
output[0].x,
output[1].x,
output[2].x,
output[3].x
)
;
printf
(
"output = %v4hlf \n"
,
out
)
;
}
write_imageh
(
output_boxes,
(
int2
)(
output_pos.x
+
0
,
output_pos.y
)
,
output[0]
)
;
write_imageh
(
output_boxes,
(
int2
)(
output_pos.x
+
1
,
output_pos.y
)
,
output[1]
)
;
write_imageh
(
output_boxes,
(
int2
)(
output_pos.x
+
2
,
output_pos.y
)
,
output[2]
)
;
write_imageh
(
output_boxes,
(
int2
)(
output_pos.x
+
3
,
output_pos.y
)
,
output[3]
)
;
write_imageh
(
output_variances,
(
int2
)(
output_pos.x
+
0
,
output_pos.y
)
,
variances[0]
)
;
write_imageh
(
output_variances,
(
int2
)(
output_pos.x
+
1
,
output_pos.y
)
,
variances[1]
)
;
write_imageh
(
output_variances,
(
int2
)(
output_pos.x
+
2
,
output_pos.y
)
,
variances[2]
)
;
write_imageh
(
output_variances,
(
int2
)(
output_pos.x
+
3
,
output_pos.y
)
,
variances[3]
)
;
}
\ No newline at end of file
src/operators/kernel/cl/cl_kernel/reshape.cl
浏览文件 @
8b71275c
...
...
@@ -14,26 +14,150 @@ limitations under the License. */
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
__kernel
void
reshape
(
__read_only
image2d_t
input,
__write_only
image2d_t
output,
__private
const
int
d0,
__private
const
int
d1,
__private
const
int
d2,
__private
const
int
d3,
__private
const
int
x0,
__private
const
int
x1,
__private
const
int
x2,
__private
const
int
x3
)
{
const
int
x
=
get_global_id
(
0
)
;
const
int
y
=
get_global_id
(
1
)
;
__kernel
void
reshape
(
__read_only
image2d_t
input_image,
__write_only
image2d_t
output_image,
__private
const
int
out_C,
__private
const
int
out_H,
__private
const
int
out_W,
__private
const
int
in_W,
__private
const
int
in_H,
__private
const
int
in_Stride0,
__private
const
int
in_Stride1,
__private
const
int
in_Stride2,
__private
const
int
out_Stride0,
__private
const
int
out_Stride1,
__private
const
int
out_Stride2
)
{
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
|
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
const
int
out_c
=
get_global_id
(
0
)
;
const
int
out_w
=
get_global_id
(
1
)
;
const
int
out_nh
=
get_global_id
(
2
)
;
const
int
out_n
=
out_nh/out_H
;
const
int
out_h
=
out_nh%out_H
;
const
int
out_c0
=
out_c
*
4
;
const
int
out_c1
=
out_c
*
4
+
1
;
const
int
out_c2
=
out_c
*
4+
2
;
const
int
out_c3
=
out_c
*
4+
3
;
int
count0
=
out_n
*
out_Stride2
+
out_c0
*
out_Stride1
+
out_h
*
out_Stride0
+
out_w
;
int
count1
=
out_n
*
out_Stride2
+
out_c1
*
out_Stride1
+
out_h
*
out_Stride0
+
out_w
;
int
count2
=
out_n
*
out_Stride2
+
out_c2
*
out_Stride1
+
out_h
*
out_Stride0
+
out_w
;
int
count3
=
out_n
*
out_Stride2
+
out_c3
*
out_Stride1
+
out_h
*
out_Stride0
+
out_w
;
int
in_n0
=
count0/in_Stride2
;
int
in_n1
=
count1/in_Stride2
;
int
in_n2
=
count1/in_Stride2
;
int
in_n3
=
count2/in_Stride2
;
count0
=
count0%in_Stride2
;
count1
=
count1%in_Stride2
;
count2
=
count2%in_Stride2
;
count3
=
count3%in_Stride2
;
int
in_c0
=
count0/in_Stride1
;
int
in_c1
=
count1/in_Stride1
;
int
in_c2
=
count2/in_Stride1
;
int
in_c3
=
count3/in_Stride1
;
int
in_h0
=
(
count0%in_Stride1
)
/in_Stride0
;
int
in_h1
=
(
count1%in_Stride1
)
/in_Stride0
;
int
in_h2
=
(
count2%in_Stride1
)
/in_Stride0
;
int
in_h3
=
(
count3%in_Stride1
)
/in_Stride0
;
int
in_w0
=
(
count0%in_Stride1
)
%in_Stride0
;
int
in_w1
=
(
count1%in_Stride1
)
%in_Stride0
;
int
in_w2
=
(
count2%in_Stride1
)
%in_Stride0
;
int
in_w3
=
(
count3%in_Stride1
)
%in_Stride0
;
int2
input_pos0
;
int2
input_pos1
;
int2
input_pos2
;
int2
input_pos3
;
input_pos0.x
=
(
in_c0/4
)
*
in_W
+
in_w0
;
input_pos0.y
=
in_n0
*
in_H
+
in_h0
;
input_pos1.x
=
(
in_c1/4
)
*
in_W
+
in_w1
;
input_pos1.y
=
in_n1
*
in_H
+
in_h1
;
input_pos2.x
=
(
in_c2/4
)
*
in_W
+
in_w2
;
input_pos2.y
=
in_n2
*
in_H
+
in_h2
;
input_pos3.x
=
(
in_c3/4
)
*
in_W
+
in_w3
;
input_pos3.y
=
in_n3
*
in_H
+
in_h3
;
int2
output_pos
;
output_pos.x
=
out_c
*
out_W
+
out_w
;
output_pos.y
=
out_nh
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
|
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
half4
input0
;
half4
input1
;
half4
input2
;
half4
input3
;
half4
output
;
input0
=
read_imageh
(
input_image,
sampler,input_pos0
)
;
if
(
in_c0%4==0
)
{
output.x
=
input0.x
;
}else
if
(
in_c0%4==1
)
{
output.x
=
input0.y
;
}else
if
(
in_c0%4==2
)
{
output.x
=
input0.z
;
}else{
output.x
=
input0.w
;
}
if
(
out_C
-
out_c
*
4>=2
)
{
input1
=
read_imageh
(
input_image,
sampler,input_pos1
)
;
if
(
in_c1%4==0
)
{
output.y
=
input1.x
;
}else
if
(
in_c1%4==1
)
{
output.y
=
input1.y
;
}else
if
(
in_c1%4==2
)
{
output.y
=
input1.z
;
}else{
output.y
=
input1.w
;
}
}else{
output.y
=
0.0f
;
}
if
(
out_C
-
out_c
*
4>=3
)
{
input2
=
read_imageh
(
input_image,
sampler,input_pos2
)
;
if
(
in_c2%4==0
)
{
output.z
=
input2.x
;
}else
if
(
in_c2%4==1
)
{
output.z
=
input1.y
;
}else
if
(
in_c2%4==2
)
{
output.z
=
input2.z
;
}else{
output.z
=
input2.w
;
}
}else{
output.z
=
0.0f
;
}
half4
in
=
read_imageh
(
input,
sampler,
(
int2
)(
x,
y
))
;
if
(
out_C
-
out_c
*
4>=4
)
{
input3
=
read_imageh
(
input_image,
sampler,input_pos3
)
;
if
(
in_c3%4==0
)
{
output.w
=
input3.x
;
}else
if
(
in_c3%4==1
)
{
output.w
=
input3.y
;
}else
if
(
in_c3%4==2
)
{
output.w
=
input3.z
;
}else{
output.w
=
input3.w
;
}
}else{
output.w
=
0.0f
;
}
write_imageh
(
output,
(
int2
)(
x,
y
)
,
in
)
;
write_imageh
(
output_image,
output_pos,
output
)
;
}
...
...
src/operators/kernel/cl/cl_kernel/transpose_kernel.cl
0 → 100644
浏览文件 @
8b71275c
/*
Copyright
(
c
)
2018
PaddlePaddle
Authors.
All
Rights
Reserved.
Licensed
under
the
Apache
License,
Version
2.0
(
the
"License"
)
;
you
may
not
use
this
file
except
in
compliance
with
the
License.
You
may
obtain
a
copy
of
the
License
at
http://www.apache.org/licenses/LICENSE-2.0
Unless
required
by
applicable
law
or
agreed
to
in
writing,
software
distributed
under
the
License
is
distributed
on
an
"AS IS"
BASIS,
WITHOUT
WARRANTIES
OR
CONDITIONS
OF
ANY
KIND,
either
express
or
implied.
See
the
License
for
the
specific
language
governing
permissions
and
limitations
under
the
License.
*/
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
__kernel
void
transpose_4d
(
__read_only
image2d_t
input_image,
__write_only
image2d_t
output_image,
__private
const
int
out_C,
__private
const
int
out_H,
__private
const
int
out_W,
__private
const
int
in_W
)
{
const
int
out_c
=
get_global_id
(
0
)
;
const
int
out_w
=
get_global_id
(
1
)
;
const
int
out_nh
=
get_global_id
(
2
)
;
const
int
out_n
=
1
;
const
int
out_h
=
out_nh%out_H
;
const
int
out_c0
=
out_c
*
4
;
const
int
out_c1
=
out_c
*
4
+
1
;
const
int
out_c2
=
out_c
*
4+
2
;
const
int
out_c3
=
out_c
*
4+
3
;
const
int
in_n
=
out_n
;
const
int
in_c
=
out_w
/
4
;
const
int
in_h0
=
out_c0
;
const
int
in_h1
=
out_c1
;
const
int
in_h2
=
out_c2
;
const
int
in_h3
=
out_c3
;
const
int
in_w
=
out_h
;
int2
output_pos
;
output_pos.x
=
out_c
*
out_W
+
out_w
;
output_pos.y
=
out_nh
;
int2
input_pos0
;
int2
input_pos1
;
int2
input_pos2
;
int2
input_pos3
;
input_pos0.x
=
in_W
*
in_c
+
in_w
;
input_pos0.y
=
in_n
*
in_h0
;
input_pos1.x
=
in_W
*
in_c
+
in_w
;
input_pos1.y
=
in_n
*
in_h1
;
input_pos2.x
=
in_W
*
in_c
+
in_w
;
input_pos2.y
=
in_n
*
in_h2
;
input_pos3.x
=
in_W
*
in_c
+
in_w
;
input_pos3.y
=
in_n
*
in_h3
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
|
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
half4
input0
;
half4
input1
;
half4
input2
;
half4
input3
;
half4
output
;
input0
=
read_imageh
(
input_image,
sampler,input_pos0
)
;
if
(
out_w%4==0
)
{
output.x
=
input0.x
;
}else
if
(
out_w%4==1
)
{
output.x
=
input0.y
;
}else
if
(
out_w%4==2
)
{
output.x
=
input0.z
;
}else{
output.x
=
input0.w
;
}
if
(
out_C
-
out_c
*
4>=2
)
{
input1
=
read_imageh
(
input_image,
sampler,input_pos1
)
;
if
(
out_w%4==0
)
{
output.y
=
input1.x
;
}else
if
(
out_w%4==1
)
{
output.y
=
input1.y
;
}else
if
(
out_w%4==2
)
{
output.y
=
input1.z
;
}else{
output.y
=
input1.w
;
}
}else{
output.y
=
0.0f
;
}
if
(
out_C
-
out_c
*
4>=3
)
{
input2
=
read_imageh
(
input_image,
sampler,input_pos2
)
;
if
(
out_w%4==0
)
{
output.z
=
input2.x
;
}else
if
(
out_w%4==1
)
{
output.z
=
input1.y
;
}else
if
(
out_w%4==2
)
{
output.z
=
input2.z
;
}else{
output.z
=
input2.w
;
}
}else{
output.z
=
0.0f
;
}
if
(
out_C
-
out_c
*
4>=4
)
{
input3
=
read_imageh
(
input_image,
sampler,input_pos3
)
;
if
(
out_w%4==0
)
{
output.w
=
input3.x
;
}else
if
(
out_w%4==1
)
{
output.w
=
input3.y
;
}else
if
(
out_w%4==2
)
{
output.w
=
input3.z
;
}else{
output.w
=
input3.w
;
}
}else{
output.w
=
0.0f
;
}
write_imageh
(
output_image,
output_pos,
output
)
;
}
\ No newline at end of file
src/operators/kernel/cl/fetch_kernel.cpp
浏览文件 @
8b71275c
...
...
@@ -22,11 +22,11 @@ namespace operators {
template
<
>
bool
FetchKernel
<
GPU_CL
,
float
>::
Init
(
FetchParam
<
GPU_CL
>
*
param
)
{
if
(
param
->
InputX
()
->
dims
().
size
()
<=
2
)
{
this
->
cl_helper_
.
AddKernel
(
"fetch_2d"
,
"fetch_kernel.cl"
);
}
else
{
this
->
cl_helper_
.
AddKernel
(
"fetch"
,
"fetch_kernel.cl"
);
}
//
if (param->InputX()->dims().size() <= 2) {
//
this->cl_helper_.AddKernel("fetch_2d", "fetch_kernel.cl");
//
} else {
this
->
cl_helper_
.
AddKernel
(
"fetch"
,
"fetch_kernel.cl"
);
//
}
return
true
;
}
...
...
@@ -49,11 +49,11 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> ¶m) {
C
=
new_dims
[
1
];
in_height
=
new_dims
[
2
];
if
(
dim
.
size
()
<=
2
)
{
in_width
=
param
.
InputX
()
->
ImageWidth
();
}
else
{
in_width
=
new_dims
[
3
];
}
//
if (dim.size() <= 2) {
//
in_width = param.InputX()->ImageWidth();
//
} else {
in_width
=
new_dims
[
3
];
//
}
CLTensor
out_cl_tensor
(
this
->
cl_helper_
.
CLContext
(),
this
->
cl_helper_
.
CLCommandQueue
());
...
...
@@ -64,16 +64,16 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> ¶m) {
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
in_width
);
clSetKernelArg
(
kernel
,
2
,
sizeof
(
cl_mem
),
&
input
);
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_mem
),
&
outBuffer
);
if
(
dim
.
size
()
>
2
)
{
int
size_ch
=
in_height
*
in_width
;
int
size_block
=
size_ch
*
4
;
int
size_batch
=
size_ch
*
C
;
int
out_c
=
new_dims
[
1
];
clSetKernelArg
(
kernel
,
4
,
sizeof
(
int
),
&
size_ch
);
clSetKernelArg
(
kernel
,
5
,
sizeof
(
int
),
&
size_block
);
clSetKernelArg
(
kernel
,
6
,
sizeof
(
int
),
&
size_batch
);
clSetKernelArg
(
kernel
,
7
,
sizeof
(
int
),
&
out_c
);
}
//
if (dim.size() > 2) {
int
size_ch
=
in_height
*
in_width
;
int
size_block
=
size_ch
*
4
;
int
size_batch
=
size_ch
*
C
;
int
out_c
=
new_dims
[
1
];
clSetKernelArg
(
kernel
,
4
,
sizeof
(
int
),
&
size_ch
);
clSetKernelArg
(
kernel
,
5
,
sizeof
(
int
),
&
size_block
);
clSetKernelArg
(
kernel
,
6
,
sizeof
(
int
),
&
size_batch
);
clSetKernelArg
(
kernel
,
7
,
sizeof
(
int
),
&
out_c
);
//
}
// cl_event wait_event = param.InpdutX()->GetClEvent();
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
3
,
NULL
,
...
...
@@ -93,8 +93,6 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> ¶m) {
// << "ms" << std::endl;
memcpy
(
out
->
data
<
float
>
(),
out_cl_tensor
.
Data
<
float
>
(),
out
->
memory_size
());
DLOG
<<
*
param
.
InputX
();
DLOG
<<
*
out
;
}
template
class
FetchKernel
<
GPU_CL
,
float
>;
...
...
src/operators/kernel/cl/prior_box_kernel.cpp
浏览文件 @
8b71275c
...
...
@@ -39,6 +39,10 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
const
auto
&
input_aspect_ratio
=
param
.
AspectRatios
();
const
bool
&
flip
=
param
.
Flip
();
const
bool
&
clip
=
param
.
Clip
();
int
isclip
=
0
;
if
(
clip
)
{
isclip
=
1
;
}
const
float
&
step_w
=
param
.
StepW
();
const
float
&
step_h
=
param
.
StepH
();
const
float
&
offset
=
param
.
Offset
();
...
...
@@ -75,6 +79,8 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
num_priors
));
float
*
box_height
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
num_priors
));
float
*
variancesptr
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
4
));
int
idx
=
0
;
for
(
size_t
s
=
0
;
s
<
min_sizes
.
size
();
++
s
)
{
auto
min_size
=
min_sizes
[
s
];
...
...
@@ -108,6 +114,9 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
}
}
}
for
(
int
i
=
0
;
i
<
variances
.
size
();
i
++
)
{
variancesptr
[
i
]
=
variances
[
i
];
}
cl_int
status
;
auto
kernel
=
this
->
cl_helper_
.
KernelAt
(
0
);
auto
default_work_size
=
...
...
@@ -116,7 +125,7 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
int
w
=
default_work_size
[
1
];
int
nh
=
default_work_size
[
2
];
std
::
vector
<
int64_t
>
box_shape
({
1
,
1
,
1
,
num_priors
});
std
::
vector
<
int64_t
>
box_shape
({
num_priors
});
framework
::
DDim
ddim
=
framework
::
make_ddim
(
box_shape
);
framework
::
CLTensor
box_width_cl_tensor
(
this
->
cl_helper_
.
CLContext
(),
...
...
@@ -131,16 +140,33 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
cl_mem
box_height_Buffer
=
box_height_cl_tensor
.
mutable_with_data
<
float
>
(
box_height
);
DLOG
<<
"c_block:"
<<
c_block
;
DLOG
<<
"w:"
<<
w
;
DLOG
<<
"nh:"
<<
nh
;
DLOG
<<
"step_width:"
<<
step_width
;
DLOG
<<
"step_height:"
<<
step_height
;
DLOG
<<
"offset:"
<<
offset
;
DLOG
<<
"img_width:"
<<
img_width
;
DLOG
<<
"img_height:"
<<
img_height
;
DLOG
<<
"num_priors:"
<<
num_priors
;
DLOG
<<
"C:"
<<
C
;
framework
::
CLTensor
variances_cl_tensor
(
this
->
cl_helper_
.
CLContext
(),
this
->
cl_helper_
.
CLCommandQueue
());
std
::
vector
<
int64_t
>
variances_shape
({
4
});
framework
::
DDim
vddim
=
framework
::
make_ddim
(
variances_shape
);
variances_cl_tensor
.
Resize
(
vddim
);
cl_mem
variances_Buffer
=
variances_cl_tensor
.
mutable_with_data
<
float
>
(
variancesptr
);
// DLOG << "c_block:" << c_block;
// DLOG << "w:" << w;
// DLOG << "nh:" << nh;
// DLOG << "step_width:" << step_width;
// DLOG << "step_height:" << step_height;
// DLOG << "offset:" << offset;
// DLOG << "img_width:" << img_width;
// DLOG << "img_height:" << img_height;
// DLOG << "num_priors:" << num_priors;
// DLOG << "C:" << C;
// DLOG << "isclip:" << isclip;
// printf("param.MinMaxAspectRatiosOrder() =
// %d\n",param.MinMaxAspectRatiosOrder()); for (int i = 0; i <
// num_priors; i++) {
// DLOG << box_width[i];
// DLOG << box_height[i];
// }
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
w
);
...
...
@@ -151,28 +177,36 @@ void PriorBoxKernel<GPU_CL, float>::Compute(
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_mem
),
&
box_height_Buffer
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_mem
),
&
output_boxes
);
status
=
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_mem
),
&
variances_Buffer
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
6
,
sizeof
(
float
),
&
step_width
);
status
=
clSetKernelArg
(
kernel
,
6
,
sizeof
(
cl_mem
),
&
output_boxes
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
7
,
sizeof
(
float
),
&
step_height
);
status
=
clSetKernelArg
(
kernel
,
7
,
sizeof
(
cl_mem
),
&
output_variances
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
8
,
sizeof
(
float
),
&
offset
);
status
=
clSetKernelArg
(
kernel
,
8
,
sizeof
(
float
),
&
step_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
9
,
sizeof
(
int
),
&
img_width
);
status
=
clSetKernelArg
(
kernel
,
9
,
sizeof
(
float
),
&
step_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
10
,
sizeof
(
int
),
&
img_heigh
t
);
status
=
clSetKernelArg
(
kernel
,
10
,
sizeof
(
float
),
&
offse
t
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
11
,
sizeof
(
int
),
&
num_priors
);
status
=
clSetKernelArg
(
kernel
,
11
,
sizeof
(
int
),
&
img_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
12
,
sizeof
(
int
),
&
C
);
status
=
clSetKernelArg
(
kernel
,
12
,
sizeof
(
int
),
&
img_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
13
,
sizeof
(
int
),
&
num_priors
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
14
,
sizeof
(
int
),
&
C
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
15
,
sizeof
(
int
),
&
isclip
);
CL_CHECK_ERRORS
(
status
);
size_t
global_work_size
[
2
]
=
{
c_block
,
nh
};
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
NULL
,
global_work_size
,
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
paddle_mobile
::
memory
::
Free
(
box_width
);
paddle_mobile
::
memory
::
Free
(
box_height
);
paddle_mobile
::
memory
::
Free
(
variancesptr
);
}
template
class
PriorBoxKernel
<
GPU_CL
,
float
>;
...
...
src/operators/kernel/cl/reshape_kernel.cpp
浏览文件 @
8b71275c
...
...
@@ -26,40 +26,76 @@ bool ReshapeKernel<GPU_CL, float>::Init(ReshapeParam<GPU_CL> *param) {
template
<
>
void
ReshapeKernel
<
GPU_CL
,
float
>::
Compute
(
const
ReshapeParam
<
GPU_CL
>
&
param
)
{
auto
kernel
=
this
->
cl_helper_
.
KernelAt
(
0
);
auto
default_work_size
=
this
->
cl_helper_
.
DefaultWorkSize
(
*
param
.
Out
());
const
auto
*
input
=
param
.
InputX
();
auto
*
output
=
param
.
Out
();
auto
inputImage
=
input
->
GetCLImage
();
auto
outputImage
=
output
->
GetCLImage
();
clSetKernelArg
(
kernel
,
0
,
sizeof
(
cl_mem
),
&
inputImage
);
clSetKernelArg
(
kernel
,
1
,
sizeof
(
cl_mem
),
&
outputImage
);
auto
input_image
=
input
->
GetCLImage
();
auto
output_image
=
output
->
GetCLImage
();
const
auto
&
inputDim
=
input
->
dims
();
const
auto
&
outputDim
=
output
->
dims
();
int
dims
[
4
]
=
{
1
,
1
,
1
,
1
};
int
odims
[
4
]
=
{
1
,
1
,
1
,
1
};
int
input_
dims
[
4
]
=
{
1
,
1
,
1
,
1
};
int
o
utput_
dims
[
4
]
=
{
1
,
1
,
1
,
1
};
// 1 1000 1 1
for
(
int
i
=
0
;
i
<
inputDim
.
size
();
i
++
)
{
dims
[
4
-
inputDim
.
size
()
+
i
]
=
inputDim
[
i
];
input_
dims
[
4
-
inputDim
.
size
()
+
i
]
=
inputDim
[
i
];
}
// 1 1 1 1000
for
(
int
i
=
0
;
i
<
outputDim
.
size
();
i
++
)
{
odims
[
4
-
outputDim
.
size
()
+
i
]
=
outputDim
[
i
];
o
utput_
dims
[
4
-
outputDim
.
size
()
+
i
]
=
outputDim
[
i
];
}
clSetKernelArg
(
kernel
,
2
,
sizeof
(
cl_int
),
&
dims
);
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_int
),
&
dims
[
1
]);
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_int
),
&
dims
[
2
]);
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_int
),
&
dims
[
3
]);
clSetKernelArg
(
kernel
,
6
,
sizeof
(
cl_int
),
&
odims
);
clSetKernelArg
(
kernel
,
7
,
sizeof
(
cl_int
),
&
odims
[
1
]);
clSetKernelArg
(
kernel
,
8
,
sizeof
(
cl_int
),
&
odims
[
1
]);
clSetKernelArg
(
kernel
,
9
,
sizeof
(
cl_int
),
&
odims
[
1
]);
const
size_t
work_size
[
2
]
=
{
output
->
ImageWidth
(),
output
->
ImageHeight
()};
// cl_event out_event = param.Out()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
NULL
,
work_size
,
NULL
,
0
,
NULL
,
NULL
);
int
out_C
=
output_dims
[
1
];
int
out_H
=
output_dims
[
2
];
int
out_W
=
output_dims
[
3
];
int
in_W
=
input_dims
[
3
];
int
in_H
=
input_dims
[
2
];
int
in_Stride0
=
in_W
;
int
in_Stride1
=
input_dims
[
2
]
*
input_dims
[
3
];
int
in_Stride2
=
input_dims
[
1
]
*
input_dims
[
2
]
*
input_dims
[
3
];
int
out_Stride0
=
out_W
;
int
out_Stride1
=
out_H
*
out_W
;
int
out_Stride2
=
out_C
*
out_H
*
out_W
;
DLOG
<<
"out_C="
<<
out_C
;
DLOG
<<
"out_H="
<<
out_H
;
DLOG
<<
"out_W="
<<
out_W
;
DLOG
<<
"in_W="
<<
in_W
;
DLOG
<<
"default_work_size="
<<
default_work_size
;
DLOG
<<
"in_Stride0="
<<
in_Stride0
;
DLOG
<<
"in_Stride1="
<<
in_Stride1
;
DLOG
<<
"out_Stride0="
<<
out_Stride0
;
DLOG
<<
"out_Stride1="
<<
out_Stride1
;
cl_int
status
;
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
cl_mem
),
&
input_image
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
cl_mem
),
&
output_image
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
out_C
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
int
),
&
out_H
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
int
),
&
out_W
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
5
,
sizeof
(
int
),
&
in_W
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
6
,
sizeof
(
int
),
&
in_H
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
7
,
sizeof
(
int
),
&
in_Stride0
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
8
,
sizeof
(
int
),
&
in_Stride1
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
9
,
sizeof
(
int
),
&
in_Stride2
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
10
,
sizeof
(
int
),
&
out_Stride0
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
11
,
sizeof
(
int
),
&
out_Stride1
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
12
,
sizeof
(
int
),
&
out_Stride2
);
CL_CHECK_ERRORS
(
status
);
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
}
template
class
ReshapeKernel
<
GPU_CL
,
float
>;
...
...
src/operators/kernel/cl/transpose_kernel.cpp
浏览文件 @
8b71275c
...
...
@@ -20,12 +20,48 @@ namespace operators {
template
<
>
bool
TransposeKernel
<
GPU_CL
,
float
>::
Init
(
TransposeParam
<
GPU_CL
>
*
param
)
{
if
(
param
->
Out
()
->
dims
().
size
()
==
4
)
{
this
->
cl_helper_
.
AddKernel
(
"transpose_4d"
,
"transpose_kernel.cl"
);
}
return
true
;
}
template
<
>
void
TransposeKernel
<
GPU_CL
,
float
>::
Compute
(
const
TransposeParam
<
GPU_CL
>
&
param
)
{}
const
TransposeParam
<
GPU_CL
>
&
param
)
{
if
(
param
.
Out
()
->
dims
().
size
()
==
4
)
{
auto
kernel
=
this
->
cl_helper_
.
KernelAt
(
0
);
auto
default_work_size
=
this
->
cl_helper_
.
DefaultWorkSize
(
*
param
.
Out
());
int
out_C
=
param
.
Out
()
->
dims
()[
1
];
int
out_H
=
param
.
Out
()
->
dims
()[
2
];
int
out_W
=
param
.
Out
()
->
dims
()[
3
];
int
in_W
=
param
.
InputX
()
->
dims
()[
3
];
auto
output_image
=
param
.
Out
()
->
GetCLImage
();
auto
input_image
=
param
.
InputX
()
->
GetCLImage
();
DLOG
<<
"out_C="
<<
out_C
;
DLOG
<<
"out_H="
<<
out_H
;
DLOG
<<
"out_W="
<<
out_W
;
DLOG
<<
"in_C="
<<
in_W
;
DLOG
<<
"default_work_size="
<<
default_work_size
;
cl_int
status
;
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
cl_mem
),
&
input_image
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
cl_mem
),
&
output_image
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
out_C
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
int
),
&
out_H
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
int
),
&
out_W
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
5
,
sizeof
(
int
),
&
in_W
);
CL_CHECK_ERRORS
(
status
);
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
}
}
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/op_param.h
浏览文件 @
8b71275c
...
...
@@ -849,6 +849,8 @@ class PriorBoxParam : public OpParam {
if
(
HasAttr
(
"min_max_aspect_ratios_order"
,
attrs
))
{
min_max_aspect_ratios_order_
=
GetAttr
<
bool
>
(
"min_max_aspect_ratios_order"
,
attrs
);
}
else
{
min_max_aspect_ratios_order_
=
false
;
}
flip_
=
GetAttr
<
bool
>
(
"flip"
,
attrs
);
clip_
=
GetAttr
<
bool
>
(
"clip"
,
attrs
);
...
...
test/CMakeLists.txt
浏览文件 @
8b71275c
...
...
@@ -366,5 +366,8 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE
(
test-eng net/test_eng.cpp test_helper.h test_include.h
)
target_link_libraries
(
test-eng paddle-mobile
)
# gen test
ADD_EXECUTABLE
(
test-super net/test_super.cpp test_helper.h test_include.h
)
target_link_libraries
(
test-super paddle-mobile
)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif
()
test/net/test_super.cpp
0 → 100644
浏览文件 @
8b71275c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../../src/common/types.h"
#include "../test_helper.h"
#include "../test_include.h"
int
main
()
{
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
GPU_CL
>
paddle_mobile
;
// paddle_mobile.SetThreadNum(4);
auto
time1
=
paddle_mobile
::
time
();
#ifdef PADDLE_MOBILE_CL
paddle_mobile
.
SetCLPath
(
"/data/local/tmp/bin"
);
#endif
auto
isok
=
paddle_mobile
.
Load
(
std
::
string
(
g_super
)
+
"/model"
,
std
::
string
(
g_super
)
+
"/params"
,
true
,
false
,
1
,
true
);
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_mul), true);
if
(
isok
)
{
auto
time2
=
paddle_mobile
::
time
();
std
::
cout
<<
"load cost :"
<<
paddle_mobile
::
time_diff
(
time1
,
time2
)
<<
"ms"
<<
std
::
endl
;
std
::
vector
<
float
>
input
;
std
::
vector
<
int64_t
>
dims
{
1
,
1
,
300
,
300
};
GetInput
<
float
>
(
g_yolo_img
,
&
input
,
dims
);
std
::
vector
<
float
>
vec_result
;
auto
time3
=
paddle_mobile
::
time
();
int
max
=
10
;
for
(
int
i
=
0
;
i
<
max
;
++
i
)
{
vec_result
=
paddle_mobile
.
Predict
(
input
,
dims
);
}
auto
time4
=
paddle_mobile
::
time
();
std
::
cout
<<
"predict cost :"
<<
paddle_mobile
::
time_diff
(
time3
,
time4
)
/
max
<<
"ms"
<<
std
::
endl
;
std
::
vector
<
float
>::
iterator
biggest
=
std
::
max_element
(
std
::
begin
(
vec_result
),
std
::
end
(
vec_result
));
std
::
cout
<<
" Max element is "
<<
*
biggest
<<
" at position "
<<
std
::
distance
(
std
::
begin
(
vec_result
),
biggest
)
<<
std
::
endl
;
}
std
::
cout
<<
"如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
"是否存在?"
<<
std
::
endl
;
return
0
;
}
test/test_helper.h
浏览文件 @
8b71275c
...
...
@@ -36,16 +36,19 @@ static const char *g_squeezenet = "../models/squeezenet";
static
const
char
*
g_googlenet
=
"../models/googlenet"
;
static
const
char
*
g_googlenet_quali
=
"../models/googlenet_combine_quali"
;
static
const
char
*
g_mobilenet
=
"../models/mobilenet"
;
static
const
char
*
g_mobilenet_mul
=
"../models/
mobilenet_mul
"
;
static
const
char
*
g_mobilenet_mul
=
"../models/
r
"
;
static
const
char
*
g_alexnet
=
"../models/alexnet"
;
static
const
char
*
g_inceptionv4
=
"../models/inceptionv4"
;
static
const
char
*
g_inceptionv3
=
"../models/InceptionV3_Spatial_Attention_Model"
;
static
const
char
*
g_nlp
=
"../models/nlp"
;
static
const
char
*
g_super
=
"../models/superresoltion"
;
static
const
char
*
g_resnet_50
=
"../models/resnet_50"
;
static
const
char
*
g_resnet
=
"../models/resnet"
;
static
const
char
*
g_googlenet_combine
=
"../models/googlenet_combine"
;
static
const
char
*
g_yolo
=
"../models/yolo"
;
static
const
char
*
g_yolo_combined
=
"../models/yolo_combined"
;
static
const
char
*
g_yolo_mul
=
"../models/
yolo_mul
"
;
static
const
char
*
g_yolo_mul
=
"../models/
d
"
;
static
const
char
*
g_fluid_fssd_new
=
"../models/fluid_fssd_new"
;
static
const
char
*
g_test_image_1x3x224x224
=
"../images/test_image_1x3x224x224_float"
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录