Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
8c4a035f
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
338
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8c4a035f
编写于
11月 19, 2018
作者:
qnqinan
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/paddle-mobile
into develop
上级
e75f76b4
672caa36
变更
15
隐藏空白更改
内联
并排
Showing
15 changed file
with
635 addition
and
25 deletion
+635
-25
src/framework/executor.cpp
src/framework/executor.cpp
+3
-0
src/framework/loader.cpp
src/framework/loader.cpp
+12
-6
src/io/jni/paddle_mobile_jni.cpp
src/io/jni/paddle_mobile_jni.cpp
+1
-1
src/io/paddle_mobile.cpp
src/io/paddle_mobile.cpp
+8
-4
src/operators/feed_op.cpp
src/operators/feed_op.cpp
+7
-1
src/operators/kernel/central-arm-func/conv_add_arm_func.h
src/operators/kernel/central-arm-func/conv_add_arm_func.h
+1
-0
src/operators/kernel/cl/cl_kernel/conv_bn_relu_kernel.cl
src/operators/kernel/cl/cl_kernel/conv_bn_relu_kernel.cl
+18
-0
src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
+12
-3
src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
+100
-0
src/operators/kernel/cl/conv_add_kernel.cpp
src/operators/kernel/cl/conv_add_kernel.cpp
+3
-3
src/operators/kernel/cl/conv_bn_relu_kernel.cpp
src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+174
-1
src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp
src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp
+140
-1
src/operators/kernel/cl/feed_kernel.cpp
src/operators/kernel/cl/feed_kernel.cpp
+2
-0
src/operators/kernel/cl/fetch_kernel.cpp
src/operators/kernel/cl/fetch_kernel.cpp
+5
-3
src/operators/kernel/cl/prior_box_kernel.cpp
src/operators/kernel/cl/prior_box_kernel.cpp
+149
-2
未找到文件。
src/framework/executor.cpp
浏览文件 @
8c4a035f
...
@@ -284,6 +284,9 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
...
@@ -284,6 +284,9 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
clock_gettime
(
CLOCK_MONOTONIC
,
&
ts
);
clock_gettime
(
CLOCK_MONOTONIC
,
&
ts
);
profile
[
i
].
runBegin
=
(
uint64_t
)
ts
.
tv_sec
*
1e9
+
ts
.
tv_nsec
;
profile
[
i
].
runBegin
=
(
uint64_t
)
ts
.
tv_sec
*
1e9
+
ts
.
tv_nsec
;
#endif
#endif
if
(
loddable_
)
{
ops
[
i
]
->
InferShape
();
}
// to Run
// to Run
ops
[
i
]
->
Run
();
ops
[
i
]
->
Run
();
#ifdef PADDLE_MOBILE_PROFILE
#ifdef PADDLE_MOBILE_PROFILE
...
...
src/framework/loader.cpp
浏览文件 @
8c4a035f
...
@@ -43,15 +43,21 @@ void Loader<Dtype, P>::InitMemoryFromProgram(
...
@@ -43,15 +43,21 @@ void Loader<Dtype, P>::InitMemoryFromProgram(
tensor
->
Resize
(
make_ddim
(
dim
));
tensor
->
Resize
(
make_ddim
(
dim
));
}
else
{
}
else
{
auto
dim
=
var_desc
->
Tensor_desc
().
Dims
();
auto
dim
=
var_desc
->
Tensor_desc
().
Dims
();
PADDLE_MOBILE_ENFORCE
(
dim
.
size
()
>
0
,
"dim size is 0"
);
//
PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
// dim[0] = 1;
// dim[0] = 1;
for
(
auto
&
d
:
dim
)
{
if
(
dim
.
size
()
==
0
)
{
if
(
d
<
0
)
{
auto
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
d
*=
-
1
;
framework
::
DDim
dDim
=
{
0
};
tensor
->
Resize
(
dDim
);
}
else
{
for
(
auto
&
d
:
dim
)
{
if
(
d
<
0
)
{
d
*=
-
1
;
}
}
}
auto
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
tensor
->
Resize
(
make_ddim
(
dim
));
}
}
auto
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
tensor
->
Resize
(
make_ddim
(
dim
));
}
}
}
else
{
}
else
{
// TODO(codeWorm): some.
// TODO(codeWorm): some.
...
...
src/io/jni/paddle_mobile_jni.cpp
浏览文件 @
8c4a035f
...
@@ -405,9 +405,9 @@ Java_com_baidu_paddle_PML_predictLod(JNIEnv *env, jclass thiz, jlongArray buf) {
...
@@ -405,9 +405,9 @@ Java_com_baidu_paddle_PML_predictLod(JNIEnv *env, jclass thiz, jlongArray buf) {
ANDROIDLOGE
(
"predict nlp size %d"
,
count
);
ANDROIDLOGE
(
"predict nlp size %d"
,
count
);
result
=
env
->
NewLongArray
(
count
);
result
=
env
->
NewLongArray
(
count
);
env
->
SetLongArrayRegion
(
result
,
0
,
count
,
vec_result
->
data
<
int64_t
>
());
env
->
SetLongArrayRegion
(
result
,
0
,
count
,
vec_result
->
data
<
int64_t
>
());
env
->
ReleaseLongArrayElements
(
buf
,
ddim_ptr
,
0
);
return
result
;
return
result
;
}
}
...
...
src/io/paddle_mobile.cpp
浏览文件 @
8c4a035f
...
@@ -123,9 +123,12 @@ void PaddleMobile<Dtype, P>::Clear() {
...
@@ -123,9 +123,12 @@ void PaddleMobile<Dtype, P>::Clear() {
executor_
=
nullptr
;
executor_
=
nullptr
;
loader_
=
nullptr
;
loader_
=
nullptr
;
}
}
template
<
typename
Dtype
,
Precision
P
>
template
<
typename
Dtype
,
Precision
P
>
double
PaddleMobile
<
Dtype
,
P
>::
GetPredictTime
()
{
double
PaddleMobile
<
Dtype
,
P
>::
GetPredictTime
()
{}
#ifdef PADDLE_MOBILE_CPU
template
<
>
double
PaddleMobile
<
CPU
,
Precision
::
FP32
>::
GetPredictTime
()
{
int
m
=
32
;
int
m
=
32
;
int
n
=
224
*
224
;
int
n
=
224
*
224
;
int
k
=
27
;
int
k
=
27
;
...
@@ -148,8 +151,8 @@ double PaddleMobile<Dtype, P>::GetPredictTime() {
...
@@ -148,8 +151,8 @@ double PaddleMobile<Dtype, P>::GetPredictTime() {
}
}
paddle_mobile
::
operators
::
math
::
Gemm
gemm
;
paddle_mobile
::
operators
::
math
::
Gemm
gemm
;
auto
time1
=
paddle_mobile
::
time
();
auto
time1
=
paddle_mobile
::
time
();
//
gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
gemm
.
Sgemm
(
m
,
n
,
k
,
static_cast
<
float
>
(
1
),
a
,
lda
,
b
,
ldb
,
//
static_cast<float>(0), c, ldc, false, nullptr);
static_cast
<
float
>
(
0
),
c
,
ldc
,
false
,
nullptr
);
auto
time2
=
paddle_mobile
::
time
();
auto
time2
=
paddle_mobile
::
time
();
double
cost
=
paddle_mobile
::
time_diff
(
time1
,
time2
);
double
cost
=
paddle_mobile
::
time_diff
(
time1
,
time2
);
paddle_mobile
::
memory
::
Free
(
a
);
paddle_mobile
::
memory
::
Free
(
a
);
...
@@ -157,6 +160,7 @@ double PaddleMobile<Dtype, P>::GetPredictTime() {
...
@@ -157,6 +160,7 @@ double PaddleMobile<Dtype, P>::GetPredictTime() {
paddle_mobile
::
memory
::
Free
(
c
);
paddle_mobile
::
memory
::
Free
(
c
);
return
cost
;
return
cost
;
}
}
#endif
template
<
typename
Dtype
,
Precision
P
>
template
<
typename
Dtype
,
Precision
P
>
PaddleMobile
<
Dtype
,
P
>::~
PaddleMobile
()
{
PaddleMobile
<
Dtype
,
P
>::~
PaddleMobile
()
{
...
...
src/operators/feed_op.cpp
浏览文件 @
8c4a035f
...
@@ -21,7 +21,13 @@ template <typename DeviceType, typename T>
...
@@ -21,7 +21,13 @@ template <typename DeviceType, typename T>
void
FeedOp
<
DeviceType
,
T
>::
InferShape
()
const
{
void
FeedOp
<
DeviceType
,
T
>::
InferShape
()
const
{
auto
out_dims
=
this
->
param_
.
Out
()
->
dims
();
auto
out_dims
=
this
->
param_
.
Out
()
->
dims
();
out_dims
[
0
]
=
this
->
param_
.
BatchSize
();
out_dims
[
0
]
=
this
->
param_
.
BatchSize
();
this
->
param_
.
Out
()
->
Resize
(
out_dims
);
auto
input_dims
=
this
->
param_
.
InputX
()
->
dims
();
DLOG
<<
input_dims
.
size
();
if
(
input_dims
.
size
()
==
4
)
{
this
->
param_
.
Out
()
->
Resize
(
input_dims
);
}
else
{
this
->
param_
.
Out
()
->
Resize
(
out_dims
);
}
}
}
}
// namespace operators
}
// namespace operators
...
...
src/operators/kernel/central-arm-func/conv_add_arm_func.h
浏览文件 @
8c4a035f
...
@@ -115,6 +115,7 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
...
@@ -115,6 +115,7 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
template
<
typename
P
>
template
<
typename
P
>
void
ConvAddCompute
(
const
FusionConvAddParam
<
CPU
>
&
param
)
{
void
ConvAddCompute
(
const
FusionConvAddParam
<
CPU
>
&
param
)
{
param
.
Output
()
->
mutable_data
<
float
>
();
if
(
param
.
Groups
()
==
param
.
Input
()
->
dims
()[
1
]
&&
if
(
param
.
Groups
()
==
param
.
Input
()
->
dims
()[
1
]
&&
param
.
Input
()
->
dims
()[
1
]
==
param
.
Output
()
->
dims
()[
1
]
&&
param
.
Input
()
->
dims
()[
1
]
==
param
.
Output
()
->
dims
()[
1
]
&&
param
.
Filter
()
->
dims
()[
2
]
==
param
.
Filter
()
->
dims
()[
3
]
&&
param
.
Filter
()
->
dims
()[
2
]
==
param
.
Filter
()
->
dims
()[
3
]
&&
...
...
src/operators/kernel/cl/cl_kernel/conv_bn_relu_kernel.cl
0 → 100644
浏览文件 @
8c4a035f
/*
Copyright
(
c
)
2018
PaddlePaddle
Authors.
All
Rights
Reserved.
Licensed
under
the
Apache
License,
Version
2.0
(
the
"License"
)
;
you
may
not
use
this
file
except
in
compliance
with
the
License.
You
may
obtain
a
copy
of
the
License
at
http://www.apache.org/licenses/LICENSE-2.0
Unless
required
by
applicable
law
or
agreed
to
in
writing,
software
distributed
under
the
License
is
distributed
on
an
"AS IS"
BASIS,
WITHOUT
WARRANTIES
OR
CONDITIONS
OF
ANY
KIND,
either
express
or
implied.
See
the
License
for
the
specific
language
governing
permissions
and
limitations
under
the
License.
*/
#
define
BATCH_NORM
#
define
RELU
#
include
"conv_kernel.inc.cl"
src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
浏览文件 @
8c4a035f
...
@@ -20,7 +20,8 @@ __kernel void fetch(__private const int in_height,
...
@@ -20,7 +20,8 @@ __kernel void fetch(__private const int in_height,
__global
float*
out,
__global
float*
out,
__private
const
int
size_ch,
__private
const
int
size_ch,
__private
const
int
size_block,
__private
const
int
size_block,
__private
const
int
size_batch
)
{
__private
const
int
size_batch,
__private
const
int
C
)
{
const
int
in_c
=
get_global_id
(
0
)
;
const
int
in_c
=
get_global_id
(
0
)
;
const
int
in_w
=
get_global_id
(
1
)
;
const
int
in_w
=
get_global_id
(
1
)
;
const
int
in_nh
=
get_global_id
(
2
)
;
const
int
in_nh
=
get_global_id
(
2
)
;
...
@@ -35,9 +36,17 @@ __kernel void fetch(__private const int in_height,
...
@@ -35,9 +36,17 @@ __kernel void fetch(__private const int in_height,
const
int
index
=
in_n
*
size_batch
+
in_c
*
size_block
+
in_h
*
in_width
+
in_w
;
const
int
index
=
in_n
*
size_batch
+
in_c
*
size_block
+
in_h
*
in_width
+
in_w
;
out[index]
=
convert_float
(
in.x
)
;
out[index]
=
convert_float
(
in.x
)
;
out[index
+
size_ch]
=
convert_float
(
in.y
)
;
if
(
C
-
4
*
in_c>=2
)
{
out[index
+
size_ch]
=
convert_float
(
in.y
)
;
}
if
(
C
-
4
*
in_c>=3
)
{
out[index
+
size_ch
*
2]
=
convert_float
(
in.z
)
;
out[index
+
size_ch
*
2]
=
convert_float
(
in.z
)
;
out[index
+
size_ch
*
3]
=
convert_float
(
in.w
)
;
}
if
(
C
-
4
*
in_c>=4
)
{
out[index
+
size_ch
*
3]
=
convert_float
(
in.w
)
;
}
}
}
__kernel
void
fetch_2d
(
__private
const
int
in_height,
__kernel
void
fetch_2d
(
__private
const
int
in_height,
...
...
src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
0 → 100644
浏览文件 @
8c4a035f
/*
Copyright
(
c
)
2018
PaddlePaddle
Authors.
All
Rights
Reserved.
Licensed
under
the
Apache
License,
Version
2.0
(
the
"License"
)
;
you
may
not
use
this
file
except
in
compliance
with
the
License.
You
may
obtain
a
copy
of
the
License
at
http://www.apache.org/licenses/LICENSE-2.0
Unless
required
by
applicable
law
or
agreed
to
in
writing,
software
distributed
under
the
License
is
distributed
on
an
"AS IS"
BASIS,
WITHOUT
WARRANTIES
OR
CONDITIONS
OF
ANY
KIND,
either
express
or
implied.
See
the
License
for
the
specific
language
governing
permissions
and
limitations
under
the
License.
*/
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
__kernel
void
prior_box
(
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2,
__global
float
*box_width,
__global
float
*box_height,
__write_only
image2d_t
output_image,
__private
const
float
step_width,
__private
const
float
step_height,
__private
const
float
offset,
__private
const
int
img_width,
__private
const
int
img_height,
__private
const
int
num_priors,
__private
const
int
C
)
{
const
int
out_c
=
get_global_id
(
0
)
;
const
int
out_nh
=
get_global_id
(
1
)
;
const
int
out_n
=
out_nh/num_priors
;
const
int
out_h
=
out_nh%num_priors
;
if
(
out_c
>=
global_size_dim0
|
|out_nh >= global_size_dim2) {
return;
}
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST
;
int2
output_pos
;
output_pos.x
=
out_c
*
4
;
output_pos.y
=
out_nh
;
float
center_x0
=
(
offset
+
out_c
*
4
)
*
step_width
;
float
center_x1
=
(
offset
+
out_c
*
4
+
1
)
*
step_width
;
float
center_x2
=
(
offset
+
out_c
*
4
+
2
)
*
step_width
;
float
center_x3
=
(
offset
+
out_c
*
4
+
3
)
*
step_width
;
float
center_y
=
(
out_n
+
offset
)
*
step_height
;
half4
output[4]
;
output[0].x
=
convert_half
((
center_x0
-
box_width[out_h]
)
/
img_width
)
;
output[1].x
=
convert_half
((
center_y
-
box_height[out_h]
)
/
img_height
)
;
output[2].x
=
convert_half
((
center_x0
+
box_width[out_h]
)
/
img_width
)
;
output[3].x
=
convert_half
((
center_y
+
box_height[out_h]
)
/
img_height
)
;
if
(
C
-
4
*
out_c>=2
)
{
output[0].y
=
convert_half
((
center_x1
-
box_width[out_h]
)
/
img_width
)
;
output[1].y
=
convert_half
((
center_y
-
box_height[out_h]
)
/
img_height
)
;
output[2].y
=
convert_half
((
center_x1
+
box_width[out_h]
)
/
img_width
)
;
output[3].y
=
convert_half
((
center_y
+
box_height[out_h]
)
/
img_height
)
;
}else{
output[0].y
=
0.0f
;
output[1].y
=
0.0f
;
output[2].y
=
0.0f
;
output[3].y
=
0.0f
;
}
if
(
C
-
4
*
out_c>=3
)
{
output[0].z
=
convert_half
((
center_x2
-
box_width[out_h]
)
/
img_width
)
;
output[1].z
=
convert_half
((
center_y
-
box_height[out_h]
)
/
img_height
)
;
output[2].z
=
convert_half
((
center_x2
+
box_width[out_h]
)
/
img_width
)
;
output[3].z
=
convert_half
((
center_y
+
box_height[out_h]
)
/
img_height
)
;
}else{
output[0].z
=
0.0f
;
output[1].z
=
0.0f
;
output[2].z
=
0.0f
;
output[3].z
=
0.0f
;
}
if
(
C
-
4
*
out_c>=4
)
{
output[0].w
=
convert_half
((
center_x3
-
box_width[out_h]
)
/
img_width
)
;
output[1].w
=
convert_half
((
center_y
-
box_height[out_h]
)
/
img_height
)
;
output[2].w
=
convert_half
((
center_x3
+
box_width[out_h]
)
/
img_width
)
;
output[3].w
=
convert_half
((
center_y
+
box_height[out_h]
)
/
img_height
)
;
}else{
output[0].z
=
0.0f
;
output[1].z
=
0.0f
;
output[2].z
=
0.0f
;
output[3].z
=
0.0f
;
}
output[0]
=
min
(
max
((
half4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
,
output[0]
)
,
(
half4
)(
1.0f,
1.0f,
1.0f,
1.0f
))
;
output[1]
=
min
(
max
((
half4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
,
output[1]
)
,
(
half4
)(
1.0f,
1.0f,
1.0f,
1.0f
))
;
output[2]
=
min
(
max
((
half4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
,
output[2]
)
,
(
half4
)(
1.0f,
1.0f,
1.0f,
1.0f
))
;
output[3]
=
min
(
max
((
half4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
,
output[3]
)
,
(
half4
)(
1.0f,
1.0f,
1.0f,
1.0f
))
;
write_imageh
(
output_image,
(
int2
)(
output_pos.x
+
1
,
output_pos.y
)
,
output[0]
)
;
write_imageh
(
output_image,
(
int2
)(
output_pos.x
+
2
,
output_pos.y
)
,
output[1]
)
;
write_imageh
(
output_image,
(
int2
)(
output_pos.x
+
3
,
output_pos.y
)
,
output[2]
)
;
write_imageh
(
output_image,
(
int2
)(
output_pos.x
+
4
,
output_pos.y
)
,
output[3]
)
;
}
\ No newline at end of file
src/operators/kernel/cl/conv_add_kernel.cpp
浏览文件 @
8c4a035f
...
@@ -68,10 +68,10 @@ void ConvAddKernel<GPU_CL, float>::Compute(
...
@@ -68,10 +68,10 @@ void ConvAddKernel<GPU_CL, float>::Compute(
int
nh
=
default_work_size
[
2
];
int
nh
=
default_work_size
[
2
];
auto
input
=
param
.
Input
()
->
GetCLImage
();
auto
input
=
param
.
Input
()
->
GetCLImage
();
auto
filter
=
param
.
Filter
()
->
GetCLImage
();
auto
filter
=
param
.
Filter
()
->
GetCLImage
();
DLOG
<<
"---yangfei30---"
;
DLOG
<<
*
param
.
Filter
();
DLOG
<<
param
.
Paddings
();
auto
biase
=
param
.
Bias
()
->
GetCLImage
();
auto
biase
=
param
.
Bias
()
->
GetCLImage
();
param
.
Output
()
->
InitEmptyImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
(),
param
.
Output
()
->
dims
());
auto
output
=
param
.
Output
()
->
GetCLImage
();
auto
output
=
param
.
Output
()
->
GetCLImage
();
int
stride
=
param
.
Strides
()[
0
];
int
stride
=
param
.
Strides
()[
0
];
int
offset
=
param
.
Offset
();
int
offset
=
param
.
Offset
();
...
...
src/operators/kernel/cl/conv_bn_relu_kernel.cpp
浏览文件 @
8c4a035f
...
@@ -22,12 +22,185 @@ namespace operators {
...
@@ -22,12 +22,185 @@ namespace operators {
template
<
>
template
<
>
bool
ConvBNReluKernel
<
GPU_CL
,
float
>::
Init
(
bool
ConvBNReluKernel
<
GPU_CL
,
float
>::
Init
(
FusionConvBNReluParam
<
GPU_CL
>
*
param
)
{
FusionConvBNReluParam
<
GPU_CL
>
*
param
)
{
PADDLE_MOBILE_ENFORCE
(
param
->
Filter
()
->
dims
()[
2
]
==
param
->
Filter
()
->
dims
()[
3
]
&&
param
->
Paddings
()[
0
]
==
param
->
Paddings
()[
1
],
"need equal"
);
const
framework
::
CLImage
*
mean
=
param
->
InputMean
();
const
framework
::
CLImage
*
variance
=
param
->
InputVariance
();
const
framework
::
CLImage
*
scale
=
param
->
InputScale
();
const
framework
::
CLImage
*
bias
=
param
->
InputBias
();
const
float
epsilon
=
param
->
Epsilon
();
const
int
C
=
mean
->
numel
();
auto
mean_ptr
=
mean
->
data
<
float
>
();
auto
variance_ptr
=
variance
->
data
<
float
>
();
auto
scale_ptr
=
scale
->
data
<
float
>
();
auto
bias_ptr
=
bias
->
data
<
float
>
();
float
inv_std_ptr
[
C
];
for
(
int
i
=
0
;
i
<
C
;
i
++
)
{
inv_std_ptr
[
i
]
=
1
/
static_cast
<
float
>
(
pow
((
variance_ptr
[
i
]
+
epsilon
),
0.5
));
}
float
*
new_scale_ptr
=
new
float
[
C
];
float
*
new_bias_ptr
=
new
float
[
C
];
for
(
int
i
=
0
;
i
<
C
;
i
++
)
{
new_scale_ptr
[
i
]
=
inv_std_ptr
[
i
]
*
scale_ptr
[
i
];
new_bias_ptr
[
i
]
=
bias_ptr
[
i
]
-
mean_ptr
[
i
]
*
inv_std_ptr
[
i
]
*
scale_ptr
[
i
];
}
framework
::
CLImage
*
new_scale
=
new
framework
::
CLImage
();
// for (int j = 0; j < C; ++j) {
// DLOG << " new scale - " << j << new_scale_ptr[j];
// }
//
// for (int j = 0; j < C; ++j) {
// DLOG << " new bias - " << j << new_bias_ptr[j];
// }
new_scale
->
SetTensorData
(
new_scale_ptr
,
variance
->
dims
());
new_scale
->
InitCLImage
(
this
->
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
// DLOG << " climage - y bias: " << *(param->Bias());
//
// DLOG << " climage - new scale: " << *new_scale;
framework
::
CLImage
*
new_bias
=
new
framework
::
CLImage
();
new_bias
->
SetTensorData
(
new_bias_ptr
,
variance
->
dims
());
new_bias
->
InitCLImage
(
this
->
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
// DLOG << " climage - new bias: " << *new_bias;
//
// DLOG << " climage - filter: " << *(param->Filter());
param
->
SetNewScale
(
new_scale
);
param
->
SetNewBias
(
new_bias
);
delete
[](
new_scale_ptr
);
delete
[](
new_bias_ptr
);
PADDLE_MOBILE_ENFORCE
(
param
->
Filter
()
->
dims
()[
2
]
==
param
->
Filter
()
->
dims
()[
3
]
&&
param
->
Paddings
()[
0
]
==
param
->
Paddings
()[
1
],
"need equal"
);
int
offset
=
static_cast
<
int
>
(
param
->
Filter
()
->
dims
()[
2
])
/
2
-
static_cast
<
int
>
(
param
->
Paddings
()[
1
]);
param
->
SetOffset
(
offset
);
if
(
param
->
Filter
()
->
dims
()[
2
]
==
1
&&
param
->
Filter
()
->
dims
()[
3
]
==
1
)
{
param
->
Filter
()
->
InitNImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
this
->
cl_helper_
.
AddKernel
(
"conv_1x1"
,
"conv_bn_relu_kernel.cl"
);
DLOG
<<
" conv bn relu conv 1x1"
;
}
else
if
(
param
->
Filter
()
->
dims
()[
1
]
==
1
&&
param
->
Input
()
->
dims
()[
1
]
==
param
->
Output
()
->
dims
()[
1
]
&&
param
->
Filter
()
->
dims
()[
2
]
==
3
)
{
param
->
Filter
()
->
InitDWImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
this
->
cl_helper_
.
AddKernel
(
"depth_conv_3x3"
,
"conv_bn_relu_kernel.cl"
);
DLOG
<<
" conv bn relu depth_conv_3x3"
;
}
else
if
(
param
->
Filter
()
->
dims
()[
2
]
==
3
&&
param
->
Filter
()
->
dims
()[
3
]
==
3
)
{
param
->
Filter
()
->
InitCLImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
this
->
cl_helper_
.
AddKernel
(
"conv_3x3"
,
"conv_bn_relu_kernel.cl"
);
DLOG
<<
" conv bn relu conv_3x3"
;
}
else
{
PADDLE_MOBILE_THROW_EXCEPTION
(
" not support "
);
}
return
true
;
return
true
;
}
}
template
<
>
template
<
>
void
ConvBNReluKernel
<
GPU_CL
,
float
>::
Compute
(
void
ConvBNReluKernel
<
GPU_CL
,
float
>::
Compute
(
const
FusionConvBNReluParam
<
GPU_CL
>
&
param
)
{}
const
FusionConvBNReluParam
<
GPU_CL
>
&
param
)
{
auto
kernel
=
this
->
cl_helper_
.
KernelAt
(
0
);
auto
default_work_size
=
this
->
cl_helper_
.
DefaultWorkSize
(
*
param
.
Output
());
int
c_block
=
default_work_size
[
0
];
int
w
=
default_work_size
[
1
];
int
nh
=
default_work_size
[
2
];
auto
input
=
param
.
Input
()
->
GetCLImage
();
auto
filter
=
param
.
Filter
()
->
GetCLImage
();
auto
new_scale
=
param
.
NewScale
()
->
GetCLImage
();
auto
new_bias
=
param
.
NewBias
()
->
GetCLImage
();
auto
output
=
param
.
Output
()
->
GetCLImage
();
int
stride
=
param
.
Strides
()[
0
];
int
offset
=
param
.
Offset
();
int
input_c
=
reinterpret_cast
<
framework
::
CLImageConverterFolder
*>
(
param
.
Input
()
->
Converter
())
->
GetCBlock
();
int
dilation
=
param
.
Dilations
()[
0
];
int
input_width
=
param
.
Input
()
->
dims
()[
3
];
int
input_height
=
param
.
Input
()
->
dims
()[
2
];
int
output_width
=
param
.
Output
()
->
dims
()[
3
];
int
output_height
=
param
.
Output
()
->
dims
()[
2
];
cl_int
status
;
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
w
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
nh
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_mem
),
&
input
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_mem
),
&
filter
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_mem
),
&
new_scale
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
6
,
sizeof
(
cl_mem
),
&
new_bias
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
7
,
sizeof
(
cl_mem
),
&
output
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
8
,
sizeof
(
int
),
&
stride
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
9
,
sizeof
(
int
),
&
offset
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
10
,
sizeof
(
int
),
&
input_c
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
11
,
sizeof
(
int
),
&
dilation
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
12
,
sizeof
(
int
),
&
input_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
13
,
sizeof
(
int
),
&
input_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
14
,
sizeof
(
int
),
&
output_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
15
,
sizeof
(
int
),
&
output_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
}
template
class
ConvBNReluKernel
<
GPU_CL
,
float
>;
template
class
ConvBNReluKernel
<
GPU_CL
,
float
>;
}
// namespace operators
}
// namespace operators
...
...
src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp
浏览文件 @
8c4a035f
...
@@ -22,12 +22,151 @@ namespace operators {
...
@@ -22,12 +22,151 @@ namespace operators {
template
<
>
template
<
>
bool
DWConvBNReluKernel
<
GPU_CL
,
float
>::
Init
(
bool
DWConvBNReluKernel
<
GPU_CL
,
float
>::
Init
(
FusionDWConvBNReluParam
<
GPU_CL
>
*
param
)
{
FusionDWConvBNReluParam
<
GPU_CL
>
*
param
)
{
PADDLE_MOBILE_ENFORCE
(
param
->
Filter
()
->
dims
()[
2
]
==
param
->
Filter
()
->
dims
()[
3
]
&&
param
->
Paddings
()[
0
]
==
param
->
Paddings
()[
1
],
"need equal"
);
const
framework
::
CLImage
*
mean
=
param
->
InputMean
();
const
framework
::
CLImage
*
variance
=
param
->
InputVariance
();
const
framework
::
CLImage
*
scale
=
param
->
InputScale
();
const
framework
::
CLImage
*
bias
=
param
->
InputBias
();
const
float
epsilon
=
param
->
Epsilon
();
const
int
C
=
mean
->
numel
();
auto
mean_ptr
=
mean
->
data
<
float
>
();
auto
variance_ptr
=
variance
->
data
<
float
>
();
auto
scale_ptr
=
scale
->
data
<
float
>
();
auto
bias_ptr
=
bias
->
data
<
float
>
();
float
inv_std_ptr
[
C
];
for
(
int
i
=
0
;
i
<
C
;
i
++
)
{
inv_std_ptr
[
i
]
=
1
/
static_cast
<
float
>
(
pow
((
variance_ptr
[
i
]
+
epsilon
),
0.5
));
}
float
*
new_scale_ptr
=
new
float
[
C
];
float
*
new_bias_ptr
=
new
float
[
C
];
for
(
int
i
=
0
;
i
<
C
;
i
++
)
{
new_scale_ptr
[
i
]
=
inv_std_ptr
[
i
]
*
scale_ptr
[
i
];
new_bias_ptr
[
i
]
=
bias_ptr
[
i
]
-
mean_ptr
[
i
]
*
inv_std_ptr
[
i
]
*
scale_ptr
[
i
];
}
framework
::
CLImage
*
new_scale
=
new
framework
::
CLImage
();
new_scale
->
SetTensorData
(
new_scale_ptr
,
variance
->
dims
());
new_scale
->
InitCLImage
(
this
->
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
framework
::
CLImage
*
new_bias
=
new
framework
::
CLImage
();
new_bias
->
SetTensorData
(
new_bias_ptr
,
variance
->
dims
());
new_bias
->
InitCLImage
(
this
->
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
param
->
SetNewScale
(
new_scale
);
param
->
SetNewBias
(
new_bias
);
delete
[](
new_scale_ptr
);
delete
[](
new_bias_ptr
);
PADDLE_MOBILE_ENFORCE
(
param
->
Filter
()
->
dims
()[
2
]
==
param
->
Filter
()
->
dims
()[
3
]
&&
param
->
Paddings
()[
0
]
==
param
->
Paddings
()[
1
],
"need equal"
);
int
offset
=
static_cast
<
int
>
(
param
->
Filter
()
->
dims
()[
2
])
/
2
-
static_cast
<
int
>
(
param
->
Paddings
()[
1
]);
param
->
SetOffset
(
offset
);
param
->
Filter
()
->
InitDWImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
this
->
cl_helper_
.
AddKernel
(
"depth_conv_3x3"
,
"conv_bn_relu_kernel.cl"
);
DLOG
<<
" conv bn relu depth_conv_3x3"
;
return
true
;
return
true
;
}
}
template
<
>
template
<
>
void
DWConvBNReluKernel
<
GPU_CL
,
float
>::
Compute
(
void
DWConvBNReluKernel
<
GPU_CL
,
float
>::
Compute
(
const
FusionDWConvBNReluParam
<
GPU_CL
>
&
param
)
{}
const
FusionDWConvBNReluParam
<
GPU_CL
>
&
param
)
{
auto
kernel
=
this
->
cl_helper_
.
KernelAt
(
0
);
auto
default_work_size
=
this
->
cl_helper_
.
DefaultWorkSize
(
*
param
.
Output
());
int
c_block
=
default_work_size
[
0
];
int
w
=
default_work_size
[
1
];
int
nh
=
default_work_size
[
2
];
auto
input
=
param
.
Input
()
->
GetCLImage
();
auto
filter
=
param
.
Filter
()
->
GetCLImage
();
auto
new_scale
=
param
.
NewScale
()
->
GetCLImage
();
auto
new_bias
=
param
.
NewBias
()
->
GetCLImage
();
auto
output
=
param
.
Output
()
->
GetCLImage
();
int
stride
=
param
.
Strides
()[
0
];
int
offset
=
param
.
Offset
();
int
input_c
=
reinterpret_cast
<
framework
::
CLImageConverterFolder
*>
(
param
.
Input
()
->
Converter
())
->
GetCBlock
();
int
dilation
=
param
.
Dilations
()[
0
];
int
input_width
=
param
.
Input
()
->
dims
()[
3
];
int
input_height
=
param
.
Input
()
->
dims
()[
2
];
int
output_width
=
param
.
Output
()
->
dims
()[
3
];
int
output_height
=
param
.
Output
()
->
dims
()[
2
];
cl_int
status
;
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
w
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
nh
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_mem
),
&
input
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_mem
),
&
filter
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_mem
),
&
new_scale
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
6
,
sizeof
(
cl_mem
),
&
new_bias
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
7
,
sizeof
(
cl_mem
),
&
output
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
8
,
sizeof
(
int
),
&
stride
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
9
,
sizeof
(
int
),
&
offset
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
10
,
sizeof
(
int
),
&
input_c
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
11
,
sizeof
(
int
),
&
dilation
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
12
,
sizeof
(
int
),
&
input_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
13
,
sizeof
(
int
),
&
input_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
14
,
sizeof
(
int
),
&
output_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
15
,
sizeof
(
int
),
&
output_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
}
template
class
DWConvBNReluKernel
<
GPU_CL
,
float
>;
template
class
DWConvBNReluKernel
<
GPU_CL
,
float
>;
}
// namespace operators
}
// namespace operators
...
...
src/operators/kernel/cl/feed_kernel.cpp
浏览文件 @
8c4a035f
...
@@ -28,6 +28,8 @@ template <>
...
@@ -28,6 +28,8 @@ template <>
void
FeedKernel
<
GPU_CL
,
float
>::
Compute
(
const
FeedParam
<
GPU_CL
>
&
param
)
{
void
FeedKernel
<
GPU_CL
,
float
>::
Compute
(
const
FeedParam
<
GPU_CL
>
&
param
)
{
auto
kernel
=
this
->
cl_helper_
.
KernelAt
(
0
);
auto
kernel
=
this
->
cl_helper_
.
KernelAt
(
0
);
cl_int
status
;
cl_int
status
;
param
.
Out
()
->
InitEmptyImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
(),
param
.
Out
()
->
dims
());
auto
output
=
param
.
Out
();
auto
output
=
param
.
Out
();
const
Tensor
*
input
=
param
.
InputX
();
const
Tensor
*
input
=
param
.
InputX
();
// DLOG << *input;
// DLOG << *input;
...
...
src/operators/kernel/cl/fetch_kernel.cpp
浏览文件 @
8c4a035f
...
@@ -27,8 +27,6 @@ bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
...
@@ -27,8 +27,6 @@ bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
}
else
{
}
else
{
this
->
cl_helper_
.
AddKernel
(
"fetch"
,
"fetch_kernel.cl"
);
this
->
cl_helper_
.
AddKernel
(
"fetch"
,
"fetch_kernel.cl"
);
}
}
auto
*
out
=
param
->
Out
();
out
->
mutable_data
<
float
>
();
return
true
;
return
true
;
}
}
...
@@ -39,7 +37,7 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> ¶m) {
...
@@ -39,7 +37,7 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> ¶m) {
auto
input
=
param
.
InputX
()
->
GetCLImage
();
auto
input
=
param
.
InputX
()
->
GetCLImage
();
auto
*
out
=
param
.
Out
();
auto
*
out
=
param
.
Out
();
out
->
mutable_data
<
float
>
();
const
auto
&
dim
=
param
.
InputX
()
->
dims
();
const
auto
&
dim
=
param
.
InputX
()
->
dims
();
size_t
new_dims
[]
=
{
1
,
1
,
1
,
1
};
size_t
new_dims
[]
=
{
1
,
1
,
1
,
1
};
...
@@ -70,9 +68,11 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> ¶m) {
...
@@ -70,9 +68,11 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> ¶m) {
int
size_ch
=
in_height
*
in_width
;
int
size_ch
=
in_height
*
in_width
;
int
size_block
=
size_ch
*
4
;
int
size_block
=
size_ch
*
4
;
int
size_batch
=
size_ch
*
C
;
int
size_batch
=
size_ch
*
C
;
int
out_c
=
new_dims
[
1
];
clSetKernelArg
(
kernel
,
4
,
sizeof
(
int
),
&
size_ch
);
clSetKernelArg
(
kernel
,
4
,
sizeof
(
int
),
&
size_ch
);
clSetKernelArg
(
kernel
,
5
,
sizeof
(
int
),
&
size_block
);
clSetKernelArg
(
kernel
,
5
,
sizeof
(
int
),
&
size_block
);
clSetKernelArg
(
kernel
,
6
,
sizeof
(
int
),
&
size_batch
);
clSetKernelArg
(
kernel
,
6
,
sizeof
(
int
),
&
size_batch
);
clSetKernelArg
(
kernel
,
7
,
sizeof
(
int
),
&
out_c
);
}
}
// cl_event wait_event = param.InpdutX()->GetClEvent();
// cl_event wait_event = param.InpdutX()->GetClEvent();
...
@@ -93,6 +93,8 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> ¶m) {
...
@@ -93,6 +93,8 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> ¶m) {
// << "ms" << std::endl;
// << "ms" << std::endl;
memcpy
(
out
->
data
<
float
>
(),
out_cl_tensor
.
Data
<
float
>
(),
out
->
memory_size
());
memcpy
(
out
->
data
<
float
>
(),
out_cl_tensor
.
Data
<
float
>
(),
out
->
memory_size
());
DLOG
<<
*
param
.
InputX
();
DLOG
<<
*
out
;
}
}
template
class
FetchKernel
<
GPU_CL
,
float
>;
template
class
FetchKernel
<
GPU_CL
,
float
>;
...
...
src/operators/kernel/cl/prior_box_kernel.cpp
浏览文件 @
8c4a035f
...
@@ -15,18 +15,165 @@ limitations under the License. */
...
@@ -15,18 +15,165 @@ limitations under the License. */
#ifdef PRIORBOX_OP
#ifdef PRIORBOX_OP
#include "operators/kernel/prior_box_kernel.h"
#include "operators/kernel/prior_box_kernel.h"
#include "framework/cl/cl_tensor.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
template
<
>
template
<
>
bool
PriorBoxKernel
<
GPU_CL
,
float
>::
Init
(
PriorBoxParam
<
GPU_CL
>
*
param
)
{
bool
PriorBoxKernel
<
GPU_CL
,
float
>::
Init
(
PriorBoxParam
<
GPU_CL
>
*
param
)
{
this
->
cl_helper_
.
AddKernel
(
"prior_box"
,
"prior_box_kernel.cl"
);
return
true
;
return
true
;
}
}
template
<
>
template
<
>
void
PriorBoxKernel
<
GPU_CL
,
float
>::
Compute
(
void
PriorBoxKernel
<
GPU_CL
,
float
>::
Compute
(
const
PriorBoxParam
<
GPU_CL
>
&
param
)
{}
const
PriorBoxParam
<
GPU_CL
>
&
param
)
{
const
auto
*
input_
=
param
.
Input
();
const
auto
&
input_dims
=
input_
->
dims
();
const
auto
&
input_image_dims
=
param
.
InputImage
()
->
dims
();
const
auto
&
min_sizes
=
param
.
MinSizes
();
const
auto
&
max_sizes
=
param
.
MaxSizes
();
const
auto
&
variances
=
param
.
Variances
();
const
auto
&
input_aspect_ratio
=
param
.
AspectRatios
();
const
bool
&
flip
=
param
.
Flip
();
const
bool
&
clip
=
param
.
Clip
();
const
float
&
step_w
=
param
.
StepW
();
const
float
&
step_h
=
param
.
StepH
();
const
float
&
offset
=
param
.
Offset
();
const
int
C
=
param
.
OutputBoxes
()
->
dims
()[
1
];
auto
output_boxes
=
param
.
OutputBoxes
()
->
GetCLImage
();
auto
output_variances
=
param
.
OutputVariances
()
->
GetCLImage
();
std
::
vector
<
float
>
aspect_ratios
;
ExpandAspectRatios
(
input_aspect_ratio
,
flip
,
&
aspect_ratios
);
auto
img_width
=
input_image_dims
[
3
];
auto
img_height
=
input_image_dims
[
2
];
auto
feature_width
=
input_dims
[
3
];
auto
feature_height
=
input_dims
[
2
];
float
step_width
,
step_height
;
/// 300 / 19
if
(
step_w
==
0
||
step_h
==
0
)
{
step_width
=
static_cast
<
float
>
(
img_width
)
/
feature_width
;
step_height
=
static_cast
<
float
>
(
img_height
)
/
feature_height
;
}
else
{
step_width
=
step_w
;
step_height
=
step_h
;
}
int
num_priors
=
aspect_ratios
.
size
()
*
min_sizes
.
size
();
if
(
!
max_sizes
.
empty
())
{
num_priors
+=
max_sizes
.
size
();
}
float
*
box_width
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
num_priors
));
float
*
box_height
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
num_priors
));
int
idx
=
0
;
for
(
size_t
s
=
0
;
s
<
min_sizes
.
size
();
++
s
)
{
auto
min_size
=
min_sizes
[
s
];
if
(
param
.
MinMaxAspectRatiosOrder
())
{
box_width
[
idx
]
=
box_height
[
idx
]
=
min_size
/
2.
;
idx
++
;
if
(
max_sizes
.
size
()
>
0
)
{
auto
max_size
=
max_sizes
[
s
];
box_width
[
idx
]
=
box_height
[
idx
]
=
sqrt
(
min_size
*
max_size
)
/
2.
;
idx
++
;
}
for
(
float
ar
:
aspect_ratios
)
{
if
(
fabs
(
ar
-
1.
)
<
1e-6
)
{
continue
;
}
box_width
[
idx
]
=
min_size
*
sqrt
(
ar
)
/
2.
;
box_height
[
idx
]
=
min_size
/
sqrt
(
ar
)
/
2.
;
idx
++
;
}
}
else
{
for
(
float
ar
:
aspect_ratios
)
{
box_width
[
idx
]
=
min_size
*
sqrt
(
ar
)
/
2.
;
box_height
[
idx
]
=
min_size
/
sqrt
(
ar
)
/
2.
;
idx
++
;
}
if
(
!
max_sizes
.
empty
())
{
auto
max_size
=
max_sizes
[
s
];
box_width
[
idx
]
=
box_height
[
idx
]
=
sqrt
(
min_size
*
max_size
)
/
2.
;
idx
++
;
}
}
}
cl_int
status
;
auto
kernel
=
this
->
cl_helper_
.
KernelAt
(
0
);
auto
default_work_size
=
this
->
cl_helper_
.
DefaultWorkSize
(
*
param
.
OutputBoxes
());
int
c_block
=
default_work_size
[
0
];
int
w
=
default_work_size
[
1
];
int
nh
=
default_work_size
[
2
];
std
::
vector
<
int64_t
>
box_shape
({
1
,
1
,
1
,
num_priors
});
framework
::
DDim
ddim
=
framework
::
make_ddim
(
box_shape
);
framework
::
CLTensor
box_width_cl_tensor
(
this
->
cl_helper_
.
CLContext
(),
this
->
cl_helper_
.
CLCommandQueue
());
box_width_cl_tensor
.
Resize
(
ddim
);
cl_mem
box_width_Buffer
=
box_width_cl_tensor
.
mutable_with_data
<
float
>
(
box_width
);
framework
::
CLTensor
box_height_cl_tensor
(
this
->
cl_helper_
.
CLContext
(),
this
->
cl_helper_
.
CLCommandQueue
());
box_height_cl_tensor
.
Resize
(
ddim
);
cl_mem
box_height_Buffer
=
box_height_cl_tensor
.
mutable_with_data
<
float
>
(
box_height
);
DLOG
<<
"c_block:"
<<
c_block
;
DLOG
<<
"w:"
<<
w
;
DLOG
<<
"nh:"
<<
nh
;
DLOG
<<
"step_width:"
<<
step_width
;
DLOG
<<
"step_height:"
<<
step_height
;
DLOG
<<
"offset:"
<<
offset
;
DLOG
<<
"img_width:"
<<
img_width
;
DLOG
<<
"img_height:"
<<
img_height
;
DLOG
<<
"num_priors:"
<<
num_priors
;
DLOG
<<
"C:"
<<
C
;
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
w
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
nh
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_mem
),
&
box_width_Buffer
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_mem
),
&
box_height_Buffer
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_mem
),
&
output_boxes
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
6
,
sizeof
(
float
),
&
step_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
7
,
sizeof
(
float
),
&
step_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
8
,
sizeof
(
float
),
&
offset
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
9
,
sizeof
(
int
),
&
img_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
10
,
sizeof
(
int
),
&
img_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
11
,
sizeof
(
int
),
&
num_priors
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
12
,
sizeof
(
int
),
&
C
);
CL_CHECK_ERRORS
(
status
);
size_t
global_work_size
[
2
]
=
{
c_block
,
nh
};
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
NULL
,
global_work_size
,
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
paddle_mobile
::
memory
::
Free
(
box_width
);
paddle_mobile
::
memory
::
Free
(
box_height
);
}
template
class
PriorBoxKernel
<
GPU_CL
,
float
>;
template
class
PriorBoxKernel
<
GPU_CL
,
float
>;
}
// namespace operators
}
// namespace operators
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录