Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
4b1a4ca0
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
4b1a4ca0
编写于
9月 03, 2019
作者:
S
StarryRain
提交者:
Jiaying Zhao
9月 03, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add opencl depthwise_conv_trans_op (#1949)
* add opencl depthwise_conv_trans_op * test=develop
上级
fc5d91d4
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
475 addition
and
198 deletion
+475
-198
mobile/src/framework/load_ops.h
mobile/src/framework/load_ops.h
+1
-1
mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+170
-0
mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h
mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h
+13
-1
mobile/src/operators/kernel/cl/cl_kernel/conv_transpose.cl
mobile/src/operators/kernel/cl/cl_kernel/conv_transpose.cl
+0
-146
mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl
...rc/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl
+248
-0
mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
+34
-48
mobile/src/operators/op_param.h
mobile/src/operators/op_param.h
+9
-2
未找到文件。
mobile/src/framework/load_ops.h
浏览文件 @
4b1a4ca0
...
...
@@ -151,7 +151,7 @@ LOAD_OP1(shape, CPU);
LOAD_OP2
(
depthwise_conv2d
,
CPU
,
GPU_CL
);
#endif
#ifdef CONV_TRANSPOSE_OP
LOAD_OP
1
(
conv2d_transpose
,
CPU
);
LOAD_OP
2
(
conv2d_transpose
,
CPU
,
GPU_CL
);
#endif
#ifdef SCALE_OP
LOAD_OP2
(
scale
,
CPU
,
GPU_CL
);
...
...
mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
浏览文件 @
4b1a4ca0
...
...
@@ -468,5 +468,175 @@ void SWConvAddBnRelu(framework::CLHelper *cl_helper,
}
CL_CHECK_ERRORS
(
status
);
}
void
DWConvTransposeAddBnRelu
(
framework
::
CLHelper
*
cl_helper
,
const
ConvTransposeParam
<
GPU_CL
>
&
param
,
bool
ifRelu
,
const
framework
::
CLImage
*
biase
,
const
framework
::
CLImage
*
new_scale
,
const
framework
::
CLImage
*
new_bias
)
{
auto
kernel
=
cl_helper
->
KernelAt
(
0
);
auto
default_work_size
=
cl_helper
->
DefaultWorkSize
(
*
param
.
Output
());
int
c_block
=
default_work_size
[
0
];
int
w
=
default_work_size
[
1
];
int
nh
=
default_work_size
[
2
];
int
w_blk_size
=
1
;
int
w_blk
=
(
w
+
w_blk_size
-
1
)
/
w_blk_size
;
default_work_size
[
1
]
=
w_blk
;
int
h_blk_size
=
1
;
int
h_blk
=
(
nh
+
h_blk_size
-
1
)
/
h_blk_size
;
default_work_size
[
2
]
=
h_blk
;
auto
input
=
param
.
Input
()
->
GetCLImage
();
auto
filter
=
param
.
Filter
()
->
GetCLImage
();
auto
output
=
param
.
Output
()
->
GetCLImage
();
int
stride
=
param
.
Strides
()[
0
];
int
pad
=
param
.
Paddings
()[
0
];
int
dilation
=
param
.
Dilations
()[
0
];
int
input_channel
=
param
.
Input
()
->
dims
()[
1
];
int
input_height
=
param
.
Input
()
->
dims
()[
2
];
int
input_width
=
param
.
Input
()
->
dims
()[
3
];
int
output_height
=
param
.
Output
()
->
dims
()[
2
];
int
output_width
=
param
.
Output
()
->
dims
()[
3
];
int
filter_height
=
param
.
Filter
()
->
dims
()[
2
];
int
filter_width
=
param
.
Filter
()
->
dims
()[
3
];
cl_int
status
;
int
index
=
0
;
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
c_block
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
w_blk
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
h_blk
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
cl_mem
),
&
input
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
cl_mem
),
&
filter
);
CL_CHECK_ERRORS
(
status
);
if
(
biase
)
{
auto
bias_mem
=
biase
->
GetCLImage
();
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
cl_mem
),
&
bias_mem
);
CL_CHECK_ERRORS
(
status
);
}
if
(
new_scale
&&
new_bias
)
{
auto
new_scale_mem
=
new_scale
->
GetCLImage
();
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
cl_mem
),
&
new_scale_mem
);
CL_CHECK_ERRORS
(
status
);
auto
new_bias_mem
=
new_bias
->
GetCLImage
();
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
cl_mem
),
&
new_bias_mem
);
CL_CHECK_ERRORS
(
status
);
}
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
cl_mem
),
&
output
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
stride
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
pad
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
dilation
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
input_channel
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
input_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
input_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
output_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
output_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
filter_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
filter_height
);
CL_CHECK_ERRORS
(
status
);
if
(
default_work_size
.
data
()[
1
]
%
60
==
0
&&
use_lws
)
{
const
size_t
local_work_size
[
3
]
=
{
static_cast
<
const
uint32_t
>
(
1
),
static_cast
<
const
uint32_t
>
(
60
),
static_cast
<
const
uint32_t
>
(
1
)};
status
=
clEnqueueNDRangeKernel
(
cl_helper
->
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
default_work_size
.
data
(),
local_work_size
,
0
,
NULL
,
NULL
);
}
else
{
status
=
clEnqueueNDRangeKernel
(
cl_helper
->
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
}
CL_CHECK_ERRORS
(
status
);
}
void
ConvTransposeAddBnRelu
(
framework
::
CLHelper
*
cl_helper
,
const
ConvTransposeParam
<
GPU_CL
>
&
param
,
bool
ifRelu
,
const
framework
::
CLImage
*
biase
,
const
framework
::
CLImage
*
new_scale
,
const
framework
::
CLImage
*
new_bias
)
{
auto
kernel
=
cl_helper
->
KernelAt
(
0
);
const
auto
*
input
=
param
.
Input
();
auto
*
output
=
param
.
Output
();
auto
*
filter
=
param
.
Filter
();
const
int
n
=
input
->
dims
()[
0
];
const
int
input_c
=
input
->
dims
()[
1
];
const
int
input_c_block
=
(
input_c
+
3
)
/
4
;
const
int
input_width
=
input
->
dims
()[
3
];
const
int
input_height
=
input
->
dims
()[
2
];
const
int
output_c
=
output
->
dims
()[
1
];
const
int
output_c_block
=
(
output_c
+
3
)
/
4
;
const
int
output_width
=
output
->
dims
()[
3
];
const
int
output_height
=
output
->
dims
()[
2
];
auto
inputImage
=
input
->
GetCLImage
();
auto
outputImage
=
output
->
GetCLImage
();
auto
filterImage
=
filter
->
GetCLImage
();
cl_int
status
;
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
input_c_block
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
input_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
input_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
int
),
&
output_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
int
),
&
output_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_mem
),
&
inputImage
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
6
,
sizeof
(
cl_mem
),
&
filterImage
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
7
,
sizeof
(
cl_mem
),
&
outputImage
);
CL_CHECK_ERRORS
(
status
);
const
size_t
work_size
[
3
]
=
{(
size_t
)
output_c_block
,
(
size_t
)
input_width
,
(
size_t
)(
n
*
input_height
)};
DLOG
<<
"conv transpose "
<<
input_c_block
<<
input_width
<<
input_height
<<
output_width
<<
output_height
<<
work_size
[
0
]
<<
work_size
[
1
]
<<
work_size
[
2
];
clEnqueueNDRangeKernel
(
cl_helper
->
CLCommandQueue
(),
kernel
,
3
,
NULL
,
work_size
,
NULL
,
0
,
NULL
,
NULL
);
}
}
// namespace operators
}
// namespace paddle_mobile
mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h
浏览文件 @
4b1a4ca0
...
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if
def CONV_OP
#if
defined(CONV_OP) || defined(CONV_TRANSPOSE_OP)
#pragma once
...
...
@@ -52,6 +52,18 @@ void SWConvAddBnRelu(framework::CLHelper *cl_helper,
const
framework
::
CLImage
*
biase
=
nullptr
,
const
framework
::
CLImage
*
new_scale
=
nullptr
,
const
framework
::
CLImage
*
new_bias
=
nullptr
);
void
DWConvTransposeAddBnRelu
(
framework
::
CLHelper
*
cl_helper
,
const
ConvTransposeParam
<
GPU_CL
>
&
param
,
bool
ifRelu
=
false
,
const
framework
::
CLImage
*
biase
=
nullptr
,
const
framework
::
CLImage
*
new_scale
=
nullptr
,
const
framework
::
CLImage
*
new_bias
=
nullptr
);
void
ConvTransposeAddBnRelu
(
framework
::
CLHelper
*
cl_helper
,
const
ConvTransposeParam
<
GPU_CL
>
&
param
,
bool
ifRelu
=
false
,
const
framework
::
CLImage
*
biase
=
nullptr
,
const
framework
::
CLImage
*
new_scale
=
nullptr
,
const
framework
::
CLImage
*
new_bias
=
nullptr
);
}
// namespace operators
}
// namespace paddle_mobile
...
...
mobile/src/operators/kernel/cl/cl_kernel/conv_transpose.cl
已删除
100644 → 0
浏览文件 @
fc5d91d4
/*
Copyright
(
c
)
2018
PaddlePaddle
Authors.
All
Rights
Reserved.
Licensed
under
the
Apache
License,
Version
2.0
(
the
"License"
)
;
you
may
not
use
this
file
except
in
compliance
with
the
License.
You
may
obtain
a
copy
of
the
License
at
http://www.apache.org/licenses/LICENSE-2.0
Unless
required
by
applicable
law
or
agreed
to
in
writing,
software
distributed
under
the
License
is
distributed
on
an
"AS IS"
BASIS,
WITHOUT
WARRANTIES
OR
CONDITIONS
OF
ANY
KIND,
either
express
or
implied.
See
the
License
for
the
specific
language
governing
permissions
and
limitations
under
the
License.
*/
#
include
"cl_common.h"
__kernel
void
conv_transpose
(
__private
const
int
input_c_block,
__private
const
int
input_width,/*
of
one
block
*/
__private
const
int
input_height,/*
of
one
block
*/
__private
const
int
output_width,
__private
const
int
output_height,
__read_only
image2d_t
input_image,
__read_only
image2d_t
filter,
__write_only
image2d_t
output_image
)
{
const
int
out_c
=
get_global_id
(
0
)
;
const
int
in_w
=
get_global_id
(
1
)
;
const
int
in_nh
=
get_global_id
(
2
)
;
const
int
n
=
in_nh
/
input_height
;
const
int
h
=
in_nh
%
input_height
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
|
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
half4
input1,
input2,
input3,
input4
;
half4
output1
=
0.0f,
output2
=
0.0f,
output3
=
0.0f,
output4
=
0.0f
;
half4
w
=
0.0f
;
int2
pos_in
;
for
(
int
i
=
0
; i < input_c_block; i += 1) {
pos_in
=
(
int2
)(
mad24
(
i,
input_width,
in_w
)
,
in_nh
)
;
input1
=
select
(
read_imageh
(
input_image,
sampler,
(
int2
)(
pos_in.x,
pos_in.y
))
,
(
half4
)(
0.0f
)
,
(
ushort4
)((
in_w
<
0
|
| h < 0 || in_w >= input_width || h >= input_height) << 15));
input2 = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + 1, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_w + 1 < 0 || h < 0 || in_w + 1 >= input_width || h >= input_height) << 15));
input3 = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + 1)),
(half4)(0.0f),
(ushort4)((in_w < 0 || h + 1 < 0 || in_w >= input_width || h + 1 >= input_height) << 15));
input4 = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + 1, pos_in.y + 1)),
(half4)(0.0f),
(ushort4)((in_w + 1 < 0 || h + 1 < 0 || in_w + 1 >= input_width |
|
h
+
1
>=
input_height
)
<<
15
))
;
int
wx
=
i
*
3
;
int
wy
=
out_c
*
4
*
3
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx,
wy
))
;
output4.x
+=
dot
(
input4,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
1
,
wy
))
;
output3.x
+=
dot
(
input3,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
2
,
wy
))
;
output4.x
+=
dot
(
input3,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx,
wy
+
1
))
;
output2.x
+=
dot
(
input2,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
1
,
wy
+
1
))
;
output1.x
+=
dot
(
input1,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
2
,
wy
+
1
))
;
output2.x
+=
dot
(
input1,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx,
wy
+
2
))
;
output4.x
+=
dot
(
input2,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
1
,
wy
+
2
))
;
output3.x
+=
dot
(
input1,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
2
,
wy
+
2
))
;
output4.x
+=
dot
(
input1,
w
)
;
wy
=
(
out_c
*
4
+
1
)
*
3
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx,
wy
))
;
output4.y
+=
dot
(
input4,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
1
,
wy
))
;
output3.y
+=
dot
(
input3,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
2
,
wy
))
;
output4.y
+=
dot
(
input3,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx,
wy
+
1
))
;
output2.y
+=
dot
(
input2,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
1
,
wy
+
1
))
;
output1.y
+=
dot
(
input1,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
2
,
wy
+
1
))
;
output2.y
+=
dot
(
input1,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx,
wy
+
2
))
;
output4.y
+=
dot
(
input2,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
1
,
wy
+
2
))
;
output3.y
+=
dot
(
input1,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
2
,
wy
+
2
))
;
output4.y
+=
dot
(
input1,
w
)
;
wy
=
(
out_c
*
4
+
2
)
*
3
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx,
wy
))
;
output4.z
+=
dot
(
input4,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
1
,
wy
))
;
output3.z
+=
dot
(
input3,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
2
,
wy
))
;
output4.z
+=
dot
(
input3,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx,
wy
+
1
))
;
output2.z
+=
dot
(
input2,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
1
,
wy
+
1
))
;
output1.z
+=
dot
(
input1,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
2
,
wy
+
1
))
;
output2.z
+=
dot
(
input1,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx,
wy
+
2
))
;
output4.z
+=
dot
(
input2,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
1
,
wy
+
2
))
;
output3.z
+=
dot
(
input1,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
2
,
wy
+
2
))
;
output4.z
+=
dot
(
input1,
w
)
;
wy
=
(
out_c
*
4
+
3
)
*
3
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx,
wy
))
;
output4.w
+=
dot
(
input4,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
1
,
wy
))
;
output3.w
+=
dot
(
input3,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
2
,
wy
))
;
output4.w
+=
dot
(
input3,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx,
wy
+
1
))
;
output2.w
+=
dot
(
input2,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
1
,
wy
+
1
))
;
output1.w
+=
dot
(
input1,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
2
,
wy
+
1
))
;
output2.w
+=
dot
(
input1,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx,
wy
+
2
))
;
output4.w
+=
dot
(
input2,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
1
,
wy
+
2
))
;
output3.w
+=
dot
(
input1,
w
)
;
w
=
read_imageh
(
filter,
sampler,
(
int2
)(
wx
+
2
,
wy
+
2
))
;
output4.w
+=
dot
(
input1,
w
)
;
}
int2
pos_out
=
(
int2
)(
out_c
*
output_width
+
2
*
in_w,
n
*
output_height
+
2
*
h
)
;
write_imageh
(
output_image,
pos_out,
output1
)
;
write_imageh
(
output_image,
(
int2
)(
pos_out.x
+
1
,
pos_out.y
)
,
output2
)
;
write_imageh
(
output_image,
(
int2
)(
pos_out.x,
pos_out.y
+
1
)
,
output3
)
;
write_imageh
(
output_image,
(
int2
)(
pos_out.x
+
1
,
pos_out.y
+
1
)
,
output4
)
;
}
\ No newline at end of file
mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl
0 → 100644
浏览文件 @
4b1a4ca0
/*
Copyright
(
c
)
2018
PaddlePaddle
Authors.
All
Rights
Reserved.
Licensed
under
the
Apache
License,
Version
2.0
(
the
"License"
)
;
you
may
not
use
this
file
except
in
compliance
with
the
License.
You
may
obtain
a
copy
of
the
License
at
http://www.apache.org/licenses/LICENSE-2.0
Unless
required
by
applicable
law
or
agreed
to
in
writing,
software
distributed
under
the
License
is
distributed
on
an
"AS IS"
BASIS,
WITHOUT
WARRANTIES
OR
CONDITIONS
OF
ANY
KIND,
either
express
or
implied.
See
the
License
for
the
specific
language
governing
permissions
and
limitations
under
the
License.
*/
#
include
"cl_common.h"
__kernel
void
conv_transpose
(
__private
const
int
input_c_block,
__private
const
int
input_width,/*
of
one
block
*/
__private
const
int
input_height,/*
of
one
block
*/
__private
const
int
output_width,
__private
const
int
output_height,
__read_only
image2d_t
input_image,
__read_only
image2d_t
filter,
__write_only
image2d_t
output_image
)
{
const
int
out_c
=
get_global_id
(
0
)
;
const
int
in_w
=
get_global_id
(
1
)
;
const
int
in_nh
=
get_global_id
(
2
)
;
const
int
n
=
in_nh
/
input_height
;
const
int
h
=
in_nh
%
input_height
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
|
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
half4
input1,
input2,
input3,
input4
;
half4
output1
=
0.0f,
output2
=
0.0f,
output3
=
0.0f,
output4
=
0.0f
;
half4
w
=
0.0f
;
int2
pos_in
;
for
(
int
i
=
0
; i < input_c_block; i += 1) {
pos_in
=
(
int2
)(
mad24
(
i,
input_width,
in_w
)
,
in_nh
)
;
input1
=
select
(
read_imageh
(
input_image,
sampler,
(
int2
)(
pos_in.x,
pos_in.y
))
,
(
half4
)(
0.0f
)
,
(
ushort4
)((
in_w
<
0
|
| h < 0 || in_w >= input_width || h >= input_height) << 15));
input2 = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + 1, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_w + 1 < 0 || h < 0 || in_w + 1 >= input_width || h >= input_height) << 15));
input3 = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + 1)),
(half4)(0.0f),
(ushort4)((in_w < 0 || h + 1 < 0 || in_w >= input_width || h + 1 >= input_height) << 15));
input4 = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + 1, pos_in.y + 1)),
(half4)(0.0f),
(ushort4)((in_w + 1 < 0 || h + 1 < 0 || in_w + 1 >= input_width || h + 1 >= input_height) << 15));
int wx = i * 3;
int wy = out_c * 4 * 3;
w = read_imageh(filter, sampler, (int2)(wx, wy));
output4.x += dot(input4, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
output3.x += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
output4.x += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
output2.x += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
output1.x += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
output2.x += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
output4.x += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
output3.x += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
output4.x += dot(input1, w);
wy = (out_c * 4 + 1) * 3;
w = read_imageh(filter, sampler, (int2)(wx, wy));
output4.y += dot(input4, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
output3.y += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
output4.y += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
output2.y += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
output1.y += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
output2.y += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
output4.y += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
output3.y += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
output4.y += dot(input1, w);
wy = (out_c * 4 + 2) * 3;
w = read_imageh(filter, sampler, (int2)(wx, wy));
output4.z += dot(input4, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
output3.z += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
output4.z += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
output2.z += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
output1.z += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
output2.z += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
output4.z += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
output3.z += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
output4.z += dot(input1, w);
wy = (out_c * 4 + 3) * 3;
w = read_imageh(filter, sampler, (int2)(wx, wy));
output4.w += dot(input4, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
output3.w += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
output4.w += dot(input3, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
output2.w += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
output1.w += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
output2.w += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
output4.w += dot(input2, w);
w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
output3.w += dot(input1, w);
w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
output4.w += dot(input1, w);
}
int2 pos_out = (int2)(out_c * output_width + 2 * in_w, n * output_height + 2 * h);
write_imageh(output_image, pos_out, output1);
write_imageh(output_image, (int2)(pos_out.x + 1, pos_out.y), output2);
write_imageh(output_image, (int2)(pos_out.x, pos_out.y + 1), output3);
write_imageh(output_image, (int2)(pos_out.x + 1, pos_out.y + 1), output4);
}
__kernel void depthwise_transpose(__private const int item_ch,
__private const int item_w,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int dilation,
__private const int in_ch,
__private const int in_w,
__private const int in_h,
__private const int out_w,
__private const int out_h,
__private const int filter_w,
__private const int filter_h) {
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST;
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// out_id
int out_b_id = item_h_id / out_h;
int out_w_id_per_ch_blk = item_w_id;
int out_h_id_per_batch = item_h_id % out_h;
int out_w_id = item_ch_id * out_w + out_w_id_per_ch_blk;
// in_id
int in_w_id_per_ch_blk = (out_w_id_per_ch_blk + pad - filter_w + stride) / stride;
in_w_id_per_ch_blk = in_w_id_per_ch_blk > 0 ? in_w_id_per_ch_blk : 0;
int in_h_id_per_batch = (out_h_id_per_batch + pad - filter_h + stride) / stride;
in_h_id_per_batch = in_h_id_per_batch > 0 ? in_h_id_per_batch : 0;
// filter_id
int align_w_i = out_w_id_per_ch_blk + pad - filter_w + 1;
int align_w = align_w_i % stride > 0 ?
align_w_i % stride - stride : align_w_i % stride;
int filter_w_id_per_ch_blk = out_w_id_per_ch_blk + pad < filter_w ? out_w_id_per_ch_blk + pad : filter_w + align_w - 1;
int align_h_i = out_h_id_per_batch + pad - filter_h + 1;
int align_h = align_h_i % stride > 0 ?
align_h_i % stride - stride : align_h_i % stride;
int filter_h_id = out_h_id_per_batch + pad < filter_h ? out_h_id_per_batch + pad : filter_h + align_h - 1;
#ifdef BIASE_CH
half4 output;
output = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
#elif defined(BIASE_ELE)
half4 output;
output = read_imageh(bias, sampler, (int2)(out_w_id, item_h_id));
#else
half4 output = 0.0f;
#endif
half4 filter = 0.0f;
half4 input = 0.0f;
for (int h = filter_h_id; h >= 0; h -= stride) {
int in_h_id = select(out_b_id * in_h + in_h_id_per_batch, -1,
in_h_id_per_batch < 0 || in_h_id_per_batch >= in_h);
for (int w = filter_w_id_per_ch_blk; w >= 0; w -= stride) {
int in_w_id = select(item_ch_id * in_w + in_w_id_per_ch_blk, -1,
in_w_id_per_ch_blk < 0 |
|
in_w_id_per_ch_blk
>=
in_w
)
;
int
filter_w_id
=
item_ch_id
*
filter_w
+
w
;
input
=
read_imageh
(
input_image,
sampler,
(
int2
)(
in_w_id,
in_h_id
))
;
filter
=
read_imageh
(
filter_image,
sampler,
(
int2
)(
filter_w_id,
h
))
;
output
=
mad
(
input,
filter,
output
)
;
in_w_id_per_ch_blk++
;
}
in_h_id_per_batch++
;
}
#
ifdef
BATCH_NORM
half4
scale
=
read_imageh
(
new_scale,
sampler,
(
int2
)(
item_ch_id,
0
))
;
half4
biase
=
read_imageh
(
new_biase,
sampler,
(
int2
)(
item_ch_id,
0
))
;
output
=
mad
(
scale,
output,
biase
)
;
#
endif
#
ifdef
RELU
output
=
activation
(
output
)
;
#
endif
write_imageh
(
output_image,
(
int2
)(
out_w_id,
item_h_id
)
,
output
)
;
}
mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
浏览文件 @
4b1a4ca0
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#ifdef CONV_TRANSPOSE_OP
#include "operators/kernel/conv_transpose_kernel.h"
#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -21,60 +22,45 @@ namespace operators {
template
<
>
bool
ConvTransposeKernel
<
GPU_CL
,
float
>::
Init
(
ConvTransposeParam
<
GPU_CL
>*
param
)
{
param
->
Filter
()
->
InitConv2dTransposeFilterCLImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
this
->
cl_helper_
.
AddKernel
(
"conv_transpose"
,
"conv_transpose.cl"
);
PADDLE_MOBILE_ENFORCE
(
param
->
Strides
()[
0
]
==
param
->
Strides
()[
1
]
&&
param
->
Paddings
()[
0
]
==
param
->
Paddings
()[
1
]
&&
param
->
Dilations
()[
0
]
==
param
->
Dilations
()[
1
]
&&
param
->
Dilations
()[
0
]
==
1
,
"need equal"
);
if
(
param
->
Filter
()
->
dims
()[
1
]
==
1
&&
param
->
Input
()
->
dims
()[
1
]
==
param
->
Output
()
->
dims
()[
1
])
{
param
->
ExecMode
()
=
ConvTransposeParam
<
GPU_CL
>::
EXEC_DEPTHWISETRANS_FLOAT
;
param
->
Filter
()
->
InitDWImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
this
->
cl_helper_
.
AddKernel
(
"depthwise_transpose"
,
"conv_transpose_kernel.cl"
);
}
else
if
(
param
->
Filter
()
->
dims
()[
2
]
==
3
&&
param
->
Filter
()
->
dims
()[
3
]
==
3
&&
param
->
Strides
()[
0
]
==
2
)
{
param
->
ExecMode
()
=
ConvTransposeParam
<
GPU_CL
>::
EXEC_CONVTRANS3x3s2_FLOAT
;
param
->
Filter
()
->
InitConv2dTransposeFilterCLImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
this
->
cl_helper_
.
AddKernel
(
"conv_transpose"
,
"conv_transpose_kernel.cl"
);
}
else
{
PADDLE_MOBILE_THROW_EXCEPTION
(
" not support "
);
}
return
true
;
}
template
<
>
void
ConvTransposeKernel
<
GPU_CL
,
float
>::
Compute
(
const
ConvTransposeParam
<
GPU_CL
>&
param
)
{
auto
kernel
=
this
->
cl_helper_
.
KernelAt
(
0
);
const
auto
*
input
=
param
.
Input
();
auto
*
output
=
param
.
Output
();
auto
*
filter
=
param
.
Filter
();
const
int
n
=
input
->
dims
()[
0
];
const
int
input_c
=
input
->
dims
()[
1
];
const
int
input_c_block
=
(
input_c
+
3
)
/
4
;
const
int
input_width
=
input
->
dims
()[
3
];
const
int
input_height
=
input
->
dims
()[
2
];
const
int
output_c
=
output
->
dims
()[
1
];
const
int
output_c_block
=
(
output_c
+
3
)
/
4
;
const
int
output_width
=
output
->
dims
()[
3
];
const
int
output_height
=
output
->
dims
()[
2
];
auto
inputImage
=
input
->
GetCLImage
();
auto
outputImage
=
output
->
GetCLImage
();
auto
filterImage
=
filter
->
GetCLImage
();
cl_int
status
;
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
input_c_block
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
input_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
input_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
int
),
&
output_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
int
),
&
output_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_mem
),
&
inputImage
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
6
,
sizeof
(
cl_mem
),
&
filterImage
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
7
,
sizeof
(
cl_mem
),
&
outputImage
);
CL_CHECK_ERRORS
(
status
);
const
size_t
work_size
[
3
]
=
{(
size_t
)
output_c_block
,
(
size_t
)
input_width
,
(
size_t
)(
n
*
input_height
)};
DLOG
<<
"conv transpose "
<<
input_c_block
<<
input_width
<<
input_height
<<
output_width
<<
output_height
<<
work_size
[
0
]
<<
work_size
[
1
]
<<
work_size
[
2
];
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
3
,
NULL
,
work_size
,
NULL
,
0
,
NULL
,
NULL
);
switch
(
param
.
ExecMode
())
{
case
ConvTransposeParam
<
GPU_CL
>::
EXEC_DEPTHWISETRANS_FLOAT
:
DWConvTransposeAddBnRelu
(
&
this
->
cl_helper_
,
param
);
break
;
case
ConvTransposeParam
<
GPU_CL
>::
EXEC_CONVTRANS3x3s2_FLOAT
:
ConvTransposeAddBnRelu
(
&
this
->
cl_helper_
,
param
);
break
;
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution transpose execute mode %d"
,
param
.
ExecMode
());
}
}
template
class
ConvTransposeKernel
<
GPU_CL
,
float
>;
...
...
mobile/src/operators/op_param.h
浏览文件 @
4b1a4ca0
...
...
@@ -2492,8 +2492,8 @@ class ConvTransposeParam : public OpParam {
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
Scope
*
scope
)
:
OpParam
(
inputs
,
outputs
,
attrs
,
scope
)
{
filter_
=
FilterFrom
<
GType
>
(
inputs
,
*
scope
);
input_
=
InputFrom
<
GType
>
(
inputs
,
*
scope
);
filter_
=
OpParam
::
FilterFrom
<
GType
>
(
inputs
,
*
scope
);
input_
=
OpParam
::
InputFrom
<
GType
>
(
inputs
,
*
scope
);
// output_ = OutputFrom<GType>(outputs, scope);
if
(
outputs
.
count
(
"Output"
))
{
output_
=
OpParam
::
OutputFrom
<
GType
>
(
outputs
,
*
scope
);
...
...
@@ -2518,6 +2518,10 @@ class ConvTransposeParam : public OpParam {
const
vector
<
int
>
&
Paddings
()
const
{
return
paddings_
;
}
const
vector
<
int
>
&
Filters
()
const
{
return
filter_
;
}
const
vector
<
int
>
&
TransFilters
()
const
{
return
transformed_filter_
;
}
const
vector
<
int
>
&
Dilations
()
const
{
return
dilations_
;
}
const
vector
<
int
>
&
OutputSize
()
const
{
return
output_size_
;
}
...
...
@@ -2529,6 +2533,8 @@ class ConvTransposeParam : public OpParam {
EXEC_GEMM_FLOAT
,
EXEC_DECONV3X3_FLOAT
,
EXEC_DECONV4X4_FLOAT
,
EXEC_DEPTHWISETRANS_FLOAT
,
EXEC_CONVTRANS3x3s2_FLOAT
,
};
ExecMode
&
ExecMode
()
const
{
return
exec_mode_
;
}
...
...
@@ -2537,6 +2543,7 @@ class ConvTransposeParam : public OpParam {
GType
*
input_
;
GType
*
output_
;
GType
*
filter_
;
GType
*
transformed_filter_
;
vector
<
int
>
strides_
;
vector
<
int
>
paddings_
;
vector
<
int
>
dilations_
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录