Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
d4f82d6c
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
d4f82d6c
编写于
8月 03, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
8月 03, 2020
浏览文件
操作
浏览文件
下载
差异文件
!3823 optimize opencl conv2d transpose
Merge pull request !3823 from chenzupeng/master-lite
上级
61639d90
39e4900e
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
117 addition
and
98 deletion
+117
-98
mindspore/lite/src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl
.../src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl
+43
-36
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl
.../src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl
+43
-35
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
...lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+28
-24
mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+3
-3
未找到文件。
mindspore/lite/src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl
浏览文件 @
d4f82d6c
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
#
define
FLT
half
#
define
FLT4
half4
#
define
FLT16
half16
__kernel
void
conv2d_transpose2x2
(
__global
FLT4
*inputx,
__global
FLT16
*weight,
__global
FLT4
*bias,
__global
FLT4
*output,
int2
kernel_size,
int2
stride,
int2
padding,
int4
src_size,
int4
dst_size
)
{
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
conv2d_transpose2x2
(
__read_only
image2d_t
src_data,
__global
FLT16
*weight,
__read_only
image2d_t
biases,
__write_only
image2d_t
dst_data,
int2
kernel_size,
int2
stride,
int2
padding,
int4
src_size,
int4
dst_size
)
{
int
h
=
get_global_id
(
0
)
;
int
kh
=
h
%
2
;
int
src_h
=
h
/
2
;
src_h
=
src_h
*
2
;
int
w
=
get_global_id
(
1
)
;
int
kw
=
w
%
2
;
int
src_w
=
w
/
2
;
src_w
=
src_w
*
2
;
int
co
=
get_global_id
(
2
)
;
if
(
h
*
2
>=
dst_size.x
|
| w * 2 >= dst_size.y |
|
co
>=
dst_size.z
)
return
;
FLT4
r0
=
(
FLT4
)(
0.f
)
;
FLT4
r1
=
(
FLT4
)(
0.f
)
;
FLT4
r2
=
(
FLT4
)(
0.f
)
;
FLT4
r3
=
(
FLT4
)(
0.f
)
;
int
base_x
=
(
h
*
src_size.y
+
w
)
*
src_size.z
;
int
base_w
=
co
*
src_size.z
;
int
base_w
=
(
co
*
4
+
kh
+
kw
*
2
)
*
src_size.z
;
for
(
int
ci
=
0
; ci < src_size.z; ++ci) {
FLT4
x
=
inputx[base_x
+
ci]
;
FLT
16
w0
=
weight[
(
base_w
+
ci
)
*
4]
;
FLT
16
w1
=
weight[
(
base_w
+
ci
)
*
4
+
1]
;
FLT
16
w2
=
weight[
(
base_w
+
ci
)
*
4
+
2]
;
FLT16
w
3
=
weight[
(
base_w
+
ci
)
*
4
+
3
]
;
r0
+=
x
.x
*
w0
.s0123
;
r0
+=
x
.y
*
w0
.s4567
;
r0
+=
x
.z
*
w0
.s89ab
;
r0
+=
x
.w
*
w0
.scdef
;
FLT4
x
0
=
read_imagef
(
src_data,
smp_zero,
(
int2
)(
src_w
*
src_size.z
+
ci,
src_h
))
;
FLT
4
x1
=
read_imagef
(
src_data,
smp_zero,
(
int2
)(
src_w
*
src_size.z
+
ci,
src_h
+
1
))
;
FLT
4
x2
=
read_imagef
(
src_data,
smp_zero,
(
int2
)((
src_w
+
1
)
*
src_size.z
+
ci,
src_h
))
;
FLT
4
x3
=
read_imagef
(
src_data,
smp_zero,
(
int2
)((
src_w
+
1
)
*
src_size.z
+
ci,
src_h
+
1
))
;
FLT16
w
eight_cache
=
weight[base_w++
]
;
r0
+=
x
0.x
*
weight_cache
.s0123
;
r0
+=
x
0.y
*
weight_cache
.s4567
;
r0
+=
x
0.z
*
weight_cache
.s89ab
;
r0
+=
x
0.w
*
weight_cache
.scdef
;
r1
+=
x
.x
*
w1
.s0123
;
r1
+=
x
.y
*
w1
.s4567
;
r1
+=
x
.z
*
w1
.s89ab
;
r1
+=
x
.w
*
w1
.scdef
;
r1
+=
x
1.x
*
weight_cache
.s0123
;
r1
+=
x
1.y
*
weight_cache
.s4567
;
r1
+=
x
1.z
*
weight_cache
.s89ab
;
r1
+=
x
1.w
*
weight_cache
.scdef
;
r2
+=
x
.x
*
w2
.s0123
;
r2
+=
x
.y
*
w2
.s4567
;
r2
+=
x
.z
*
w2
.s89ab
;
r2
+=
x
.w
*
w2
.scdef
;
r2
+=
x
2.x
*
weight_cache
.s0123
;
r2
+=
x
2.y
*
weight_cache
.s4567
;
r2
+=
x
2.z
*
weight_cache
.s89ab
;
r2
+=
x
2.w
*
weight_cache
.scdef
;
r3
+=
x
.x
*
w3
.s0123
;
r3
+=
x
.y
*
w3
.s4567
;
r3
+=
x
.z
*
w3
.s89ab
;
r3
+=
x
.w
*
w3
.scdef
;
r3
+=
x
3.x
*
weight_cache
.s0123
;
r3
+=
x
3.y
*
weight_cache
.s4567
;
r3
+=
x
3.z
*
weight_cache
.s89ab
;
r3
+=
x
3.w
*
weight_cache
.scdef
;
}
r0
+=
bias[co]
;
r1
+=
bias[co]
;
r2
+=
bias[co]
;
r3
+=
bias[co]
;
output[
((
2
*
h
+
0
)
*
dst_size.y
+
2
*
w
+
0
)
*
dst_size.z
+
co]
=
r0
;
output[
((
2
*
h
+
0
)
*
dst_size.y
+
2
*
w
+
1
)
*
dst_size.z
+
co]
=
r1
;
output[
((
2
*
h
+
1
)
*
dst_size.y
+
2
*
w
+
0
)
*
dst_size.z
+
co]
=
r2
;
output[
((
2
*
h
+
1
)
*
dst_size.y
+
2
*
w
+
1
)
*
dst_size.z
+
co]
=
r3
;
}
\ No newline at end of file
FLT4
bias_val
=
read_imagef
(
biases,
smp_zero,
(
int2
)(
co,
0
))
;
r0
+=
bias_val
;
r1
+=
bias_val
;
r2
+=
bias_val
;
r3
+=
bias_val
;
write_imagef
(
dst_data,
(
int2
)((
2
*
src_w
+
kw
)
*
dst_size.z
+
co,
2
*
src_h
+
kh
)
,
r0
)
;
write_imagef
(
dst_data,
(
int2
)((
2
*
src_w
+
kw
)
*
dst_size.z
+
co,
2
*
src_h
+
kh
+
2
)
,
r1
)
;
write_imagef
(
dst_data,
(
int2
)((
2
*
src_w
+
kw
+
2
)
*
dst_size.z
+
co,
2
*
src_h
+
kh
)
,
r2
)
;
write_imagef
(
dst_data,
(
int2
)((
2
*
src_w
+
kw
+
2
)
*
dst_size.z
+
co,
2
*
src_h
+
kh
+
2
)
,
r3
)
;
}
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl
浏览文件 @
d4f82d6c
#
define
FLT
float
#
define
FLT4
float4
#
define
FLT16
float16
__kernel
void
conv2d_transpose2x2
(
__global
FLT4
*inputx,
__global
FLT16
*weight,
__global
FLT4
*bias,
__global
FLT4
*output,
int2
kernel_size,
int2
stride,
int2
padding,
int4
src_size,
int4
dst_size
)
{
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
conv2d_transpose2x2
(
__read_only
image2d_t
src_data,
__global
FLT16
*weight,
__read_only
image2d_t
biases,
__write_only
image2d_t
dst_data,
int2
kernel_size,
int2
stride,
int2
padding,
int4
src_size,
int4
dst_size
)
{
int
h
=
get_global_id
(
0
)
;
int
kh
=
h
%
2
;
int
src_h
=
h
/
2
;
src_h
=
src_h
*
2
;
int
w
=
get_global_id
(
1
)
;
int
kw
=
w
%
2
;
int
src_w
=
w
/
2
;
src_w
=
src_w
*
2
;
int
co
=
get_global_id
(
2
)
;
if
(
h
*
2
>=
dst_size.x
|
| w * 2 >= dst_size.y |
|
co
>=
dst_size.z
)
return
;
FLT4
r0
=
(
FLT4
)(
0.f
)
;
FLT4
r1
=
(
FLT4
)(
0.f
)
;
FLT4
r2
=
(
FLT4
)(
0.f
)
;
FLT4
r3
=
(
FLT4
)(
0.f
)
;
int
base_x
=
(
h
*
src_size.y
+
w
)
*
src_size.z
;
int
base_w
=
co
*
src_size.z
;
int
base_w
=
(
co
*
4
+
kh
+
kw
*
2
)
*
src_size.z
;
for
(
int
ci
=
0
; ci < src_size.z; ++ci) {
FLT4
x
=
inputx[base_x
+
ci]
;
FLT
16
w0
=
weight[
(
base_w
+
ci
)
*
4]
;
FLT
16
w1
=
weight[
(
base_w
+
ci
)
*
4
+
1]
;
FLT
16
w2
=
weight[
(
base_w
+
ci
)
*
4
+
2]
;
FLT16
w
3
=
weight[
(
base_w
+
ci
)
*
4
+
3
]
;
r0
+=
x
.x
*
w0
.s0123
;
r0
+=
x
.y
*
w0
.s4567
;
r0
+=
x
.z
*
w0
.s89ab
;
r0
+=
x
.w
*
w0
.scdef
;
FLT4
x
0
=
read_imagef
(
src_data,
smp_zero,
(
int2
)(
src_w
*
src_size.z
+
ci,
src_h
))
;
FLT
4
x1
=
read_imagef
(
src_data,
smp_zero,
(
int2
)(
src_w
*
src_size.z
+
ci,
src_h
+
1
))
;
FLT
4
x2
=
read_imagef
(
src_data,
smp_zero,
(
int2
)((
src_w
+
1
)
*
src_size.z
+
ci,
src_h
))
;
FLT
4
x3
=
read_imagef
(
src_data,
smp_zero,
(
int2
)((
src_w
+
1
)
*
src_size.z
+
ci,
src_h
+
1
))
;
FLT16
w
eight_cache
=
weight[base_w++
]
;
r0
+=
x
0.x
*
weight_cache
.s0123
;
r0
+=
x
0.y
*
weight_cache
.s4567
;
r0
+=
x
0.z
*
weight_cache
.s89ab
;
r0
+=
x
0.w
*
weight_cache
.scdef
;
r1
+=
x
.x
*
w1
.s0123
;
r1
+=
x
.y
*
w1
.s4567
;
r1
+=
x
.z
*
w1
.s89ab
;
r1
+=
x
.w
*
w1
.scdef
;
r1
+=
x
1.x
*
weight_cache
.s0123
;
r1
+=
x
1.y
*
weight_cache
.s4567
;
r1
+=
x
1.z
*
weight_cache
.s89ab
;
r1
+=
x
1.w
*
weight_cache
.scdef
;
r2
+=
x
.x
*
w2
.s0123
;
r2
+=
x
.y
*
w2
.s4567
;
r2
+=
x
.z
*
w2
.s89ab
;
r2
+=
x
.w
*
w2
.scdef
;
r2
+=
x
2.x
*
weight_cache
.s0123
;
r2
+=
x
2.y
*
weight_cache
.s4567
;
r2
+=
x
2.z
*
weight_cache
.s89ab
;
r2
+=
x
2.w
*
weight_cache
.scdef
;
r3
+=
x
.x
*
w3
.s0123
;
r3
+=
x
.y
*
w3
.s4567
;
r3
+=
x
.z
*
w3
.s89ab
;
r3
+=
x
.w
*
w3
.scdef
;
r3
+=
x
3.x
*
weight_cache
.s0123
;
r3
+=
x
3.y
*
weight_cache
.s4567
;
r3
+=
x
3.z
*
weight_cache
.s89ab
;
r3
+=
x
3.w
*
weight_cache
.scdef
;
}
r0
+=
bias[co]
;
r1
+=
bias[co]
;
r2
+=
bias[co]
;
r3
+=
bias[co]
;
output[
((
2
*
h
+
0
)
*
dst_size.y
+
2
*
w
+
0
)
*
dst_size.z
+
co]
=
r0
;
output[
((
2
*
h
+
0
)
*
dst_size.y
+
2
*
w
+
1
)
*
dst_size.z
+
co]
=
r1
;
output[
((
2
*
h
+
1
)
*
dst_size.y
+
2
*
w
+
0
)
*
dst_size.z
+
co]
=
r2
;
output[
((
2
*
h
+
1
)
*
dst_size.y
+
2
*
w
+
1
)
*
dst_size.z
+
co]
=
r3
;
}
\ No newline at end of file
FLT4
bias_val
=
read_imagef
(
biases,
smp_zero,
(
int2
)(
co,
0
))
;
r0
+=
bias_val
;
r1
+=
bias_val
;
r2
+=
bias_val
;
r3
+=
bias_val
;
write_imagef
(
dst_data,
(
int2
)((
2
*
src_w
+
kw
)
*
dst_size.z
+
co,
2
*
src_h
+
kh
)
,
r0
)
;
write_imagef
(
dst_data,
(
int2
)((
2
*
src_w
+
kw
)
*
dst_size.z
+
co,
2
*
src_h
+
kh
+
2
)
,
r1
)
;
write_imagef
(
dst_data,
(
int2
)((
2
*
src_w
+
kw
+
2
)
*
dst_size.z
+
co,
2
*
src_h
+
kh
)
,
r2
)
;
write_imagef
(
dst_data,
(
int2
)((
2
*
src_w
+
kw
+
2
)
*
dst_size.z
+
co,
2
*
src_h
+
kh
+
2
)
,
r3
)
;
}
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
浏览文件 @
d4f82d6c
...
...
@@ -64,11 +64,8 @@ int Conv2dTransposeOpenCLKernel::Init() {
auto
allocator
=
ocl_runtime
->
GetAllocator
();
padWeight_
=
reinterpret_cast
<
FLOAT_T
*>
(
allocator
->
Malloc
(
div_ci
*
div_co
*
16
*
kh
*
kw
*
sizeof
(
FLOAT_T
)));
padWeight_
=
reinterpret_cast
<
FLOAT_T
*>
(
allocator
->
MapBuffer
(
padWeight_
,
CL_MAP_WRITE
,
nullptr
,
true
));
bias_
=
reinterpret_cast
<
FLOAT_T
*>
(
allocator
->
Malloc
(
div_co
*
4
*
sizeof
(
FLOAT_T
)));
bias_
=
reinterpret_cast
<
FLOAT_T
*>
(
allocator
->
MapBuffer
(
bias_
,
CL_MAP_WRITE
,
nullptr
,
true
));
PadWeight
();
allocator
->
UnmapBuffer
(
padWeight_
);
allocator
->
UnmapBuffer
(
bias_
);
outputs_
[
0
]
->
SetFormat
(
schema
::
Format_NHWC4
);
MS_LOG
(
DEBUG
)
<<
kernel_name
<<
" Init Done!"
;
return
0
;
...
...
@@ -77,7 +74,7 @@ int Conv2dTransposeOpenCLKernel::Init() {
int
Conv2dTransposeOpenCLKernel
::
ReSize
()
{
return
0
;
}
void
Conv2dTransposeOpenCLKernel
::
PadWeight
()
{
// OHWI to O
IHW
4(I)4(O)
// OHWI to O
HWI
4(I)4(O)
ConvParameter
*
param
=
reinterpret_cast
<
ConvParameter
*>
(
opParameter
);
int
ci
=
param
->
input_channel_
;
int
co
=
param
->
output_channel_
;
...
...
@@ -86,13 +83,11 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
int
div_ci
=
UP_DIV
(
ci
,
4
);
int
div_co
=
UP_DIV
(
co
,
4
);
auto
origin_weight
=
reinterpret_cast
<
FLOAT_T
*>
(
inputs_
.
at
(
kWeightIndex
)
->
Data
());
auto
origin_bias
=
reinterpret_cast
<
FLOAT_T
*>
(
inputs_
.
at
(
kBiasIndex
)
->
Data
());
bool
has_bias
=
origin_bias
!=
nullptr
;
int
index
=
0
;
for
(
int
co_i
=
0
;
co_i
<
div_co
;
co_i
++
)
{
for
(
int
ci_i
=
0
;
ci_i
<
div_ci
;
ci
_i
++
)
{
for
(
int
kw_i
=
0
;
kw_i
<
kw
;
kw
_i
++
)
{
for
(
int
kh_i
=
0
;
kh_i
<
kh
;
kh_i
++
)
{
for
(
int
kw_i
=
0
;
kw_i
<
kw
;
kw
_i
++
)
{
for
(
int
ci_i
=
0
;
ci_i
<
div_ci
;
ci
_i
++
)
{
for
(
int
ci4_i
=
0
;
ci4_i
<
4
;
ci4_i
++
)
{
for
(
int
co4_i
=
0
;
co4_i
<
4
;
co4_i
++
)
{
int
co_offset
=
co_i
*
4
+
co4_i
;
...
...
@@ -109,16 +104,6 @@ void Conv2dTransposeOpenCLKernel::PadWeight() {
}
}
}
for
(
int
co_i
=
0
;
co_i
<
div_co
;
co_i
++
)
{
for
(
int
co4_i
=
0
;
co4_i
<
4
;
co4_i
++
)
{
int
co_offset
=
co_i
*
4
+
co4_i
;
if
(
has_bias
&&
co_offset
<
co
)
{
bias_
[
co_offset
]
=
origin_bias
[
co_offset
];
}
else
{
bias_
[
co_offset
]
=
0.
;
}
}
}
}
int
Conv2dTransposeOpenCLKernel
::
Run
()
{
...
...
@@ -134,14 +119,31 @@ int Conv2dTransposeOpenCLKernel::Run() {
int
co
=
param
->
output_channel_
;
int
kh
=
param
->
kernel_h_
;
int
kw
=
param
->
kernel_w_
;
int
pad
=
kh
-
1
-
param
->
pad_h_
;
int
pad
=
param
->
pad_h_
;
int
oh
=
outputs_
[
0
]
->
shape
()[
1
];
int
ow
=
outputs_
[
0
]
->
shape
()[
2
];
int
h
=
inputs_
[
0
]
->
shape
()[
1
];
int
w
=
inputs_
[
0
]
->
shape
()[
2
];
auto
ocl_runtime
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
cl
::
ImageFormat
image_format
;
{
image_format
.
image_channel_order
=
CL_RGBA
;
#ifdef ENABLE_FP16
image_format
.
image_channel_data_type
=
CL_HALF_FLOAT
;
#else
image_format
.
image_channel_data_type
=
CL_FLOAT
;
#endif
}
cl_int
in_error_code
,
in_error_code_weight
,
in_error_code_bias
,
out_error_code
;
cl
::
Image2D
img_x
(
*
ocl_runtime
->
Context
(),
CL_MEM_READ_ONLY
|
CL_MEM_COPY_HOST_PTR
,
image_format
,
w
*
ci
/
4
,
h
,
0
,
inputs_
[
0
]
->
Data
(),
&
in_error_code
);
cl
::
Image2D
img_bias
(
*
ocl_runtime
->
Context
(),
CL_MEM_READ_ONLY
|
CL_MEM_COPY_HOST_PTR
,
image_format
,
co
/
4
,
1
,
0
,
inputs_
[
2
]
->
Data
(),
&
in_error_code_bias
);
cl
::
Image2D
out_mem
(
*
ocl_runtime
->
Context
(),
CL_MEM_WRITE_ONLY
,
image_format
,
ow
*
co
/
4
,
oh
,
0
,
nullptr
,
&
out_error_code
);
// local size should less than MAX_GROUP_SIZE
std
::
vector
<
size_t
>
local
=
{
4
,
4
,
32
};
std
::
vector
<
size_t
>
local
=
{
16
,
1
,
16
};
std
::
vector
<
size_t
>
global
=
{
UP_ROUND
((
size_t
)
oh
/
2
,
local
[
0
]),
UP_ROUND
((
size_t
)
ow
/
2
,
local
[
1
]),
UP_ROUND
((
size_t
)
co
/
4
,
local
[
2
])};
...
...
@@ -150,16 +152,19 @@ int Conv2dTransposeOpenCLKernel::Run() {
cl_int2
padding
=
{
pad
,
pad
};
cl_int4
src_size
=
{
h
,
w
,
UP_DIV
(
ci
,
4
),
1
};
cl_int4
dst_size
=
{
oh
,
ow
,
UP_DIV
(
co
,
4
),
1
};
ocl_runtime
->
SetKernelArg
(
kernel_
,
0
,
i
nputs_
[
0
]
->
Data
()
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
0
,
i
mg_x
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
1
,
padWeight_
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
2
,
bias_
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
3
,
out
puts_
[
0
]
->
Data
()
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
2
,
img_bias
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
3
,
out
_mem
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
4
,
kernel_size
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
5
,
stride
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
6
,
padding
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
7
,
src_size
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
8
,
dst_size
);
ocl_runtime
->
RunKernel
(
kernel_
,
global
,
local
,
nullptr
);
auto
origin
=
cl
::
array
<
cl
::
size_type
,
3U
>
{
0
,
0
,
0
};
auto
region
=
cl
::
array
<
cl
::
size_type
,
3U
>
{(
size_t
)(
ow
*
co
/
4
),
(
size_t
)(
oh
),
1
};
ocl_runtime
->
GetDefaultCommandQueue
()
->
enqueueReadImage
(
out_mem
,
CL_TRUE
,
origin
,
region
,
0
,
0
,
outputs_
[
0
]
->
Data
());
return
0
;
}
...
...
@@ -180,4 +185,3 @@ kernel::LiteKernel *OpenCLConv2dTransposeKernelCreator(const std::vector<lite::t
REG_KERNEL
(
kGPU
,
kNumberTypeFloat32
,
PrimitiveType_DeConv2D
,
OpenCLConv2dTransposeKernelCreator
)
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
浏览文件 @
d4f82d6c
...
...
@@ -94,8 +94,8 @@ void MatMulOpenCLKernel::PadWeight() {
}
}
if
(
hasBias_
)
{
memcpy
(
inputs_
[
2
]
->
Data
(),
bias_
,
sizeof
(
FLOAT_T
)
*
sizeCI
.
s
[
0
]);
for
(
int
i
=
sizeC
I
.
s
[
0
];
i
<
sizeCI
.
s
[
1
]
*
4
;
i
++
)
{
memcpy
(
bias_
,
inputs_
[
2
]
->
Data
(),
sizeof
(
FLOAT_T
)
*
sizeCO
.
s
[
0
]);
for
(
int
i
=
sizeC
O
.
s
[
0
];
i
<
sizeCO
.
s
[
1
]
*
4
;
i
++
)
{
bias_
[
i
]
=
0
;
}
}
...
...
@@ -118,7 +118,7 @@ int MatMulOpenCLKernel::Run() {
ocl_runtime
->
SetKernelArg
(
kernel_
,
1
,
padWeight_
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
2
,
outputs_
[
0
]
->
Data
());
if
(
hasBias_
)
{
ocl_runtime
->
SetKernelArg
(
kernel_
,
3
,
inputs_
[
2
]
->
Data
()
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
3
,
bias_
);
}
else
{
ocl_runtime
->
SetKernelArg
(
kernel_
,
3
,
nullptr
);
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录