Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
12102ae3
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
12102ae3
编写于
8月 03, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
8月 03, 2020
浏览文件
操作
浏览文件
下载
差异文件
!3830 lite gpu opencl convolutoin kernel speed up
Merge pull request !3830 from 王东旭/master
上级
67005d42
04e4cba6
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
252 addition
and
108 deletion
+252
-108
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl
...ore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl
+149
-45
mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
...pore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+97
-58
mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
...spore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+6
-5
未找到文件。
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl
浏览文件 @
12102ae3
#
define
CI_TILE
4
#
define
CO_TILE
4
#
define
UP_DIV
(
x,
y
)
(((
x
)
+
(
y
)
-
(
1
))
/
(
y
))
//#define
__global
//#pragma
OPENCL
EXTENSION
cl_arm_printf
:
enable
__kernel
void
convolution_NHWC_OHWI
(
__global
float
*input,
__global
float
*weight,
__global
float
*bias,
__global
float
*output,
const
uint4
input_shape,
//
NHWC
const
uint4
weight_shape,
//
OHWI
const
uint4
output_shape,
//
NHWC
const
uint2
stride,
//
HW
const
uint4
pad
)
//
top
bottom
left
right
const
int4
input_shape,
//
NHWC
const
int4
output_shape,
//
NHWC
const
int4
kernel_stride,
//
kernelHW_strideHW
const
int4
pad
)
//
top
bottom
left
right
{
uint
ow
=
get_global_id
(
0
)
;
uint
oh
=
get_global_id
(
1
)
;
uint
co_outer
=
get_global_id
(
2
)
;
int
ow
=
get_global_id
(
0
)
;
int
oh
=
get_global_id
(
1
)
;
int
co_slice
=
get_global_id
(
2
)
;
int
CI
=
input_shape.w,
IH
=
input_shape.y,
IW
=
input_shape.z
;
int
CO
=
output_shape.w,
OH
=
output_shape.y,
OW
=
output_shape.z
;
int
KH
=
kernel_stride.x,
KW
=
kernel_stride.y
;
int
strideH
=
kernel_stride.z,
strideW
=
kernel_stride.w
;
int
padTop
=
pad.x,
padLeft
=
pad.z
;
int
CI_SLICES
=
UP_DIV
(
CI,
CI_TILE
)
;
int
CO_SLICES
=
UP_DIV
(
CO,
CO_TILE
)
;
uint
CI
=
input_shape.w,
IH
=
input_shape.y,
IW
=
input_shape.z
;
uint
CO
=
output_shape.w,
OW
=
output_shape.z
;
uint
KH
=
weight_shape.y,
KW
=
weight_shape.z
;
uint
stride_h
=
stride.x,
stride_w
=
stride.y
;
uint
pad_top
=
pad.x,
pad_left
=
pad.z
;
uint
CI_TILE_NUM
=
UP_DIV
(
CI,
CI_TILE
)
;
uint
CO_TILE_NUM
=
UP_DIV
(
CO,
CO_TILE
)
;
if
(
oh
>=
OH
|
| ow >= OW || co_slice >= CO_SLICES)
return;
float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
for
(
u
int
kh
=
0
; kh < KH; ++kh)
for (int kh = 0; kh < KH; ++kh)
{
uint
ih
=
kh
+
oh
*
stride_h
-
pad_t
op
;
for
(
u
int
kw
=
0
; kw < KW; ++kw)
int ih = kh + oh * strideH - padT
op;
for (int kw = 0; kw < KW; ++kw)
{
uint
iw
=
kw
+
ow
*
stride_w
-
pad_l
eft
;
for
(
uint
ci_outer
=
0
; ci_outer < CI_TILE_NUM; ++ci_outer
)
int iw = kw + ow * strideW - padL
eft;
for (
int ci_slice = 0; ci_slice < CI_SLICES; ++ci_slice
)
{
for
(
u
int
ci_inner
=
0
; ci_inner < CI_TILE; ++ci_inner)
for (int ci_inner = 0; ci_inner < CI_TILE; ++ci_inner)
{
uint
ci
=
ci_outer
*
CI_TILE
+
ci_inner
;
int ci = ci_slice
* CI_TILE + ci_inner;
if (ci >= CI)
break;
u
int
input_idx
=
ih
*
IW
*
CI
+
iw
*
CI
+
ci
;
int input_idx = ih * IW * CI + iw * CI + ci;
float value = 0;
if (ih < 0 || ih >= IH || iw < 0 || iw >= IW)
value = 0;
else
value = input[input_idx];
uint CO_TILE
_OFFSET = KH * KW * CI;
uint weight_idx = (co_outer * CO_TILE) * CO_TILE
_OFFSET +
kh * KW * CI +
kw * CI +
ci;
acc.x += weight[weight_idx + 0 * CO_
TILE_
OFFSET] * value;
acc.y += weight[weight_idx + 1 * CO_
TILE_
OFFSET] * value;
acc.z += weight[weight_idx + 2 * CO_
TILE_
OFFSET] * value;
acc.w += weight[weight_idx + 3 * CO_
TILE_
OFFSET] * value;
int CO
_OFFSET = KH * KW * CI;
int weight_idx = (co_slice * CO_TILE) * CO
_OFFSET +
kh * KW * CI +
kw * CI +
ci;
acc.x += weight[weight_idx + 0 * CO_OFFSET] * value;
acc.y += weight[weight_idx + 1 * CO_OFFSET] * value;
acc.z += weight[weight_idx + 2 * CO_OFFSET] * value;
acc.w += weight[weight_idx + 3 * CO_OFFSET] * value;
}
}
}
}
uint output_idx = oh * OW * CO + ow * CO + (co_outer
* CO_TILE);
if (co_
outer < CO_TILE_NUM
- 1 |
|
CO
%
CO_TILE
==
0
)
int output_idx = oh * OW * CO + ow * CO + (co_slice
* CO_TILE);
if (co_
slice < CO_SLICES
- 1 || CO % CO_TILE == 0)
{
output[output_idx
+
0]
=
acc.x
+
bias[co_
outer
*
CO_TILE
+
0]
;
output[output_idx
+
1]
=
acc.y
+
bias[co_
outer
*
CO_TILE
+
1]
;
output[output_idx
+
2]
=
acc.z
+
bias[co_
outer
*
CO_TILE
+
2]
;
output[output_idx
+
3]
=
acc.w
+
bias[co_
outer
*
CO_TILE
+
3]
;
output[output_idx + 0] = acc.x + bias[co_
slice
* CO_TILE + 0];
output[output_idx + 1] = acc.y + bias[co_
slice
* CO_TILE + 1];
output[output_idx + 2] = acc.z + bias[co_
slice
* CO_TILE + 2];
output[output_idx + 3] = acc.w + bias[co_
slice
* CO_TILE + 3];
}
else if (CO % CO_TILE == 1)
{
output[output_idx
+
0]
=
acc.x
+
bias[co_
outer
*
CO_TILE
+
0]
;
output[output_idx + 0] = acc.x + bias[co_
slice
* CO_TILE + 0];
}
else if (CO % CO_TILE == 2)
{
output[output_idx
+
0]
=
acc.x
+
bias[co_
outer
*
CO_TILE
+
0]
;
output[output_idx
+
1]
=
acc.y
+
bias[co_
outer
*
CO_TILE
+
1]
;
output[output_idx + 0] = acc.x + bias[co_
slice
* CO_TILE + 0];
output[output_idx + 1] = acc.y + bias[co_
slice
* CO_TILE + 1];
}
else if (CO % CO_TILE == 3)
{
output[output_idx
+
0]
=
acc.x
+
bias[co_outer
*
CO_TILE
+
0]
;
output[output_idx
+
1]
=
acc.y
+
bias[co_outer
*
CO_TILE
+
1]
;
output[output_idx
+
2]
=
acc.z
+
bias[co_outer
*
CO_TILE
+
2]
;
output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
}
}
//#pragma OPENCL EXTENSION cl_khr_fp16 : enable
//#define FLT4 half4
#define FLT4 float4
__kernel void convolution_NHWC4_OHWIIO_float8(__global FLT4 *input,
__global FLT4 *weight,
__global FLT4 *bias,
__global FLT4 *output,
const int4 input_shape, // NHWC
const int4 output_shape, // NHWC
const int4 kernel_stride, // kernelHW_strideHW
const int4 pad) // top bottom left right
{
int oh = get_global_id(0); // [0, OH)
int ow = get_global_id(1); // [0, OW)
int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) )
int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z;
int CI_SLICES = UP_DIV(CI, CI_TILE);
int CO_SLICES = UP_DIV(CO, CO_TILE);
int KH = kernel_stride.x, KW = kernel_stride.y;
int strideH = kernel_stride.z, strideW = kernel_stride.w;
int padTop = pad.x, padLeft = pad.z;
if (oh >= OH || ow >= OW |
|
2
*
co_slice
>=
CO_SLICES
)
return
;
if
(
2
*
co_slice
+
1
>=
CO_SLICES
)
{
FLT4
out0_c4
=
(
FLT4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
;
__global
FLT4
*w0_ic1_oc4
=
weight
+
(
2
*
co_slice
+
0
)
*
KH
*
KW
*
CI_SLICES
*
CI_TILE
;
for
(
int
kh
=
0
; kh < KH; ++kh)
{
int
ih
=
kh
+
oh
*
strideH
-
padTop
;
for
(
int
kw
=
0
; kw < KW; ++kw)
{
int
iw
=
kw
+
ow
*
strideW
-
padLeft
;
if
(
ih
>=
0
&&
ih
<
IH
&&
iw
>=
0
&&
iw
<
IW
)
{
for
(
int
ci_slice
=
0
; ci_slice < CI_SLICES; ci_slice++)
{
FLT4
in_c4
=
input[ih
*
IW
*
CI_SLICES
+
iw
*
CI_SLICES
+
ci_slice]
;
out0_c4
+=
w0_ic1_oc4[0]
*
in_c4.x
;
out0_c4
+=
w0_ic1_oc4[1]
*
in_c4.y
;
out0_c4
+=
w0_ic1_oc4[2]
*
in_c4.z
;
out0_c4
+=
w0_ic1_oc4[3]
*
in_c4.w
;
w0_ic1_oc4
+=
4
;
}
}
else
{
w0_ic1_oc4
+=
4
*
CI_SLICES
;
}
}
}
output[oh
*
OW
*
CO_SLICES
+
ow
*
CO_SLICES
+
2
*
co_slice
+
0]
=
out0_c4
+
bias[2
*
co_slice
+
0]
;
}
else
{
FLT4
out0_c4
=
(
FLT4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
;
FLT4
out1_c4
=
(
FLT4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
;
__global
FLT4
*w0_ic1_oc4
=
weight
+
(
2
*
co_slice
+
0
)
*
KH
*
KW
*
CI_SLICES
*
CI_TILE
;
__global
FLT4
*w1_ic1_oc4
=
weight
+
(
2
*
co_slice
+
1
)
*
KH
*
KW
*
CI_SLICES
*
CI_TILE
;
for
(
int
kh
=
0
; kh < KH; ++kh)
{
int
ih
=
kh
+
oh
*
strideH
-
padTop
;
for
(
int
kw
=
0
; kw < KW; ++kw)
{
int
iw
=
kw
+
ow
*
strideW
-
padLeft
;
if
(
ih
>=
0
&&
ih
<
IH
&&
iw
>=
0
&&
iw
<
IW
)
{
int
idx
=
ih
*
IW
*
CI_SLICES
+
iw
*
CI_SLICES
;
for
(
int
ci_slice
=
0
; ci_slice < CI_SLICES; ci_slice++)
{
FLT4
in_c4
=
input[idx
+
ci_slice]
;
out0_c4
+=
w0_ic1_oc4[0]
*
in_c4.x
;
out0_c4
+=
w0_ic1_oc4[1]
*
in_c4.y
;
out0_c4
+=
w0_ic1_oc4[2]
*
in_c4.z
;
out0_c4
+=
w0_ic1_oc4[3]
*
in_c4.w
;
w0_ic1_oc4
+=
4
;
out1_c4
+=
w1_ic1_oc4[0]
*
in_c4.x
;
out1_c4
+=
w1_ic1_oc4[1]
*
in_c4.y
;
out1_c4
+=
w1_ic1_oc4[2]
*
in_c4.z
;
out1_c4
+=
w1_ic1_oc4[3]
*
in_c4.w
;
w1_ic1_oc4
+=
4
;
}
}
else
{
w0_ic1_oc4
+=
4
*
CI_SLICES
;
w1_ic1_oc4
+=
4
*
CI_SLICES
;
}
}
}
output[oh
*
OW
*
CO_SLICES
+
ow
*
CO_SLICES
+
2
*
co_slice
+
0]
=
out0_c4
+
bias[2
*
co_slice
+
0]
;
output[oh
*
OW
*
CO_SLICES
+
ow
*
CO_SLICES
+
2
*
co_slice
+
1]
=
out1_c4
+
bias[2
*
co_slice
+
1]
;
}
}
\ No newline at end of file
mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
浏览文件 @
12102ae3
...
...
@@ -14,16 +14,12 @@
* limitations under the License.
*/
#include "src/runtime/kernel/opencl/kernel/convolution.h"
#include <vector>
#include <string>
#include <set>
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "src/runtime/opencl/opencl_runtime.h"
#ifndef PROGRAM_WITH_IL
#include <algorithm>
#include "src/runtime/kernel/opencl/kernel/convolution.h"
#include "src/runtime/kernel/opencl/cl/fp32/convolution.cl.inc"
#
endif
#
include "src/kernel_registry.h"
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kGPU
;
using
mindspore
::
lite
::
KernelRegistrar
;
...
...
@@ -38,27 +34,27 @@ int ConvolutionOpenCLKernel::Init() {
MS_LOG
(
ERROR
)
<<
"ConvolutionOpenCLKernel only support Batch=1!"
;
}
outputs_
[
0
]
->
SetFormat
(
schema
::
Format_NHWC4
);
io_dataformat_
=
outputs_
[
0
]
->
GetFormat
();
auto
io_NHWC
=
inputs_
[
0
]
->
GetFormat
()
==
schema
::
Format_NHWC
&&
outputs_
[
0
]
->
GetFormat
()
==
schema
::
Format_NHWC
;
auto
io_NHWC4
=
inputs_
[
0
]
->
GetFormat
()
==
schema
::
Format_NHWC4
&&
outputs_
[
0
]
->
GetFormat
()
==
schema
::
Format_NHWC4
;
if
(
!
io_NHWC
&&
!
io_NHWC4
)
{
MS_LOG
(
ERROR
)
<<
"input and output data_format is invalid!"
;
}
io_dataformat_
=
inputs_
[
0
]
->
GetFormat
();
if
(
inputs_
[
1
]
->
GetFormat
()
!=
schema
::
Format_KHWC
)
{
MS_LOG
(
ERROR
)
<<
"weight data_format is invalid!"
;
}
auto
ocl_runtime
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
std
::
string
kernel_name
=
"convolution_NHWC_OHWI"
;
#ifdef PROGRAM_WITH_IL
ocl_runtime
->
CreateKernelFromIL
(
kernel_
(),
kernel_name
);
#else
std
::
set
<
std
::
string
>
build_options
;
std
::
string
source
=
convolution_source_fp32
;
std
::
string
program_name
=
"convolution"
;
std
::
string
kernel_name
=
io_NHWC4
?
"convolution_NHWC4_OHWIIO_float8"
:
"convolution_NHWC_OHWI"
;
auto
ocl_runtime
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
ocl_runtime
->
LoadSource
(
program_name
,
source
);
ocl_runtime
->
BuildKernel
(
kernel_
,
program_name
,
kernel_name
,
build_options
);
#endif
this
->
InitBuffer
();
MS_LOG
(
DEBUG
)
<<
kernel_name
<<
" Init Done!"
;
return
0
;
}
int
ConvolutionOpenCLKernel
::
InitBuffer
()
{
...
...
@@ -78,35 +74,41 @@ int ConvolutionOpenCLKernel::InitBuffer() {
memcpy_s
(
packed_bias_
,
bias_tensor
->
Size
(),
bias_tensor
->
Data
(),
bias_tensor
->
Size
());
allocator
->
UnmapBuffer
(
packed_bias_
);
}
else
if
(
io_dataformat_
==
schema
::
Format_NHWC4
)
{
// OHWI -> OHWIIO
auto
weight_shape
=
weight_tensor
->
shape
();
size_t
CO
=
weight_shape
[
0
];
size_t
KH
=
weight_shape
[
1
];
size_t
KW
=
weight_shape
[
2
];
size_t
CI
=
weight_shape
[
3
];
size_t
CI_ALIGN
=
UP_DIV
(
CI
,
C4NUM
)
*
C4NUM
;
size_t
CO_ALIGN
=
UP_DIV
(
CO
,
C4NUM
)
*
C4NUM
;
size_t
weight_size_tiled
=
CO_ALIGN
*
KH
*
KW
*
CI_ALIGN
*
sizeof
(
float
);
size_t
CI_SLICES
=
UP_DIV
(
CI
,
C4NUM
);
size_t
CO_SLICES
=
UP_DIV
(
CO
,
C4NUM
);
constexpr
size_t
CI_TILE
=
C4NUM
;
constexpr
size_t
CO_TILE
=
C4NUM
;
size_t
packed_weight_size
=
CO_SLICES
*
KH
*
KW
*
CI_SLICES
*
CI_TILE
*
CO_TILE
*
sizeof
(
float
);
packed_weight_
=
reinterpret_cast
<
float
*>
(
allocator
->
Malloc
(
weight_size_tiled
));
packed_weight_
=
reinterpret_cast
<
float
*>
(
allocator
->
Malloc
(
packed_weight_size
));
packed_weight_
=
reinterpret_cast
<
float
*>
(
allocator
->
MapBuffer
(
packed_weight_
,
CL_MAP_WRITE
,
nullptr
,
true
));
memset_s
(
packed_weight_
,
weight_size_tiled
,
0x00
,
weight_size_tiled
);
memset_s
(
packed_weight_
,
packed_weight_size
,
0x00
,
packed_weight_size
);
auto
weight_data
=
reinterpret_cast
<
float
*>
(
weight_tensor
->
Data
());
for
(
int
co
=
0
;
co
<
CO
;
++
co
)
{
for
(
int
kh
=
0
;
kh
<
KH
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
KW
;
++
kw
)
{
for
(
int
ci
=
0
;
ci
<
CI
;
++
ci
)
{
packed_weight_
[
co
*
KH
*
KW
*
CI_ALIGN
+
kh
*
KW
*
CI_ALIGN
+
kw
*
CI_ALIGN
+
ci
]
=
weight_data
[
co
*
KH
*
KW
*
CI
+
kh
*
KW
*
CI
+
kw
*
CI
+
ci
];
auto
co_outer
=
co
/
CO_TILE
;
auto
co_inner
=
co
%
CO_TILE
;
auto
ci_outer
=
ci
/
CI_TILE
;
auto
ci_inner
=
ci
%
CI_TILE
;
packed_weight_
[((((
co_outer
*
KH
+
kh
)
*
KW
+
kw
)
*
CI_SLICES
+
ci_outer
)
*
CI_TILE
+
ci_inner
)
*
CO_TILE
+
co_inner
]
=
*
(
weight_data
++
);
}
}
}
}
allocator
->
UnmapBuffer
(
packed_weight_
);
size_t
bias_size_tiled
=
CO_ALIGN
*
sizeof
(
float
);
packed_bias_
=
reinterpret_cast
<
float
*>
(
allocator
->
Malloc
(
bias_size_tiled
));
size_t
packed_bias_size
=
CO_SLICES
*
CO_TILE
*
sizeof
(
float
);
packed_bias_
=
reinterpret_cast
<
float
*>
(
allocator
->
Malloc
(
packed_bias_size
));
packed_bias_
=
reinterpret_cast
<
float
*>
(
allocator
->
MapBuffer
(
packed_bias_
,
CL_MAP_WRITE
,
nullptr
,
true
));
memset_s
(
packed_bias_
,
bias_size_tiled
,
0x00
,
bias_size_tiled
);
memset_s
(
packed_bias_
,
packed_bias_size
,
0x00
,
packed_bias_size
);
auto
bias_data
=
reinterpret_cast
<
float
*>
(
bias_tensor
->
Data
());
for
(
int
co
=
0
;
co
<
CO
;
++
co
)
{
packed_bias_
[
co
]
=
bias_data
[
co
];
...
...
@@ -115,47 +117,80 @@ int ConvolutionOpenCLKernel::InitBuffer() {
}
return
0
;
}
}
// namespace mindspore::kernel
int
ConvolutionOpenCLKernel
::
ReSize
()
{
return
0
;
}
static
int
GetBiggestDivider
(
int
x
,
int
y
)
{
for
(
int
i
=
y
;
i
!=
0
;
i
--
)
{
if
(
x
%
i
==
0
)
{
return
i
;
}
}
return
1
;
}
static
void
GetLocalSize
(
const
ConvParameter
*
param
,
std
::
vector
<
size_t
>
*
global
,
std
::
vector
<
size_t
>
*
local
)
{
constexpr
size_t
work_group_size
[]
=
{
4
,
4
,
1
};
constexpr
size_t
max_work_item_sizes
[]
=
{
512
,
512
,
512
};
constexpr
size_t
max_work_group_size
=
512
;
const
size_t
max_z_size
=
std
::
min
<
size_t
>
(
16
,
max_work_item_sizes
[
2
]);
// 先用OH OW CO_SLICES初始化global,并且441对齐
size_t
global_h
=
UP_DIV
(
param
->
output_h_
,
work_group_size
[
0
])
*
work_group_size
[
0
];
size_t
global_w
=
UP_DIV
(
param
->
output_w_
,
work_group_size
[
1
])
*
work_group_size
[
1
];
size_t
global_c
=
UP_DIV
(
UP_DIV
(
param
->
output_channel_
,
C4NUM
),
work_group_size
[
2
])
*
work_group_size
[
2
];
// 使用策略计算local
size_t
local_c
=
GetBiggestDivider
(
global_c
,
max_z_size
);
size_t
local_hw_size
=
std
::
min
<
size_t
>
(
256
,
max_work_group_size
)
/
local_c
;
size_t
local_w
=
std
::
min
(
global_w
,
local_hw_size
);
size_t
local_h
=
std
::
min
(
local_hw_size
/
local_w
,
global_h
);
if
(
local_h
==
global_h
&&
global_h
%
2
==
0
)
{
local_h
=
global_h
/
2
;
}
global
->
clear
();
global
->
push_back
(
UP_DIV
(
param
->
output_h_
,
local_h
)
*
local_h
);
global
->
push_back
(
UP_DIV
(
param
->
output_w_
,
local_w
)
*
local_w
);
global
->
push_back
(
UP_DIV
(
UP_DIV
(
param
->
output_channel_
,
C4NUM
),
local_c
)
*
local_c
);
local
->
clear
();
local
->
push_back
(
local_h
);
local
->
push_back
(
local_w
);
local
->
push_back
(
local_c
);
}
int
ConvolutionOpenCLKernel
::
Run
()
{
MS_LOG
(
DEBUG
)
<<
this
->
Name
()
<<
" Running!
"
;
MS_LOG
(
INFO
)
<<
"ConvolutionOpenCLKernel::Run()
"
;
auto
ocl_runtime
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
auto
param
=
reinterpret_cast
<
ConvParameter
*>
(
opParameter
);
auto
input0_shape
=
inputs_
[
0
]
->
shape
();
// NHWC
auto
input1_shape
=
inputs_
[
1
]
->
shape
();
// OHWI
auto
outpu0_shape
=
outputs_
[
0
]
->
shape
();
// NHWC
cl_uint
N
=
input0_shape
[
0
];
cl_uint
CI
=
input0_shape
[
3
];
cl_uint
IH
=
input0_shape
[
1
];
cl_uint
IW
=
input0_shape
[
2
];
cl_uint
CO
=
outpu0_shape
[
3
];
cl_uint
OH
=
outpu0_shape
[
1
];
cl_uint
OW
=
outpu0_shape
[
2
];
cl_uint
KH
=
input1_shape
[
1
];
cl_uint
KW
=
input1_shape
[
2
];
cl_uint
CI_TILE_NUM
=
UP_DIV
(
CI
,
C4NUM
);
cl_uint
CO_TILE_NUM
=
UP_DIV
(
CO
,
C4NUM
);
cl_uint
CI_ALIGN
=
CI_TILE_NUM
*
C4NUM
;
cl_uint
CO_ALIGN
=
CO_TILE_NUM
*
C4NUM
;
cl_uint4
input_shape
;
cl_uint4
weight_shape
;
cl_uint4
output_shape
;
cl_int
N
=
input0_shape
[
0
];
cl_int
CI
=
input0_shape
[
3
];
cl_int
IH
=
input0_shape
[
1
];
cl_int
IW
=
input0_shape
[
2
];
cl_int
CO
=
outpu0_shape
[
3
];
cl_int
OH
=
outpu0_shape
[
1
];
cl_int
OW
=
outpu0_shape
[
2
];
cl_int
KH
=
input1_shape
[
1
];
cl_int
KW
=
input1_shape
[
2
];
cl_int
CI_ALIGN
=
UP_DIV
(
CI
,
C4NUM
)
*
C4NUM
;
cl_int
CO_ALIGN
=
UP_DIV
(
CO
,
C4NUM
)
*
C4NUM
;
cl_int4
input_shape
;
cl_int4
output_shape
;
if
(
io_dataformat_
==
schema
::
Format_NHWC
)
{
input_shape
=
{
N
,
IH
,
IW
,
CI
};
weight_shape
=
{
CO
,
KH
,
KW
,
CI
};
output_shape
=
{
N
,
OH
,
OW
,
CO
};
}
else
if
(
io_dataformat_
==
schema
::
Format_NHWC4
)
{
input_shape
=
{
N
,
IH
,
IW
,
CI_ALIGN
};
weight_shape
=
{
CO_ALIGN
,
KH
,
KW
,
CI_ALIGN
};
output_shape
=
{
N
,
OH
,
OW
,
CO_ALIGN
};
}
cl_uint2
stride
=
{
static_cast
<
cl_uint
>
(
param
->
stride_h_
),
static_cast
<
cl_uint
>
(
param
->
stride_w_
)};
cl_uint4
pad
=
{
static_cast
<
cl_uint
>
(
param
->
pad_u_
),
static_cast
<
cl_uint
>
(
param
->
pad_d_
),
static_cast
<
cl_uint
>
(
param
->
pad_l_
),
static_cast
<
cl_uint
>
(
param
->
pad_r_
)};
cl_int4
kernel_stride
=
{
KH
,
KW
,
param
->
stride_h_
,
param
->
stride_w_
};
cl_int4
pad
=
{
param
->
pad_u_
,
param
->
pad_d_
,
param
->
pad_l_
,
param
->
pad_r_
};
int
arg_cn
=
0
;
ocl_runtime
->
SetKernelArg
(
kernel_
,
arg_cn
++
,
inputs_
[
0
]
->
Data
());
...
...
@@ -163,14 +198,19 @@ int ConvolutionOpenCLKernel::Run() {
ocl_runtime
->
SetKernelArg
(
kernel_
,
arg_cn
++
,
packed_bias_
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
arg_cn
++
,
outputs_
[
0
]
->
Data
());
ocl_runtime
->
SetKernelArg
(
kernel_
,
arg_cn
++
,
input_shape
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
arg_cn
++
,
weight_shape
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
arg_cn
++
,
output_shape
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
arg_cn
++
,
stride
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
arg_cn
++
,
kernel_
stride
);
ocl_runtime
->
SetKernelArg
(
kernel_
,
arg_cn
++
,
pad
);
std
::
vector
<
size_t
>
global
=
{
OW
,
OH
,
CO_TILE_NUM
};
std
::
vector
<
size_t
>
local
=
{
1
,
1
,
CO_TILE_NUM
};
std
::
vector
<
size_t
>
global
;
std
::
vector
<
size_t
>
local
;
GetLocalSize
(
reinterpret_cast
<
ConvParameter
*>
(
this
->
opParameter
),
&
global
,
&
local
);
// float8 per thread
if
(
io_dataformat_
==
schema
::
Format_NHWC4
)
{
local
[
2
]
=
UP_DIV
(
local
[
2
],
2
);
global
[
2
]
=
UP_DIV
(
global
[
2
],
2
);
global
[
2
]
=
UP_DIV
(
global
[
2
],
global
[
2
])
*
global
[
2
];
}
ocl_runtime
->
RunKernel
(
kernel_
,
global
,
local
,
nullptr
);
return
0
;
...
...
@@ -196,4 +236,3 @@ kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector<lite::tenso
REG_KERNEL
(
kGPU
,
kNumberTypeFloat32
,
PrimitiveType_Conv2D
,
OpenCLConvolutionKernelCreator
)
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
浏览文件 @
12102ae3
...
...
@@ -14,11 +14,13 @@
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_
BACKEND_OPENCL_CONVOLUTIONOPENCLKERNEL
_H_
#define MINDSPORE_LITE_SRC_
BACKEND_OPENCL_CONVOLUTIONOPENCLKERNEL
_H_
#ifndef MINDSPORE_LITE_SRC_
RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION
_H_
#define MINDSPORE_LITE_SRC_
RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION
_H_
#include <vector>
#include "src/runtime/kernel/arm/fp32/convolution.h"
#include "src/ir/tensor.h"
#include "src/lite_kernel.h"
#include "schema/model_generated.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/arm/opclib/conv_parameter.h"
...
...
@@ -44,5 +46,4 @@ class ConvolutionOpenCLKernel : public LiteKernel {
};
}
// namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_CONVOLUTIONOPENCLKERNEL_H_
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_CONVOLUTION_H_
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录