Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
59b1138a
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
59b1138a
编写于
4年前
作者:
M
mindspore-ci-bot
提交者:
Gitee
4年前
浏览文件
操作
浏览文件
下载
差异文件
!5155 opencl convolution support fp16
Merge pull request !5155 from 王东旭/opencl_convolution_fp16
上级
dda115c0
925c2465
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
124 addition
and
160 deletion
+124
-160
mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
...pore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+74
-92
mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
...spore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+4
-2
mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
...te/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
+46
-66
未找到文件。
mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
浏览文件 @
59b1138a
...
...
@@ -17,9 +17,11 @@
#include <string>
#include <set>
#include <algorithm>
#include "src/common/utils.h"
#include "src/runtime/kernel/opencl/kernel/convolution.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
#include "nnacl/fp32/common_func.h"
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kGPU
;
using
mindspore
::
lite
::
KernelRegistrar
;
...
...
@@ -31,6 +33,7 @@ namespace mindspore::kernel {
int
ConvolutionOpenCLKernel
::
Init
()
{
static
int
init_count
=
0
;
auto
ocl_runtime
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
use_fp16_
=
ocl_runtime
->
GetFp16Enable
();
auto
allocator
=
ocl_runtime
->
GetAllocator
();
std
::
set
<
std
::
string
>
build_options
;
init_count
++
;
...
...
@@ -73,19 +76,15 @@ int ConvolutionOpenCLKernel::Init() {
// allocate winograd memory
if
(
use_winograd_
)
{
#ifdef ENABLE_FP16
size_t
img_dtype
=
CL_HALF_FLOAT
;
size_t
sizeof_datatype
=
2
;
#else
size_t
img_dtype
=
CL_FLOAT
;
size_t
sizeof_datatype
=
4
;
#endif
size_t
size
=
TILES_XY
*
CI_SLICES
*
36
*
sizeof_datatype
;
size_t
img_dtype
=
use_fp16_
?
CL_HALF_FLOAT
:
CL_FLOAT
;
size_t
sizeof_FLT
=
use_fp16_
?
2
:
4
;
size_t
size
=
TILES_XY
*
CI_SLICES
*
36
*
sizeof_FLT
;
size_t
width
=
TILES_XY
;
size_t
height
=
CI_SLICES
*
36
;
winograd_mem0_
=
allocator
->
Malloc
(
size
,
{
width
,
height
,
img_dtype
});
size
=
TILES_XY
*
CO_SLICES
*
36
*
sizeof_
datatype
;
size
=
TILES_XY
*
CO_SLICES
*
36
*
sizeof_
FLT
;
width
=
TILES_XY
;
height
=
CO_SLICES
*
36
;
winograd_mem1_
=
allocator
->
Malloc
(
size
,
{
width
,
height
,
img_dtype
});
...
...
@@ -103,6 +102,7 @@ int ConvolutionOpenCLKernel::Init() {
int
ConvolutionOpenCLKernel
::
InitBuffer
()
{
auto
ocl_runtime
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
auto
allocator
=
ocl_runtime
->
GetAllocator
();
size_t
sizeof_FLT
=
use_fp16_
?
2
:
4
;
auto
param
=
reinterpret_cast
<
ConvParameter
*>
(
op_parameter_
);
size_t
KH
=
param
->
kernel_h_
;
...
...
@@ -111,15 +111,18 @@ int ConvolutionOpenCLKernel::InitBuffer() {
constexpr
size_t
CO_TILE
=
C4NUM
;
size_t
packed_weight_size
;
if
(
use_winograd_
)
{
packed_weight_size
=
UP_DIV
(
CO
,
8
)
*
6
*
6
*
CI_SLICES
*
2
*
CI_TILE
*
CO_TILE
*
sizeof
(
float
)
;
packed_weight_size
=
UP_DIV
(
CO
,
8
)
*
6
*
6
*
CI_SLICES
*
2
*
CI_TILE
*
CO_TILE
*
sizeof
_FLT
;
}
else
{
packed_weight_size
=
CO_SLICES
*
KH
*
KW
*
CI_SLICES
*
CI_TILE
*
CO_TILE
*
sizeof
(
float
)
;
packed_weight_size
=
CO_SLICES
*
KH
*
KW
*
CI_SLICES
*
CI_TILE
*
CO_TILE
*
sizeof
_FLT
;
}
packed_weight_
=
reinterpret_cast
<
float
*>
(
allocator
->
Malloc
(
packed_weight_size
));
packed_weight_
=
allocator
->
Malloc
(
packed_weight_size
);
auto
packed_weight_fp32
=
reinterpret_cast
<
float
*>
(
packed_weight_
);
auto
packed_weight_fp16
=
reinterpret_cast
<
uint16_t
*>
(
packed_weight_
);
allocator
->
MapBuffer
(
packed_weight_
,
CL_MAP_WRITE
,
nullptr
,
true
);
memset
(
packed_weight_
,
0x00
,
packed_weight_size
);
auto
weight_tensor
=
in_tensors_
[
1
];
auto
origin_weight
=
reinterpret_cast
<
float
*>
(
weight_tensor
->
Data
());
auto
origin_weight_fp32
=
reinterpret_cast
<
float
*>
(
weight_tensor
->
Data
());
auto
origin_weight_fp16
=
reinterpret_cast
<
uint16_t
*>
(
weight_tensor
->
Data
());
if
(
use_winograd_
)
{
// weight: OHWI -> O66I -> O/8 6 6 I/4 O2 I4 O4
...
...
@@ -141,7 +144,11 @@ int ConvolutionOpenCLKernel::InitBuffer() {
for
(
int
kh
=
0
;
kh
<
3
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
3
;
++
kw
)
{
const
int
f_index
=
((
co
*
3
+
kh
)
*
3
+
kw
)
*
CI
+
ci
;
in_vals
[
kh
*
3
+
kw
]
=
origin_weight
[
f_index
];
if
(
use_fp16_
)
{
in_vals
[
kh
*
3
+
kw
]
=
ShortToFloat32
(
origin_weight_fp16
[
f_index
]);
}
else
{
in_vals
[
kh
*
3
+
kw
]
=
origin_weight_fp32
[
f_index
];
}
}
}
...
...
@@ -169,14 +176,18 @@ int ConvolutionOpenCLKernel::InitBuffer() {
(((((
co_outer
*
6
+
kh
)
*
6
+
kw
)
*
CI_SLICES
+
ci_outer
)
*
2
+
co_inner_group
)
*
CI_TILE
+
ci_inner
)
*
CO_TILE
+
co_inner
;
packed_weight_
[
dst_idx
]
=
encoded_weight
[
src_idx
++
];
if
(
use_fp16_
)
{
packed_weight_fp16
[
dst_idx
]
=
Float32ToShort
(
encoded_weight
[
src_idx
++
]);
}
else
{
packed_weight_fp32
[
dst_idx
]
=
encoded_weight
[
src_idx
++
];
}
}
}
}
}
}
else
{
// weight: OHWI -> O/4 H W I/4 I4 O4
for
(
int
co
=
0
;
co
<
CO
;
++
co
)
{
for
(
int
co
=
0
,
src_idx
=
0
;
co
<
CO
;
++
co
)
{
for
(
int
kh
=
0
;
kh
<
KH
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
KW
;
++
kw
)
{
for
(
int
ci
=
0
;
ci
<
CI
;
++
ci
)
{
...
...
@@ -184,8 +195,13 @@ int ConvolutionOpenCLKernel::InitBuffer() {
auto
co_inner
=
co
%
CO_TILE
;
auto
ci_outer
=
ci
/
CI_TILE
;
auto
ci_inner
=
ci
%
CI_TILE
;
packed_weight_
[((((
co_outer
*
KH
+
kh
)
*
KW
+
kw
)
*
CI_SLICES
+
ci_outer
)
*
CI_TILE
+
ci_inner
)
*
CO_TILE
+
co_inner
]
=
*
(
origin_weight
++
);
size_t
dst_idx
=
((((
co_outer
*
KH
+
kh
)
*
KW
+
kw
)
*
CI_SLICES
+
ci_outer
)
*
CI_TILE
+
ci_inner
)
*
CO_TILE
+
co_inner
;
if
(
use_fp16_
)
{
packed_weight_fp16
[
dst_idx
]
=
origin_weight_fp16
[
src_idx
++
];
}
else
{
packed_weight_fp32
[
dst_idx
]
=
origin_weight_fp32
[
src_idx
++
];
}
}
}
}
...
...
@@ -195,14 +211,11 @@ int ConvolutionOpenCLKernel::InitBuffer() {
// align bias from C to C4
auto
bias_tensor
=
in_tensors_
[
2
];
size_t
packed_bias_size
=
CO_SLICES
*
CO_TILE
*
sizeof
(
float
)
;
packed_bias_
=
reinterpret_cast
<
float
*>
(
allocator
->
Malloc
(
packed_bias_size
)
);
packed_bias_
=
reinterpret_cast
<
float
*>
(
allocator
->
MapBuffer
(
packed_bias_
,
CL_MAP_WRITE
,
nullptr
,
true
)
);
size_t
packed_bias_size
=
CO_SLICES
*
CO_TILE
*
sizeof
_FLT
;
packed_bias_
=
allocator
->
Malloc
(
packed_bias_size
);
allocator
->
MapBuffer
(
packed_bias_
,
CL_MAP_WRITE
,
nullptr
,
true
);
memset
(
packed_bias_
,
0x00
,
packed_bias_size
);
auto
bias_data
=
reinterpret_cast
<
float
*>
(
bias_tensor
->
Data
());
for
(
int
co
=
0
;
co
<
CO
;
++
co
)
{
packed_bias_
[
co
]
=
bias_data
[
co
];
}
memcpy
(
packed_bias_
,
bias_tensor
->
Data
(),
CO
*
sizeof_FLT
);
allocator
->
UnmapBuffer
(
packed_bias_
);
return
RET_OK
;
...
...
@@ -224,11 +237,7 @@ int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_s
im_dst_y
=
out_tensors_
[
0
]
->
Height
()
*
CO_SLICES
;
im_dst_x
=
out_tensors_
[
0
]
->
Width
();
}
#ifdef ENABLE_FP16
size_t
img_dtype
=
CL_HALF_FLOAT
;
#else
size_t
img_dtype
=
CL_FLOAT
;
#endif
size_t
img_dtype
=
use_fp16_
?
CL_HALF_FLOAT
:
CL_FLOAT
;
img_size
->
clear
();
img_size
->
push_back
(
im_dst_x
);
img_size
->
push_back
(
im_dst_y
);
...
...
@@ -321,18 +330,9 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() {
code
+=
"#define CI_SLICES "
+
std
::
to_string
(
CI_SLICES
)
+
"
\n
"
;
code
+=
"#define CO_SLICES "
+
std
::
to_string
(
CO_SLICES
)
+
"
\n\n
"
;
#ifdef ENABLE_FP16
code
+=
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable
\n
"
"#define FLT4 half4
\n
"
"#define READ_FLT4 read_imageh
\n
"
"#define WRITE_FLT4 write_imageh
\n\n
"
;
#else
code
+=
"#define FLT4 float4
\n
"
"#define READ_FLT4 read_imagef
\n
"
"#define WRITE_FLT4 write_imagef
\n\n
"
;
#endif
if
(
use_fp16_
)
{
code
+=
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable
\n
"
;
}
code
+=
"__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
\n\n
"
;
...
...
@@ -365,7 +365,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() {
" {
\n
"
" for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)
\n
"
" {
\n
"
;
code
+=
"FLT4 in_c4 = READ_
FLT4
(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, ih)); // NHWC4: H WC
\n\n
"
;
code
+=
"FLT4 in_c4 = READ_
IMAGE
(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, ih)); // NHWC4: H WC
\n\n
"
;
code
+=
" out0_c4 += w0_ic1_oc4[0] * in_c4.x;
\n
"
" out0_c4 += w0_ic1_oc4[1] * in_c4.y;
\n
"
...
...
@@ -389,21 +389,22 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() {
}
if
(
OW
*
CO_SLICES
<
65536
)
{
code
+=
" WRITE_
FLT4
(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC
\n
}"
;
code
+=
" WRITE_
IMAGE
(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC
\n
}"
;
}
else
{
code
+=
" WRITE_
FLT4
(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC
\n
}"
;
code
+=
" WRITE_
IMAGE
(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC
\n
}"
;
}
return
code
;
}
std
::
string
ConvolutionOpenCLKernel
::
CodeGenWinograd4x4To36
()
{
return
"#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
\n
"
return
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable
\n
"
"#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
\n
"
"#define PAD 1
\n
"
"
\n
"
"__constant sampler_t
\n
"
"smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
\n
"
"
\n
"
"constant
float
Bt[36] = {
\n
"
"constant
FLT
Bt[36] = {
\n
"
" 1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f,
\n
"
" 0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f,
\n
"
" 0.0000000000f, -0.9428089857f, 1.3333334923f, 0.4714045525f, -0.6666667461f, 0.0000000000f,
\n
"
...
...
@@ -433,52 +434,40 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() {
" int tile_x = tile_xy % TILE_X;
\n
"
" int tile_y = tile_xy / TILE_X;
\n
"
"
\n
"
" constant
float
*Bt_row = Bt + row * 6;
\n
"
"
float
4 BtD_row[6] = {0};
\n
"
" constant
FLT
*Bt_row = Bt + row * 6;
\n
"
"
FLT
4 BtD_row[6] = {0};
\n
"
" for (int y = 0; y < 6; y++)
\n
"
" {
\n
"
" int y_idx = tile_y * 4 - PAD + y;
\n
"
" for (int x = 0; x < 6; x++)
\n
"
" {
\n
"
" int x_idx = (tile_x * 4 - PAD + x) * SLICES + slice;
\n
"
" BtD_row[x] += Bt_row[y] *
read_imagef
(input, smp_none, (int2)(x_idx, y_idx));
\n
"
" BtD_row[x] += Bt_row[y] *
READ_IMAGE
(input, smp_none, (int2)(x_idx, y_idx));
\n
"
" }
\n
"
" }
\n
"
"
\n
"
" for (int y = 0; y < 6; y++)
\n
"
" {
\n
"
"
float4 acc = (float
4)(0.0f, 0.0f, 0.0f, 0.0f);
\n
"
"
FLT4 acc = (FLT
4)(0.0f, 0.0f, 0.0f, 0.0f);
\n
"
" for (int x = 0; x < 6; x++)
\n
"
" {
\n
"
" acc += BtD_row[x] * Bt[y * 6 + x];
\n
"
" }
\n
"
"// write_imagef(output, (int2)((row * 6 + y) * SLICES + slice, tile_xy), acc); // H WC W=36
\n
"
" write_imagef(output, (int2)(tile_xy, slice * 36 + (row * 6 + y)), acc); // CH W H=36
\n
"
" WRITE_IMAGE(output, (int2)(tile_xy, slice * 36 + (row * 6 + y)), acc); // CH W H=36
\n
"
" }
\n
"
"}"
;
}
std
::
string
ConvolutionOpenCLKernel
::
CodeGenWinogradConvolution
()
{
return
"#define CI_TILE 4
\n
"
return
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable
\n
"
"#define CI_TILE 4
\n
"
"#define H 36
\n
"
"//#define W 256
\n
"
"//#define CI 96
\n
"
"//#define CO 80s
\n
"
"//#define CI_SLICES 24
\n
"
"//#define CO_SLICES 20
\n
"
"
\n
"
"#define FLT4 float4
\n
"
"#define READ_FLT4 read_imagef
\n
"
"#define WRITE_FLT4 write_imagef
\n
"
"
\n
"
"//#define __global
\n
"
"
\n
"
"__constant sampler_t
\n
"
"smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
\n
"
"
\n
"
"__kernel void WinogradConvolution(__read_only image2d_t input,
\n
"
" __write_only image2d_t output,
\n
"
" __global
float
16 *weight,
\n
"
" __global
FLT
16 *weight,
\n
"
" int4 input_shape, // N 36 H/4*W/4 CI_SLICES
\n
"
" int4 output_shape) // N 36 H/4*W/4 CO_SLICES
\n
"
"{
\n
"
...
...
@@ -501,14 +490,14 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() {
" FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
\n
"
"
\n
"
" int y_idx = h;
\n
"
" __global
float
16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2;
\n
"
" __global
FLT
16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2;
\n
"
" for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)
\n
"
" {
\n
"
" FLT4 in0 = READ_
FLT4
(input, smp_none, (int2)(w + 0, y_idx));
\n
"
" FLT4 in1 = READ_
FLT4
(input, smp_none, (int2)(w + 1, y_idx));
\n
"
" FLT4 in0 = READ_
IMAGE
(input, smp_none, (int2)(w + 0, y_idx));
\n
"
" FLT4 in1 = READ_
IMAGE
(input, smp_none, (int2)(w + 1, y_idx));
\n
"
" y_idx += 36;
\n
"
"
\n
"
"
float
16 weight0 = weight_ptr[0], weight1 = weight_ptr[1];
\n
"
"
FLT
16 weight0 = weight_ptr[0], weight1 = weight_ptr[1];
\n
"
" weight_ptr += 2;
\n
"
"
\n
"
"
\n
"
...
...
@@ -533,18 +522,18 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() {
" out11 += in1.w * weight1.scdef;
\n
"
" }
\n
"
"
\n
"
" WRITE_
FLT4
(output, (int2)(w + 0, (co_slice + 0) * H + h), out00);
\n
"
" WRITE_
IMAGE
(output, (int2)(w + 0, (co_slice + 0) * H + h), out00);
\n
"
" if (w + 1 < W)
\n
"
" {
\n
"
" WRITE_
FLT4
(output, (int2)(w + 1, (co_slice + 0) * H + h), out01);
\n
"
" WRITE_
IMAGE
(output, (int2)(w + 1, (co_slice + 0) * H + h), out01);
\n
"
" }
\n
"
"
\n
"
" if (co_slice + 1 < CO_SLICES)
\n
"
" {
\n
"
" WRITE_
FLT4
(output, (int2)(w + 0, (co_slice + 1) * H + h), out10);
\n
"
" WRITE_
IMAGE
(output, (int2)(w + 0, (co_slice + 1) * H + h), out10);
\n
"
" if (w + 1 < W)
\n
"
" {
\n
"
" WRITE_
FLT4
(output, (int2)(w + 1, (co_slice + 1) * H + h), out11);
\n
"
" WRITE_
IMAGE
(output, (int2)(w + 1, (co_slice + 1) * H + h), out11);
\n
"
" }
\n
"
" }
\n
"
"}"
;
...
...
@@ -552,16 +541,11 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() {
std
::
string
ConvolutionOpenCLKernel
::
CodeGenWinograd36To4x4
()
{
std
::
string
code
=
"//#define TILE_XY 256
\n
"
"//#define SLICES 20
\n
"
"//#define OH 16
\n
"
"//#define OW 256
\n
"
"
\n
"
"//#define __global
\n
"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable
\n
"
"__constant sampler_t
\n
"
"smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
\n
"
"
\n
"
"constant
float
At[24] = {
\n
"
"constant
FLT
At[24] = {
\n
"
" 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f,
\n
"
" 0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f,
\n
"
" 0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f,
\n
"
...
...
@@ -570,7 +554,7 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
"
\n
"
"__kernel void Winograd36To4x4(__read_only image2d_t input,
\n
"
" __write_only image2d_t output,
\n
"
" __global
float
4 *bias,
\n
"
" __global
FLT
4 *bias,
\n
"
" int4 input_shape, // N 36 H/4*W/4 CO_SLICES
\n
"
" int4 output_shape) // N H W CO_SLICES
\n
"
"{
\n
"
...
...
@@ -588,25 +572,25 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
" return;
\n
"
" }
\n
"
"
\n
"
" constant
float
*At_row = At + row * 6;
\n
"
"
float
4 AtM_row[6] = {0};
\n
"
" constant
FLT
*At_row = At + row * 6;
\n
"
"
FLT
4 AtM_row[6] = {0};
\n
"
" for (int y = 0; y < 6; y++)
\n
"
" {
\n
"
" for (int x = 0; x < 6; x++)
\n
"
" {
\n
"
" AtM_row[x] += At_row[y] * read_imagef(input, smp_none, (int2)(tile_xy, slice * 36 + y * 6 + "
"x));
\n
"
" AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_none, (int2)(tile_xy, slice * 36 + y * 6 + x));
\n
"
" }
\n
"
" }
\n
"
"
\n
"
" for (int x = 0; x < 4; x++)
\n
"
" {
\n
"
"
float4 acc = (float
4)(0.0f, 0.0f, 0.0f, 0.0f);
\n
"
"
FLT4 acc = (FLT
4)(0.0f, 0.0f, 0.0f, 0.0f);
\n
"
" for (int y = 0; y < 6; y++)
\n
"
" {
\n
"
" acc += AtM_row[y] * At[x * 6 + y];
\n
"
" }
\n
"
" acc += bias[slice];
\n
"
;
" acc += bias[slice];
\n
"
"
\n
"
;
auto
param
=
reinterpret_cast
<
ConvParameter
*>
(
op_parameter_
);
if
(
param
->
act_type_
==
ActType_Relu
)
{
...
...
@@ -619,9 +603,7 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
" int TILE_X = OW / 4;
\n
"
" int tile_x = tile_xy % TILE_X * 4;
\n
"
" int tile_y = tile_xy / TILE_X * 4;
\n
"
"// write_imagef(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc); // height=CH width=W
\n
"
" write_imagef(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc); // height=H "
"width=WC
\n
"
" WRITE_IMAGE(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc); // height=H width=WC
\n
"
" }
\n
"
"}"
;
return
code
;
...
...
This diff is collapsed.
Click to expand it.
mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
浏览文件 @
59b1138a
...
...
@@ -40,6 +40,8 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
int
GetImageSize
(
size_t
idx
,
std
::
vector
<
size_t
>
*
img_size
)
override
;
private:
bool
use_fp16_
=
false
;
int
CI
;
int
IH
;
int
IW
;
...
...
@@ -48,8 +50,8 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
int
OW
;
int
CI_SLICES
;
int
CO_SLICES
;
float
*
packed_weight_
=
nullptr
;
float
*
packed_bias_
=
nullptr
;
void
*
packed_weight_
=
nullptr
;
void
*
packed_bias_
=
nullptr
;
bool
use_winograd_
=
false
;
int
TILES_X
;
...
...
This diff is collapsed.
Click to expand it.
mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
浏览文件 @
59b1138a
...
...
@@ -18,9 +18,10 @@
#include "common/common_test.h"
#include "mindspore/lite/src/common/file_utils.h"
#include "mindspore/lite/src/runtime/opencl/opencl_runtime.h"
#include "nnacl/pack.h"
#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
#include "mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h"
#include "nnacl/pack.h"
#include "nnacl/fp32/common_func.h"
using
mindspore
::
kernel
::
ConvolutionOpenCLKernel
;
using
mindspore
::
kernel
::
LiteKernel
;
...
...
@@ -34,22 +35,37 @@ void LoadData(void *dst, size_t dst_size, const std::string &file_path) {
if
(
file_path
.
empty
())
{
memset
(
dst
,
0x00
,
dst_size
);
}
else
{
auto
src_data
=
reinterpret_cast
<
float
*>
(
mindspore
::
lite
::
ReadFile
(
file_path
.
c_str
(),
&
dst_size
)
);
auto
src_data
=
mindspore
::
lite
::
ReadFile
(
file_path
.
c_str
(),
&
dst_size
);
memcpy
(
dst
,
src_data
,
dst_size
);
}
}
void
MyCompareOutput
(
lite
::
tensor
::
Tensor
*
output_tensor
,
const
std
::
string
&
file_path
)
{
auto
*
output_data
=
reinterpret_cast
<
float
*>
(
output_tensor
->
Data
());
void
MyCompareOutput
(
lite
::
tensor
::
Tensor
*
output_tensor
,
const
std
::
string
&
file_path
,
const
TypeId
data_type
,
const
float
atol
)
{
size_t
output_size
=
output_tensor
->
Size
();
auto
output_data_ori
=
output_tensor
->
Data
();
auto
expect_data_ori
=
mindspore
::
lite
::
ReadFile
(
file_path
.
c_str
(),
&
output_size
);
std
::
vector
<
float
>
output_data_vec
(
output_tensor
->
ElementsC4Num
());
std
::
vector
<
float
>
expect_data_vec
(
output_tensor
->
ElementsC4Num
());
float
*
output_data
,
*
expect_data
;
if
(
data_type
==
kNumberTypeFloat16
)
{
for
(
int
i
=
0
;
i
<
output_data_vec
.
size
();
++
i
)
{
output_data_vec
[
i
]
=
ShortToFloat32
(
reinterpret_cast
<
uint16_t
*>
(
output_data_ori
)[
i
]);
expect_data_vec
[
i
]
=
ShortToFloat32
(
reinterpret_cast
<
uint16_t
*>
(
expect_data_ori
)[
i
]);
}
output_data
=
output_data_vec
.
data
();
expect_data
=
expect_data_vec
.
data
();
}
else
{
output_data
=
reinterpret_cast
<
float
*>
(
output_data_ori
);
expect_data
=
reinterpret_cast
<
float
*>
(
expect_data_ori
);
}
printf
(
"
\n
output[0:10]:"
);
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
printf
(
"%d:%.3f "
,
i
,
output_data
[
i
]);
}
printf
(
"
\n
"
);
size_t
output_size
=
output_tensor
->
Size
();
auto
expect_data
=
reinterpret_cast
<
float
*>
(
mindspore
::
lite
::
ReadFile
(
file_path
.
c_str
(),
&
output_size
));
constexpr
float
atol
=
0.5
;
for
(
int
i
=
0
;
i
<
output_tensor
->
ElementsNum
();
++
i
)
{
if
(
std
::
fabs
(
output_data
[
i
]
-
expect_data
[
i
])
>
atol
)
{
printf
(
"error at idx[%d] expect=%.3f output=%.3f
\n
"
,
i
,
expect_data
[
i
],
output_data
[
i
]);
...
...
@@ -61,8 +77,8 @@ void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &fil
printf
(
"COMPARE SUCCESS!
\n\n\n
"
);
}
void
TEST_MAIN
(
schema
::
Format
input_format
,
schema
::
Format
output_format
,
const
std
::
string
&
data_path
,
std
::
string
attr_str
)
{
void
TEST_MAIN
(
schema
::
Format
input_format
,
schema
::
Format
output_format
,
const
TypeId
data_type
,
const
std
::
string
&
data_path
,
std
::
string
attr_str
)
{
auto
param
=
new
(
std
::
nothrow
)
ConvParameter
;
if
(
param
==
nullptr
)
{
return
;
...
...
@@ -87,6 +103,7 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const
std
::
cout
<<
"initialize OpenCLRuntime and OpenCLAllocator"
;
auto
ocl_runtime
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
ocl_runtime
->
Init
();
ocl_runtime
->
SetFp16Enable
(
data_type
==
kNumberTypeFloat16
);
auto
allocator
=
ocl_runtime
->
GetAllocator
();
std
::
cout
<<
"create Tensors"
;
...
...
@@ -94,7 +111,6 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const
std
::
vector
<
int
>
weight_shape
=
{
param
->
output_channel_
,
param
->
kernel_h_
,
param
->
kernel_w_
,
param
->
input_channel_
};
std
::
vector
<
int
>
bias_shape
=
{
param
->
output_channel_
};
std
::
vector
<
int
>
output_shape
=
{
param
->
output_batch_
,
param
->
output_h_
,
param
->
output_w_
,
param
->
output_channel_
};
auto
data_type
=
kNumberTypeFloat32
;
auto
tensor_type
=
schema
::
NodeType_ValueNode
;
auto
input_tensor
=
lite
::
tensor
::
Tensor
(
data_type
,
input_shape
,
input_format
,
tensor_type
);
auto
weight_tensor
=
lite
::
tensor
::
Tensor
(
data_type
,
weight_shape
,
schema
::
Format_KHWC
,
tensor_type
);
...
...
@@ -121,11 +137,17 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const
input_tensor
.
MallocData
(
allocator
);
// before MapBuffer()
sub_graph
->
Init
();
LoadData
(
input_tensor
.
Data
(),
input_tensor
.
Size
(),
input_file
);
// after MapBuffer()
printf
(
"input[0-2] =%.3f
\n
"
,
reinterpret_cast
<
float
*>
(
input_tensor
.
Data
())[
0
]);
printf
(
"weight[0-2]=%.3f
\n
"
,
reinterpret_cast
<
float
*>
(
weight_tensor
.
Data
())[
0
]);
printf
(
"bias[0-2] =%.3f
\n
"
,
reinterpret_cast
<
float
*>
(
bias_tensor
.
Data
())[
0
]);
if
(
data_type
==
kNumberTypeFloat16
)
{
printf
(
"input[0] =%.3f
\n
"
,
ShortToFloat32
(
reinterpret_cast
<
uint16_t
*>
(
input_tensor
.
Data
())[
0
]));
printf
(
"weight[0]=%.3f
\n
"
,
ShortToFloat32
(
reinterpret_cast
<
uint16_t
*>
(
weight_tensor
.
Data
())[
0
]));
printf
(
"bias[0] =%.3f
\n
"
,
ShortToFloat32
(
reinterpret_cast
<
uint16_t
*>
(
bias_tensor
.
Data
())[
0
]));
}
else
{
printf
(
"input[0] =%.3f
\n
"
,
reinterpret_cast
<
float
*>
(
input_tensor
.
Data
())[
0
]);
printf
(
"weight[0]=%.3f
\n
"
,
reinterpret_cast
<
float
*>
(
weight_tensor
.
Data
())[
0
]);
printf
(
"bias[0] =%.3f
\n
"
,
reinterpret_cast
<
float
*>
(
bias_tensor
.
Data
())[
0
]);
}
sub_graph
->
Run
();
MyCompareOutput
(
&
output_tensor
,
expect_file
);
MyCompareOutput
(
&
output_tensor
,
expect_file
,
data_type
,
(
data_type
==
kNumberTypeFloat16
?
0.7
f
:
0.1
f
)
);
std
::
cout
<<
"release resources"
;
weight_tensor
.
FreeData
();
...
...
@@ -138,72 +160,30 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const
delete
sub_graph
;
}
TEST_F
(
TestConvolutionOpenCL
,
in1x1x64x512_out1x1x64x7358_k11_s11_p0000
)
{
// change W/H
TEST_F
(
TestConvolutionOpenCL
,
in1x224x224x3_out1x112x112x32_k33_s22_p0101_fp32
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/0
2_fp32/"
,
"inputNHWC_1x
1x64x512_outputNHWC_1x1x64x7358_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0
_dilationHW_"
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
kNumberTypeFloat32
,
"testcases/mobilenetv
2_fp32/"
,
"inputNHWC_1x
224x224x3_outputNHWC_1x112x112x32_kernelHW_3x3_strideHW_2x2_padTopBottomLeftRight_0x1x0x1
_dilationHW_"
"1x1"
);
}
TEST_F
(
TestConvolutionOpenCL
,
winograd_02_other_inputNHWC_1x32x512x1_outputNHWC_1x32x512x50
)
{
// speed up
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/test_fp32/"
,
"inputNHWC_1x32x512x1_outputNHWC_1x32x512x50_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
"dilationHW_1x1"
);
}
TEST_F
(
TestConvolutionOpenCL
,
in1x224x224x3_out1x112x112x32_k33_s22_p0101
)
{
TEST_F
(
TestConvolutionOpenCL
,
in1x224x224x3_out1x112x112x32_k33_s22_p0101_fp16
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/mobilenetv2_fp32
/"
,
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
kNumberTypeFloat16
,
"testcases/mobilenetv2_fp16
/"
,
"inputNHWC_1x224x224x3_outputNHWC_1x112x112x32_kernelHW_3x3_strideHW_2x2_padTopBottomLeftRight_0x1x0x1_dilationHW_"
"1x1"
);
}
TEST_F
(
TestConvolutionOpenCL
,
winograd_02_origin_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/test_fp32/"
,
TEST_F
(
TestConvolutionOpenCL
,
winograd_02_origin_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80
_fp32
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
kNumberTypeFloat32
,
"testcases/test_fp32/"
,
"inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
"dilationHW_1x1"
);
}
TEST_F
(
TestConvolutionOpenCL
,
winograd_02_origin_inputNHWC_1x16x256x100_outputNHWC_1x16x256x96
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/test_fp32/"
,
"inputNHWC_1x16x256x100_outputNHWC_1x16x256x96_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
"dilationHW_1x1"
);
}
TEST_F
(
TestConvolutionOpenCL
,
winograd_02_other_inputNHWC_1x32x512x50_outputNHWC_1x32x512x48
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/02_fp32/"
,
"inputNHWC_1x32x512x50_outputNHWC_1x32x512x48_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
"dilationHW_1x1"
);
}
TEST_F
(
TestConvolutionOpenCL
,
winograd_02_other_inputNHWC_1x8x128x100_outputNHWC_1x8x128x250
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/02_fp32/"
,
"inputNHWC_1x8x128x100_outputNHWC_1x8x128x250_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
"dilationHW_1x1"
);
}
TEST_F
(
TestConvolutionOpenCL
,
winograd_02_other_inputNHWC_1x8x128x100_outputNHWC_1x8x128x300
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/02_fp32/"
,
"inputNHWC_1x8x128x100_outputNHWC_1x8x128x300_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
"dilationHW_1x1"
);
}
TEST_F
(
TestConvolutionOpenCL
,
winograd_02_other_inputNHWC_1x4x64x150_outputNHWC_1x4x64x350
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/02_fp32/"
,
"inputNHWC_1x4x64x150_outputNHWC_1x4x64x350_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
"dilationHW_1x1"
);
}
TEST_F
(
TestConvolutionOpenCL
,
winograd_02_other_inputNHWC_1x4x64x150_outputNHWC_1x4x64x400
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/02_fp32/"
,
"inputNHWC_1x4x64x150_outputNHWC_1x4x64x400_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
TEST_F
(
TestConvolutionOpenCL
,
winograd_02_origin_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_fp16
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
kNumberTypeFloat16
,
"testcases/test_fp16/"
,
"inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
"dilationHW_1x1"
);
}
TEST_F
(
TestConvolutionOpenCL
,
winograd_08_origin_inputNHWC_1x480x480x128_outputNHWC_1x480x480x128
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/test_fp32/"
,
"inputNHWC_1x480x480x128_outputNHWC_1x480x480x128_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_"
"1x1x1x1_dilationHW_1x1"
);
}
}
// namespace mindspore
This diff is collapsed.
Click to expand it.
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录
新手
引导
客服
返回
顶部