Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
9001dc48
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
9001dc48
编写于
8月 19, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
8月 19, 2020
浏览文件
操作
浏览文件
下载
差异文件
!4672 optimize opencl winograd kernel performance
Merge pull request !4672 from 王东旭/opencl_winograd_optimize
上级
147c7457
d0ff232a
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
103 addition
and
65 deletion
+103
-65
mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
...pore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+46
-36
mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
...spore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+2
-2
mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
...te/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
+55
-27
未找到文件。
mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
浏览文件 @
9001dc48
...
...
@@ -476,12 +476,10 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() {
std
::
string
ConvolutionOpenCLKernel
::
CodeGenWinogradConvolution
()
{
return
"#define CI_TILE 4
\n
"
"#define H 36
\n
"
"//#define W 256
\n
"
"//#define CI 96
\n
"
"#define IH 36
\n
"
"//#define IW 256
\n
"
"//#define CO 80
\n
"
"#define OH 36
\n
"
"//#define OW 256
\n
"
"//#define CO 80s
\n
"
"//#define CI_SLICES 24
\n
"
"//#define CO_SLICES 20
\n
"
"
\n
"
...
...
@@ -500,59 +498,71 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() {
" int4 input_shape, // N 36 H/4*W/4 CI_SLICES
\n
"
" int4 output_shape) // N 36 H/4*W/4 CO_SLICES
\n
"
"{
\n
"
" int
o
w = get_global_id(0) * 2;
\n
"
" int
o
h = get_global_id(1);
\n
"
" int w = get_global_id(0) * 2;
\n
"
" int h = get_global_id(1);
\n
"
" int co_slice = get_global_id(2) * 2;
\n
"
"
\n
"
" int CI_SLICES = input_shape.w;
\n
"
" int
I
W = input_shape.z;
\n
"
" int W = input_shape.z;
\n
"
" int CO_SLICES = output_shape.w;
\n
"
" int OW = IW;
\n
"
"
\n
"
" if (
oh >= OH || ow >= O
W || co_slice >= CO_SLICES)
\n
"
" if (
h >= H || w >=
W || co_slice >= CO_SLICES)
\n
"
" {
\n
"
" return;
\n
"
" }
\n
"
"
\n
"
" __global float16 *w_ptr = weight + (co_slice / 2 * 36 + oh) * CI_SLICES * 2;
\n
"
" int y_idx = oh;
\n
"
" FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
\n
"
" FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
\n
"
" FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
\n
"
" FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
\n
"
"
\n
"
" int y_idx = h;
\n
"
" __global float16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2;
\n
"
" for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)
\n
"
" {
\n
"
" FLT4 in0 = READ_FLT4(input, smp_none, (int2)(
o
w + 0, y_idx));
\n
"
" FLT4 in1 = READ_FLT4(input, smp_none, (int2)(
o
w + 1, y_idx));
\n
"
" FLT4 in0 = READ_FLT4(input, smp_none, (int2)(w + 0, y_idx));
\n
"
" FLT4 in1 = READ_FLT4(input, smp_none, (int2)(w + 1, y_idx));
\n
"
" y_idx += 36;
\n
"
"
\n
"
" float16 w0 = w_ptr[0], w1 = w_ptr[1];
\n
"
" w_ptr += 2;
\n
"
" float16 weight0 = weight_ptr[0], weight1 = weight_ptr[1];
\n
"
" weight_ptr += 2;
\n
"
"
\n
"
"
\n
"
" out00 += in0.x * w0.s0123;
\n
"
" out00 += in0.y * w0.s4567;
\n
"
" out00 += in0.z * w0.s89ab;
\n
"
" out00 += in0.w * w0.scdef;
\n
"
" out00 += in0.x * w
eight
0.s0123;
\n
"
" out00 += in0.y * w
eight
0.s4567;
\n
"
" out00 += in0.z * w
eight
0.s89ab;
\n
"
" out00 += in0.w * w
eight
0.scdef;
\n
"
"
\n
"
" out01 += in1.x * w0.s0123;
\n
"
" out01 += in1.y * w0.s4567;
\n
"
" out01 += in1.z * w0.s89ab;
\n
"
" out01 += in1.w * w0.scdef;
\n
"
" out01 += in1.x * w
eight
0.s0123;
\n
"
" out01 += in1.y * w
eight
0.s4567;
\n
"
" out01 += in1.z * w
eight
0.s89ab;
\n
"
" out01 += in1.w * w
eight
0.scdef;
\n
"
"
\n
"
" out10 += in0.x * w1.s0123;
\n
"
" out10 += in0.y * w1.s4567;
\n
"
" out10 += in0.z * w1.s89ab;
\n
"
" out10 += in0.w * w1.scdef;
\n
"
" out10 += in0.x * w
eight
1.s0123;
\n
"
" out10 += in0.y * w
eight
1.s4567;
\n
"
" out10 += in0.z * w
eight
1.s89ab;
\n
"
" out10 += in0.w * w
eight
1.scdef;
\n
"
"
\n
"
" out11 += in1.x * w1.s0123;
\n
"
" out11 += in1.y * w1.s4567;
\n
"
" out11 += in1.z * w1.s89ab;
\n
"
" out11 += in1.w * w1.scdef;
\n
"
" out11 += in1.x * weight1.s0123;
\n
"
" out11 += in1.y * weight1.s4567;
\n
"
" out11 += in1.z * weight1.s89ab;
\n
"
" out11 += in1.w * weight1.scdef;
\n
"
" }
\n
"
"
\n
"
" WRITE_FLT4(output, (int2)(w + 0, (co_slice + 0) * H + h), out00);
\n
"
" if (w + 1 < W)
\n
"
" {
\n
"
" WRITE_FLT4(output, (int2)(w + 1, (co_slice + 0) * H + h), out01);
\n
"
" }
\n
"
"
\n
"
" if (co_slice + 1 < CO_SLICES)
\n
"
" {
\n
"
" WRITE_FLT4(output, (int2)(w + 0, (co_slice + 1) * H + h), out10);
\n
"
" if (w + 1 < W)
\n
"
" {
\n
"
" WRITE_FLT4(output, (int2)(w + 1, (co_slice + 1) * H + h), out11);
\n
"
" }
\n
"
" }
\n
"
" WRITE_FLT4(output, (int2)(ow + 0, (co_slice + 0) * 36 + oh), out00);
\n
"
" WRITE_FLT4(output, (int2)(ow + 1, (co_slice + 0) * 36 + oh), out01);
\n
"
" WRITE_FLT4(output, (int2)(ow + 0, (co_slice + 1) * 36 + oh), out10);
\n
"
" WRITE_FLT4(output, (int2)(ow + 1, (co_slice + 1) * 36 + oh), out11);
\n
"
"}"
;
}
...
...
mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
浏览文件 @
9001dc48
...
...
@@ -66,8 +66,8 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
auto
param
=
reinterpret_cast
<
ConvParameter
*>
(
op_parameter_
);
const
bool
attr_valid
=
param
->
kernel_h_
==
3
&&
param
->
kernel_w_
==
3
&&
param
->
dilation_h_
==
1
&&
param
->
dilation_w_
==
1
&&
param
->
stride_h_
==
1
&&
param
->
stride_w_
==
1
;
const
bool
channel_good
=
C
O_SLICES
%
4
==
0
&&
CI_SLICES
>=
16
&&
CO_SLICES
>=
16
;
const
bool
hw_good
=
TILES_X
*
TILES_Y
>=
32
;
const
bool
channel_good
=
C
I_SLICES
>=
12
&&
CO_SLICES
>=
12
;
const
bool
hw_good
=
TILES_X
*
TILES_Y
>=
16
;
return
attr_valid
&&
channel_good
&&
hw_good
;
}
...
...
mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
浏览文件 @
9001dc48
...
...
@@ -77,33 +77,33 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const
auto
testcase_path
=
data_path
+
"/"
+
attr_str
+
"/"
;
auto
input_file
=
testcase_path
+
(
input_format
==
schema
::
Format_NHWC4
?
"input_NHWC4.bin"
:
"input_NHWC.bin"
);
auto
weight_file
=
testcase_path
+
"weight_OHWI.bin"
;
auto
bias_file
=
testcase_path
+
"bias_C
4
.bin"
;
auto
bias_file
=
testcase_path
+
"bias_C.bin"
;
auto
expect_file
=
testcase_path
+
(
output_format
==
schema
::
Format_NHWC4
?
"expect_NHWC4.bin"
:
"expect_NHWC.bin"
);
std
::
cout
<<
input_file
<<
std
::
endl
;
std
::
cout
<<
weight_file
<<
std
::
endl
;
std
::
cout
<<
bias_file
<<
std
::
endl
;
std
::
cout
<<
expect_file
<<
std
::
endl
;
std
::
cout
<<
"input_file:"
<<
input_file
<<
std
::
endl
;
std
::
cout
<<
"weight_file:"
<<
weight_file
<<
std
::
endl
;
std
::
cout
<<
"bias_file:"
<<
bias_file
<<
std
::
endl
;
std
::
cout
<<
"expect_file:"
<<
expect_file
<<
std
::
endl
;
std
::
cout
<<
"initialize OpenCLRuntime"
;
auto
ocl_runtime
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
ocl_runtime
->
Init
();
auto
allocator
=
ocl_runtime
->
GetAllocator
();
std
::
cout
<<
"create
inputs/weights/outputs Tensors(framework do
)"
;
std
::
cout
<<
"create
Tensors(framework will do!!!
)"
;
std
::
vector
<
int
>
input_shape
=
{
param
->
input_batch_
,
param
->
input_h_
,
param
->
input_w_
,
param
->
input_channel_
};
std
::
vector
<
int
>
weight_shape
=
{
param
->
output_channel_
,
param
->
kernel_h_
,
param
->
kernel_w_
,
param
->
input_channel_
};
std
::
vector
<
int
>
bias_shape
=
{
param
->
output_channel_
};
std
::
vector
<
int
>
output_shape
=
{
param
->
output_batch_
,
param
->
output_h_
,
param
->
output_w_
,
param
->
output_channel_
};
auto
data_type
=
kNumberTypeFloat32
;
auto
tensor
T
ype
=
schema
::
NodeType_ValueNode
;
auto
input_tensor
=
new
lite
::
tensor
::
Tensor
(
data_type
,
input_shape
,
input_format
,
tensor
T
ype
);
auto
weight_tensor
=
new
lite
::
tensor
::
Tensor
(
data_type
,
weight_shape
,
schema
::
Format_KHWC
,
tensor
T
ype
);
auto
bias_tensor
=
new
lite
::
tensor
::
Tensor
(
data_type
,
bias_shape
,
schema
::
Format_KHWC
,
tensor
T
ype
);
auto
output_tensor
=
new
lite
::
tensor
::
Tensor
(
data_type
,
output_shape
,
output_format
,
tensor
T
ype
);
auto
tensor
_t
ype
=
schema
::
NodeType_ValueNode
;
auto
input_tensor
=
new
lite
::
tensor
::
Tensor
(
data_type
,
input_shape
,
input_format
,
tensor
_t
ype
);
auto
weight_tensor
=
new
lite
::
tensor
::
Tensor
(
data_type
,
weight_shape
,
schema
::
Format_KHWC
,
tensor
_t
ype
);
auto
bias_tensor
=
new
lite
::
tensor
::
Tensor
(
data_type
,
bias_shape
,
schema
::
Format_KHWC
,
tensor
_t
ype
);
auto
output_tensor
=
new
lite
::
tensor
::
Tensor
(
data_type
,
output_shape
,
output_format
,
tensor
_t
ype
);
std
::
vector
<
lite
::
tensor
::
Tensor
*>
inputs
{
input_tensor
,
weight_tensor
,
bias_tensor
};
std
::
vector
<
lite
::
tensor
::
Tensor
*>
outputs
{
output_tensor
};
std
::
cout
<<
"
initialize weight Tensors data(framework do
)"
;
std
::
cout
<<
"
allocate and initialize weight/bias memory by hand here(framework will do!!!
)"
;
std
::
vector
<
float
>
weight_vec
(
weight_tensor
->
ElementsNum
());
std
::
vector
<
float
>
bias_vec
(
weight_tensor
->
ElementsNum
());
weight_tensor
->
SetData
(
weight_vec
.
data
());
...
...
@@ -111,25 +111,18 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const
LoadData
(
weight_tensor
->
Data
(),
weight_tensor
->
Size
(),
weight_file
);
LoadData
(
bias_tensor
->
Data
(),
bias_tensor
->
Size
(),
bias_file
);
std
::
cout
<<
"create OpenCL Kernel"
;
// weight has been allcated by framework
std
::
cout
<<
"create OpenCL Kernel"
;
// weight
/bias
has been allcated by framework
auto
*
conv_kernel
=
new
ConvolutionOpenCLKernel
(
reinterpret_cast
<
OpParameter
*>
(
param
),
inputs
,
outputs
);
conv_kernel
->
Init
();
std
::
vector
<
LiteKernel
*>
kernels
{
conv_kernel
};
// freamework to do!!! allocate memory by hand
inputs
[
0
]
->
MallocData
(
allocator
);
std
::
cout
<<
"create SubGraphOpenCLKernel"
;
auto
*
sub_graph
=
new
SubGraphOpenCLKernel
({
input_tensor
},
outputs
,
kernels
,
kernels
,
kernels
);
inputs
[
0
]
->
MallocData
(
allocator
);
// allocate input memory by hand here, framework will do!!!
auto
*
sub_graph
=
new
SubGraphOpenCLKernel
({
input_tensor
},
outputs
,
{
conv_kernel
},
{
conv_kernel
},
{
conv_kernel
});
sub_graph
->
Init
();
std
::
cout
<<
"initialize input Tensors data"
;
// inputs has been allcated by sub_graph->Init()
LoadData
(
input_tensor
->
Data
(),
input_tensor
->
Size
(),
input_file
);
LoadData
(
input_tensor
->
Data
(),
input_tensor
->
Size
(),
input_file
);
// initialize input Tensors data
printf
(
"input[0] =%.3f
\n
"
,
reinterpret_cast
<
float
*>
(
input_tensor
->
Data
())[
0
]);
printf
(
"weight[0]=%.3f
\n
"
,
reinterpret_cast
<
float
*>
(
weight_tensor
->
Data
())[
0
]);
printf
(
"bias[0] =%.3f
\n
"
,
reinterpret_cast
<
float
*>
(
bias_tensor
->
Data
())[
0
]);
std
::
cout
<<
"sub_graph->Run()"
;
sub_graph
->
Run
();
std
::
cout
<<
"compare result"
;
...
...
@@ -144,7 +137,7 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const
}
delete
conv_kernel
;
delete
sub_graph
;
mindspore
::
lite
::
opencl
::
OpenCLRuntime
::
DeleteInstance
();
lite
::
opencl
::
OpenCLRuntime
::
DeleteInstance
();
}
TEST_F
(
TestConvolutionOpenCL
,
in1x224x224x3_out1x112x112x32_k33_s22_p0101
)
{
...
...
@@ -161,18 +154,53 @@ TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) {
// "1x1");
//}
TEST_F
(
TestConvolutionOpenCL
,
winograd_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80
)
{
TEST_F
(
TestConvolutionOpenCL
,
winograd_
02_origin_
inputNHWC_1x16x256x96_outputNHWC_1x16x256x80
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/test_fp32/"
,
"inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
"dilationHW_1x1"
);
}
TEST_F
(
TestConvolutionOpenCL
,
winograd_inputNHWC_1x16x256x100_outputNHWC_1x16x256x96
)
{
TEST_F
(
TestConvolutionOpenCL
,
winograd_
02_origin_
inputNHWC_1x16x256x100_outputNHWC_1x16x256x96
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/test_fp32/"
,
"inputNHWC_1x16x256x100_outputNHWC_1x16x256x96_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
"dilationHW_1x1"
);
}
TEST_F
(
TestConvolutionOpenCL
,
winograd_inputNHWC_1x480x480x128_outputNHWC_1x480x480x128
)
{
// TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x32x512x1_outputNHWC_1x32x512x50) {
// TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
// "inputNHWC_1x32x512x1_outputNHWC_1x32x512x50_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
// "dilationHW_1x1");
//}
TEST_F
(
TestConvolutionOpenCL
,
winograd_02_other_inputNHWC_1x32x512x50_outputNHWC_1x32x512x48
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/02_fp32/"
,
"inputNHWC_1x32x512x50_outputNHWC_1x32x512x48_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
"dilationHW_1x1"
);
}
TEST_F
(
TestConvolutionOpenCL
,
winograd_02_other_inputNHWC_1x8x128x100_outputNHWC_1x8x128x250
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/02_fp32/"
,
"inputNHWC_1x8x128x100_outputNHWC_1x8x128x250_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
"dilationHW_1x1"
);
}
TEST_F
(
TestConvolutionOpenCL
,
winograd_02_other_inputNHWC_1x8x128x100_outputNHWC_1x8x128x300
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/02_fp32/"
,
"inputNHWC_1x8x128x100_outputNHWC_1x8x128x300_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
"dilationHW_1x1"
);
}
TEST_F
(
TestConvolutionOpenCL
,
winograd_02_other_inputNHWC_1x4x64x150_outputNHWC_1x4x64x350
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/02_fp32/"
,
"inputNHWC_1x4x64x150_outputNHWC_1x4x64x350_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
"dilationHW_1x1"
);
}
TEST_F
(
TestConvolutionOpenCL
,
winograd_02_other_inputNHWC_1x4x64x150_outputNHWC_1x4x64x400
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/02_fp32/"
,
"inputNHWC_1x4x64x150_outputNHWC_1x4x64x400_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
"dilationHW_1x1"
);
}
TEST_F
(
TestConvolutionOpenCL
,
winograd_08_origin_inputNHWC_1x480x480x128_outputNHWC_1x480x480x128
)
{
TEST_MAIN
(
schema
::
Format_NHWC
,
schema
::
Format_NHWC4
,
"testcases/test_fp32/"
,
"inputNHWC_1x480x480x128_outputNHWC_1x480x480x128_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_"
"1x1x1x1_dilationHW_1x1"
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录