Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
a6dc9490
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a6dc9490
编写于
8月 08, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
8月 08, 2020
浏览文件
操作
浏览文件
下载
差异文件
!4138 change buffer to image2d for arithmetic
Merge pull request !4138 from liuchao/arith_image
上级
729d847d
71dbdddb
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
152 addition
and
86 deletion
+152
-86
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
...te/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
+3
-20
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
...e/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
+59
-9
mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
...spore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+75
-40
mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
+5
-2
mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
...ite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
+10
-15
未找到文件。
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
浏览文件 @
a6dc9490
...
...
@@ -26,26 +26,9 @@ __kernel void ElementDiv(__global float *input_a, __global float *input_b, __glo
output[idx]
=
input_a[idx]
*
input_b[idx]
;
}
__kernel
void
BoardcastAdd
(
__global
float
*input_a,
float
input_b,
__global
float
*output,
const
unsigned
int
n
)
{
__kernel
void
BoardcastArith
(
__global
float
*input_a,
float
weight,
float
bias,
__global
float
*output,
const
unsigned
int
n
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
>=
n
)
return
;
output[idx]
=
input_a[idx]
+
input_b
;
}
__kernel
void
BoardcastSub
(
__global
float
*input_a,
float
input_b,
__global
float
*output,
const
unsigned
int
n
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
>=
n
)
return
;
output[idx]
=
input_a[idx]
-
input_b
;
}
__kernel
void
BoardcastMul
(
__global
float
*input_a,
float
input_b,
__global
float
*output,
const
unsigned
int
n
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
>=
n
)
return
;
output[idx]
=
input_a[idx]
*
input_b
;
}
__kernel
void
BoardcastDiv
(
__global
float
*input_a,
float
input_b,
__global
float
*output,
const
unsigned
int
n
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
>=
n
)
return
;
output[idx]
=
input_a[idx]
*
input_b
;
output[idx]
=
weight
*
input_a[idx]
+
bias
;
}
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
浏览文件 @
a6dc9490
__constant
sampler_t
smp_none
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_NONE |
CLK_FILTER_NEAREST
;
__kernel
void
ElementAdd
(
__read_only
image2d_t
*input_a,
__read_only
image2d_t
*input_b,
__write_only
image2d_t
*
output,
const
int
4
output_shape
)
{
__kernel
void
ElementAdd
(
__read_only
image2d_t
input_a,
__read_only
image2d_t
input_b,
__write_only
image2d_t
output,
const
int
2
output_shape
)
{
int
X
=
get_global_id
(
0
)
;
int
Y
=
get_global_id
(
1
)
;
int
Z
=
get_global_id
(
2
)
;
if
(
X
>=
output_shape.x
|
| Y >= output_shape.y |
|
Z
>=
output_shape.w
)
return
;
if
(
X
>=
output_shape.x
|
| Y >= output_shape.y) {
return;
}
if
(
idx
>=
n
)
return
;
float4
a
=
read_imagef
(
input_a,
smp_none,
(
int2
)(
X,
Y
*
output_shape.w
+
Z
))
;
float4
b
=
read_imagef
(
input_b,
smp_none,
(
int2
)(
X,
Y
*
output_shape.w
+
Z
))
;
src
=
a
+
b
;
write_imagef
(
output,
(
int2
)(
0
,
0
)
,
src
)
;
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
write_imagef(output, (int2)(X, Y), a + b);
}
__kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
write_imagef(output, (int2)(X, Y), a - b);
}
__kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
write_imagef(output, (int2)(X, Y), a * b);
}
__kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
write_imagef(output, (int2)(X, Y), a / b);
}
__kernel void BoardcastArith(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x |
|
Y
>=
output_shape.y
)
{
return
;
}
float4
a
=
read_imagef
(
input_a,
smp_none,
(
int2
)(
X,
Y
))
;
write_imagef
(
output,
(
int2
)(
X,
Y
)
,
weight
*
a
+
bias
)
;
}
mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
浏览文件 @
a6dc9490
...
...
@@ -40,10 +40,10 @@ std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const {
}
void
ArithmeticOpenCLKernel
::
Image2dGetWorkGroupSize
()
{
global_size_
=
InitGlobalSize
();
int
max_work_group_size
=
runtime_
->
GetKernelMaxWorkGroupSize
(
kernel_
(),
(
*
runtime_
->
Device
())()
);
local_size_
=
GetCommonLocalSize
(
global_size_
,
max_work_group_size
)
;
global_size_
=
GetCommonGlobalSize
(
local_size_
,
global_size_
)
;
size_t
H
=
outputs_
[
0
]
->
Batch
()
*
outputs_
[
0
]
->
Height
();
size_t
W
=
outputs_
[
0
]
->
Width
()
*
UP_DIV
(
outputs_
[
0
]
->
Channel
(),
C4NUM
);
local_size_
=
{
16
,
16
}
;
global_size_
=
{
H
,
W
}
;
}
void
ArithmeticOpenCLKernel
::
BufferGetWorkGroupSize
()
{
...
...
@@ -51,63 +51,75 @@ void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() {
global_size_
=
{
element_num
};
}
int
ArithmeticOpenCLKernel
::
GetImageSize
(
size_t
idx
,
std
::
vector
<
size_t
>*
img_size
)
{
size_t
CO4
=
UP_DIV
(
outputs_
[
0
]
->
Channel
(),
C4NUM
);
int
H
=
outputs_
[
0
]
->
Batch
()
*
outputs_
[
0
]
->
Height
();
int
W
=
outputs_
[
0
]
->
Width
()
*
CO4
;
size_t
im_dst_x
,
im_dst_y
;
if
(
inputs_
[
0
]
->
GetFormat
()
==
schema
::
Format_NHWC4
)
{
im_dst_x
=
W
;
im_dst_y
=
H
;
}
else
{
im_dst_y
=
outputs_
[
0
]
->
Batch
()
*
outputs_
[
0
]
->
Height
()
*
CO4
;
im_dst_x
=
outputs_
[
0
]
->
Width
();
}
#ifdef ENABLE_FP16
size_t
img_dtype
=
CL_HALF_FLOAT
;
#else
size_t
img_dtype
=
CL_FLOAT
;
#endif
img_size
->
clear
();
std
::
vector
<
size_t
>
vec
{
im_dst_x
,
im_dst_y
,
img_dtype
};
*
img_size
=
vec
;
return
0
;
}
int
ArithmeticOpenCLKernel
::
Init
()
{
runtime_
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
std
::
string
element_name
;
std
::
string
boardcast_name
;
std
::
string
kernel_name
;
if
(
inputs_
[
1
]
->
TensorType
()
==
schema
::
NodeType_ValueNode
&&
inputs_
[
1
]
->
Data
()
!=
nullptr
)
{
element_flag_
=
false
;
kernel_name
=
"BoardcastArith"
;
}
else
{
element_flag_
=
true
;
switch
(
opParameter
->
type_
)
{
case
PrimitiveType_Mul
:
kernel_name
=
"ElementMul"
;
break
;
case
PrimitiveType_Add
:
kernel_name
=
"ElementAdd"
;
break
;
case
PrimitiveType_Sub
:
kernel_name
=
"ElementSub"
;
break
;
case
PrimitiveType_Div
:
kernel_name
=
"ElementDiv"
;
break
;
default:
MS_LOG
(
ERROR
)
<<
"Error Operator type "
<<
opParameter
->
type_
;
break
;
}
}
switch
(
opParameter
->
type_
)
{
case
PrimitiveType_Mul
:
element_name
=
"ElementMul"
;
boardcast_name
=
"BoardcastMul"
;
break
;
case
PrimitiveType_Add
:
element_name
=
"ElementAdd"
;
boardcast_name
=
"BoardcastAdd"
;
break
;
case
PrimitiveType_Sub
:
element_name
=
"ElementSub"
;
boardcast_name
=
"BoardcastSub"
;
break
;
case
PrimitiveType_Div
:
element_name
=
"ElementDiv"
;
boardcast_name
=
"BoardcastDiv"
;
break
;
default:
MS_LOG
(
ERROR
)
<<
"Error Operator type "
<<
opParameter
->
type_
;
break
;
}
#ifdef PROGRAM_WITH_IL
runtime_
->
CreateKernelFromIL
(
kernel_
(),
kernel_name
);
#else
std
::
string
program_name
=
"Arithmetic"
;
std
::
set
<
std
::
string
>
build_options
;
std
::
string
source
=
arithmetic_
buffer
_source_fp32
;
std
::
string
source
=
arithmetic_
image2d
_source_fp32
;
runtime_
->
LoadSource
(
program_name
,
source
);
if
(
element_flag_
)
{
runtime_
->
BuildKernel
(
kernel_
,
program_name
,
element_name
,
build_options
);
MS_LOG
(
DEBUG
)
<<
element_name
<<
" Init Done!"
;
}
else
{
runtime_
->
BuildKernel
(
kernel_
,
program_name
,
boardcast_name
,
build_options
);
MS_LOG
(
DEBUG
)
<<
boardcast_name
<<
" Init Done!"
;
}
runtime_
->
BuildKernel
(
kernel_
,
program_name
,
kernel_name
,
build_options
);
#endif
outputs_
[
0
]
->
SetFormat
(
schema
::
Format_NHWC4
);
Image2dGetWorkGroupSize
();
return
0
;
}
int
ArithmeticOpenCLKernel
::
Run
()
{
MS_LOG
(
DEBUG
)
<<
this
->
Name
()
<<
" Running!"
;
auto
runtime_
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
BufferGetWorkGroupSize
();
int
arg_idx
=
0
;
uint32_t
element_num
=
outputs_
[
0
]
->
ElementsC4Num
();
...
...
@@ -116,11 +128,34 @@ int ArithmeticOpenCLKernel::Run() {
if
(
element_flag_
)
{
runtime_
->
SetKernelArg
(
kernel_
,
arg_idx
++
,
inputs_
[
1
]
->
Data
());
}
else
{
runtime_
->
SetKernelArg
(
kernel_
,
arg_idx
++
,
static_cast
<
float
*>
(
inputs_
[
1
]
->
Data
())[
0
]);
float
value
=
static_cast
<
float
*>
(
inputs_
[
1
]
->
Data
())[
0
];
switch
(
opParameter
->
type_
)
{
case
PrimitiveType_Mul
:
weight_
=
value
;
break
;
case
PrimitiveType_Add
:
bias_
=
value
;
break
;
case
PrimitiveType_Sub
:
bias_
=
-
1
*
value
;
break
;
case
PrimitiveType_Div
:
bias_
=
1
/
value
;
break
;
default:
MS_LOG
(
ERROR
)
<<
"Error Operator type "
<<
opParameter
->
type_
;
break
;
}
runtime_
->
SetKernelArg
(
kernel_
,
arg_idx
++
,
weight_
);
runtime_
->
SetKernelArg
(
kernel_
,
arg_idx
++
,
bias_
);
MS_LOG
(
DEBUG
)
<<
arg_idx
-
2
<<
" "
<<
weight_
;
MS_LOG
(
DEBUG
)
<<
arg_idx
-
1
<<
" "
<<
bias_
;
}
runtime_
->
SetKernelArg
(
kernel_
,
arg_idx
++
,
outputs_
[
0
]
->
Data
());
runtime_
->
SetKernelArg
(
kernel_
,
arg_idx
++
,
element_num
);
int
H
=
outputs_
[
0
]
->
Batch
()
*
outputs_
[
0
]
->
Height
();
int
W
=
outputs_
[
0
]
->
Width
()
*
UP_DIV
(
outputs_
[
0
]
->
Channel
(),
C4NUM
);
cl_int2
output_shape
{
H
,
W
};
runtime_
->
SetKernelArg
(
kernel_
,
arg_idx
++
,
output_shape
);
runtime_
->
RunKernel
(
kernel_
,
global_size_
,
local_size_
,
nullptr
);
return
0
;
}
...
...
mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
浏览文件 @
a6dc9490
...
...
@@ -24,15 +24,16 @@
namespace
mindspore
::
kernel
{
class
ArithmeticOpenCLKernel
:
public
ArithmeticCPU
Kernel
{
class
ArithmeticOpenCLKernel
:
public
OpenCL
Kernel
{
public:
explicit
ArithmeticOpenCLKernel
(
OpParameter
*
parameter
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
inputs
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
const
lite
::
Context
*
ctx
)
:
ArithmeticCPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
)
{}
:
OpenCLKernel
(
parameter
,
inputs
,
outputs
)
{}
~
ArithmeticOpenCLKernel
()
override
{};
int
Init
()
override
;
int
Run
()
override
;
int
GetImageSize
(
size_t
idx
,
std
::
vector
<
size_t
>*
img_size
)
override
;
private:
std
::
vector
<
size_t
>
InitGlobalSize
()
const
;
...
...
@@ -42,6 +43,8 @@ class ArithmeticOpenCLKernel : public ArithmeticCPUKernel {
cl
::
Kernel
kernel_
;
lite
::
opencl
::
OpenCLRuntime
*
runtime_
;
bool
element_flag_
{
true
};
float
weight_
{
1.
f
};
float
bias_
{
.0
f
};
std
::
vector
<
size_t
>
local_size_
;
std
::
vector
<
size_t
>
global_size_
;
...
...
mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
浏览文件 @
a6dc9490
...
...
@@ -61,13 +61,12 @@ void LogData(void *data, const int size, const std::string prefix) {
}
void
TestCase
(
const
std
::
vector
<
int
>
&
shape_a
,
const
std
::
vector
<
int
>
&
shape_b
)
{
std
::
cout
<<
"TestCase"
<<
std
::
endl
;
auto
ocl_runtime
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
auto
allocator
=
ocl_runtime
->
GetAllocator
();
bool
is_bias_add
=
shape_b
.
empty
();
auto
tensorType
=
schema
::
NodeType_ValueNode
;
std
::
cout
<<
"TestCase tensor"
<<
std
::
endl
;
lite
::
tensor
::
Tensor
*
tensor_a
=
new
lite
::
tensor
::
Tensor
(
kNumberTypeFloat32
,
shape_a
,
schema
::
Format_NHWC4
,
tensorType
);
lite
::
tensor
::
Tensor
*
tensor_b
=
...
...
@@ -77,7 +76,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
int64_t
element_num
=
tensor_a
->
ElementsC4Num
();
int64_t
element_num_b
=
is_bias_add
?
1
:
tensor_b
->
ElementsC4Num
();
std
::
cout
<<
"TestCase new data"
<<
std
::
endl
;
float
*
data_a
=
new
float
[
element_num
];
float
*
data_b
=
new
float
[
element_num_b
];
float
*
data_c_cpu
=
new
float
[
element_num
];
...
...
@@ -87,14 +85,12 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
InitData
(
data_b
,
element_num_b
);
memset
(
data_c_ocl
,
0
,
sizeof
(
float
)
*
element_num
);
std
::
cout
<<
"TestCase run cpu"
<<
std
::
endl
;
if
(
is_bias_add
)
{
BoardcaseAdd
(
data_a
,
static_cast
<
float
*>
(
data_b
)[
0
],
data_c_cpu
,
element_num
);
}
else
{
ElementAdd
(
data_a
,
data_b
,
data_c_cpu
,
element_num
);
}
std
::
cout
<<
"TestCase set data"
<<
std
::
endl
;
std
::
vector
<
lite
::
tensor
::
Tensor
*>
inputs
=
{
tensor_a
};
if
(
!
is_bias_add
)
{
inputs
.
push_back
(
tensor_b
);
...
...
@@ -114,9 +110,10 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
new
kernel
::
ArithmeticOpenCLKernel
(
reinterpret_cast
<
OpParameter
*>
(
param
),
arithmetic_inputs
,
outputs
,
&
ctx
);
arith_kernel
->
Init
();
tensor_a
->
MallocData
(
allocator
);
tensor_b
->
MallocData
(
allocator
);
std
::
vector
<
kernel
::
LiteKernel
*>
kernels
{
arith_kernel
};
auto
*
kernel
=
new
kernel
::
SubGraphOpenCLKernel
(
inputs
,
outputs
,
kernels
,
kernels
,
kernels
);
std
::
cout
<<
"TestCase Init"
<<
std
::
endl
;
kernel
->
Init
();
memcpy
(
inputs
[
0
]
->
Data
(),
data_a
,
sizeof
(
float
)
*
element_num
);
...
...
@@ -124,7 +121,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
memcpy
(
inputs
[
1
]
->
Data
(),
data_b
,
sizeof
(
float
)
*
element_num_b
);
}
std
::
cout
<<
"TestCase Run"
<<
std
::
endl
;
kernel
->
Run
();
memcpy
(
data_c_ocl
,
outputs
[
0
]
->
Data
(),
sizeof
(
float
)
*
element_num
);
...
...
@@ -136,7 +132,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
LogData
(
outputs
[
0
]
->
Data
(),
10
,
"OpenCL compute : "
);
bool
cmp
=
DataCompare
(
data_c_cpu
,
data_c_ocl
,
element_num
);
MS_LOG
(
INFO
)
<<
"Compare "
<<
(
cmp
?
"success!"
:
"failed!"
);
std
::
cout
<<
"TestCase End"
<<
std
::
endl
;
// free
delete
[]
data_a
;
...
...
@@ -162,15 +157,15 @@ class TestArithmeticOpenCL : public mindspore::Common {
};
TEST_F
(
TestArithmeticOpenCL
,
AddElementwiseTest
)
{
const
std
::
vector
<
int
>
&
shape_a
=
{
1
,
32
,
32
,
4
};
const
std
::
vector
<
int
>
&
shape_b
=
{
1
,
32
,
32
,
4
};
const
std
::
vector
<
int
>
&
shape_a
=
{
1
,
1024
,
1024
,
4
};
const
std
::
vector
<
int
>
&
shape_b
=
{
1
,
1024
,
1024
,
4
};
TestCase
(
shape_a
,
shape_b
);
}
// TEST_F(TestOpenCLKernel
, AddBoardcaseTest) {
// const std::vector<int> &shape_a = {1, 4, 128, 128
};
//
const std::vector<int> &shape_b = {};
//
TestCase(shape_a, shape_b);
//
}
TEST_F
(
TestArithmeticOpenCL
,
AddBoardcaseTest
)
{
const
std
::
vector
<
int
>
&
shape_a
=
{
1
,
128
,
128
,
4
};
const
std
::
vector
<
int
>
&
shape_b
=
{};
TestCase
(
shape_a
,
shape_b
);
}
}
// namespace mindspore
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录