Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
7a3b6667
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7a3b6667
编写于
8月 20, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
8月 20, 2020
浏览文件
操作
浏览文件
下载
差异文件
!4815 [MS][LITE][GPU]change opencl code path
Merge pull request !4815 from chenzupeng/master-lite
上级
1793a07e
605c2b0b
变更
40
隐藏空白更改
内联
并排
Showing
40 changed file
with
216 addition
and
511 deletion
+216
-511
build.sh
build.sh
+13
-17
mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl
mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl
+8
-11
mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
+101
-0
mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl
mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl
+0
-0
mindspore/lite/src/runtime/kernel/opencl/cl/batchnorm.cl
mindspore/lite/src/runtime/kernel/opencl/cl/batchnorm.cl
+0
-1
mindspore/lite/src/runtime/kernel/opencl/cl/caffe_prelu.cl
mindspore/lite/src/runtime/kernel/opencl/cl/caffe_prelu.cl
+2
-5
mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl
mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl
+0
-1
mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl
.../lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl
+0
-5
mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
+0
-0
mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
...ore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
+0
-9
mindspore/lite/src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl
.../src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl
+0
-61
mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl
...ite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl
+0
-76
mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl
...spore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl
+0
-45
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
...te/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
+0
-34
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
...e/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
+0
-66
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl
+0
-32
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/reshape.cl
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/reshape.cl
+0
-14
mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
+0
-4
mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl
mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl
+0
-0
mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl
mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl
+0
-4
mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl
mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl
+0
-0
mindspore/lite/src/runtime/kernel/opencl/cl/softmax1x1.cl
mindspore/lite/src/runtime/kernel/opencl/cl/softmax1x1.cl
+5
-5
mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
+0
-19
mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
+0
-4
mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
...spore/lite/src/runtime/kernel/opencl/kernel/activation.cc
+2
-2
mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
...spore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+7
-3
mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
+2
-2
mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.cc
...pore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.cc
+2
-2
mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
+3
-3
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
...lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+2
-7
mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
...lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+2
-7
mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+2
-7
mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+4
-4
mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
+2
-2
mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+2
-7
mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
+4
-4
mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
+45
-37
mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h
mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h
+2
-0
mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
+2
-7
mindspore/lite/src/runtime/opencl/opencl_runtime.cc
mindspore/lite/src/runtime/opencl/opencl_runtime.cc
+4
-4
未找到文件。
build.sh
浏览文件 @
7a3b6667
...
...
@@ -460,24 +460,20 @@ build_gtest() {
gene_clhpp
()
{
CL_SRC_DIR
=
"
${
BASEPATH
}
/mindspore/lite/src/runtime/kernel/opencl/cl"
for
sub_dir
in
"
${
CL_SRC_DIR
}
"
/
*
if
[
!
-d
${
CL_SRC_DIR
}
]
;
then
return
fi
cd
${
CL_SRC_DIR
}
/
rm
-rf
*
.inc
echo
"
$(
cd
"
$(
dirname
$0
)
"
;
pwd
)
"
for
file_path
in
"
${
CL_SRC_DIR
}
"
/
*
do
data_type
=
"
$(
basename
${
sub_dir
}
)
"
if
[
!
-d
${
CL_SRC_DIR
}
/
${
data_type
}
]
;
then
continue
fi
cd
${
CL_SRC_DIR
}
/
${
data_type
}
rm
-rf
*
.inc
echo
"
$(
cd
"
$(
dirname
$0
)
"
;
pwd
)
"
for
file_path
in
"
${
CL_SRC_DIR
}
/
${
data_type
}
"
/
*
do
file
=
"
$(
basename
${
file_path
}
)
"
inc_file
=
`
echo
${
CL_SRC_DIR
}
/
${
data_type
}
/
${
file
}
|
sed
's/$/.inc/'
`
sed
's/^/\"/;s/$/ \\n\" \\/'
${
CL_SRC_DIR
}
/
${
data_type
}
/
${
file
}
>
${
inc_file
}
kernel_name
=
`
echo
${
file
}
|
sed
s
'/.\{3\}$//'
`
sed
-i
"1i
\s
tatic const char *
${
kernel_name
}
_source_
${
data_type
}
=
\"\\
n
\"
\\
"
${
inc_file
}
sed
-i
'$a\;'
${
inc_file
}
done
file
=
"
$(
basename
${
file_path
}
)
"
inc_file
=
`
echo
${
CL_SRC_DIR
}
/
${
file
}
|
sed
's/$/.inc/'
`
sed
's/^/\"/;s/$/ \\n\" \\/'
${
CL_SRC_DIR
}
/
${
file
}
>
${
inc_file
}
kernel_name
=
`
echo
${
file
}
|
sed
s
'/.\{3\}$//'
`
sed
-i
"1i
\s
tatic const char *
${
kernel_name
}
_source =
\"\\
n
\"
\\
"
${
inc_file
}
sed
-i
'$a\;'
${
inc_file
}
done
}
...
...
mindspore/lite/src/runtime/kernel/opencl/cl/
fp32/
activation.cl
→
mindspore/lite/src/runtime/kernel/opencl/cl/activation.cl
浏览文件 @
7a3b6667
...
...
@@ -2,10 +2,7 @@
#
define
SLICES
4
#
define
UP_DIV
(
x,
y
)
(((
x
)
+
(
y
)
-
(
1
))
/
(
y
))
#
define
FLT4
float4
#
define
MIN
(
X,
Y
)
(
X
<
Y
?
X
:
Y
)
#
define
READ_FLT4
read_imagef
#
define
WRITE_FLT4
write_imagef
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
ReluScalar
(
__read_only
image2d_t
input,
__write_only
image2d_t
output,
const
int4
input_shape,
...
...
@@ -14,13 +11,13 @@ __kernel void ReluScalar(__read_only image2d_t input, __write_only image2d_t out
int
Y
=
get_global_id
(
0
)
; // height id
int
X
=
get_global_id
(
1
)
; // weight id
for
(
int
num
=
0
; num < UP_DIV(C, SLICES); ++num) {
FLT4
in_c4
=
READ_
FLT4
(
input,
smp_zero,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
))
; // NHWC4: H WC
FLT4
in_c4
=
READ_
IMAGE
(
input,
smp_zero,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
))
; // NHWC4: H WC
FLT4
tmp
;
tmp.x
=
in_c4.x
>=
0
?
in_c4.x
:
in_c4.x
*
alpha
;
tmp.y
=
in_c4.y
>=
0
?
in_c4.y
:
in_c4.y
*
alpha
;
tmp.z
=
in_c4.z
>=
0
?
in_c4.z
:
in_c4.z
*
alpha
;
tmp.w
=
in_c4.w
>=
0
?
in_c4.w
:
in_c4.w
*
alpha
;
WRITE_
FLT4
(
output,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
)
,
tmp
)
; // NHWC4: H WC
WRITE_
IMAGE
(
output,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
)
,
tmp
)
; // NHWC4: H WC
}
}
...
...
@@ -29,13 +26,13 @@ __kernel void Relu(__read_only image2d_t input, __write_only image2d_t output, c
int
Y
=
get_global_id
(
0
)
; // height id
int
X
=
get_global_id
(
1
)
; // weight id
for
(
int
num
=
0
; num < UP_DIV(C, SLICES); ++num) {
FLT4
in_c4
=
READ_
FLT4
(
input,
smp_zero,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
))
; // NHWC4: H WC
FLT4
in_c4
=
READ_
IMAGE
(
input,
smp_zero,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
))
; // NHWC4: H WC
FLT4
tmp
;
tmp.x
=
in_c4.x
>=
0
?
in_c4.x
:
0
;
tmp.y
=
in_c4.y
>=
0
?
in_c4.y
:
0
;
tmp.z
=
in_c4.z
>=
0
?
in_c4.z
:
0
;
tmp.w
=
in_c4.w
>=
0
?
in_c4.w
:
0
;
WRITE_
FLT4
(
output,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
)
,
tmp
)
; // NHWC4: H WC
WRITE_
IMAGE
(
output,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
)
,
tmp
)
; // NHWC4: H WC
}
}
...
...
@@ -44,13 +41,13 @@ __kernel void Relu6(__read_only image2d_t input, __write_only image2d_t output,
int
Y
=
get_global_id
(
0
)
; // height id
int
X
=
get_global_id
(
1
)
; // weight id
for
(
int
num
=
0
; num < UP_DIV(C, SLICES); ++num) {
FLT4
in_c4
=
READ_
FLT4
(
input,
smp_zero,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
))
; // NHWC4: H WC
FLT4
in_c4
=
READ_
IMAGE
(
input,
smp_zero,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
))
; // NHWC4: H WC
FLT4
tmp
;
tmp.x
=
in_c4.x
>=
0
?
MIN
(
in_c4.x,
6
)
:
0
;
tmp.y
=
in_c4.y
>=
0
?
MIN
(
in_c4.y,
6
)
:
0
;
tmp.z
=
in_c4.z
>=
0
?
MIN
(
in_c4.z,
6
)
:
0
;
tmp.w
=
in_c4.w
>=
0
?
MIN
(
in_c4.w,
6
)
:
0
;
WRITE_
FLT4
(
output,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
)
,
tmp
)
; // NHWC4: H WC
WRITE_
IMAGE
(
output,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
)
,
tmp
)
; // NHWC4: H WC
}
}
...
...
@@ -59,12 +56,12 @@ __kernel void Sigmoid(__read_only image2d_t input, __write_only image2d_t output
int
Y
=
get_global_id
(
0
)
; // height id
int
X
=
get_global_id
(
1
)
; // weight id
for
(
int
num
=
0
; num < UP_DIV(C, SLICES); ++num) {
FLT4
in_c4
=
READ_
FLT4
(
input,
smp_zero,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
))
; // NHWC4: H WC
FLT4
in_c4
=
READ_
IMAGE
(
input,
smp_zero,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
))
; // NHWC4: H WC
FLT4
tmp
;
tmp.x
=
1
/
(
1
+
exp
(
-in_c4.x
))
;
tmp.y
=
1
/
(
1
+
exp
(
-in_c4.y
))
;
tmp.z
=
1
/
(
1
+
exp
(
-in_c4.z
))
;
tmp.w
=
1
/
(
1
+
exp
(
-in_c4.w
))
;
WRITE_
FLT4
(
output,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
)
,
tmp
)
; // NHWC4: H WC
WRITE_
IMAGE
(
output,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
)
,
tmp
)
; // NHWC4: H WC
}
}
mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
0 → 100644
浏览文件 @
7a3b6667
#
define
divide_no_check
(
a,
b
)
(
a
/
b
)
__constant
sampler_t
smp_none
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_NONE |
CLK_FILTER_NEAREST
;
__kernel
void
ElementAdd_IMG
(
__read_only
image2d_t
input_a,
__read_only
image2d_t
input_b,
__write_only
image2d_t
output,
const
int2
output_shape
)
{
int
X
=
get_global_id
(
0
)
;
int
Y
=
get_global_id
(
1
)
;
if
(
X
>=
output_shape.x
|
| Y >= output_shape.y) {
return;
}
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
WRITE_IMAGE(output, (int2)(X, Y), a + b);
}
__kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
WRITE_IMAGE(output, (int2)(X, Y), a - b);
}
__kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
WRITE_IMAGE(output, (int2)(X, Y), a * b);
}
__kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b));
}
__kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x |
|
Y
>=
output_shape.y
)
{
return
;
}
FLT4
a
=
READ_IMAGE
(
input_a,
smp_none,
(
int2
)(
X,
Y
))
;
WRITE_IMAGE
(
output,
(
int2
)(
X,
Y
)
,
weight
*
a
+
bias
)
;
}
__kernel
void
ElementAdd_BUF
(
__global
float
*input_a,
__global
float
*input_b,
__global
float
*output,
const
unsigned
int
n
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
>=
n
)
return
;
output[idx]
=
input_a[idx]
+
input_b[idx]
;
}
__kernel
void
ElementSub_BUF
(
__global
float
*input_a,
__global
float
*input_b,
__global
float
*output,
const
unsigned
int
n
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
>=
n
)
return
;
output[idx]
=
input_a[idx]
-
input_b[idx]
;
}
__kernel
void
ElementMul_BUF
(
__global
float
*input_a,
__global
float
*input_b,
__global
float
*output,
const
unsigned
int
n
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
>=
n
)
return
;
output[idx]
=
input_a[idx]
*
input_b[idx]
;
}
__kernel
void
ElementDiv_BUF
(
__global
float
*input_a,
__global
float
*input_b,
__global
float
*output,
const
unsigned
int
n
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
>=
n
)
return
;
output[idx]
=
input_a[idx]
*
input_b[idx]
;
}
__kernel
void
BoardcastArith_BUF
(
__global
float
*input_a,
float
weight,
float
bias,
__global
float
*output,
const
unsigned
int
n
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
>=
n
)
return
;
output[idx]
=
weight
*
input_a[idx]
+
bias
;
}
mindspore/lite/src/runtime/kernel/opencl/cl/
fp32/
avg_pool2d.cl
→
mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl
浏览文件 @
7a3b6667
文件已移动
mindspore/lite/src/runtime/kernel/opencl/cl/
fp32/
batchnorm.cl
→
mindspore/lite/src/runtime/kernel/opencl/cl/batchnorm.cl
浏览文件 @
7a3b6667
#
define
FLT4
float4
#
define
INT4
int4
#
define
INT2
int2
__constant
sampler_t
smp_none
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_NONE |
CLK_FILTER_NEAREST
;
...
...
mindspore/lite/src/runtime/kernel/opencl/cl/
fp32/
caffe_prelu.cl
→
mindspore/lite/src/runtime/kernel/opencl/cl/caffe_prelu.cl
浏览文件 @
7a3b6667
...
...
@@ -2,9 +2,6 @@
#
define
SLICES
4
#
define
UP_DIV
(
x,
y
)
(((
x
)
+
(
y
)
-
(
1
))
/
(
y
))
#
define
FLT4
float4
#
define
READ_FLT4
read_imagef
#
define
WRITE_FLT4
write_imagef
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
CaffePRelu
(
__read_only
image2d_t
input,
__write_only
image2d_t
output,
const
int4
input_shape,
...
...
@@ -14,13 +11,13 @@ __kernel void CaffePRelu(__read_only image2d_t input, __write_only image2d_t out
int
Y
=
get_global_id
(
0
)
; // height id
int
X
=
get_global_id
(
1
)
; // weight id
for
(
int
num
=
0
; num < UP_DIV(C, SLICES); ++num) {
FLT4
in_c4
=
READ_
FLT4
(
input,
smp_zero,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
))
; // NHWC4: H WC
FLT4
in_c4
=
READ_
IMAGE
(
input,
smp_zero,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
))
; // NHWC4: H WC
FLT4
tmp
;
int
index
=
num
*
4
;
tmp.x
=
in_c4.x
*
alpha[index]
;
tmp.y
=
in_c4.y
*
alpha[index
+
1]
;
tmp.z
=
in_c4.z
*
alpha[index
+
2]
;
tmp.w
=
in_c4.w
*
alpha[index
+
3]
;
WRITE_
FLT4
(
output,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
)
,
tmp
)
; // NHWC4: H WC
WRITE_
IMAGE
(
output,
(
int2
)(
X
*
UP_DIV
(
C,
SLICES
)
+
num,
Y
)
,
tmp
)
; // NHWC4: H WC
}
}
mindspore/lite/src/runtime/kernel/opencl/cl/
fp32/
concat.cl
→
mindspore/lite/src/runtime/kernel/opencl/cl/concat.cl
浏览文件 @
7a3b6667
//
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
#
define
FLT4
float4
__constant
sampler_t
smp_none
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_NONE |
CLK_FILTER_NEAREST
;
__kernel
void
Concat
(
__read_only
image2d_t
input0,
__read_only
image2d_t
input1,
__write_only
image2d_t
output,
...
...
mindspore/lite/src/runtime/kernel/opencl/cl/
fp32/
conv2d_transpose2x2.cl
→
mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl
浏览文件 @
7a3b6667
#
define
FLT
float
#
define
FLT4
float4
#
define
FLT16
float16
#
define
READ_IMAGE
read_imagef
#
define
WRITE_IMAGE
write_imagef
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
conv2d_transpose2x2
(
__read_only
image2d_t
src_data,
__global
FLT16
*weight,
__read_only
image2d_t
biases,
__write_only
image2d_t
dst_data,
int2
kernel_size,
int2
stride,
int2
padding,
...
...
mindspore/lite/src/runtime/kernel/opencl/cl/
fp32/
convolution.cl
→
mindspore/lite/src/runtime/kernel/opencl/cl/convolution.cl
浏览文件 @
7a3b6667
文件已移动
mindspore/lite/src/runtime/kernel/opencl/cl/
fp32/
depthwise_conv2d.cl
→
mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
浏览文件 @
7a3b6667
#
ifdef
ENABLE_FP16
#
define
FLT
half
#
define
FLT4
half4
#
define
TO_FLT4
convert_half4
#
else
#
define
FLT
float
#
define
FLT4
float4
#
define
TO_FLT4
convert_float4
#
endif
__constant
sampler_t
sampler_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
DepthwiseConv2d_IMG_NC4HW4
(
__read_only
image2d_t
src_data,
__global
FLT4
*filter,
__global
FLT4
*bias,
float
relu_clip1,
__write_only
image2d_t
dst_data,
int2
kernel_size,
...
...
mindspore/lite/src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl
已删除
100644 → 0
浏览文件 @
1793a07e
#
define
FLT
half
#
define
FLT4
half4
#
define
FLT16
half16
#
define
READ_IMAGE
read_imageh
#
define
WRITE_IMAGE
write_imageh
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
conv2d_transpose2x2
(
__read_only
image2d_t
src_data,
__global
FLT16
*weight,
__read_only
image2d_t
biases,
__write_only
image2d_t
dst_data,
int2
kernel_size,
int2
stride,
int2
padding,
int4
src_size,
int4
dst_size
)
{
int
h
=
get_global_id
(
0
)
;
int
kh
=
h
%
2
;
int
src_h
=
h
/
2
;
src_h
=
src_h
*
2
;
int
w
=
get_global_id
(
1
)
;
int
kw
=
w
%
2
;
int
src_w
=
w
/
2
;
src_w
=
src_w
*
2
;
int
co
=
get_global_id
(
2
)
;
if
(
src_h
*
2
>=
dst_size.x
|
| src_w * 2 >= dst_size.y |
|
co
>=
dst_size.z
)
return
;
FLT4
r0
=
(
FLT4
)(
0.f
)
;
FLT4
r1
=
(
FLT4
)(
0.f
)
;
FLT4
r2
=
(
FLT4
)(
0.f
)
;
FLT4
r3
=
(
FLT4
)(
0.f
)
;
int
base_w
=
(
co
*
4
+
kh
+
kw
*
2
)
*
src_size.z
;
for
(
int
ci
=
0
; ci < src_size.z; ++ci) {
FLT4
x0
=
READ_IMAGE
(
src_data,
smp_zero,
(
int2
)(
src_w
*
src_size.z
+
ci,
src_h
))
;
FLT4
x1
=
READ_IMAGE
(
src_data,
smp_zero,
(
int2
)(
src_w
*
src_size.z
+
ci,
src_h
+
1
))
;
FLT4
x2
=
READ_IMAGE
(
src_data,
smp_zero,
(
int2
)((
src_w
+
1
)
*
src_size.z
+
ci,
src_h
))
;
FLT4
x3
=
READ_IMAGE
(
src_data,
smp_zero,
(
int2
)((
src_w
+
1
)
*
src_size.z
+
ci,
src_h
+
1
))
;
FLT16
weight_cache
=
weight[base_w++]
;
r0
+=
x0.x
*
weight_cache.s0123
;
r0
+=
x0.y
*
weight_cache.s4567
;
r0
+=
x0.z
*
weight_cache.s89ab
;
r0
+=
x0.w
*
weight_cache.scdef
;
r1
+=
x1.x
*
weight_cache.s0123
;
r1
+=
x1.y
*
weight_cache.s4567
;
r1
+=
x1.z
*
weight_cache.s89ab
;
r1
+=
x1.w
*
weight_cache.scdef
;
r2
+=
x2.x
*
weight_cache.s0123
;
r2
+=
x2.y
*
weight_cache.s4567
;
r2
+=
x2.z
*
weight_cache.s89ab
;
r2
+=
x2.w
*
weight_cache.scdef
;
r3
+=
x3.x
*
weight_cache.s0123
;
r3
+=
x3.y
*
weight_cache.s4567
;
r3
+=
x3.z
*
weight_cache.s89ab
;
r3
+=
x3.w
*
weight_cache.scdef
;
}
FLT4
bias_val
=
READ_IMAGE
(
biases,
smp_zero,
(
int2
)(
co,
0
))
;
r0
+=
bias_val
;
r1
+=
bias_val
;
r2
+=
bias_val
;
r3
+=
bias_val
;
WRITE_IMAGE
(
dst_data,
(
int2
)((
2
*
src_w
+
kw
)
*
dst_size.z
+
co,
2
*
src_h
+
kh
)
,
r0
)
;
WRITE_IMAGE
(
dst_data,
(
int2
)((
2
*
src_w
+
kw
)
*
dst_size.z
+
co,
2
*
src_h
+
kh
+
2
)
,
r1
)
;
WRITE_IMAGE
(
dst_data,
(
int2
)((
2
*
src_w
+
kw
+
2
)
*
dst_size.z
+
co,
2
*
src_h
+
kh
)
,
r2
)
;
WRITE_IMAGE
(
dst_data,
(
int2
)((
2
*
src_w
+
kw
+
2
)
*
dst_size.z
+
co,
2
*
src_h
+
kh
+
2
)
,
r3
)
;
}
mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl
已删除
100644 → 0
浏览文件 @
1793a07e
#
pragma
OPENCL
EXTENSION
cl_khr_3d_image_writes
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
#
define
ACCUM_FLT4
half4
#
define
FLT
half
#
define
FLT2
half2
#
define
FLT3
half3
#
define
FLT4
half4
#
define
TO_FLT4
convert_half4
#
define
TO_ACCUM_TYPE
convert_half4
#
define
TO_ACCUM_FLT
convert_half
#
define
READ_IMAGE
read_imagef
#
define
WRITE_IMAGE
write_imagef
__constant
sampler_t
smp_edge
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP_TO_EDGE |
CLK_FILTER_NEAREST
;
__constant
sampler_t
smp_none
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_NONE |
CLK_FILTER_NEAREST
;
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
DepthwiseConv2d_NC4HW4
(
__global
FLT4
*src_data,
__global
FLT4
*filters,
__global
FLT4
*biases,
float
relu_clip1,
__global
FLT4
*dst_data,
int2
kernel_size,
int2
stride,
int2
padding,
int2
dilation,
int4
src_size,
int4
dst_size
)
{
int
X
=
get_global_id
(
0
)
;
int
Y
=
get_global_id
(
1
)
;
int
Z
=
get_global_id
(
2
)
;
if
(
X
>=
dst_size.x
|
| Y >= dst_size.y || Z >= dst_size.z) return;
ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
int x_offseted = X * stride.x + padding.x;
int y_offseted = Y * stride.y + padding.y;
int fx_c = Z * kernel_size.x * kernel_size.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = y_offseted + ky * dilation.y;
bool outside_y = y_c < 0 || y_c >= src_size.y;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = x_offseted + kx * dilation.x;
bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) {
FLT4 f = filters[fx_c];
FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
r += TO_ACCUM_TYPE(src_final * f);
}
fx_c++;
}
}
FLT4 bias_val = biases[Z];
FLT4 res0 = TO_FLT4(r) + bias_val;
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res0;
}
__kernel void DepthwiseConv2d_NHWC4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
int x_offseted = X * stride.x + padding.x;
int y_offseted = Y * stride.y + padding.y;
int fx_c = Z * kernel_size.x * kernel_size.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = y_offseted + ky * dilation.y;
bool outside_y = y_c < 0 || y_c >= src_size.y;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = x_offseted + kx * dilation.x;
bool outside_x = x_c < 0 |
|
x_c
>=
src_size.x
;
if
(
!outside_x
&&
!outside_y
)
{
FLT4
f
=
filters[fx_c]
;
FLT4
src_final
=
src_data[
((
y_c
*
src_size.x
+
x_c
)
*
src_size.z
+
Z
)
]
;
r
+=
TO_ACCUM_TYPE
(
src_final
*
f
)
;
}
fx_c++
;
}
}
FLT4
bias_val
=
biases[Z]
;
FLT4
res0
=
TO_FLT4
(
r
)
+
bias_val
;
res0
=
clamp
(
res0,
(
FLT
)(
0.0f
)
,
(
FLT
)(
relu_clip1
))
;
dst_data[
((
Y
*
dst_size.x
+
X
)
*
dst_size.z
+
Z
)
]
=
res0
;
}
mindspore/lite/src/runtime/kernel/opencl/cl/fp16/transpose.cl
已删除
100644 → 0
浏览文件 @
1793a07e
#
define
FLT
half
#
define
FLT4
half4
#
define
READ_IMAGE
read_imageh
#
define
WRITE_IMAGE
write_imageh
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
transpose
(
__read_only
image2d_t
src_data,
__write_only
image2d_t
dst_data,
int2
HW,
int2
C
)
{
int
X
=
get_global_id
(
0
)
;
int
Y
=
get_global_id
(
1
)
;
if
(
X
>=
HW.y
||
Y
>=
C.y
)
{
return
;
}
FLT4
result[4]
;
result[0]
=
(
FLT4
)(
0.0f
)
;
result[1]
=
(
FLT4
)(
0.0f
)
;
result[2]
=
(
FLT4
)(
0.0f
)
;
result[3]
=
(
FLT4
)(
0.0f
)
;
FLT4
x0
=
READ_IMAGE
(
src_data,
smp_zero,
(
int2
)(
Y,
4
*
X
))
;
FLT4
x1
=
READ_IMAGE
(
src_data,
smp_zero,
(
int2
)(
Y,
4
*
X
+
1
))
;
FLT4
x2
=
READ_IMAGE
(
src_data,
smp_zero,
(
int2
)(
Y,
4
*
X
+
2
))
;
FLT4
x3
=
READ_IMAGE
(
src_data,
smp_zero,
(
int2
)(
Y,
4
*
X
+
3
))
;
result[0].x
=
x0.x
;
result[0].y
=
x1.x
;
result[0].z
=
x2.x
;
result[0].w
=
x3.x
;
result[1].x
=
x0.y
;
result[1].y
=
x1.y
;
result[1].z
=
x2.y
;
result[1].w
=
x3.y
;
result[2].x
=
x0.z
;
result[2].y
=
x1.z
;
result[2].z
=
x2.z
;
result[2].w
=
x3.z
;
result[3].x
=
x0.w
;
result[3].y
=
x1.w
;
result[3].z
=
x2.w
;
result[3].w
=
x3.w
;
WRITE_IMAGE
(
dst_data,
(
int2
)(
X,
4
*
Y
)
,
result[0]
)
;
WRITE_IMAGE
(
dst_data,
(
int2
)(
X,
4
*
Y
+
1
)
,
result[1]
)
;
WRITE_IMAGE
(
dst_data,
(
int2
)(
X,
4
*
Y
+
2
)
,
result[2]
)
;
WRITE_IMAGE
(
dst_data,
(
int2
)(
X,
4
*
Y
+
3
)
,
result[3]
)
;
}
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
已删除
100644 → 0
浏览文件 @
1793a07e
__kernel
void
ElementAdd
(
__global
float
*input_a,
__global
float
*input_b,
__global
float
*output,
const
unsigned
int
n
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
>=
n
)
return
;
output[idx]
=
input_a[idx]
+
input_b[idx]
;
}
__kernel
void
ElementSub
(
__global
float
*input_a,
__global
float
*input_b,
__global
float
*output,
const
unsigned
int
n
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
>=
n
)
return
;
output[idx]
=
input_a[idx]
-
input_b[idx]
;
}
__kernel
void
ElementMul
(
__global
float
*input_a,
__global
float
*input_b,
__global
float
*output,
const
unsigned
int
n
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
>=
n
)
return
;
output[idx]
=
input_a[idx]
*
input_b[idx]
;
}
__kernel
void
ElementDiv
(
__global
float
*input_a,
__global
float
*input_b,
__global
float
*output,
const
unsigned
int
n
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
>=
n
)
return
;
output[idx]
=
input_a[idx]
/
input_b[idx]
;
}
__kernel
void
BoardcastArith
(
__global
float
*input_a,
float
weight,
float
bias,
__global
float
*output,
const
unsigned
int
n
)
{
int
idx
=
get_global_id
(
0
)
;
if
(
idx
>=
n
)
return
;
output[idx]
=
weight
*
input_a[idx]
+
bias
;
}
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
已删除
100644 → 0
浏览文件 @
1793a07e
#
define
divide_no_check
(
a,
b
)
(
a/b
)
__constant
sampler_t
smp_none
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_NONE |
CLK_FILTER_NEAREST
;
__kernel
void
ElementAdd
(
__read_only
image2d_t
input_a,
__read_only
image2d_t
input_b,
__write_only
image2d_t
output,
const
int2
output_shape
)
{
int
X
=
get_global_id
(
0
)
;
int
Y
=
get_global_id
(
1
)
;
if
(
X
>=
output_shape.x
|
| Y >= output_shape.y) {
return;
}
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
write_imagef(output, (int2)(X, Y), a + b);
}
__kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
write_imagef(output, (int2)(X, Y), a - b);
}
__kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
write_imagef(output, (int2)(X, Y), a * b);
}
__kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) {
return;
}
float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
write_imagef(output, (int2)(X, Y), divide_no_check(a, b));
}
__kernel void BoardcastArith(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
const int2 output_shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
if (X >= output_shape.x |
|
Y
>=
output_shape.y
)
{
return
;
}
float4
a
=
read_imagef
(
input_a,
smp_none,
(
int2
)(
X,
Y
))
;
write_imagef
(
output,
(
int2
)(
X,
Y
)
,
weight
*
a
+
bias
)
;
}
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl
已删除
100644 → 0
浏览文件 @
1793a07e
#
define
FLT4
float4
#
define
FLT16
float16
#
define
READ_IMAGE
read_imagef
#
define
WRITE_IMAGE
write_imagef
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
MatMul
(
__read_only
image2d_t
input,
__global
FLT16
*weight,
__read_only
image2d_t
bias,
__write_only
image2d_t
output,
int2
offset_ci,
int2
offset_co,
int
has_bias
)
{
int2
gid
=
(
int2
)(
get_global_id
(
0
)
,
get_global_id
(
1
))
;
int2
lid
=
(
int2
)(
get_local_id
(
0
)
,
get_local_id
(
1
))
;
FLT4
result
=
(
FLT4
)(
0.0f
)
;
bool
inside
=
gid.x
<
offset_co.y
;
for
(
uint
i
=
lid.y
; i < offset_ci.y && inside; i += 4) {
FLT4
v
=
READ_IMAGE
(
input,
smp_zero,
(
int2
)(
i,
0
))
;
FLT16
w
=
weight[gid.x
+
i
*
offset_co.y]
;
result.x
+=
dot
(
v,
w.s0123
)
;
result.y
+=
dot
(
v,
w.s4567
)
;
result.z
+=
dot
(
v,
w.s89ab
)
;
result.w
+=
dot
(
v,
w.scdef
)
;
}
__local
FLT4
temp[64][4]
;
temp[lid.x][lid.y]
=
result
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lid.y
==
0
&&
inside
)
{
result
+=
temp[lid.x][1]
;
result
+=
temp[lid.x][2]
;
result
+=
temp[lid.x][3]
;
if
(
has_bias
!=
0
)
{
result
+=
READ_IMAGE
(
bias,
smp_zero,
(
int2
)(
gid.x,
0
))
;
}
WRITE_IMAGE
(
output,
(
int2
)(
gid.x,
0
)
,
result
)
;
}
}
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/reshape.cl
已删除
100644 → 0
浏览文件 @
1793a07e
#
define
FLT
float
#
define
FLT4
float4
#
define
READ_IMAGE
read_imagef
#
define
WRITE_IMAGE
write_imagef
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
reshape
(
__read_only
image2d_t
src_data,
__write_only
image2d_t
dst_data,
int4
size
)
{
int
X
=
get_global_id
(
0
)
;
int
Y
=
get_global_id
(
1
)
;
int
Z
=
get_global_id
(
2
)
;
if
(
X
>=
size.x
|
| Y >= size.y |
|
Z
>=
size.z
)
{
return
;
}
WRITE_IMAGE
(
dst_data,
(
int2
)(
Y
*
size.z
+
Z,
X
)
,
READ_IMAGE
(
src_data,
smp_zero,
(
int2
)(
Y
*
size.z
+
Z,
X
)))
;
}
mindspore/lite/src/runtime/kernel/opencl/cl/
fp16/
matmul.cl
→
mindspore/lite/src/runtime/kernel/opencl/cl/matmul.cl
浏览文件 @
7a3b6667
#
define
FLT4
half4
#
define
FLT16
half16
#
define
READ_IMAGE
read_imageh
#
define
WRITE_IMAGE
write_imageh
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
MatMul
(
__read_only
image2d_t
input,
__global
FLT16
*weight,
__read_only
image2d_t
bias,
__write_only
image2d_t
output,
int2
offset_ci,
int2
offset_co,
int
has_bias
)
{
...
...
mindspore/lite/src/runtime/kernel/opencl/cl/
fp32/
max_pool2d.cl
→
mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl
浏览文件 @
7a3b6667
文件已移动
mindspore/lite/src/runtime/kernel/opencl/cl/
fp16/
reshape.cl
→
mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl
浏览文件 @
7a3b6667
#
define
FLT
half
#
define
FLT4
half4
#
define
READ_IMAGE
read_imageh
#
define
WRITE_IMAGE
write_imageh
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
reshape
(
__read_only
image2d_t
src_data,
__write_only
image2d_t
dst_data,
int4
size
)
{
int
X
=
get_global_id
(
0
)
;
...
...
mindspore/lite/src/runtime/kernel/opencl/cl/
fp32/
softmax.cl
→
mindspore/lite/src/runtime/kernel/opencl/cl/softmax.cl
浏览文件 @
7a3b6667
文件已移动
mindspore/lite/src/runtime/kernel/opencl/cl/
fp32/
softmax1x1.cl
→
mindspore/lite/src/runtime/kernel/opencl/cl/softmax1x1.cl
浏览文件 @
7a3b6667
__constant
sampler_t
smp_none
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_NONE |
CLK_FILTER_NEAREST
;
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
//
what
is
mask
and
args.slices_x32
__kernel
void
SoftMax1x1_IMG
(
__read_only
image2d_t
input,
__write_only
image2d_t
output,
const
float4
mask,
const
int
slices,
const
int
slices_x32
)
{
...
...
@@ -54,11 +54,11 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *outpu
int
tid
=
get_local_id
(
0
)
;
float
sum
=
0.0f
;
for
(
size_t
i
=
tid
; i < slices - 1; i += 32) {
float4
src
=
read_imagef
(
input,
smp_
none
,
(
int2
)(
i,
0
))
;
float4
src
=
read_imagef
(
input,
smp_
zero
,
(
int2
)(
i,
0
))
;
sum
+=
dot
((
float4
)(
1.0f
)
,
exp
(
src
))
;
}
if
((
slices
-
1
)
%
32
==
tid
)
{
float4
src
=
read_imagef
(
input,
smp_
none
,
(
int2
)(
slices
-
1
,
0
))
;
float4
src
=
read_imagef
(
input,
smp_
zero
,
(
int2
)(
slices
-
1
,
0
))
;
sum
+=
dot
(
mask,
exp
(
src
))
;
}
...
...
@@ -80,12 +80,12 @@ __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *outpu
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
sum
=
tmpx1[0]
;
for
(
size_t
i
=
tid
; i < slices - 1; i += 32) {
float4
result
=
read_imagef
(
input,
smp_
none
,
(
int2
)(
i,
0
))
;
float4
result
=
read_imagef
(
input,
smp_
zero
,
(
int2
)(
i,
0
))
;
result
=
exp
(
result
)
*
sum
;
output[i]
=
result
;
}
if
((
slices
-
1
)
%
32
==
tid
)
{
float4
result
=
read_imagef
(
input,
smp_
none
,
(
int2
)(
slices
-
1
,
0
))
;
float4
result
=
read_imagef
(
input,
smp_
zero
,
(
int2
)(
slices
-
1
,
0
))
;
result
=
exp
(
result
)
*
sum
;
__global
float4
*remain_ptr4
=
output
;
remain_ptr4
+=
slices
-
1
;
...
...
mindspore/lite/src/runtime/kernel/opencl/cl/
fp32/
to_format.cl
→
mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
浏览文件 @
7a3b6667
#
define
FLT
float
#
define
FLT4
float4
#
define
READ_IMAGE
read_imagef
#
define
WRITE_IMAGE
write_imagef
//
enum
Format
{
//
Format_NCHW
=
0
,
//
Format_NHWC
=
1
,
//
Format_NHWC4
=
2
,
//
Format_HWKC
=
3
,
//
Format_HWCK
=
4
,
//
Format_KCHW
=
5
,
//
Format_CKHW
=
6
,
//
Format_KHWC
=
7
,
//
Format_CHWK
=
8
,
//
Format_NC4HW4
=
100
,
//
Format_NUM_OF_FORMAT
=
101
,
//
Format_MIN
=
Format_NCHW,
//
Format_MAX
=
Format_NUM_OF_FORMAT
//}
;
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
to_format_NCHW_to_NHWC4_IMG
(
__global
FLT4
*src_data,
__write_only
image2d_t
dst_data,
int4
size,
int4
shape
)
{
...
...
mindspore/lite/src/runtime/kernel/opencl/cl/
fp32/
transpose.cl
→
mindspore/lite/src/runtime/kernel/opencl/cl/transpose.cl
浏览文件 @
7a3b6667
#
define
FLT
float
#
define
FLT4
float4
#
define
READ_IMAGE
read_imagef
#
define
WRITE_IMAGE
write_imagef
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
transpose_IMG
(
__read_only
image2d_t
src_data,
__write_only
image2d_t
dst_data,
int2
HW,
int2
C
)
{
int
X
=
get_global_id
(
0
)
;
...
...
mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc
浏览文件 @
7a3b6667
...
...
@@ -24,7 +24,7 @@
#include "src/runtime/runtime_api.h"
#include "include/errorcode.h"
#include "src/runtime/kernel/opencl/cl/
fp32/
activation.cl.inc"
#include "src/runtime/kernel/opencl/cl/activation.cl.inc"
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kGPU
;
using
mindspore
::
lite
::
KernelRegistrar
;
...
...
@@ -46,7 +46,7 @@ int ActivationOpenClKernel::Init() {
}
std
::
string
program_name
=
""
;
std
::
string
kernel_name
=
""
;
std
::
string
source
=
activation_source
_fp32
;
std
::
string
source
=
activation_source
;
if
(
type_
==
ActivationType_RELU
)
{
program_name
=
"RELU"
;
kernel_name
=
"Relu"
;
...
...
mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
浏览文件 @
7a3b6667
...
...
@@ -22,8 +22,7 @@
#include "src/kernel_registry.h"
#include "src/runtime/kernel/opencl/utils.h"
#ifndef PROGRAM_WITH_IL
#include "src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl.inc"
#include "src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl.inc"
#include "src/runtime/kernel/opencl/cl/arithmetic.cl.inc"
#endif
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kGPU
;
...
...
@@ -109,9 +108,14 @@ int ArithmeticOpenCLKernel::Init() {
error_code
=
RET_ERROR
;
}
#else
if
(
out_mem_type_
==
OpenCLMemType
::
IMG
)
{
kernel_name
+=
"_IMG"
;
}
else
{
kernel_name
+=
"_BUF"
;
}
std
::
string
program_name
=
"Arithmetic"
;
std
::
set
<
std
::
string
>
build_options
;
std
::
string
source
=
arithmetic_
image2d_source_fp32
;
std
::
string
source
=
arithmetic_
source
;
runtime_
->
LoadSource
(
program_name
,
source
);
error_code
=
runtime_
->
BuildKernel
(
kernel_
,
program_name
,
kernel_name
,
build_options
);
#endif
...
...
mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc
浏览文件 @
7a3b6667
...
...
@@ -20,7 +20,7 @@
#include "src/kernel_registry.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/kernel/batchnorm.h"
#include "src/runtime/kernel/opencl/cl/
fp32/
batchnorm.cl.inc"
#include "src/runtime/kernel/opencl/cl/batchnorm.cl.inc"
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kGPU
;
using
mindspore
::
lite
::
KernelRegistrar
;
...
...
@@ -50,7 +50,7 @@ int BatchNormOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_siz
}
int
BatchNormOpenCLKernel
::
Init
()
{
std
::
set
<
std
::
string
>
build_options
;
std
::
string
source
=
batchnorm_source
_fp32
;
std
::
string
source
=
batchnorm_source
;
std
::
string
program_name
=
"batch_normalization"
;
std
::
string
kernel_name
=
"batch_normalization"
;
auto
ocl_runtime
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
...
...
mindspore/lite/src/runtime/kernel/opencl/kernel/caffe_prelu.cc
浏览文件 @
7a3b6667
...
...
@@ -24,7 +24,7 @@
#include "include/errorcode.h"
#include "src/runtime/kernel/opencl/kernel/caffe_prelu.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/cl/
fp32/
caffe_prelu.cl.inc"
#include "src/runtime/kernel/opencl/cl/caffe_prelu.cl.inc"
#include "src/runtime/kernel/arm/nnacl/caffeprelu.h"
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kGPU
;
...
...
@@ -57,7 +57,7 @@ int CaffePReluOpenCLKernel::Init() {
}
CaffeWeight
();
std
::
set
<
std
::
string
>
build_options
;
std
::
string
source
=
caffe_prelu_source
_fp32
;
std
::
string
source
=
caffe_prelu_source
;
std
::
string
program_name
=
"CaffePRelu"
;
std
::
string
kernel_name
=
"CaffePRelu"
;
auto
ocl_runtime
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
...
...
mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc
浏览文件 @
7a3b6667
...
...
@@ -20,7 +20,7 @@
#include "src/kernel_registry.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/kernel/concat.h"
#include "src/runtime/kernel/opencl/cl/
fp32/
concat.cl.inc"
#include "src/runtime/kernel/opencl/cl/concat.cl.inc"
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kGPU
;
using
mindspore
::
lite
::
KernelRegistrar
;
...
...
@@ -66,7 +66,7 @@ int ConcatOpenCLKernel::Init() {
}
if
(
in_tensors_
.
size
()
==
2
)
{
std
::
set
<
std
::
string
>
build_options
;
std
::
string
source
=
concat_source
_fp32
;
std
::
string
source
=
concat_source
;
std
::
string
program_name
=
"Concat"
;
std
::
string
kernel_name
=
"Concat"
;
auto
ocl_runtime
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
...
...
@@ -76,7 +76,7 @@ int ConcatOpenCLKernel::Init() {
if
(
in_tensors_
.
size
()
==
3
)
{
std
::
set
<
std
::
string
>
build_options
;
std
::
string
source
=
concat_source
_fp32
;
std
::
string
source
=
concat_source
;
std
::
string
program_name
=
"Concat3input"
;
std
::
string
kernel_name
=
"Concat3input"
;
auto
ocl_runtime
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
...
...
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
浏览文件 @
7a3b6667
...
...
@@ -20,8 +20,7 @@
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/kernel/conv2d_transpose.h"
#ifndef PROGRAM_WITH_IL
#include "src/runtime/kernel/opencl/cl/fp16/conv2d_transpose2x2.cl.inc"
#include "src/runtime/kernel/opencl/cl/fp32/conv2d_transpose2x2.cl.inc"
#include "src/runtime/kernel/opencl/cl/conv2d_transpose2x2.cl.inc"
#endif
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kGPU
;
...
...
@@ -45,11 +44,7 @@ int Conv2dTransposeOpenCLKernel::Init() {
#ifdef PROGRAM_WITH_IL
ocl_runtime
->
CreateKernelFromIL
(
kernel_
(),
kernel_name
);
#else
#ifdef ENABLE_FP16
std
::
string
source
=
conv2d_transpose2x2_source_fp16
;
#else
std
::
string
source
=
conv2d_transpose2x2_source_fp32
;
#endif
std
::
string
source
=
conv2d_transpose2x2_source
;
std
::
set
<
std
::
string
>
build_options
;
std
::
string
program_name
=
"conv2d_transpose2x2"
;
ocl_runtime
->
LoadSource
(
program_name
,
source
);
...
...
mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
浏览文件 @
7a3b6667
...
...
@@ -26,8 +26,7 @@
#ifndef PROGRAM_WITH_IL
#include "src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl.inc"
#include "src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl.inc"
#include "src/runtime/kernel/opencl/cl/depthwise_conv2d.cl.inc"
#endif
...
...
@@ -68,11 +67,7 @@ int DepthwiseConv2dOpenCLKernel::Init() {
#else
std
::
string
program_name
=
"DepthwiseConv2d"
;
std
::
set
<
std
::
string
>
build_options
;
#ifdef ENABLE_FP16
std
::
string
source
=
depthwise_conv2d_source_fp16
;
#else
std
::
string
source
=
depthwise_conv2d_source_fp32
;
#endif
std
::
string
source
=
depthwise_conv2d_source
;
ocl_runtime
->
LoadSource
(
program_name
,
source
);
ocl_runtime
->
BuildKernel
(
kernel_
,
program_name
,
kernel_name
,
build_options
);
#endif
...
...
mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
浏览文件 @
7a3b6667
...
...
@@ -21,8 +21,7 @@
#include "src/runtime/kernel/arm/nnacl/fp32/matmul.h"
#include "src/runtime/kernel/opencl/kernel/matmul.h"
#ifndef PROGRAM_WITH_IL
#include "src/runtime/kernel/opencl/cl/fp16/matmul.cl.inc"
#include "src/runtime/kernel/opencl/cl/fp32/matmul.cl.inc"
#include "src/runtime/kernel/opencl/cl/matmul.cl.inc"
#endif
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kGPU
;
...
...
@@ -40,11 +39,7 @@ int MatMulOpenCLKernel::Init() {
ocl_runtime
->
CreateKernelFromIL
(
kernel_
(),
kernel_name
);
#else
std
::
set
<
std
::
string
>
build_options
;
#ifdef ENABLE_FP16
std
::
string
source
=
matmul_source_fp16
;
#else
std
::
string
source
=
matmul_source_fp32
;
#endif
std
::
string
source
=
matmul_source
;
std
::
string
program_name
=
"MatMul"
;
ocl_runtime
->
LoadSource
(
program_name
,
source
);
ocl_runtime
->
BuildKernel
(
kernel_
,
program_name
,
kernel_name
,
build_options
);
...
...
mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
浏览文件 @
7a3b6667
...
...
@@ -24,8 +24,8 @@
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/image_format.h"
#ifndef PROGRAM_WITH_IL
#include "src/runtime/kernel/opencl/cl/
fp32/max
_pool2d.cl.inc"
#include "src/runtime/kernel/opencl/cl/
fp32/avg
_pool2d.cl.inc"
#include "src/runtime/kernel/opencl/cl/
avg
_pool2d.cl.inc"
#include "src/runtime/kernel/opencl/cl/
max
_pool2d.cl.inc"
#endif
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kGPU
;
...
...
@@ -46,13 +46,13 @@ int PoolingOpenCLKernel::Init() {
if
(
parameter_
->
max_pooling_
)
{
kernel_name
=
"MaxPooling2d"
;
#ifndef PROGRAM_WITH_IL
source
=
max_pool2d_source
_fp32
;
source
=
max_pool2d_source
;
program_name
=
"MaxPooling2d"
;
#endif
}
else
if
(
parameter_
->
avg_pooling_
)
{
kernel_name
=
"AvgPooling2d"
;
#ifndef PROGRAM_WITH_IL
source
=
avg_pool2d_source
_fp32
;
source
=
avg_pool2d_source
;
program_name
=
"AvgPooling2d"
;
#endif
}
else
{
...
...
mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.cc
浏览文件 @
7a3b6667
...
...
@@ -23,7 +23,7 @@
#include "include/errorcode.h"
#include "src/runtime/kernel/opencl/kernel/prelu.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/cl/
fp32/
activation.cl.inc"
#include "src/runtime/kernel/opencl/cl/activation.cl.inc"
#include "src/runtime/kernel/arm/nnacl/prelu_parameter.h"
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kGPU
;
...
...
@@ -40,7 +40,7 @@ int PReluOpenCLKernel::Init() {
return
RET_ERROR
;
}
std
::
set
<
std
::
string
>
build_options
;
std
::
string
source
=
activation_source
_fp32
;
std
::
string
source
=
activation_source
;
std
::
string
program_name
=
"PRelu"
;
std
::
string
kernel_name
=
"ReluScalar"
;
auto
ocl_runtime
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
...
...
mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
浏览文件 @
7a3b6667
...
...
@@ -20,8 +20,7 @@
#include "src/kernel_registry.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/kernel/reshape.h"
#include "src/runtime/kernel/opencl/cl/fp16/reshape.cl.inc"
#include "src/runtime/kernel/opencl/cl/fp32/reshape.cl.inc"
#include "src/runtime/kernel/opencl/cl/reshape.cl.inc"
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kGPU
;
using
mindspore
::
lite
::
KernelRegistrar
;
...
...
@@ -39,11 +38,7 @@ int ReshapeOpenCLKernel::Init() {
ocl_runtime
->
CreateKernelFromIL
(
kernel_
(),
kernel_name
);
#else
std
::
set
<
std
::
string
>
build_options
;
#ifdef ENABLE_FP16
std
::
string
source
=
reshape_source_fp16
;
#else
std
::
string
source
=
reshape_source_fp32
;
#endif
std
::
string
source
=
reshape_source
;
std
::
string
program_name
=
"reshape"
;
ocl_runtime
->
LoadSource
(
program_name
,
source
);
ocl_runtime
->
BuildKernel
(
kernel_
,
program_name
,
kernel_name
,
build_options
);
...
...
mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
浏览文件 @
7a3b6667
...
...
@@ -22,8 +22,8 @@
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/utils.h"
#ifndef PROGRAM_WITH_IL
#include "src/runtime/kernel/opencl/cl/
fp32/
softmax.cl.inc"
#include "src/runtime/kernel/opencl/cl/
fp32/
softmax1x1.cl.inc"
#include "src/runtime/kernel/opencl/cl/softmax.cl.inc"
#include "src/runtime/kernel/opencl/cl/softmax1x1.cl.inc"
#endif
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kGPU
;
...
...
@@ -88,7 +88,7 @@ int SoftmaxOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
int
SoftmaxOpenCLKernel
::
Init
()
{
std
::
string
kernel_name
=
"SoftMax"
;
std
::
string
program_name
=
"SoftMax"
;
std
::
string
source
=
softmax_source
_fp32
;
std
::
string
source
=
softmax_source
;
runtime_
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
// framework not set this param yet! just use default.
if
(
parameter_
->
axis_
==
-
1
)
{
...
...
@@ -101,7 +101,7 @@ int SoftmaxOpenCLKernel::Init() {
// support 2d tensor
kernel_name
+=
"1x1"
;
program_name
+=
"1x1"
;
source
=
softmax1x1_source
_fp32
;
source
=
softmax1x1_source
;
onexone_flag_
=
true
;
}
else
{
MS_LOG
(
EXCEPTION
)
<<
"Init `Softmax` kernel failed: Unsupported axis: "
<<
parameter_
->
axis_
;
...
...
mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
浏览文件 @
7a3b6667
...
...
@@ -22,7 +22,7 @@
#include "include/errorcode.h"
#include "src/kernel_registry.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/cl/
fp32/
to_format.cl.inc"
#include "src/runtime/kernel/opencl/cl/to_format.cl.inc"
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kGPU
;
using
mindspore
::
lite
::
KernelRegistrar
;
...
...
@@ -37,10 +37,9 @@ int ToFormatOpenCLKernel::Init() {
auto
parameter
=
reinterpret_cast
<
OpenCLToFormatParameter
*>
(
op_parameter_
);
out_mem_type_
=
parameter
->
out_mem_type
;
std
::
string
program_name
=
"to_format"
;
std
::
map
<
schema
::
Format
,
std
::
string
>
format_str
{{
schema
::
Format_NCHW
,
"NCHW"
},
{
schema
::
Format_NHWC
,
"NHWC"
},
{
schema
::
Format_NC4HW4
,
"NC4HW4"
},
{
schema
::
Format_NHWC4
,
"NHWC4"
}};
std
::
map
<
schema
::
Format
,
std
::
string
>
format_str
{{
schema
::
Format_NCHW
,
"NCHW"
},
{
schema
::
Format_NHWC
,
"NHWC"
},
{
schema
::
Format_NC4HW4
,
"NC4HW4"
},
{
schema
::
Format_NC4
,
"NHWC4"
},
{
schema
::
Format_NC
,
"NHWC"
},
{
schema
::
Format_NHWC4
,
"NHWC4"
}};
std
::
string
kernel_name
=
"to_format_"
+
format_str
[
in_tensors_
[
0
]
->
GetFormat
()]
+
"_to_"
+
format_str
[
out_tensors_
[
0
]
->
GetFormat
()];
if
(
out_mem_type_
==
OpenCLMemType
::
IMG
)
{
...
...
@@ -49,49 +48,54 @@ int ToFormatOpenCLKernel::Init() {
kernel_name
+=
"_BUF"
;
}
this
->
set_name
(
kernel_name
);
#ifdef PROGRAM_WITH_IL
ocl_runtime
->
CreateKernelFromIL
(
kernel_
(),
kernel_name
);
#else
std
::
set
<
std
::
string
>
build_options
;
#ifdef ENABLE_FP16
std
::
string
source
=
to_format_source_fp16
;
#else
std
::
string
source
=
to_format_source_fp32
;
#endif
std
::
string
source
=
to_format_source
;
ocl_runtime
->
LoadSource
(
program_name
,
source
);
ocl_runtime
->
BuildKernel
(
kernel_
,
program_name
,
kernel_name
,
build_options
);
#endif
InitNHWCShape
();
MS_LOG
(
DEBUG
)
<<
kernel_name
<<
" Init Done!"
;
return
RET_OK
;
}
int
ToFormatOpenCLKernel
::
ReSize
()
{
return
RET_OK
;
}
int
ToFormatOpenCLKernel
::
GetGlobalSize
(
size_t
idx
,
std
::
vector
<
size_t
>
*
global_size
)
{
int
ToFormatOpenCLKernel
::
InitNHWCShape
()
{
std
::
vector
<
int
>
shapex
=
out_tensors_
[
0
]
->
shape
();
size_t
n
,
h
,
w
,
c
;
if
(
out_tensors_
[
0
]
->
GetFormat
()
==
schema
::
Format_NHWC4
||
out_tensors_
[
0
]
->
GetFormat
()
==
schema
::
Format_NHWC
)
{
int
h
=
shapex
[
1
];
int
w
=
shapex
[
2
];
int
c
=
shapex
[
3
];
int
c4
=
UP_DIV
(
c
,
C4NUM
);
std
::
vector
<
size_t
>
vec
=
{(
size_t
)
h
,
(
size_t
)
w
,
(
size_t
)
c4
};
*
global_size
=
std
::
move
(
vec
);
n
=
shapex
[
0
];
h
=
shapex
[
1
];
w
=
shapex
[
2
];
c
=
shapex
[
3
];
}
else
if
(
out_tensors_
[
0
]
->
GetFormat
()
==
schema
::
Format_NC4HW4
||
out_tensors_
[
0
]
->
GetFormat
()
==
schema
::
Format_NCHW
)
{
int
h
=
shapex
[
2
];
int
w
=
shapex
[
3
];
int
c
=
shapex
[
1
];
int
c4
=
UP_DIV
(
c
,
C4NUM
);
std
::
vector
<
size_t
>
vec
=
{(
size_t
)
c4
,
(
size_t
)
h
,
(
size_t
)
w
};
*
global_size
=
std
::
move
(
vec
);
}
else
if
(
out_tensors_
[
0
]
->
GetFormat
()
==
out_tensors_
[
0
]
->
GetFormat
()
==
schema
::
Format_NCHW
)
{
int
h
=
shapex
[
2
];
int
w
=
shapex
[
3
];
int
c
=
shapex
[
1
];
int
w4
=
UP_DIV
(
w
,
C4NUM
);
std
::
vector
<
size_t
>
vec
=
{(
size_t
)
w4
,
(
size_t
)
h
,
(
size_t
)
c
};
*
global_size
=
std
::
move
(
vec
);
n
=
shapex
[
0
];
h
=
shapex
[
2
];
w
=
shapex
[
3
];
c
=
shapex
[
1
];
}
else
if
(
out_tensors_
[
0
]
->
GetFormat
()
==
schema
::
Format_NC4
||
out_tensors_
[
0
]
->
GetFormat
()
==
schema
::
Format_NC
)
{
n
=
shapex
[
0
];
h
=
1
;
w
=
1
;
c
=
shapex
[
1
];
}
else
{
n
=
shapex
[
0
];
h
=
shapex
[
1
];
w
=
shapex
[
2
];
c
=
shapex
[
3
];
}
nhwc_shape_
=
{
n
,
h
,
w
,
c
};
return
RET_OK
;
}
int
ToFormatOpenCLKernel
::
ReSize
()
{
return
RET_OK
;
}
int
ToFormatOpenCLKernel
::
GetGlobalSize
(
size_t
idx
,
std
::
vector
<
size_t
>
*
global_size
)
{
std
::
vector
<
size_t
>
vec
=
{
nhwc_shape_
[
1
],
nhwc_shape_
[
2
],
UP_DIV
(
nhwc_shape_
[
3
],
C4NUM
)};
*
global_size
=
std
::
move
(
vec
);
return
RET_OK
;
}
int
ToFormatOpenCLKernel
::
GetLocalSize
(
size_t
idx
,
const
std
::
vector
<
size_t
>
&
global_size
,
...
...
@@ -114,6 +118,12 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size
int
c
=
shapex
[
3
];
im_dst_x
=
w
*
UP_DIV
(
c
,
C4NUM
);
im_dst_y
=
h
;
}
else
if
(
out_tensors_
[
0
]
->
GetFormat
()
==
schema
::
Format_NC4
)
{
int
h
=
1
;
int
w
=
1
;
int
c
=
shapex
[
1
];
im_dst_x
=
w
*
UP_DIV
(
c
,
C4NUM
);
im_dst_y
=
h
;
}
else
{
MS_LOG
(
ERROR
)
<<
"Unsupported format. "
<<
out_tensors_
[
0
]
->
GetFormat
();
}
...
...
@@ -128,15 +138,13 @@ int ToFormatOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size
return
RET_OK
;
}
int
ToFormatOpenCLKernel
::
Run
()
{
MS_LOG
(
DEBUG
)
<<
"ToFormat"
<<
" Running!"
;
MS_LOG
(
DEBUG
)
<<
this
->
name
()
<<
" Running!"
;
auto
ocl_runtime
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
std
::
vector
<
size_t
>
local
=
{};
std
::
vector
<
size_t
>
global
;
GetGlobalSize
(
0
,
&
global
);
auto
shapex
=
in_tensors_
[
0
]
->
shape
();
cl_int4
shape
{
shapex
.
size
()
>
0
?
shapex
[
0
]
:
1
,
shapex
.
size
()
>
1
?
shapex
[
1
]
:
1
,
shapex
.
size
()
>
2
?
shapex
[
2
]
:
1
,
shapex
.
size
()
>
3
?
shapex
[
3
]
:
1
};
cl_int4
shape
{(
cl_int
)
nhwc_shape_
[
0
],
(
cl_int
)
nhwc_shape_
[
1
],
(
cl_int
)
nhwc_shape_
[
2
],
(
cl_int
)
nhwc_shape_
[
3
]};
cl_int4
gsize
{(
cl_int
)
global
[
0
],
(
cl_int
)
global
[
1
],
(
cl_int
)
global
[
2
],
1
};
ocl_runtime
->
SetKernelArg
(
kernel_
,
0
,
in_tensors_
[
0
]
->
Data
());
ocl_runtime
->
SetKernelArg
(
kernel_
,
1
,
out_tensors_
[
0
]
->
Data
());
...
...
mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h
浏览文件 @
7a3b6667
...
...
@@ -37,9 +37,11 @@ class ToFormatOpenCLKernel : public OpenCLKernel {
int
GetImageSize
(
size_t
idx
,
std
::
vector
<
size_t
>
*
img_size
)
override
;
int
GetGlobalSize
(
size_t
idx
,
std
::
vector
<
size_t
>
*
global_size
)
override
;
int
GetLocalSize
(
size_t
idx
,
const
std
::
vector
<
size_t
>
&
global_size
,
std
::
vector
<
size_t
>
*
local_size
)
override
;
int
InitNHWCShape
();
private:
cl
::
Kernel
kernel_
;
std
::
vector
<
size_t
>
nhwc_shape_
;
};
}
// namespace mindspore::kernel
...
...
mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
浏览文件 @
7a3b6667
...
...
@@ -21,8 +21,7 @@
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/kernel/transpose.h"
#ifndef PROGRAM_WITH_IL
#include "src/runtime/kernel/opencl/cl/fp16/transpose.cl.inc"
#include "src/runtime/kernel/opencl/cl/fp32/transpose.cl.inc"
#include "src/runtime/kernel/opencl/cl/transpose.cl.inc"
#endif
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kGPU
;
...
...
@@ -45,11 +44,7 @@ int TransposeOpenCLKernel::Init() {
ocl_runtime
->
CreateKernelFromIL
(
kernel_
(),
kernel_name
);
#else
std
::
set
<
std
::
string
>
build_options
;
#ifdef ENABLE_FP16
std
::
string
source
=
transpose_source_fp16
;
#else
std
::
string
source
=
transpose_source_fp32
;
#endif
std
::
string
source
=
transpose_source
;
std
::
string
program_name
=
"transpose"
;
ocl_runtime
->
LoadSource
(
program_name
,
source
);
ocl_runtime
->
BuildKernel
(
kernel_
,
program_name
,
kernel_name
,
build_options
);
...
...
mindspore/lite/src/runtime/opencl/opencl_runtime.cc
浏览文件 @
7a3b6667
...
...
@@ -288,13 +288,13 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
if
(
fp16_enable_
)
{
// fp16 enable, kernel will use half and read_imageh and write_imageh.
build_options_str
=
"-DFL
OAT=half -DFLOAT4=half4 -DRI_F=read_imageh
"
"-DW
I_F=write_imageh
"
;
"-DFL
T=half -DFLT4=half4 -DFLT16=half16
"
"-DW
RITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT4=convert_half4
"
;
}
else
{
// fp16 not enable, kernel will use float and read_imagef and write_imagef.
build_options_str
=
"-DFL
OAT=float -DFLOAT4=float4 -DRI_F=read_imagef
"
"-DW
I_F=write_imagef
"
;
"-DFL
T=float -DFLT4=float4 -DFLT16=float16
"
"-DW
RITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT4=convert_float4
"
;
}
build_options_str
=
std
::
accumulate
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录