Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
58877370
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
58877370
编写于
4年前
作者:
M
mindspore-ci-bot
提交者:
Gitee
4年前
浏览文件
操作
浏览文件
下载
差异文件
!5143 support fp16 for opencl depthwise
Merge pull request !5143 from wandongdong/master
上级
4653728e
b972ea62
变更
8
展开全部
隐藏空白更改
内联
并排
Showing
8 changed file
with
335 addition
and
552 deletion
+335
-552
mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
...ore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
+76
-79
mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
+10
-120
mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
...lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+40
-19
mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
.../lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
+2
-3
mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
+1
-0
mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
.../lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
+3
-4
mindspore/lite/src/runtime/kernel/opencl/utils.h
mindspore/lite/src/runtime/kernel/opencl/utils.h
+68
-0
mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc
...st/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc
+135
-327
未找到文件。
mindspore/lite/src/runtime/kernel/opencl/cl/depthwise_conv2d.cl
浏览文件 @
58877370
__constant
sampler_t
sampler_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
DepthwiseConv2d_IMG_NC4HW4
(
__read_only
image2d_t
src_data,
__global
FLT4
*filter,
__global
FLT4
*bias,
float
relu_clip
1
,
__write_only
image2d_t
dst_data,
int2
kernel_size,
float
relu_clip,
__write_only
image2d_t
dst_data,
int2
kernel_size,
int2
stride,
int2
padding,
int2
dilation,
int4
src_size,
int4
dst_size
)
{
int
X
=
get_global_id
(
0
)
;
int
Y
=
get_global_id
(
1
)
;
int
Z
=
get_global_id
(
2
)
;
if
(
X
>=
dst_size.x
|
| Y >= dst_size.y || Z >= dst_size.z) return;
FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
int x_offset
ed
= X * stride.x + padding.x;
int y_offset
ed
= Y * stride.y + padding.y;
int x_offset = X * stride.x + padding.x;
int y_offset = Y * stride.y + padding.y;
int fx_c = Z * kernel_size.x * kernel_size.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = y_offset
ed
+ ky * dilation.y;
int y_c = y_offset + ky * dilation.y;
bool outside_y = y_c < 0 || y_c >= src_size.y;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = x_offset
ed
+ kx * dilation.x;
int x_c = x_offset + kx * dilation.x;
bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) {
FLT4 f = filter[fx_c];
// FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))];
FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(x_c, (Z * src_size.y + y_c)));
r += TO_FLT4(src_final * f);
FLT4 flt_p = filter[fx_c];
FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(x_c, (Z * src_size.y + y_c)));
r += TO_FLT4(src_p * flt_p);
}
fx_c++;
}
}
FLT4 bias_val = bias[Z];
FLT4 res0 = TO_FLT4(r) + bias_val;
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
// dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0;
write_imagef(dst_data, (int2)(X, (Z * dst_size.y + Y)), res0);
FLT4 bias_p = bias[Z];
FLT4 res = TO_FLT4(r) + bias_p;
res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip));
WRITE_IMAGE(dst_data, (int2)(X, (Z * dst_size.y + Y)), res);
}
__kernel void DepthwiseConv2d_IMG_NHWC4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
float relu_clip
1
, __write_only image2d_t dst_data, int2 kernel_size,
float relu_clip, __write_only image2d_t dst_data, int2 kernel_size,
int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
int x_offset
ed
= X * stride.x + padding.x;
int y_offset
ed
= Y * stride.y + padding.y;
int x_offset = X * stride.x + padding.x;
int y_offset = Y * stride.y + padding.y;
int fx_c = Z * kernel_size.x * kernel_size.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = y_offset
ed
+ ky * dilation.y;
int y_c = y_offset + ky * dilation.y;
bool outside_y = y_c < 0 || y_c >= src_size.y;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = x_offset
ed
+ kx * dilation.x;
int x_c = x_offset + kx * dilation.x;
bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) {
FLT4 f = filter[fx_c];
// FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(Z + x_c * src_size.z, y_c));
r += TO_FLT4(src_final * f);
FLT4 flt_p = filter[fx_c];
FLT4 src_p = READ_IMAGE(src_data, smp_zero, (int2)(Z + x_c * src_size.z, y_c));
r += TO_FLT4(src_p * flt_p);
}
fx_c++;
}
}
FLT4 bias_val = bias[Z];
FLT4 res0 = TO_FLT4(r) + bias_val;
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
// dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
write_imagef(dst_data, (int2)(X * dst_size.z + Z, Y), res0);
FLT4 bias_p = bias[Z];
FLT4 res = TO_FLT4(r) + bias_p;
res = clamp(res, (FLT)(0.0f), (FLT)(relu_clip));
WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, Y), res);
}
__kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
float relu_clip
1
, __write_only image2d_t dst_data, int2 kernel_size,
float relu_clip, __write_only image2d_t dst_data, int2 kernel_size,
int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
int x_offset
ed
= X * stride.x + padding.x;
int y_offset
ed
= Y * stride.y + padding.y;
int x_offset = X * stride.x + padding.x;
int y_offset = Y * stride.y + padding.y;
int fx_c = Z;
{
int y_c = y_offset
ed
;
int y_c = y_offset;
bool outside_y = y_c < 0 || y_c >= src_size.y;
{
int x_c = x_offset
ed
;
int x_c = x_offset;
bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) {
FLT4 f = filter[fx_c];
// FLT4 src_
final
=src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
FLT4 src_
final = read_imagef(src_data, sampler
_zero, (int2)(Z, (y_c * src_size.x + x_c) * src_size.z));
r += TO_FLT4(src_
final * f
);
FLT4 f
lt_p
= filter[fx_c];
// FLT4 src_
p
=src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
FLT4 src_
p = READ_IMAGE(src_data, smp
_zero, (int2)(Z, (y_c * src_size.x + x_c) * src_size.z));
r += TO_FLT4(src_
p * flt_p
);
}
}
}
FLT4 bias_
val
= bias[Z];
FLT4 res
0 = TO_FLT4(r) + bias_val
;
res
0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1
));
// dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res
0
;
write_imagef(dst_data, (int2)(Z, (Y * dst_size.x + X) * dst_size.z), res0
);
FLT4 bias_
p
= bias[Z];
FLT4 res
= TO_FLT4(r) + bias_p
;
res
= clamp(res, (FLT)(0.0f), (FLT)(relu_clip
));
// dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res;
WRITE_IMAGE(dst_data, (int2)(Z, (Y * dst_size.x + X) * dst_size.z), res
);
}
__kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
float relu_clip
1
, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
float relu_clip, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
int x_offset
ed
= X * stride.x + padding.x;
int y_offset
ed
= Y * stride.y + padding.y;
int x_offset = X * stride.x + padding.x;
int y_offset = Y * stride.y + padding.y;
int fx_c = Z * kernel_size.x * kernel_size.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = y_offset
ed
+ ky * dilation.y;
int y_c = y_offset + ky * dilation.y;
bool outside_y = y_c < 0 || y_c >= src_size.y;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = x_offset
ed
+ kx * dilation.x;
int x_c = x_offset + kx * dilation.x;
bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) {
FLT4 f = filter[fx_c];
FLT4 src_
final
= src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
r += TO_FLT4(src_
final * f
);
FLT4 f
lt_p
= filter[fx_c];
FLT4 src_
p
= src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
r += TO_FLT4(src_
p * flt_p
);
}
fx_c++;
}
}
FLT4 bias_
val
= bias[Z];
FLT4 res
0 = TO_FLT4(r) + bias_val
;
res
0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1
));
dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res
0
;
FLT4 bias_
p
= bias[Z];
FLT4 res
= TO_FLT4(r) + bias_p
;
res
= clamp(res, (FLT)(0.0f), (FLT)(relu_clip
));
dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res;
}
__kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
float relu_clip
1
, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
float relu_clip, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
int x_offset
ed
= X * stride.x + padding.x;
int y_offset
ed
= Y * stride.y + padding.y;
int x_offset = X * stride.x + padding.x;
int y_offset = Y * stride.y + padding.y;
int fx_c = Z * kernel_size.x * kernel_size.y;
for (int ky = 0; ky < kernel_size.y; ++ky) {
int y_c = y_offset
ed
+ ky * dilation.y;
int y_c = y_offset + ky * dilation.y;
bool outside_y = y_c < 0 || y_c >= src_size.y;
for (int kx = 0; kx < kernel_size.x; ++kx) {
int x_c = x_offset
ed
+ kx * dilation.x;
int x_c = x_offset + kx * dilation.x;
bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) {
FLT4 f = filter[fx_c];
FLT4 src_
final
= src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
r += TO_FLT4(src_
final * f
);
FLT4 f
lt_p
= filter[fx_c];
FLT4 src_
p
= src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
r += TO_FLT4(src_
p * flt_p
);
}
fx_c++;
}
}
FLT4 bias_
val
= bias[Z];
FLT4 res
0 = TO_FLT4(r) + bias_val
;
res
0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1
));
dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res
0
;
FLT4 bias_
p
= bias[Z];
FLT4 res
= TO_FLT4(r) + bias_p
;
res
= clamp(res, (FLT)(0.0f), (FLT)(relu_clip
));
dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res;
}
__kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
float relu_clip
1
, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
float relu_clip, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) return;
FLT4 r = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
int x_offset
ed
= X * stride.x + padding.x;
int y_offset
ed
= Y * stride.y + padding.y;
int x_offset = X * stride.x + padding.x;
int y_offset = Y * stride.y + padding.y;
int fx_c = Z;
{
int y_c = y_offset
ed
;
int y_c = y_offset;
bool outside_y = y_c < 0 || y_c >= src_size.y;
{
int x_c = x_offset
ed
;
int x_c = x_offset;
bool outside_x = x_c < 0 |
|
x_c
>=
src_size.x
;
if
(
!outside_x
&&
!outside_y
)
{
FLT4
f
=
filter[fx_c]
;
FLT4
src_
final
=
src_data[
((
y_c
*
src_size.x
+
x_c
)
*
src_size.z
+
Z
)
]
;
r
+=
TO_FLT4
(
src_
final
*
f
)
;
FLT4
f
lt_p
=
filter[fx_c]
;
FLT4
src_
p
=
src_data[
((
y_c
*
src_size.x
+
x_c
)
*
src_size.z
+
Z
)
]
;
r
+=
TO_FLT4
(
src_
p
*
flt_p
)
;
}
}
}
FLT4
bias_
val
=
bias[Z]
;
FLT4
res
0
=
TO_FLT4
(
r
)
+
bias_val
;
res
0
=
clamp
(
res0,
(
FLT
)(
0.0f
)
,
(
FLT
)(
relu_clip1
))
;
dst_data[
((
Y
*
dst_size.x
+
X
)
*
dst_size.z
+
Z
)
]
=
res
0
;
FLT4
bias_
p
=
bias[Z]
;
FLT4
res
=
TO_FLT4
(
r
)
+
bias_p
;
res
=
clamp
(
res,
(
FLT
)(
0.0f
)
,
(
FLT
)(
relu_clip
))
;
dst_data[
((
Y
*
dst_size.x
+
X
)
*
dst_size.z
+
Z
)
]
=
res
;
}
This diff is collapsed.
Click to expand it.
mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
浏览文件 @
58877370
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
to_format_NCHW_to_NHWC4_IMG
(
__global
FLT4
*src_data,
__write_only
image2d_t
dst_data,
int4
size,
int4
shape
)
{
int
X
=
get_global_id
(
0
)
;
int
Y
=
get_global_id
(
1
)
;
int
Z
=
get_global_id
(
2
)
;
if
(
X
>=
size.x
|
| Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel
void
to_format_NHWC_to_NHWC4_IMG
(
__global
FLT4
*src_data,
__write_only
image2d_t
dst_data,
int4
size,
int4
shape
)
{
int
X
=
get_global_id
(
0
)
;
...
...
@@ -47,58 +37,17 @@ __kernel void to_format_NHWC4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only
}
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), src_data[(X * size.y + Y) * size.z + Z]);
}
__kernel void to_format_NC4HW4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NCHW_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NHWC_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NHWC4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NC4HW4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
// size(h, w, c4, 1), shape(n, c, h, w)
int X = get_global_id(0); // h
int Y = get_global_id(1); // w
int Z = get_global_id(2); // c4
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// FLT4 src_final = src_data[(((Z)*src_size.y + (y_c)) * src_size.x + (x_c))];
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), src_data[(Y * size.z + Z) * size.x + X]);
WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), src_data[(Z * size.x + X) * size.y + Y]);
}
__kernel void to_format_NCHW_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
...
...
@@ -109,56 +58,6 @@ __kernel void to_format_NCHW_to_NCHW_BUF(__read_only image2d_t src_data, __globa
}
dst_data[(Z * size.y + Y) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z));
}
__kernel void to_format_NHWC_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NHWC4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NC4HW4_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NCHW_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NHWC_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NHWC4_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
...
...
@@ -185,25 +84,16 @@ __kernel void to_format_NHWC4_to_NHWC_BUF(__read_only image2d_t src_data, __glob
}
}
}
__kernel void to_format_NC4HW4_to_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
// WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NC4HW4_to_NC4HW4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
// size(h, w, c, 1), shape(n, c, h, w)
int X = get_global_id(0); // h
int Y = get_global_id(1); // w
int Z = get_global_id(2); // c
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
dst_data[(
Y * size.z + Z) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z,
X));
dst_data[(
Z * size.x + X) * size.y + Y] = READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x +
X));
}
__kernel void to_format_NHWC4_to_NHWC4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
...
...
This diff is collapsed.
Click to expand it.
mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
浏览文件 @
58877370
...
...
@@ -20,9 +20,10 @@
#include <utility>
#include "src/kernel_registry.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/arm/fp32/convolution_depthwise.h"
#include "src/runtime/kernel/opencl/utils.h"
#include "nnacl/fp32/common_func.h"
#include "nnacl/op_base.h"
#include "include/errorcode.h"
#include "nnacl/pack.h"
#ifndef PROGRAM_WITH_IL
...
...
@@ -81,30 +82,50 @@ int DepthwiseConv2dOpenCLKernel::InitBuffer() {
auto
parameter
=
reinterpret_cast
<
ConvParameter
*>
(
op_parameter_
);
auto
ocl_runtime
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
auto
allocator
=
ocl_runtime
->
GetAllocator
();
bool
is_fp16
=
ocl_runtime
->
GetFp16Enable
();
// weight: o, h, w, i; o == group, i == 1
auto
origin_weight
=
reinterpret_cast
<
FLOAT_t
*>
(
in_tensors_
.
at
(
kWeightIndex
)
->
Data
()
);
void
*
origin_weight
=
in_tensors_
.
at
(
kWeightIndex
)
->
Data
(
);
int
CO4
=
UP_DIV
(
out_tensors_
[
0
]
->
Channel
(),
C4NUM
);
int
pack_weight_size
=
C4NUM
*
CO4
*
parameter
->
kernel_h_
*
parameter
->
kernel_w_
;
packed_weight_
=
reinterpret_cast
<
FLOAT_t
*>
(
allocator
->
Malloc
(
pack_weight_size
*
sizeof
(
FLOAT_t
)));
packed_weight_
=
reinterpret_cast
<
FLOAT_t
*>
(
allocator
->
MapBuffer
(
packed_weight_
,
CL_MAP_WRITE
,
nullptr
,
true
));
int
plane
=
parameter
->
kernel_h_
*
parameter
->
kernel_w_
;
#ifdef ENABLE_FP16
PackNCHWToNC4HW4Fp16
(
origin_weight
,
packed_weight_
,
1
,
plane
,
out_tensors_
[
0
]
->
Channel
());
#else
PackNCHWToNC4HW4Fp32
(
origin_weight
,
packed_weight_
,
1
,
plane
,
out_tensors_
[
0
]
->
Channel
());
#endif
if
(
is_fp16
)
{
packed_weight_
=
allocator
->
Malloc
(
pack_weight_size
*
sizeof
(
int16_t
));
packed_weight_
=
allocator
->
MapBuffer
(
packed_weight_
,
CL_MAP_WRITE
,
nullptr
,
true
);
if
(
in_tensors_
.
at
(
kWeightIndex
)
->
data_type
()
==
kNumberTypeFloat16
)
{
std
::
function
<
int16_t
(
int16_t
)
>
to_dtype
=
[](
int16_t
x
)
->
int16_t
{
return
x
;
};
PackNCHWToNC4HW4
<
int16_t
,
int16_t
>
(
origin_weight
,
packed_weight_
,
1
,
plane
,
out_tensors_
[
0
]
->
Channel
(),
to_dtype
);
}
else
if
(
in_tensors_
.
at
(
kWeightIndex
)
->
data_type
()
==
kNumberTypeFloat32
)
{
std
::
function
<
int16_t
(
float
)
>
to_dtype
=
Float32ToShort
;
PackNCHWToNC4HW4
<
float
,
int16_t
>
(
origin_weight
,
packed_weight_
,
1
,
plane
,
out_tensors_
[
0
]
->
Channel
(),
to_dtype
);
}
else
{
MS_LOG
(
ERROR
)
<<
"Only support float16/float32, actual data type "
<<
in_tensors_
.
at
(
kWeightIndex
)
->
data_type
();
}
}
else
{
packed_weight_
=
allocator
->
Malloc
(
pack_weight_size
*
sizeof
(
float
));
packed_weight_
=
allocator
->
MapBuffer
(
packed_weight_
,
CL_MAP_WRITE
,
nullptr
,
true
);
if
(
in_tensors_
.
at
(
kWeightIndex
)
->
data_type
()
==
kNumberTypeFloat32
)
{
std
::
function
<
float
(
float
)
>
to_dtype
=
[](
float
x
)
->
float
{
return
(
float
)
x
;
};
PackNCHWToNC4HW4
<
float
,
float
>
(
origin_weight
,
packed_weight_
,
1
,
plane
,
out_tensors_
[
0
]
->
Channel
(),
to_dtype
);
}
else
{
MS_LOG
(
ERROR
)
<<
"Only support float16/float32, actual data type "
<<
in_tensors_
.
at
(
kWeightIndex
)
->
data_type
();
}
}
allocator
->
UnmapBuffer
(
packed_weight_
);
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
bias_data_
=
reinterpret_cast
<
FLOAT_t
*>
(
allocator
->
Malloc
(
C4NUM
*
CO4
*
sizeof
(
FLOAT_t
)));
bias_data_
=
reinterpret_cast
<
FLOAT_t
*>
(
allocator
->
MapBuffer
(
bias_data_
,
CL_MAP_WRITE
,
nullptr
,
true
));
size_t
up_co_size
=
C4NUM
*
CO4
*
sizeof
(
FLOAT_t
);
size_t
dtype_size
=
sizeof
(
float
);
if
(
is_fp16
&&
in_tensors_
.
at
(
kBiasIndex
)
->
data_type
()
==
kNumberTypeFloat16
)
{
dtype_size
=
sizeof
(
int16_t
);
}
bias_data_
=
allocator
->
Malloc
(
C4NUM
*
CO4
*
dtype_size
);
bias_data_
=
allocator
->
MapBuffer
(
bias_data_
,
CL_MAP_WRITE
,
nullptr
,
true
);
size_t
up_co_size
=
C4NUM
*
CO4
*
dtype_size
;
memset
(
bias_data_
,
0
,
up_co_size
);
auto
ori_bias
=
reinterpret_cast
<
FLOAT_t
*>
(
in_tensors_
.
at
(
kBiasIndex
)
->
Data
()
);
memcpy
(
bias_data_
,
ori_bias
,
out_tensors_
[
0
]
->
Channel
()
*
sizeof
(
FLOAT_t
)
);
auto
ori_bias
=
in_tensors_
.
at
(
kBiasIndex
)
->
Data
(
);
memcpy
(
bias_data_
,
ori_bias
,
out_tensors_
[
0
]
->
Channel
()
*
dtype_size
);
allocator
->
UnmapBuffer
(
bias_data_
);
}
else
{
MS_ASSERT
(
in_tensors_
.
size
()
==
kInputSize1
);
...
...
@@ -124,11 +145,10 @@ int DepthwiseConv2dOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *i
im_dst_y
=
out_tensors_
[
0
]
->
Height
()
*
CO4
;
im_dst_x
=
out_tensors_
[
0
]
->
Width
();
}
#ifdef ENABLE_FP16
size_t
img_dtype
=
CL_HALF_FLOAT
;
#else
size_t
img_dtype
=
CL_FLOAT
;
#endif
if
(
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
()
->
GetFp16Enable
())
{
img_dtype
=
CL_HALF_FLOAT
;
}
img_size
->
clear
();
std
::
vector
<
size_t
>
vec
{
im_dst_x
,
im_dst_y
,
img_dtype
};
*
img_size
=
vec
;
...
...
@@ -204,5 +224,6 @@ kernel::LiteKernel *OpenCLDepthwiseConv2dKernelCreator(const std::vector<lite::t
return
kernel
;
}
REG_KERNEL
(
kGPU
,
kNumberTypeFloat16
,
PrimitiveType_DepthwiseConv2D
,
OpenCLDepthwiseConv2dKernelCreator
)
REG_KERNEL
(
kGPU
,
kNumberTypeFloat32
,
PrimitiveType_DepthwiseConv2D
,
OpenCLDepthwiseConv2dKernelCreator
)
}
// namespace mindspore::kernel
This diff is collapsed.
Click to expand it.
mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
浏览文件 @
58877370
...
...
@@ -20,7 +20,6 @@
#include <vector>
#include "src/runtime/kernel/opencl/opencl_kernel.h"
#include "nnacl/conv_parameter.h"
#include "src/runtime/opencl/opencl_runtime.h"
namespace
mindspore
::
kernel
{
...
...
@@ -46,8 +45,8 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel {
int
GetLocalSize
(
size_t
idx
,
const
std
::
vector
<
size_t
>
&
global_size
,
std
::
vector
<
size_t
>
*
local_size
)
override
;
private:
FLOAT_t
*
packed_weight_
;
FLOAT_t
*
bias_data_
;
void
*
packed_weight_
;
void
*
bias_data_
;
cl
::
Kernel
kernel_
;
};
}
// namespace mindspore::kernel
...
...
This diff is collapsed.
Click to expand it.
mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
浏览文件 @
58877370
...
...
@@ -172,5 +172,6 @@ kernel::LiteKernel *OpenCLToFormatKernelCreator(const std::vector<lite::tensor::
return
kernel
;
}
REG_KERNEL
(
kGPU
,
kNumberTypeFloat16
,
PrimitiveType_ToFormat
,
OpenCLToFormatKernelCreator
)
REG_KERNEL
(
kGPU
,
kNumberTypeFloat32
,
PrimitiveType_ToFormat
,
OpenCLToFormatKernelCreator
)
}
// namespace mindspore::kernel
This diff is collapsed.
Click to expand it.
mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
浏览文件 @
58877370
...
...
@@ -93,11 +93,10 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::tensor::Tensor *
}
out_tensors
->
emplace_back
(
new_tensor
);
#ifdef ENABLE_FP16
KernelKey
desc
{
kGPU
,
kNumberTypeFloat16
,
schema
::
PrimitiveType_ToFormat
};
#else
KernelKey
desc
{
kGPU
,
kNumberTypeFloat32
,
schema
::
PrimitiveType_ToFormat
};
#endif
if
(
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
()
->
GetFp16Enable
())
{
desc
.
data_type
=
kNumberTypeFloat16
;
}
OpenCLToFormatParameter
*
parameter
=
new
(
std
::
nothrow
)
OpenCLToFormatParameter
;
MS_ASSERT
(
parameter
);
if
(
parameter
==
nullptr
)
{
...
...
This diff is collapsed.
Click to expand it.
mindspore/lite/src/runtime/kernel/opencl/utils.h
浏览文件 @
58877370
...
...
@@ -23,6 +23,7 @@
#include "utils/log_adapter.h"
#include "nnacl/op_base.h"
#include "src/lite_kernel.h"
#include "src/common//utils.h"
namespace
mindspore
::
lite
{
kernel
::
LiteKernel
*
GetOpenCLKernel
(
const
std
::
vector
<
tensor
::
Tensor
*>
&
in_tensors
,
...
...
@@ -89,6 +90,73 @@ std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int ma
std
::
string
CLErrorCode
(
cl_int
error_code
);
template
<
class
T1
,
class
T2
>
void
PackNCHWToNC4HW4
(
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
,
const
std
::
function
<
T2
(
T1
)
>
&
to_dtype
)
{
int
c4
=
UP_DIV
(
channel
,
C4NUM
);
for
(
int
b
=
0
;
b
<
batch
;
b
++
)
{
int
src_offset
=
b
*
plane
*
channel
;
int
dst_offset
=
b
*
plane
*
c4
*
C4NUM
;
for
(
int
c
=
0
;
c
<
channel
;
c
++
)
{
int
c4_block_num
=
c
/
C4NUM
;
int
c4_block_rem
=
c
%
C4NUM
;
int
src_c_offset
=
src_offset
+
c
*
plane
;
int
dst_c_offset
=
dst_offset
+
c4_block_num
*
plane
*
C4NUM
;
for
(
int
k
=
0
;
k
<
plane
;
k
++
)
{
int
src_kernel_offset
=
src_c_offset
+
k
;
int
dst_kernel_offset
=
dst_c_offset
+
C4NUM
*
k
+
c4_block_rem
;
(
static_cast
<
T2
*>
(
dst
)
+
dst_kernel_offset
)[
0
]
=
to_dtype
((
static_cast
<
T1
*>
(
src
)
+
src_kernel_offset
)[
0
]);
}
}
}
}
template
<
class
T1
,
class
T2
>
void
PackNHWCToNHWC4
(
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
,
const
std
::
function
<
T2
(
T1
)
>
&
to_dtype
)
{
int
c4
=
UP_DIV
(
channel
,
C4NUM
);
int
nhwc4_batch_unit_offset
=
c4
*
C4NUM
*
plane
;
int
ic_remainder_
=
channel
%
C4NUM
;
if
(
ic_remainder_
!=
0
)
{
int
nhwc4_batch_offset
=
0
;
for
(
int
b
=
0
;
b
<
batch
;
b
++
)
{
int
batch_offset
=
b
*
channel
*
plane
;
for
(
int
i
=
0
;
i
<
plane
;
++
i
)
{
for
(
int
c
=
0
;
c
<
channel
;
++
c
)
{
(
static_cast
<
T2
*>
(
dst
)
+
nhwc4_batch_offset
+
i
*
c4
*
C4NUM
+
c
)[
0
]
=
to_dtype
((
static_cast
<
T1
*>
(
src
)
+
batch_offset
+
i
*
channel
+
c
)[
0
]);
}
}
nhwc4_batch_offset
+=
nhwc4_batch_unit_offset
;
}
}
else
{
size_t
ori_input_size
=
batch
*
plane
*
channel
;
for
(
size_t
n
=
0
;
n
<
ori_input_size
;
++
n
)
{
(
static_cast
<
T2
*>
(
dst
)
+
n
)[
0
]
=
to_dtype
((
static_cast
<
T1
*>
(
src
)
+
n
)[
0
]);
}
}
}
template
<
class
T1
,
class
T2
>
void
PackNHWCToNC4HW4
(
void
*
src
,
void
*
dst
,
int
batch
,
int
plane
,
int
channel
,
const
std
::
function
<
T2
(
T1
)
>
&
to_dtype
)
{
int
c4
=
UP_DIV
(
channel
,
C4NUM
);
for
(
int
b
=
0
;
b
<
batch
;
b
++
)
{
int
src_oc_offset
=
b
*
plane
*
channel
;
int
dst_oc_offset
=
b
*
plane
*
c4
*
C4NUM
;
for
(
int
k
=
0
;
k
<
plane
;
k
++
)
{
int
src_kernel_offset
=
src_oc_offset
+
k
*
channel
;
int
dst_kernel_offset
=
dst_oc_offset
+
k
*
C4NUM
;
for
(
int
i
=
0
;
i
<
channel
;
i
++
)
{
int
c4_block_num
=
i
/
C4NUM
;
int
c4_block_rem
=
i
%
C4NUM
;
int
src_ic_offset
=
src_kernel_offset
+
i
;
int
dst_ic_offset
=
dst_kernel_offset
+
c4_block_num
*
plane
*
C4NUM
+
c4_block_rem
;
(
static_cast
<
T2
*>
(
dst
)
+
dst_ic_offset
)[
0
]
=
to_dtype
((
static_cast
<
T1
*>
(
src
)
+
src_ic_offset
)[
0
]);
}
}
}
}
}
// namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_BACKEND_OPENCL_UTILS_H_
This diff is collapsed.
Click to expand it.
mindspore/lite/test/ut/src/runtime/kernel/opencl/depthwise_conv2d_tests.cc
浏览文件 @
58877370
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录
新手
引导
客服
返回
顶部