Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
22927dc4
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
22927dc4
编写于
8月 04, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
8月 04, 2020
浏览文件
操作
浏览文件
下载
差异文件
!3862 add judgement for cl
Merge pull request !3862 from pengyongrong/cl_format
上级
6a1e6b01
d831f8df
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
194 addition
and
319 deletion
+194
-319
build.sh
build.sh
+2
-6
mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl
...ite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl
+12
-32
mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl
mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl
+3
-4
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
...pore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
+1
-2
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl
+1
-1
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl
...ore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl
+127
-168
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl
...ite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl
+38
-92
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl
+3
-4
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax.cl
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax.cl
+7
-10
未找到文件。
build.sh
浏览文件 @
22927dc4
...
...
@@ -510,12 +510,8 @@ gene_ocl_program() {
build_opencl
()
{
cd
${
BASEPATH
}
if
[[
!
-d
"third_party/OpenCL-Headers"
]]
;
then
git submodule update
--init
third_party/OpenCL-Headers
fi
if
[[
!
-d
"third_party/OpenCL-CLHPP"
]]
;
then
git submodule update
--init
third_party/OpenCL-CLHPP
fi
git submodule update
--init
third_party/OpenCL-Headers
git submodule update
--init
third_party/OpenCL-CLHPP
if
[[
"
${
OPENCL_OFFLINE_COMPILE
}
"
==
"on"
]]
;
then
gene_ocl_program
else
...
...
mindspore/lite/src/runtime/kernel/opencl/cl/fp16/depthwise_conv2d.cl
浏览文件 @
22927dc4
...
...
@@ -13,19 +13,9 @@
__constant
sampler_t
smp_edge
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP_TO_EDGE |
CLK_FILTER_NEAREST
;
__constant
sampler_t
smp_none
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_NONE |
CLK_FILTER_NEAREST
;
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
DepthwiseConv2d_NC4HW4
(
__global
FLT4*
src_data,
__global
FLT4*
filters,
__global
FLT4*
biases,
float
relu_clip1,
__global
FLT4*
dst_data,
int2
kernel_size,
int2
stride,
int2
padding,
int2
dilation,
int4
src_size,
int4
dst_size
)
{
__kernel
void
DepthwiseConv2d_NC4HW4
(
__global
FLT4
*src_data,
__global
FLT4
*filters,
__global
FLT4
*biases,
float
relu_clip1,
__global
FLT4
*dst_data,
int2
kernel_size,
int2
stride,
int2
padding,
int2
dilation,
int4
src_size,
int4
dst_size
)
{
int
X
=
get_global_id
(
0
)
;
int
Y
=
get_global_id
(
1
)
;
int
Z
=
get_global_id
(
2
)
;
...
...
@@ -42,31 +32,21 @@ __global FLT4* dst_data,
bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) {
FLT4 f = filters[fx_c];
FLT4 src_final =src_data[(((Z) *
src_size.y + (y_c)) * src_size.x + (x_c))];
FLT4 src_final = src_data[(((Z)*
src_size.y + (y_c)) * src_size.x + (x_c))];
r += TO_ACCUM_TYPE(src_final * f);
}
;
}
fx_c++;
}
}
FLT4 bias_val = biases[Z];
FLT4 res0 = TO_FLT4(r) + bias_val;
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
dst_data[(((Z)
*
dst_size.y + (Y)) * dst_size.x + (X))] = res0;
dst_data[(((Z)
*
dst_size.y + (Y)) * dst_size.x + (X))] = res0;
}
__kernel void DepthwiseConv2d_NHWC4(
__global FLT4* src_data,
__global FLT4* filters,
__global FLT4* biases,
float relu_clip1,
__global FLT4* dst_data,
int2 kernel_size,
int2 stride,
int2 padding,
int2 dilation,
int4 src_size,
int4 dst_size
) {
__kernel void DepthwiseConv2d_NHWC4(__global FLT4 *src_data, __global FLT4 *filters, __global FLT4 *biases,
float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
...
...
@@ -83,9 +63,9 @@ __global FLT4* dst_data,
bool outside_x = x_c < 0 |
|
x_c
>=
src_size.x
;
if
(
!outside_x
&&
!outside_y
)
{
FLT4
f
=
filters[fx_c]
;
FLT4
src_final
=
src_data[
((
y_c
*
src_size.x
+
x_c
)
*
src_size.z
+
Z
)
]
;
FLT4
src_final
=
src_data[
((
y_c
*
src_size.x
+
x_c
)
*
src_size.z
+
Z
)
]
;
r
+=
TO_ACCUM_TYPE
(
src_final
*
f
)
;
}
;
}
fx_c++
;
}
}
...
...
@@ -93,4 +73,4 @@ __global FLT4* dst_data,
FLT4
res0
=
TO_FLT4
(
r
)
+
bias_val
;
res0
=
clamp
(
res0,
(
FLT
)(
0.0f
)
,
(
FLT
)(
relu_clip1
))
;
dst_data[
((
Y
*
dst_size.x
+
X
)
*
dst_size.z
+
Z
)
]
=
res0
;
}
\ No newline at end of file
}
mindspore/lite/src/runtime/kernel/opencl/cl/fp16/matmul.cl
浏览文件 @
22927dc4
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
#
define
FLT4
half4
#
define
FLT16
half16
__kernel
void
MatMul
(
__global
FLT4
*x,
__global
FLT16
*weight,
__global
FLT4
*buffer,
__global
FLT4
*bias,
int2
offset_ci,
int2
offset_co,
int
has_bias
)
{
__kernel
void
MatMul
(
__global
FLT4
*x,
__global
FLT16
*weight,
__global
FLT4
*buffer,
__global
FLT4
*bias,
int2
offset_ci,
int2
offset_co,
int
has_bias
)
{
int2
gid
=
(
int2
)(
get_global_id
(
0
)
,
get_global_id
(
1
))
;
int2
lid
=
(
int2
)(
get_local_id
(
0
)
,
get_local_id
(
1
))
;
FLT4
s
=
(
FLT4
)(
0.0f
)
;
...
...
@@ -29,4 +28,4 @@ __kernel void MatMul(__global FLT4 *x, __global FLT16 *weight,
buffer[gid.x]
=
s
;
//
memory
pollution?
or
protected
by
opencl
}
}
\ No newline at end of file
}
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/avg_pool2d.cl
浏览文件 @
22927dc4
...
...
@@ -31,7 +31,6 @@ __kernel void AvgPooling2d(__global float4 *input, __global float4 *output, cons
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
__kernel void AvgPooling2dImage2d(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
const int4 output_shape, const int2 stride, const int2 kernel_size,
const int2 padding) {
...
...
@@ -63,4 +62,4 @@ __kernel void AvgPooling2dImage2d(__read_only image2d_t input, __write_only imag
}
float4
result
=
convert_float4
(
r
/
window_size
)
;
write_imagef
(
output,
(
int2
)(
X,
Y
*
output_shape.w
+
Z
)
,
result
)
;
}
\ No newline at end of file
}
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/concat.cl
浏览文件 @
22927dc4
...
...
@@ -51,4 +51,4 @@ __kernel void Concat3input(__global float *input0, __global float *input1, __glo
output[index_output]
=
input2[input_idx]
;
}
}
}
\ No newline at end of file
}
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/convolution.cl
浏览文件 @
22927dc4
#
define
CI_TILE
4
#
define
CO_TILE
4
#
define
UP_DIV
(
x,
y
)
(((
x
)
+
(
y
)
-
(
1
))
/
(
y
))
//#define
__global
//#pragma
OPENCL
EXTENSION
cl_arm_printf
:
enable
__kernel
void
convolution_NHWC_OHWI
(
__global
float
*input,
__global
float
*weight,
__global
float
*bias,
//
#
define
__global
//
#
pragma
OPENCL
EXTENSION
cl_arm_printf
:
enable
__kernel
void
convolution_NHWC_OHWI
(
__global
float
*input,
__global
float
*weight,
__global
float
*bias,
__global
float
*output,
const
int4
input_shape,
//
NHWC
const
int4
output_shape,
//
NHWC
const
int4
kernel_stride,
//
kernelHW_strideHW
const
int4
pad
)
//
top
bottom
left
right
{
int
ow
=
get_global_id
(
0
)
;
int
oh
=
get_global_id
(
1
)
;
int
co_slice
=
get_global_id
(
2
)
;
const
int4
input_shape,
//
NHWC
const
int4
output_shape,
//
NHWC
const
int4
kernel_stride,
//
kernelHW_strideHW
const
int4
pad
)
{
int
ow
=
get_global_id
(
0
)
;
int
oh
=
get_global_id
(
1
)
;
int
co_slice
=
get_global_id
(
2
)
;
int
CI
=
input_shape.w,
IH
=
input_shape.y,
IW
=
input_shape.z
;
int
CO
=
output_shape.w,
OH
=
output_shape.y,
OW
=
output_shape.z
;
int
KH
=
kernel_stride.x,
KW
=
kernel_stride.y
;
int
strideH
=
kernel_stride.z,
strideW
=
kernel_stride.w
;
int
padTop
=
pad.x,
padLeft
=
pad.z
;
int
CI_SLICES
=
UP_DIV
(
CI,
CI_TILE
)
;
int
CO_SLICES
=
UP_DIV
(
CO,
CO_TILE
)
;
int
CI
=
input_shape.w,
IH
=
input_shape.y,
IW
=
input_shape.z
;
int
CO
=
output_shape.w,
OH
=
output_shape.y,
OW
=
output_shape.z
;
int
KH
=
kernel_stride.x,
KW
=
kernel_stride.y
;
int
strideH
=
kernel_stride.z,
strideW
=
kernel_stride.w
;
int
padTop
=
pad.x,
padLeft
=
pad.z
;
int
CI_SLICES
=
UP_DIV
(
CI,
CI_TILE
)
;
int
CO_SLICES
=
UP_DIV
(
CO,
CO_TILE
)
;
if
(
oh
>=
OH
|
| ow >= OW || co_slice >= CO_SLICES)
return;
if
(
oh
>=
OH
|
| ow >= OW || co_slice >= CO_SLICES) return;
float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
for (int kh = 0; kh < KH; ++kh)
{
int ih = kh + oh * strideH - padTop;
for (int kw = 0; kw < KW; ++kw)
{
int iw = kw + ow * strideW - padLeft;
for (int ci_slice = 0; ci_slice < CI_SLICES; ++ci_slice)
{
for (int ci_inner = 0; ci_inner < CI_TILE; ++ci_inner)
{
int ci = ci_slice * CI_TILE + ci_inner;
if (ci >= CI)
break;
float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
for (int kh = 0; kh < KH; ++kh) {
int ih = kh + oh * strideH - padTop;
for (int kw = 0; kw < KW; ++kw) {
int iw = kw + ow * strideW - padLeft;
for (int ci_slice = 0; ci_slice < CI_SLICES; ++ci_slice) {
for (int ci_inner = 0; ci_inner < CI_TILE; ++ci_inner) {
int ci = ci_slice * CI_TILE + ci_inner;
if (ci >= CI) break;
int input_idx = ih * IW * CI + iw * CI + ci;
float value = 0;
if (ih < 0 || ih >= IH || iw < 0 || iw >= IW)
value = 0;
else
value = input[input_idx];
int input_idx = ih * IW * CI + iw * CI + ci;
float value = 0;
if (ih < 0 || ih >= IH || iw < 0 || iw >= IW)
value = 0;
else
value = input[input_idx];
int CO_OFFSET = KH * KW * CI;
int weight_idx = (co_slice * CO_TILE) * CO_OFFSET +
kh * KW * CI +
kw * CI +
ci;
acc.x += weight[weight_idx + 0 * CO_OFFSET] * value;
acc.y += weight[weight_idx + 1 * CO_OFFSET] * value;
acc.z += weight[weight_idx + 2 * CO_OFFSET] * value;
acc.w += weight[weight_idx + 3 * CO_OFFSET] * value;
}
}
int CO_OFFSET = KH * KW * CI;
int weight_idx = (co_slice * CO_TILE) * CO_OFFSET + kh * KW * CI + kw * CI + ci;
acc.x += weight[weight_idx + 0 * CO_OFFSET] * value;
acc.y += weight[weight_idx + 1 * CO_OFFSET] * value;
acc.z += weight[weight_idx + 2 * CO_OFFSET] * value;
acc.w += weight[weight_idx + 3 * CO_OFFSET] * value;
}
}
}
int output_idx = oh * OW * CO + ow * CO + (co_slice * CO_TILE);
if (co_slice < CO_SLICES - 1 || CO % CO_TILE == 0)
{
output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
output[output_idx + 3] = acc.w + bias[co_slice * CO_TILE + 3];
}
else if (CO % CO_TILE == 1)
{
output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
}
else if (CO % CO_TILE == 2)
{
output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
}
else if (CO % CO_TILE == 3)
{
output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
}
}
int output_idx = oh * OW * CO + ow * CO + (co_slice * CO_TILE);
if (co_slice < CO_SLICES - 1 || CO % CO_TILE == 0) {
output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
output[output_idx + 3] = acc.w + bias[co_slice * CO_TILE + 3];
} else if (CO % CO_TILE == 1) {
output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
} else if (CO % CO_TILE == 2) {
output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
} else if (CO % CO_TILE == 3) {
output[output_idx + 0] = acc.x + bias[co_slice * CO_TILE + 0];
output[output_idx + 1] = acc.y + bias[co_slice * CO_TILE + 1];
output[output_idx + 2] = acc.z + bias[co_slice * CO_TILE + 2];
}
}
//#pragma OPENCL EXTENSION cl_khr_fp16 : enable
//#define FLT4 half4
// #pragma OPENCL EXTENSION cl_khr_fp16 : enable
// #define FLT4 half4
#define FLT4 float4
__kernel void convolution_NHWC4_OHWIIO_float8(__global FLT4 *input,
__global FLT4 *weight,
__global FLT4 *bias,
__kernel void convolution_NHWC4_OHWIIO_float8(__global FLT4 *input, __global FLT4 *weight, __global FLT4 *bias,
__global FLT4 *output,
const int4 input_shape, // NHWC
const int4 output_shape, // NHWC
const int4 kernel_stride, // kernelHW_strideHW
const int4 pad) // top bottom left right
{
int oh = get_global_id(0); // [0, OH)
int ow = get_global_id(1); // [0, OW)
int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) )
const int4 input_shape, // NHWC
const int4 output_shape, // NHWC
const int4 kernel_stride, // kernelHW_strideHW
const int4 pad) {
int oh = get_global_id(0); // [0, OH)
int ow = get_global_id(1); // [0, OW)
int co_slice = get_global_id(2); // [0, UP_DIV(CO, CO_TILE) )
int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z;
int CI_SLICES = UP_DIV(CI, CI_TILE);
int CO_SLICES = UP_DIV(CO, CO_TILE);
int KH = kernel_stride.x, KW = kernel_stride.y;
int strideH = kernel_stride.z, strideW = kernel_stride.w;
int padTop = pad.x, padLeft = pad.z;
int CI = input_shape.w, IH = input_shape.y, IW = input_shape.z;
int CO = output_shape.w, OH = output_shape.y, OW = output_shape.z;
int CI_SLICES = UP_DIV(CI, CI_TILE);
int CO_SLICES = UP_DIV(CO, CO_TILE);
int KH = kernel_stride.x, KW = kernel_stride.y;
int strideH = kernel_stride.z, strideW = kernel_stride.w;
int padTop = pad.x, padLeft = pad.z;
if (oh >= OH || ow >= OW |
|
2
*
co_slice
>=
CO_SLICES
)
return
;
if
(
2
*
co_slice
+
1
>=
CO_SLICES
)
{
FLT4
out0_c4
=
(
FLT4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
;
__global
FLT4
*w0_ic1_oc4
=
weight
+
(
2
*
co_slice
+
0
)
*
KH
*
KW
*
CI_SLICES
*
CI_TILE
;
for
(
int
kh
=
0
; kh < KH; ++kh)
{
int
ih
=
kh
+
oh
*
strideH
-
padTop
;
for
(
int
kw
=
0
; kw < KW; ++kw)
{
int
iw
=
kw
+
ow
*
strideW
-
padLeft
;
if
(
ih
>=
0
&&
ih
<
IH
&&
iw
>=
0
&&
iw
<
IW
)
{
for
(
int
ci_slice
=
0
; ci_slice < CI_SLICES; ci_slice++)
{
FLT4
in_c4
=
input[ih
*
IW
*
CI_SLICES
+
iw
*
CI_SLICES
+
ci_slice]
;
out0_c4
+=
w0_ic1_oc4[0]
*
in_c4.x
;
out0_c4
+=
w0_ic1_oc4[1]
*
in_c4.y
;
out0_c4
+=
w0_ic1_oc4[2]
*
in_c4.z
;
out0_c4
+=
w0_ic1_oc4[3]
*
in_c4.w
;
w0_ic1_oc4
+=
4
;
}
}
else
{
w0_ic1_oc4
+=
4
*
CI_SLICES
;
}
}
if (oh >= OH || ow >= OW |
|
2
*
co_slice
>=
CO_SLICES
)
return
;
if
(
2
*
co_slice
+
1
>=
CO_SLICES
)
{
FLT4
out0_c4
=
(
FLT4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
;
__global
FLT4
*w0_ic1_oc4
=
weight
+
(
2
*
co_slice
+
0
)
*
KH
*
KW
*
CI_SLICES
*
CI_TILE
;
for
(
int
kh
=
0
; kh < KH; ++kh) {
int
ih
=
kh
+
oh
*
strideH
-
padTop
;
for
(
int
kw
=
0
; kw < KW; ++kw) {
int
iw
=
kw
+
ow
*
strideW
-
padLeft
;
if
(
ih
>=
0
&&
ih
<
IH
&&
iw
>=
0
&&
iw
<
IW
)
{
for
(
int
ci_slice
=
0
; ci_slice < CI_SLICES; ci_slice++) {
FLT4
in_c4
=
input[ih
*
IW
*
CI_SLICES
+
iw
*
CI_SLICES
+
ci_slice]
;
out0_c4
+=
w0_ic1_oc4[0]
*
in_c4.x
;
out0_c4
+=
w0_ic1_oc4[1]
*
in_c4.y
;
out0_c4
+=
w0_ic1_oc4[2]
*
in_c4.z
;
out0_c4
+=
w0_ic1_oc4[3]
*
in_c4.w
;
w0_ic1_oc4
+=
4
;
}
}
else
{
w0_ic1_oc4
+=
4
*
CI_SLICES
;
}
output[oh
*
OW
*
CO_SLICES
+
ow
*
CO_SLICES
+
2
*
co_slice
+
0]
=
out0_c4
+
bias[2
*
co_slice
+
0]
;
}
}
else
{
FLT4
out0_c4
=
(
FLT4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
;
FLT4
out1_c4
=
(
FLT4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
;
__global
FLT4
*w0_ic1_oc4
=
weight
+
(
2
*
co_slice
+
0
)
*
KH
*
KW
*
CI_SLICES
*
CI_TILE
;
__global
FLT4
*w1_ic1_oc4
=
weight
+
(
2
*
co_slice
+
1
)
*
KH
*
KW
*
CI_SLICES
*
CI_TILE
;
for
(
int
kh
=
0
; kh < KH; ++kh)
{
int
ih
=
kh
+
oh
*
strideH
-
padTop
;
for
(
int
kw
=
0
; kw < KW; ++kw)
{
int
iw
=
kw
+
ow
*
strideW
-
padLeft
;
if
(
ih
>=
0
&&
ih
<
IH
&&
iw
>=
0
&&
iw
<
IW
)
{
int
idx
=
ih
*
IW
*
CI_SLICES
+
iw
*
CI_SLICES
;
for
(
int
ci_slice
=
0
; ci_slice < CI_SLICES; ci_slice++)
{
FLT4
in_c4
=
input[idx
+
ci_slice]
;
output[oh
*
OW
*
CO_SLICES
+
ow
*
CO_SLICES
+
2
*
co_slice
+
0]
=
out0_c4
+
bias[2
*
co_slice
+
0]
;
}
else
{
FLT4
out0_c4
=
(
FLT4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
;
FLT4
out1_c4
=
(
FLT4
)(
0.0f,
0.0f,
0.0f,
0.0f
)
;
__global
FLT4
*w0_ic1_oc4
=
weight
+
(
2
*
co_slice
+
0
)
*
KH
*
KW
*
CI_SLICES
*
CI_TILE
;
__global
FLT4
*w1_ic1_oc4
=
weight
+
(
2
*
co_slice
+
1
)
*
KH
*
KW
*
CI_SLICES
*
CI_TILE
;
for
(
int
kh
=
0
; kh < KH; ++kh) {
int
ih
=
kh
+
oh
*
strideH
-
padTop
;
for
(
int
kw
=
0
; kw < KW; ++kw) {
int
iw
=
kw
+
ow
*
strideW
-
padLeft
;
if
(
ih
>=
0
&&
ih
<
IH
&&
iw
>=
0
&&
iw
<
IW
)
{
int
idx
=
ih
*
IW
*
CI_SLICES
+
iw
*
CI_SLICES
;
for
(
int
ci_slice
=
0
; ci_slice < CI_SLICES; ci_slice++) {
FLT4
in_c4
=
input[idx
+
ci_slice]
;
out0_c4
+=
w0_ic1_oc4[0]
*
in_c4.x
;
out0_c4
+=
w0_ic1_oc4[1]
*
in_c4.y
;
out0_c4
+=
w0_ic1_oc4[2]
*
in_c4.z
;
out0_c4
+=
w0_ic1_oc4[3]
*
in_c4.w
;
w0_ic1_oc4
+=
4
;
out0_c4
+=
w0_ic1_oc4[0]
*
in_c4.x
;
out0_c4
+=
w0_ic1_oc4[1]
*
in_c4.y
;
out0_c4
+=
w0_ic1_oc4[2]
*
in_c4.z
;
out0_c4
+=
w0_ic1_oc4[3]
*
in_c4.w
;
w0_ic1_oc4
+=
4
;
out1_c4
+=
w1_ic1_oc4[0]
*
in_c4.x
;
out1_c4
+=
w1_ic1_oc4[1]
*
in_c4.y
;
out1_c4
+=
w1_ic1_oc4[2]
*
in_c4.z
;
out1_c4
+=
w1_ic1_oc4[3]
*
in_c4.w
;
w1_ic1_oc4
+=
4
;
}
}
else
{
w0_ic1_oc4
+=
4
*
CI_SLICES
;
w1_ic1_oc4
+=
4
*
CI_SLICES
;
}
}
out1_c4
+=
w1_ic1_oc4[0]
*
in_c4.x
;
out1_c4
+=
w1_ic1_oc4[1]
*
in_c4.y
;
out1_c4
+=
w1_ic1_oc4[2]
*
in_c4.z
;
out1_c4
+=
w1_ic1_oc4[3]
*
in_c4.w
;
w1_ic1_oc4
+=
4
;
}
}
else
{
w0_ic1_oc4
+=
4
*
CI_SLICES
;
w1_ic1_oc4
+=
4
*
CI_SLICES
;
}
output[oh
*
OW
*
CO_SLICES
+
ow
*
CO_SLICES
+
2
*
co_slice
+
0]
=
out0_c4
+
bias[2
*
co_slice
+
0]
;
output[oh
*
OW
*
CO_SLICES
+
ow
*
CO_SLICES
+
2
*
co_slice
+
1]
=
out1_c4
+
bias[2
*
co_slice
+
1]
;
}
}
}
\ No newline at end of file
output[oh
*
OW
*
CO_SLICES
+
ow
*
CO_SLICES
+
2
*
co_slice
+
0]
=
out0_c4
+
bias[2
*
co_slice
+
0]
;
output[oh
*
OW
*
CO_SLICES
+
ow
*
CO_SLICES
+
2
*
co_slice
+
1]
=
out1_c4
+
bias[2
*
co_slice
+
1]
;
}
}
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/depthwise_conv2d.cl
浏览文件 @
22927dc4
...
...
@@ -8,18 +8,9 @@
#
define
TO_FLT4
convert_float4
#
endif
__constant
sampler_t
sampler_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
DepthwiseConv2d_IMG_NC4HW4
(
__read_only
image2d_t
src_data,
__global
FLT4*
filter,
__global
FLT4*
bias,
float
relu_clip1,
__write_only
image2d_t
dst_data,
int2
kernel_size,
int2
stride,
int2
padding,
int2
dilation,
int4
src_size,
int4
dst_size
)
{
__kernel
void
DepthwiseConv2d_IMG_NC4HW4
(
__read_only
image2d_t
src_data,
__global
FLT4
*filter,
__global
FLT4
*bias,
float
relu_clip1,
__write_only
image2d_t
dst_data,
int2
kernel_size,
int2
stride,
int2
padding,
int2
dilation,
int4
src_size,
int4
dst_size
)
{
int
X
=
get_global_id
(
0
)
;
int
Y
=
get_global_id
(
1
)
;
int
Z
=
get_global_id
(
2
)
;
...
...
@@ -36,32 +27,23 @@ __write_only image2d_t dst_data,
bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) {
FLT4 f = filter[fx_c];
//FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))];
FLT4 src_final =read_imagef(src_data, sampler_zero, (int2)(x_c, (Z * src_size.y + y_c)));
//
FLT4 src_final =src_data[(((Z) * src_size.y + (y_c)) * src_size.x + (x_c))];
FLT4 src_final =
read_imagef(src_data, sampler_zero, (int2)(x_c, (Z * src_size.y + y_c)));
r += TO_FLT4(src_final * f);
}
;
}
fx_c++;
}
}
FLT4 bias_val = bias[Z];
FLT4 res0 = TO_FLT4(r) + bias_val;
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
//dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0;
//
dst_data[(((Z) * dst_size.y + (Y)) * dst_size.x + (X))] = res0;
write_imagef(dst_data, (int2)(X, (Z * dst_size.y + Y)), res0);
}
__kernel void DepthwiseConv2d_IMG_NHWC4(
__read_only image2d_t src_data,
__global FLT4* filter,
__global FLT4* bias,
float relu_clip1,
__write_only image2d_t dst_data,
int2 kernel_size,
int2 stride,
int2 padding,
int2 dilation,
int4 src_size,
int4 dst_size) {
__kernel void DepthwiseConv2d_IMG_NHWC4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
...
...
@@ -78,32 +60,23 @@ __write_only image2d_t dst_data,
bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) {
FLT4 f = filter[fx_c];
//FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
FLT4 src_final =
read_imagef(src_data, sampler_zero, (int2)(Z+x_c*
src_size.z, y_c));
//
FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
FLT4 src_final =
read_imagef(src_data, sampler_zero, (int2)(Z + x_c *
src_size.z, y_c));
r += TO_FLT4(src_final * f);
}
;
}
fx_c++;
}
}
FLT4 bias_val = bias[Z];
FLT4 res0 = TO_FLT4(r) + bias_val;
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
//dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
write_imagef(dst_data, (int2)(X
*dst_size.z+
Z, Y), res0);
//
dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
write_imagef(dst_data, (int2)(X
* dst_size.z +
Z, Y), res0);
}
__kernel void DepthwiseConv2d_IMG_NHWC4_1x1(
__read_only image2d_t src_data,
__global FLT4* filter,
__global FLT4* bias,
float relu_clip1,
__write_only image2d_t dst_data,
int2 kernel_size,
int2 stride,
int2 padding,
int2 dilation,
int4 src_size,
int4 dst_size) {
__kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias,
float relu_clip1, __write_only image2d_t dst_data, int2 kernel_size,
int2 stride, int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
...
...
@@ -120,30 +93,21 @@ __write_only image2d_t dst_data,
bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) {
FLT4 f = filter[fx_c];
//FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
//
FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
FLT4 src_final = read_imagef(src_data, sampler_zero, (int2)(Z, (y_c * src_size.x + x_c) * src_size.z));
r += TO_FLT4(src_final * f);
}
;
}
}
}
FLT4 bias_val = bias[Z];
FLT4 res0 = TO_FLT4(r) + bias_val;
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
//dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
//
dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
write_imagef(dst_data, (int2)(Z, (Y * dst_size.x + X) * dst_size.z), res0);
}
__kernel void DepthwiseConv2d_BUF_NC4HW4(
__global FLT4* src_data,
__global FLT4* filter,
__global FLT4* bias,
float relu_clip1,
__global FLT4* dst_data,
int2 kernel_size,
int2 stride,
int2 padding,
int2 dilation,
int4 src_size,
int4 dst_size) {
__kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
...
...
@@ -160,30 +124,21 @@ __global FLT4* dst_data,
bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) {
FLT4 f = filter[fx_c];
FLT4 src_final =
src_data[(((Z) *
src_size.y + (y_c)) * src_size.x + (x_c))];
FLT4 src_final =
src_data[(((Z)*
src_size.y + (y_c)) * src_size.x + (x_c))];
r += TO_FLT4(src_final * f);
}
;
}
fx_c++;
}
}
FLT4 bias_val = bias[Z];
FLT4 res0 = TO_FLT4(r) + bias_val;
res0 = clamp(res0, (FLT)(0.0f), (FLT)(relu_clip1));
dst_data[(((Z)
*
dst_size.y + (Y)) * dst_size.x + (X))] = res0;
dst_data[(((Z)
*
dst_size.y + (Y)) * dst_size.x + (X))] = res0;
}
__kernel void DepthwiseConv2d_BUF_NHWC4(
__global FLT4* src_data,
__global FLT4* filter,
__global FLT4* bias,
float relu_clip1,
__global FLT4* dst_data,
int2 kernel_size,
int2 stride,
int2 padding,
int2 dilation,
int4 src_size,
int4 dst_size) {
__kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
...
...
@@ -200,9 +155,9 @@ __global FLT4* dst_data,
bool outside_x = x_c < 0 || x_c >= src_size.x;
if (!outside_x && !outside_y) {
FLT4 f = filter[fx_c];
FLT4 src_final =src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
FLT4 src_final =
src_data[((y_c * src_size.x + x_c) * src_size.z + Z)];
r += TO_FLT4(src_final * f);
}
;
}
fx_c++;
}
}
...
...
@@ -212,18 +167,9 @@ __global FLT4* dst_data,
dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res0;
}
__kernel void DepthwiseConv2d_BUF_NHWC4_1x1(
__global FLT4* src_data,
__global FLT4* filter,
__global FLT4* bias,
float relu_clip1,
__global FLT4* dst_data,
int2 kernel_size,
int2 stride,
int2 padding,
int2 dilation,
int4 src_size,
int4 dst_size) {
__kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias,
float relu_clip1, __global FLT4 *dst_data, int2 kernel_size, int2 stride,
int2 padding, int2 dilation, int4 src_size, int4 dst_size) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
...
...
@@ -240,13 +186,13 @@ __global FLT4* dst_data,
bool outside_x = x_c < 0 |
|
x_c
>=
src_size.x
;
if
(
!outside_x
&&
!outside_y
)
{
FLT4
f
=
filter[fx_c]
;
FLT4
src_final
=src_data[
((
y_c
*
src_size.x
+
x_c
)
*
src_size.z
+
Z
)
]
;
FLT4
src_final
=
src_data[
((
y_c
*
src_size.x
+
x_c
)
*
src_size.z
+
Z
)
]
;
r
+=
TO_FLT4
(
src_final
*
f
)
;
}
;
}
}
}
FLT4
bias_val
=
bias[Z]
;
FLT4
res0
=
TO_FLT4
(
r
)
+
bias_val
;
res0
=
clamp
(
res0,
(
FLT
)(
0.0f
)
,
(
FLT
)(
relu_clip1
))
;
dst_data[
((
Y
*
dst_size.x
+
X
)
*
dst_size.z
+
Z
)
]
=
res0
;
}
\ No newline at end of file
}
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/matmul.cl
浏览文件 @
22927dc4
#
define
FLT4
float4
#
define
FLT16
float16
__kernel
void
MatMul
(
__global
FLT4
*x,
__global
FLT16
*weight,
__global
FLT4
*buffer,
__global
FLT4
*bias,
int2
offset_ci,
int2
offset_co,
int
has_bias
)
{
__kernel
void
MatMul
(
__global
FLT4
*x,
__global
FLT16
*weight,
__global
FLT4
*buffer,
__global
FLT4
*bias,
int2
offset_ci,
int2
offset_co,
int
has_bias
)
{
int2
gid
=
(
int2
)(
get_global_id
(
0
)
,
get_global_id
(
1
))
;
int2
lid
=
(
int2
)(
get_local_id
(
0
)
,
get_local_id
(
1
))
;
FLT4
s
=
(
FLT4
)(
0.0f
)
;
...
...
@@ -28,4 +27,4 @@ __kernel void MatMul(__global FLT4 *x, __global FLT16 *weight,
buffer[gid.x]
=
s
;
//
memory
pollution?
or
protected
by
opencl
}
}
\ No newline at end of file
}
mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax.cl
浏览文件 @
22927dc4
#
define
SLICES
4
int
DivideRoundUp
(
int
n,
int
div
)
{
int
q
=
n
/
div
;
return
n
%
div
==
0
?
q
:
q
+
1
;
int
DivideRoundUp
(
int
n,
int
div
)
{
int
q
=
n
/
div
;
return
n
%
div
==
0
?
q
:
q
+
1
;
}
__kernel
void
SoftMax
(
__global
float4
*input,
__global
float4
*output,
const
int4
input_shape
)
{
int
X
=
get_global_id
(
0
)
; // width
int
Y
=
get_global_id
(
1
)
; // height
__kernel
void
SoftMax
(
__global
float4
*input,
__global
float4
*output,
const
int4
input_shape
)
{
int
X
=
get_global_id
(
0
)
; // width
int
Y
=
get_global_id
(
1
)
; // height
int
H
=
input_shape.y
;
int
W
=
input_shape.z
;
int
C
=
input_shape.w
;
...
...
@@ -32,4 +29,4 @@ __kernel void SoftMax(__global float4 *input,
float4
result
=
convert_float4
(
t
)
;
output[
(
Y
*
W
+
X
*
H
)
*
C
+
d]
=
result
;
}
}
\ No newline at end of file
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录