Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
f91d80e0
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f91d80e0
编写于
9月 05, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
9月 05, 2020
浏览文件
操作
浏览文件
下载
差异文件
!5787 [MSLITE] add cast to opencl to_format op
Merge pull request !5787 from wandongdong/master
上级
6d9501d5
d922befb
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
217 addition
and
44 deletion
+217
-44
mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
+2
-1
mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
+187
-39
mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
+4
-2
mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
.../lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
+21
-0
mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h
...e/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h
+1
-0
mindspore/lite/src/runtime/opencl/opencl_runtime.cc
mindspore/lite/src/runtime/opencl/opencl_runtime.cc
+2
-2
未找到文件。
mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
浏览文件 @
f91d80e0
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
#
define
divide_no_check
(
a,
b
)
(
a
/
b
)
__constant
sampler_t
smp_none
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_NONE |
CLK_FILTER_NEAREST
;
...
...
@@ -62,7 +63,7 @@ __kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, fl
}
FLT4
a
=
READ_IMAGE
(
input_a,
smp_none,
(
int2
)(
X,
Y
))
;
WRITE_IMAGE
(
output,
(
int2
)(
X,
Y
)
,
weight
*
a
+
bias
)
;
WRITE_IMAGE
(
output,
(
int2
)(
X,
Y
)
,
((
FLT
)
weight
)
*
a
+
(
FLT
)
bias
)
;
}
__kernel
void
ElementAdd_BUF
(
__global
float
*input_a,
__global
float
*input_b,
__global
float
*output,
...
...
mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
浏览文件 @
f91d80e0
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
__constant
sampler_t
smp_zero
=
CLK_NORMALIZED_COORDS_FALSE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
__kernel
void
to_format_NHWC_to_NHWC4_IMG
(
__global
FLT
4
*src_data,
__write_only
image2d_t
dst_data,
int4
size,
int4
shape
)
{
__kernel
void
to_format_NHWC_to_NHWC4_IMG
_float
(
__global
float
4
*src_data,
__write_only
image2d_t
dst_data,
int4
size,
int4
shape
)
{
int
X
=
get_global_id
(
0
)
;
int
Y
=
get_global_id
(
1
)
;
int
Z
=
get_global_id
(
2
)
;
if
(
X
>=
size.x
|
| Y >= size.y || Z >= size.z) {
return;
}
FLT4 data = (FLT4)(0.f);
int offset = (X * shape.z + Y) * shape.w + Z * 4;
__global
FLT *src_addr = (__global FLT
*)src_data;
__global
float *src_addr = (__global float
*)src_data;
src_addr += offset;
if ((Z + 1) * 4 <= shape.w) {
data = TO_FLT4(((__global float4 *)src_addr)[0]);
} else {
if ((shape.w - Z * 4) >= 1) {
data.x = (FLT)src_addr[0];
}
if ((shape.w - Z * 4) >= 2) {
data.y = (FLT)src_addr[1];
}
if ((shape.w - Z * 4) >= 3) {
data.z = (FLT)src_addr[2];
}
}
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data);
}
__kernel void to_format_NHWC_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
FLT4 data = (FLT4)(0.f);
int offset = (X * shape.z + Y) * shape.w + Z * 4;
__global half *src_addr = (__global half *)src_data;
src_addr += offset;
if ((Z + 1) * 4 <= shape.w) {
data =
((__global FLT4 *)src_addr)[0]
;
data =
TO_FLT4(((__global half4 *)src_addr)[0])
;
} else {
if ((shape.w - Z * 4) >= 1) {
data.x = src_addr[0];
data.x =
(FLT)
src_addr[0];
}
if ((shape.w - Z * 4) >= 2) {
data.y = src_addr[1];
data.y =
(FLT)
src_addr[1];
}
if ((shape.w - Z * 4) >= 3) {
data.z = src_addr[2];
data.z =
(FLT)
src_addr[2];
}
}
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), data);
}
__kernel void to_format_NHWC_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
__kernel void to_format_NHWC_to_NC4HW4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
int offset = (X * shape.z + Y) * shape.w + Z * 4;
__global float *src_addr = (__global float *)src_data;
src_addr += offset;
FLT4 data = (FLT4)(0.f);
if ((Z + 1) * 4 <= shape.w) {
data = TO_FLT4(((__global float4 *)src_addr)[0]);
} else {
if ((shape.w - Z * 4) >= 1) {
data.x = (FLT)src_addr[0];
}
if ((shape.w - Z * 4) >= 2) {
data.y = (FLT)src_addr[1];
}
if ((shape.w - Z * 4) >= 3) {
data.z = (FLT)src_addr[2];
}
}
WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), data);
}
__kernel void to_format_NHWC_to_NC4HW4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
...
...
@@ -36,36 +90,57 @@ __kernel void to_format_NHWC_to_NC4HW4_IMG(__global FLT4 *src_data, __write_only
return;
}
int offset = (X * shape.z + Y) * shape.w + Z * 4;
__global
FLT *src_addr = (__global FLT
*)src_data;
__global
half *src_addr = (__global half
*)src_data;
src_addr += offset;
FLT4 data = (FLT4)(0.f);
if ((Z + 1) * 4 <= shape.w) {
data =
((__global FLT4 *)src_addr)[0]
;
data =
TO_FLT4(((__global half4 *)src_addr)[0])
;
} else {
if ((shape.w - Z * 4) >= 1) {
data.x = src_addr[0];
data.x =
(FLT)
src_addr[0];
}
if ((shape.w - Z * 4) >= 2) {
data.y = src_addr[1];
data.y =
(FLT)
src_addr[1];
}
if ((shape.w - Z * 4) >= 3) {
data.z = src_addr[2];
data.z =
(FLT)
src_addr[2];
}
}
WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), data);
}
__kernel void to_format_NHWC4_to_NHWC4_IMG(__global FLT4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
__kernel void to_format_NHWC4_to_NHWC4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), TO_FLT4(src_data[(X * size.y + Y) * size.z + Z]));
}
__kernel void to_format_NHWC4_to_NHWC4_IMG_half(__global half4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), src_data[(X * size.y + Y) * size.z + Z]);
WRITE_IMAGE(dst_data, (int2)(Y * size.z + Z, X), TO_FLT4(src_data[(X * size.y + Y) * size.z + Z]));
}
__kernel void to_format_NC4HW4_to_NC4HW4_IMG_float(__global float4 *src_data, __write_only image2d_t dst_data,
int4 size, int4 shape) {
// size(h, w, c4, 1), shape(n, c, h, w)
int X = get_global_id(0); // h
int Y = get_global_id(1); // w
int Z = get_global_id(2); // c4
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), TO_FLT4(src_data[(Z * size.x + X) * size.y + Y]));
}
__kernel void to_format_NC4HW4_to_NC4HW4_IMG
(__global FLT
4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
__kernel void to_format_NC4HW4_to_NC4HW4_IMG
_half(__global half
4 *src_data, __write_only image2d_t dst_data, int4 size,
int4 shape) {
// size(h, w, c4, 1), shape(n, c, h, w)
int X = get_global_id(0); // h
int Y = get_global_id(1); // w
...
...
@@ -73,32 +148,94 @@ __kernel void to_format_NC4HW4_to_NC4HW4_IMG(__global FLT4 *src_data, __write_on
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), src_data[(Z * size.x + X) * size.y + Y]);
WRITE_IMAGE(dst_data, (int2)(Y, Z * size.x + X), TO_FLT4(src_data[(Z * size.x + X) * size.y + Y]));
}
__kernel void to_format_NCHW_to_NCHW_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
dst_data[(Z * size.y + Y) * size.x + X] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z)));
}
__kernel void to_format_NCHW_to_NCHW_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
dst_data[(Z * size.y + Y) * size.x + X] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z)));
}
__kernel void to_format_NHWC4_to_NHWC_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
int offset = (X * shape.z + Y) * shape.w + Z * 4;
__global float *dst_addr = (__global float *)dst_data;
dst_addr += offset;
if ((Z + 1) * 4 <= shape.w) {
((__global float4 *)dst_addr)[0] = data;
} else {
if (shape.w - Z * 4 >= 1) {
dst_addr[0] = data.x;
}
if (shape.w - Z * 4 >= 2) {
dst_addr[1] = data.y;
}
if (shape.w - Z * 4 >= 3) {
dst_addr[2] = data.z;
}
}
}
__kernel void to_format_N
CHW_to_NCHW_BUF(__read_only image2d_t src_data, __global FLT
4 *dst_data, int4 size,
int4 shape) {
__kernel void to_format_N
HWC4_to_NHWC_BUF_half(__read_only image2d_t src_data, __global half
4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
dst_data[(Z * size.y + Y) * size.x + X] = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.x + X, Z));
half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
int offset = (X * shape.z + Y) * shape.w + Z * 4;
__global half *dst_addr = (__global half *)dst_data;
dst_addr += offset;
if ((Z + 1) * 4 <= shape.w) {
((__global half4 *)dst_addr)[0] = data;
} else {
if (shape.w - Z * 4 >= 1) {
dst_addr[0] = data.x;
}
if (shape.w - Z * 4 >= 2) {
dst_addr[1] = data.y;
}
if (shape.w - Z * 4 >= 3) {
dst_addr[2] = data.z;
}
}
}
__kernel void to_format_N
HWC4_to_NHWC_BUF(__read_only image2d_t src_data, __global FLT
4 *dst_data, int4 size,
int4 shape) {
__kernel void to_format_N
C4HW4_to_NHWC_BUF_float(__read_only image2d_t src_data, __global float
4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
FLT4 data = READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X
));
float4 data = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X)
));
int offset = (X * shape.z + Y) * shape.w + Z * 4;
__global
FLT *dst_addr = (__global FLT
*)dst_data;
__global
float *dst_addr = (__global float
*)dst_data;
dst_addr += offset;
if ((Z + 1) * 4 <= shape.w) {
((__global
FLT
4 *)dst_addr)[0] = data;
((__global
float
4 *)dst_addr)[0] = data;
} else {
if (shape.w - Z * 4 >= 1) {
dst_addr[0] = data.x;
...
...
@@ -111,20 +248,20 @@ __kernel void to_format_NHWC4_to_NHWC_BUF(__read_only image2d_t src_data, __glob
}
}
}
__kernel void to_format_NC4HW4_to_NHWC_BUF
(__read_only image2d_t src_data, __global FLT
4 *dst_data, int4 size,
int4 shape) {
__kernel void to_format_NC4HW4_to_NHWC_BUF
_half(__read_only image2d_t src_data, __global half
4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
FLT4 data = READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X
));
half4 data = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X)
));
int offset = (X * shape.z + Y) * shape.w + Z * 4;
__global
FLT *dst_addr = (__global FLT
*)dst_data;
__global
half *dst_addr = (__global half
*)dst_data;
dst_addr += offset;
if ((Z + 1) * 4 <= shape.w) {
((__global
FLT
4 *)dst_addr)[0] = data;
((__global
half
4 *)dst_addr)[0] = data;
} else {
if (shape.w - Z * 4 >= 1) {
dst_addr[0] = data.x;
...
...
@@ -137,8 +274,19 @@ __kernel void to_format_NC4HW4_to_NHWC_BUF(__read_only image2d_t src_data, __glo
}
}
}
__kernel void to_format_NC4HW4_to_NC4HW4_BUF(__read_only image2d_t src_data, __global FLT4 *dst_data, int4 size,
int4 shape) {
__kernel void to_format_NC4HW4_to_NC4HW4_BUF_float(__read_only image2d_t src_data, __global float4 *dst_data, int4 size,
int4 shape) {
// size(h, w, c, 1), shape(n, c, h, w)
int X = get_global_id(0); // h
int Y = get_global_id(1); // w
int Z = get_global_id(2); // c
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
dst_data[(Z * size.x + X) * size.y + Y] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X)));
}
__kernel void to_format_NC4HW4_to_NC4HW4_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
int4 shape) {
// size(h, w, c, 1), shape(n, c, h, w)
int X = get_global_id(0); // h
int Y = get_global_id(1); // w
...
...
@@ -146,15 +294,15 @@ __kernel void to_format_NC4HW4_to_NC4HW4_BUF(__read_only image2d_t src_data, __g
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
dst_data[(Z * size.x + X) * size.y + Y] =
READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X
));
dst_data[(Z * size.x + X) * size.y + Y] =
convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y, Z * size.x + X)
));
}
__kernel void to_format_NHWC4_to_NHWC4_BUF
(__read_only image2d_t src_data, __global FLT
4 *dst_data, int4 size,
int4 shape) {
__kernel void to_format_NHWC4_to_NHWC4_BUF
_float(__read_only image2d_t src_data, __global float
4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y |
|
Z
>=
size.z
)
{
return
;
}
dst_data[
(
X
*
size.y
+
Y
)
*
size.z
+
Z]
=
READ_IMAGE
(
src_data,
smp_zero,
(
int2
)(
Y
*
size.z
+
Z,
X
))
;
dst_data[
(
X
*
size.y
+
Y
)
*
size.z
+
Z]
=
convert_float4
(
READ_IMAGE
(
src_data,
smp_zero,
(
int2
)(
Y
*
size.z
+
Z,
X
)
))
;
}
mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc
浏览文件 @
f91d80e0
...
...
@@ -42,10 +42,12 @@ int ToFormatOpenCLKernel::Init() {
{
schema
::
Format_NC
,
"NHWC"
},
{
schema
::
Format_NHWC4
,
"NHWC4"
}};
std
::
string
kernel_name
=
"to_format_"
+
format_str
[
in_tensors_
[
0
]
->
GetFormat
()]
+
"_to_"
+
format_str
[
out_tensors_
[
0
]
->
GetFormat
()];
std
::
map
<
TypeId
,
std
::
string
>
dtype_str
{
{
kNumberTypeFloat32
,
"float"
},
{
kNumberTypeFloat16
,
"half"
},
{
kNumberTypeInt8
,
"Int8"
}};
if
(
out_mem_type_
==
OpenCLMemType
::
IMG
)
{
kernel_name
+=
"_IMG
"
;
kernel_name
+=
"_IMG
_"
+
dtype_str
[
in_tensors_
[
0
]
->
data_type
()]
;
}
else
{
kernel_name
+=
"_BUF
"
;
kernel_name
+=
"_BUF
_"
+
dtype_str
[
out_tensors_
[
0
]
->
data_type
()]
;
}
this
->
set_name
(
kernel_name
);
...
...
mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
浏览文件 @
f91d80e0
...
...
@@ -15,6 +15,7 @@
*/
#include "src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
#include <set>
#include "src/runtime/opencl/opencl_executor.h"
#include "src/runtime/opencl/opencl_runtime.h"
#include "src/runtime/kernel/opencl/utils.h"
...
...
@@ -181,11 +182,31 @@ int SubGraphOpenCLKernel::Init() {
}
nodes_
.
insert
(
nodes_
.
end
(),
out_convert_ops_
.
begin
(),
out_convert_ops_
.
end
());
UpdateTensorDataType
();
MallocTensorWithReuse
();
return
RET_OK
;
}
int
SubGraphOpenCLKernel
::
UpdateTensorDataType
()
{
auto
ocl_runtime
=
lite
::
opencl
::
OpenCLRuntime
::
GetInstance
();
bool
is_fp16
=
ocl_runtime
->
GetFp16Enable
();
if
(
is_fp16
&&
(
in_tensors_
[
0
]
->
data_type
()
==
kNumberTypeFloat32
))
{
std
::
set
<
lite
::
tensor
::
Tensor
*>
out_set
;
out_set
.
insert
(
in_tensors_
.
begin
(),
in_tensors_
.
end
());
out_set
.
insert
(
out_tensors_
.
begin
(),
out_tensors_
.
end
());
for
(
auto
iv
:
nodes_
)
{
auto
cur_outs
=
iv
->
out_tensors
();
for
(
auto
jv
:
cur_outs
)
{
if
(
out_set
.
count
(
jv
)
==
0
)
{
jv
->
set_data_type
(
kNumberTypeFloat16
);
}
}
}
}
return
RET_OK
;
}
int
SubGraphOpenCLKernel
::
MallocTensorWithReuse
()
{
kernel
::
LiteKernelUtil
::
InitTensorRefCount
(
nodes_
);
for
(
auto
*
kernel
:
nodes_
)
{
...
...
mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h
浏览文件 @
f91d80e0
...
...
@@ -46,6 +46,7 @@ class SubGraphOpenCLKernel : public SubGraphKernel {
int
UnInit
();
protected:
int
UpdateTensorDataType
();
int
MallocTensorWithReuse
();
int
GenToFormatOp
(
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
in_tensors
,
const
std
::
vector
<
std
::
vector
<
kernel
::
LiteKernel
*>>
in_kernels
,
...
...
mindspore/lite/src/runtime/opencl/opencl_runtime.cc
浏览文件 @
f91d80e0
...
...
@@ -301,12 +301,12 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
// fp16 enable, kernel will use half and read_imageh and write_imageh.
build_options_str
=
"-DFLT=half -DFLT4=half4 -DFLT16=half16 "
"-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT4=convert_half4 "
;
"-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT
=convert_half -DTO_FLT
4=convert_half4 "
;
}
else
{
// fp16 not enable, kernel will use float and read_imagef and write_imagef.
build_options_str
=
"-DFLT=float -DFLT4=float4 -DFLT16=float16 "
"-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT4=convert_float4 "
;
"-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT
=convert_float -DTO_FLT
4=convert_float4 "
;
}
auto
build_options_ext
=
std
::
accumulate
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录