Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
0e078d19
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
0e078d19
编写于
3月 26, 2018
作者:
Y
yejianwu
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
compatible with opencl1.1 and 1.2
上级
69baafe9
变更
41
显示空白变更内容
内联
并排
Showing
41 changed file
with
614 addition
and
190 deletion
+614
-190
mace/kernels/opencl/activation_opencl.cc
mace/kernels/opencl/activation_opencl.cc
+12
-5
mace/kernels/opencl/addn.cc
mace/kernels/opencl/addn.cc
+10
-4
mace/kernels/opencl/batch_norm_opencl.cc
mace/kernels/opencl/batch_norm_opencl.cc
+12
-5
mace/kernels/opencl/bias_add_opencl.cc
mace/kernels/opencl/bias_add_opencl.cc
+16
-5
mace/kernels/opencl/buffer_to_image.cc
mace/kernels/opencl/buffer_to_image.cc
+14
-3
mace/kernels/opencl/channel_shuffle.cc
mace/kernels/opencl/channel_shuffle.cc
+14
-6
mace/kernels/opencl/cl/activation.cl
mace/kernels/opencl/cl/activation.cl
+10
-2
mace/kernels/opencl/cl/addn.cl
mace/kernels/opencl/cl/addn.cl
+4
-1
mace/kernels/opencl/cl/batch_norm.cl
mace/kernels/opencl/cl/batch_norm.cl
+10
-2
mace/kernels/opencl/cl/bias_add.cl
mace/kernels/opencl/cl/bias_add.cl
+10
-2
mace/kernels/opencl/cl/buffer_to_image.cl
mace/kernels/opencl/cl/buffer_to_image.cl
+82
-13
mace/kernels/opencl/cl/channel_shuffle.cl
mace/kernels/opencl/cl/channel_shuffle.cl
+10
-2
mace/kernels/opencl/cl/concat.cl
mace/kernels/opencl/cl/concat.cl
+20
-4
mace/kernels/opencl/cl/conv_2d.cl
mace/kernels/opencl/cl/conv_2d.cl
+11
-2
mace/kernels/opencl/cl/conv_2d_1x1.cl
mace/kernels/opencl/cl/conv_2d_1x1.cl
+11
-2
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+11
-2
mace/kernels/opencl/cl/depthwise_conv2d.cl
mace/kernels/opencl/cl/depthwise_conv2d.cl
+19
-3
mace/kernels/opencl/cl/eltwise.cl
mace/kernels/opencl/cl/eltwise.cl
+4
-1
mace/kernels/opencl/cl/fully_connected.cl
mace/kernels/opencl/cl/fully_connected.cl
+18
-3
mace/kernels/opencl/cl/matmul.cl
mace/kernels/opencl/cl/matmul.cl
+5
-1
mace/kernels/opencl/cl/pooling.cl
mace/kernels/opencl/cl/pooling.cl
+9
-2
mace/kernels/opencl/cl/resize_bilinear.cl
mace/kernels/opencl/cl/resize_bilinear.cl
+11
-3
mace/kernels/opencl/cl/slice.cl
mace/kernels/opencl/cl/slice.cl
+11
-2
mace/kernels/opencl/cl/softmax.cl
mace/kernels/opencl/cl/softmax.cl
+11
-3
mace/kernels/opencl/cl/space_to_batch.cl
mace/kernels/opencl/cl/space_to_batch.cl
+16
-2
mace/kernels/opencl/cl/winograd_transform.cl
mace/kernels/opencl/cl/winograd_transform.cl
+16
-4
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+26
-13
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+12
-5
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+13
-5
mace/kernels/opencl/conv_2d_opencl_general.cc
mace/kernels/opencl/conv_2d_opencl_general.cc
+13
-5
mace/kernels/opencl/depthwise_conv_opencl.cc
mace/kernels/opencl/depthwise_conv_opencl.cc
+13
-5
mace/kernels/opencl/eltwise_opencl.cc
mace/kernels/opencl/eltwise_opencl.cc
+10
-4
mace/kernels/opencl/fully_connected_opencl.cc
mace/kernels/opencl/fully_connected_opencl.cc
+20
-9
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+20
-12
mace/kernels/opencl/matmul.cc
mace/kernels/opencl/matmul.cc
+11
-6
mace/kernels/opencl/pooling_opencl.cc
mace/kernels/opencl/pooling_opencl.cc
+31
-13
mace/kernels/opencl/resize_bilinear_opencl.cc
mace/kernels/opencl/resize_bilinear_opencl.cc
+12
-5
mace/kernels/opencl/slice.cc
mace/kernels/opencl/slice.cc
+9
-2
mace/kernels/opencl/softmax_opencl.cc
mace/kernels/opencl/softmax_opencl.cc
+12
-6
mace/kernels/opencl/space_to_batch_opencl.cc
mace/kernels/opencl/space_to_batch_opencl.cc
+13
-6
mace/kernels/opencl/winograd_transform.cc
mace/kernels/opencl/winograd_transform.cc
+22
-10
未找到文件。
mace/kernels/opencl/activation_opencl.cc
浏览文件 @
0e078d19
...
...
@@ -24,9 +24,10 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
if
(
kernel_
.
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"activation"
);
built_options
.
emplace
(
"-Dactivation="
+
kernel_name
);
...
...
@@ -60,6 +61,10 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_
=
runtime
->
BuildKernel
(
"activation"
,
kernel_name
,
built_options
);
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
int
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
...
...
@@ -69,14 +74,16 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
kernel_
.
setArg
(
idx
++
,
static_cast
<
float
>
(
relux_max_limit_
));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
input_shape_
=
input
->
shape
();
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
std
::
string
tuning_key
=
Concat
(
tuning_key_prefix_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
...
...
mace/kernels/opencl/addn.cc
浏览文件 @
0e078d19
...
...
@@ -24,6 +24,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
const
index_t
width
=
input_tensors
[
0
]
->
dim
(
2
);
const
index_t
channels
=
input_tensors
[
0
]
->
dim
(
3
);
auto
runtime
=
OpenCLRuntime
::
Global
();
for
(
int
i
=
1
;
i
<
size
;
++
i
)
{
MACE_CHECK_NOTNULL
(
input_tensors
[
i
]);
MACE_CHECK
(
batch
==
input_tensors
[
i
]
->
dim
(
0
));
...
...
@@ -36,7 +38,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
if
(
input_tensors
.
size
()
>
4
)
{
MACE_NOT_IMPLEMENTED
;
}
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"addn"
);
...
...
@@ -53,6 +54,9 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
const
index_t
width_pixels
=
channel_blocks
*
width
;
const
index_t
batch_height_pixels
=
batch
*
height
;
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
static_cast
<
uint32_t
>
(
batch_height_pixels
)};
if
(
!
IsVecEqual
(
input_shape_
,
input_tensors
[
0
]
->
shape
()))
{
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
...
...
@@ -64,13 +68,15 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
}
kernel_
.
setArg
(
idx
++
,
*
(
output_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
input_shape_
=
input_tensors
[
0
]
->
shape
();
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
static_cast
<
uint32_t
>
(
batch_height_pixels
)}
;
const
std
::
vector
<
uint32_t
>
lws
=
{
64
,
16
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
))
;
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size
/
16
,
16
,
1
};
std
::
stringstream
ss
;
ss
<<
"addn_opencl_kernel_"
<<
output_shape
[
0
]
<<
"_"
<<
output_shape
[
1
]
<<
"_"
<<
output_shape
[
2
]
<<
"_"
<<
output_shape
[
3
];
...
...
mace/kernels/opencl/batch_norm_opencl.cc
浏览文件 @
0e078d19
...
...
@@ -30,8 +30,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
if
(
kernel_
.
get
()
==
nullptr
)
{
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"batch_norm"
);
...
...
@@ -74,14 +79,16 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
relux_max_limit_
);
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
input_shape_
=
input
->
shape
();
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
std
::
string
tuning_key
=
Concat
(
"batch_norm_opencl_kernel_"
,
activation_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
folded_constant_
);
...
...
mace/kernels/opencl/bias_add_opencl.cc
浏览文件 @
0e078d19
...
...
@@ -23,6 +23,10 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
...
...
@@ -38,17 +42,24 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
bias
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
input_shape_
=
input
->
shape
();
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
};
std
::
vector
<
uint32_t
>
roundup_gws
(
lws
.
size
());
for
(
size_t
i
=
0
;
i
<
lws
.
size
();
++
i
)
{
roundup_gws
[
i
]
=
RoundUp
(
gws
[
i
],
lws
[
i
]);
}
cl
::
Event
event
;
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel_
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
kernel_
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_
gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
);
if
(
future
!=
nullptr
)
{
...
...
mace/kernels/opencl/buffer_to_image.cc
浏览文件 @
0e078d19
...
...
@@ -26,7 +26,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
buffer
->
Resize
(
image
->
shape
());
}
size_t
gws
[
2
]
=
{
image_shape
[
0
],
image_shape
[
1
]};
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
image_shape
[
0
]),
static_cast
<
uint32_t
>
(
image_shape
[
1
])};
std
::
string
kernel_name
;
switch
(
type
)
{
case
CONV2D_FILTER
:
...
...
@@ -98,10 +99,20 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
3
)));
}
b2f_kernel
.
setArg
(
idx
++
,
*
(
image
->
opencl_image
()));
const
std
::
vector
<
uint32_t
>
lws
=
{
16
,
64
};
b2f_kernel
.
setArg
(
idx
++
,
gws
[
0
]);
b2f_kernel
.
setArg
(
idx
++
,
gws
[
1
]);
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
b2f_kernel
));
const
std
::
vector
<
uint32_t
>
lws
=
{
16
,
kwg_size
/
16
};
std
::
vector
<
uint32_t
>
roundup_gws
(
lws
.
size
());
for
(
size_t
i
=
0
;
i
<
lws
.
size
();
++
i
)
{
roundup_gws
[
i
]
=
RoundUp
(
gws
[
i
],
lws
[
i
]);
}
cl
::
Event
event
;
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_
gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
...
...
mace/kernels/opencl/channel_shuffle.cc
浏览文件 @
0e078d19
...
...
@@ -30,9 +30,13 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
"groups must be multiple of 4"
);
const
index_t
group_channel_blocks
=
RoundUpDiv4
(
channels_per_group
);
if
(
kernel_
.
get
()
==
nullptr
)
{
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
group_channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"channel_shuffle"
);
built_options
.
emplace
(
"-Dchannel_shuffle="
+
kernel_name
);
...
...
@@ -42,19 +46,23 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
=
runtime
->
BuildKernel
(
"channel_shuffle"
,
kernel_name
,
built_options
);
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
groups_
);
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
channels_per_group
));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
input_shape_
=
input
->
shape
();
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
group_channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)}
;
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
))
;
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"channel_shuffle_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
...
...
mace/kernels/opencl/cl/activation.cl
浏览文件 @
0e078d19
...
...
@@ -5,11 +5,19 @@ __kernel void activation(__read_only image2d_t input,
__read_only
image2d_t
alpha,
#
endif
__private
const
float
relux_max_limit,
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2
)
{
const
int
ch_blk
=
get_global_id
(
0
)
;
const
int
w
=
get_global_id
(
1
)
;
const
int
hb
=
get_global_id
(
2
)
;
const
int
width
=
get_global_size
(
1
)
;
if
(
ch_blk
>=
global_size_dim0
|
| w >= global_size_dim1
|
|
hb
>=
global_size_dim2
)
{
return
;
}
const
int
width
=
global_size_dim1
;
const
int
pos
=
mad24
(
ch_blk,
width,
w
)
;
DATA_TYPE4
in
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
pos,
hb
))
;
...
...
mace/kernels/opencl/cl/addn.cl
浏览文件 @
0e078d19
...
...
@@ -8,9 +8,12 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
#
if
INPUT_NUM
>
3
__read_only
image2d_t
input3,
#
endif
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1
)
{
const
int
w
=
get_global_id
(
0
)
;
const
int
hb
=
get_global_id
(
1
)
;
if
(
w
>=
global_size_dim0
||
hb
>=
global_size_dim1
)
return
;
DATA_TYPE4
in0
=
READ_IMAGET
(
input0,
SAMPLER,
(
int2
)(
w,
hb
))
;
DATA_TYPE4
in1
=
READ_IMAGET
(
input1,
SAMPLER,
(
int2
)(
w,
hb
))
;
...
...
mace/kernels/opencl/cl/batch_norm.cl
浏览文件 @
0e078d19
...
...
@@ -9,11 +9,19 @@ __kernel void batch_norm(__read_only image2d_t input,
__private
const
float
epsilon,
#
endif
__write_only
image2d_t
output,
__private
const
float
relux_max_limit
)
{
__private
const
float
relux_max_limit,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2
)
{
const
int
ch_blk
=
get_global_id
(
0
)
;
const
int
w
=
get_global_id
(
1
)
;
const
int
hb
=
get_global_id
(
2
)
;
const
int
width
=
get_global_size
(
1
)
;
if
(
ch_blk
>=
global_size_dim0
|
| w >= global_size_dim1
|| hb >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#ifdef FOLDED_CONSTANT
DATA_TYPE4 bn_scale = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0));
...
...
mace/kernels/opencl/cl/bias_add.cl
浏览文件 @
0e078d19
...
...
@@ -2,11 +2,19 @@
//
Supported
data
types:
half/float
__kernel
void
bias_add
(
__read_only
image2d_t
input,
__read_only
image2d_t
bias,
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2
)
{
const
int
ch_blk
=
get_global_id
(
0
)
;
const
int
w
=
get_global_id
(
1
)
;
const
int
hb
=
get_global_id
(
2
)
;
const
int
width
=
get_global_size
(
1
)
;
if
(
ch_blk
>=
global_size_dim0
|
| w >= global_size_dim1
|
|
hb
>=
global_size_dim2
)
{
return
;
}
const
int
width
=
global_size_dim1
;
const
int
pos
=
mad24
(
ch_blk,
width,
w
)
;
DATA_TYPE4
in
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
pos,
hb
))
;
...
...
mace/kernels/opencl/cl/buffer_to_image.cl
浏览文件 @
0e078d19
...
...
@@ -5,9 +5,15 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
__private
const
int
filter_w,
__private
const
int
out_channel,
__private
const
int
in_channel,
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1
)
{
int
w
=
get_global_id
(
0
)
;
int
h
=
get_global_id
(
1
)
;
if
(
w
>=
global_size_dim0
|
| h >= global_size_dim1) {
return;
}
const int out_channel_idx = h * 4;
const int rounded_in_channel = ((in_channel + 3) / 4) * 4;
const int hw_idx = w / rounded_in_channel;
...
...
@@ -45,9 +51,15 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
__private const int filter_w,
__private const int out_channel,
__private const int in_channel,
__read_only
image2d_t
input
)
{
__read_only image2d_t input,
__private const int global_size_dim0,
__private const int global_size_dim1) {
int w = get_global_id(0);
int h = get_global_id(1);
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int out_channel_idx = h * 4;
const int rounded_in_channel = ((in_channel + 3) / 4) * 4;
const int hw_idx = w / rounded_in_channel;
...
...
@@ -84,9 +96,14 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
__private const int filter_w,
__private const int in_channel,
__private const int multiplier,
__write_only
image2d_t
output
)
{
/*
ic%4
*
kh
*
kw
*
m,
ic/4
*/
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1) { /* ic%4 * kh * kw * m, ic/4 */
const int w = get_global_id(0);
const int h = get_global_id(1);
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
DATA_TYPE4 values = 0;
if (multiplier == 1) {
...
...
@@ -134,9 +151,15 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__private const int height,
__private const int width,
__private const int channels,
__write_only
image2d_t
output
)
{
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1) {
int w = get_global_id(0);
int h = get_global_id(1);
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int batch_idx = h / height;
const int height_idx = h % height;
const int width_idx = w % width;
...
...
@@ -166,9 +189,15 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
__private const int height,
__private const int width,
__private const int channels,
__read_only
image2d_t
input
)
{
__read_only image2d_t input,
__private const int global_size_dim0,
__private const int global_size_dim1) {
int w = get_global_id(0);
int h = get_global_id(1);
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int batch_idx = h / height;
const int height_idx = h % height;
const int width_idx = w % width;
...
...
@@ -196,9 +225,14 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
__kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int count,
__write_only
image2d_t
output
)
{
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1) {
int w = get_global_id(0);
int h = get_global_id(1);
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int offset = input_offset + w * 4;
const int size = count - w * 4;
...
...
@@ -223,9 +257,14 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
__private const int count,
__read_only
image2d_t
input
)
{
__read_only image2d_t input,
__private const int global_size_dim0,
__private const int global_size_dim1) {
int w = get_global_id(0);
int h = get_global_id(1);
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int offset = w * 4;
int2 coord = (int2)(w, h);
...
...
@@ -251,9 +290,15 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
__private const int height,
__private const int width,
__private const int channels,
__write_only
image2d_t
output
)
{
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1) {
int w = get_global_id(0);
int h = get_global_id(1);
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int wc = width * channels;
const int height_blks = (height + 3) / 4;
const int batch_idx = h / height_blks;
...
...
@@ -284,9 +329,15 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
__private const int height,
__private const int width,
__private const int channels,
__read_only
image2d_t
input
)
{
__read_only image2d_t input,
__private const int global_size_dim0,
__private const int global_size_dim1) {
int w = get_global_id(0);
int h = get_global_id(1);
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int height_blks = (height + 3) / 4;
const int batch_idx = h / height_blks;
const int height_idx = (h % height_blks) << 2;
...
...
@@ -315,9 +366,15 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
__private const int height,
__private const int width,
__private const int channels,
__write_only
image2d_t
output
)
{
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1) {
int w = get_global_id(0);
int h = get_global_id(1);
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int width_blks = (width + 3) / 4;
const int batch_idx = h / height;
const int height_idx = h % height;
...
...
@@ -349,10 +406,16 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
__private const int in_channels,
__private const int height,
__private const int width,
__write_only
image2d_t
output
)
{
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1) {
int w = get_global_id(0);
int h = get_global_id(1);
const
int
out_channels
=
get_global_size
(
1
)
;
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int out_channels = global_size_dim1;
const int out_channel_idx = h;
const int in_channel_idx = w << 2;
const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
...
...
@@ -429,9 +492,15 @@ __kernel void winograd_filter_image_to_buffer(__global DATA_TYPE *output, //Oc,
__private const int height,
__private const int width,
__private const int channel,
__read_only
image2d_t
input
)
{
__read_only image2d_t input,
__private const int global_size_dim0,
__private const int global_size_dim1) {
const int w = get_global_id(0);
const int h = get_global_id(1);
if (w >= global_size_dim0 |
|
h
>=
global_size_dim1
)
{
return
;
}
const
int
width_idx
=
w
<<
2
;
const
int
size
=
width
-
width_idx
;
int
offset
=
h
*
width
+
width_idx
;
...
...
mace/kernels/opencl/cl/channel_shuffle.cl
浏览文件 @
0e078d19
...
...
@@ -4,11 +4,19 @@
__kernel
void
channel_shuffle
(
__read_only
image2d_t
input,
__private
const
int
groups,
__private
const
int
channels_per_group,
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2
)
{
const
int
group_chan_blk_idx
=
get_global_id
(
0
)
;
const
int
width_idx
=
get_global_id
(
1
)
;
const
int
width
=
get_global_size
(
1
)
;
const
int
hb_idx
=
get_global_id
(
2
)
;
if
(
group_chan_blk_idx
>=
global_size_dim0
|
| width_idx >= global_size_dim1
|
|
hb_idx
>=
global_size_dim2
)
{
return
;
}
const
int
width
=
global_size_dim1
;
const
int
group_blks
=
groups
/
4
;
const
int
groups_blks_width
=
group_blks
*
width
;
const
int
channels_per_group_blks
=
channels_per_group
/
4
;
...
...
mace/kernels/opencl/cl/concat.cl
浏览文件 @
0e078d19
...
...
@@ -25,11 +25,19 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
__kernel
void
concat_channel
(
__read_only
image2d_t
input0,
__read_only
image2d_t
input1,
__private
const
int
input0_chan,
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2
)
{
const
int
chan_blk_idx
=
get_global_id
(
0
)
;
const
int
width_idx
=
get_global_id
(
1
)
;
const
int
width
=
get_global_size
(
1
)
;
const
int
hb_idx
=
get_global_id
(
2
)
;
if
(
chan_blk_idx
>=
global_size_dim0
|
| width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
const int input0_chan_blk = (input0_chan + 3) >> 2;
DATA_TYPE4 data = 0;
...
...
@@ -74,11 +82,19 @@ __kernel void concat_channel(__read_only image2d_t input0,
// Required: All input channels are divisible by 4
__kernel void concat_channel_multi(__read_only image2d_t input,
__private const int chan_blk_offset,
__write_only
image2d_t
output
)
{
__write_only image2d_t output,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
const int chan_blk_idx = get_global_id(0);
const int width_idx = get_global_id(1);
const
int
width
=
get_global_size
(
1
)
;
const int hb_idx = get_global_id(2);
if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
DATA_TYPE4 data = 0;
data = READ_IMAGET(input,
SAMPLER,
...
...
mace/kernels/opencl/cl/conv_2d.cl
浏览文件 @
0e078d19
...
...
@@ -18,11 +18,20 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__private
const
int
padding_top,
__private
const
int
padding_left,
__private
const
int
dilation_h,
__private
const
int
dilation_w
)
{
__private
const
int
dilation_w,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2
)
{
const
int
out_ch_blk
=
get_global_id
(
0
)
;
const
int
out_w_blk
=
get_global_id
(
1
)
;
const
int
out_w_blks
=
get_global_size
(
1
)
;
const
int
out_hb
=
get_global_id
(
2
)
;
if
(
out_ch_blk
>=
global_size_dim0
|
| out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
}
const int out_w_blks = global_size_dim1;
const int rounded_in_ch = in_ch_blks << 2;
#ifdef BIAS
...
...
mace/kernels/opencl/cl/conv_2d_1x1.cl
浏览文件 @
0e078d19
...
...
@@ -12,12 +12,21 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__private
const
int
in_ch_blks,
__private
const
int
height,
__private
const
int
width,
__private
const
int
stride
)
{
__private
const
int
stride,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2
)
{
const
int
out_ch_blk
=
get_global_id
(
0
)
;
const
int
out_w_blk
=
get_global_id
(
1
)
;
const
int
out_w_blks
=
get_global_size
(
1
)
;
const
int
out_hb
=
get_global_id
(
2
)
;
if
(
out_ch_blk
>=
global_size_dim0
|
| out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
}
const int out_w_blks = global_size_dim1;
#ifdef BIAS
DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
DATA_TYPE4 out1 = out0;
...
...
mace/kernels/opencl/cl/conv_2d_3x3.cl
浏览文件 @
0e078d19
...
...
@@ -16,11 +16,20 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__private
const
int
padding_top,
__private
const
int
padding_left,
__private
const
int
dilation_h,
__private
const
int
dilation_w
)
{
__private
const
int
dilation_w,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2
)
{
const
int
out_ch_blk
=
get_global_id
(
0
)
;
const
int
out_w_blk
=
get_global_id
(
1
)
;
const
int
out_w_blks
=
get_global_size
(
1
)
;
const
int
out_hb
=
get_global_id
(
2
)
;
if
(
out_ch_blk
>=
global_size_dim0
|
| out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
}
const int out_w_blks = global_size_dim1;
const int rounded_in_ch = in_ch_blks << 2;
#ifdef BIAS
...
...
mace/kernels/opencl/cl/depthwise_conv2d.cl
浏览文件 @
0e078d19
...
...
@@ -18,11 +18,19 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
__private
const
short
padding_top,
__private
const
short
padding_left,
__private
const
short
dilation_h,
__private
const
short
dilation_w
)
{
__private
const
short
dilation_w,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2
)
{
const
short
out_ch_blk
=
get_global_id
(
0
)
;
const
short
out_w_blk
=
get_global_id
(
1
)
;
const
short
out_w_blks
=
get_global_size
(
1
)
;
const
short
out_hb
=
get_global_id
(
2
)
;
if
(
out_ch_blk
>=
global_size_dim0
|
| out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
}
const short out_w_blks = global_size_dim1;
const short rounded_in_ch = in_ch_blks << 2;
const short in_ch_blk = out_ch_blk; // multiplier = 1
...
...
@@ -141,10 +149,18 @@ __kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4
__private const short filter_height,
__private const short filter_width,
__private const short padding_top,
__private const short padding_left) {
__private const short padding_left,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
const short out_ch_blk = get_global_id(0);
const short out_w_blk = get_global_id(1) << 2;
const short out_hb = get_global_id(2);
if (out_ch_blk >= global_size_dim0 || get_global_id(1) >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
}
const short rounded_in_ch = in_ch_blks << 2;
const short in_ch_blk = out_ch_blk; // multiplier = 1
...
...
mace/kernels/opencl/cl/eltwise.cl
浏览文件 @
0e078d19
...
...
@@ -6,9 +6,12 @@ __kernel void eltwise(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
__private
const
float
coeff0,
__private
const
float
coeff1,
#
endif
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1
)
{
const
int
w
=
get_global_id
(
0
)
;
const
int
hb
=
get_global_id
(
1
)
;
if
(
w
>=
global_size_dim0
||
hb
>=
global_size_dim1
)
return
;
DATA_TYPE4
in0
=
READ_IMAGET
(
input0,
SAMPLER,
(
int2
)(
w,
hb
))
;
DATA_TYPE4
in1
=
READ_IMAGET
(
input1,
SAMPLER,
(
int2
)(
w,
hb
))
;
...
...
mace/kernels/opencl/cl/fully_connected.cl
浏览文件 @
0e078d19
...
...
@@ -10,9 +10,15 @@ __kernel void fully_connected(__read_only image2d_t input,
__private
const
int
input_height,
__private
const
int
input_width,
__private
const
int
input_channel,
__private
const
float
relux_max_limit
)
{
__private
const
float
relux_max_limit,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1
)
{
const
int
batch_idx
=
get_global_id
(
0
)
;
const
int
out_blk_idx
=
get_global_id
(
1
)
;
if
(
batch_idx
>=
global_size_dim0
|
| out_blk_idx >= global_size_dim1) {
return;
}
const int input_chan_blk = (input_channel + 3) >> 2;
float4 input_value;
...
...
@@ -68,11 +74,20 @@ __kernel void fully_connected_width(__read_only image2d_t input,
__private const int input_width,
__private const int in_chan_blks,
__private const int out_blks,
__private const float relux_max_limit) {
__private const float relux_max_limit,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
const int inter_out_idx = get_global_id(0);
const int width_blk_idx = get_global_id(1);
const int width_blk_count = get_global_size(1);
const int batch_out_blk_idx = get_global_id(2);
if (inter_out_idx >= global_size_dim0 || width_blk_idx >= global_size_dim1
|| batch_out_blk_idx >= global_size_dim2) {
return;
}
const int width_blk_count = global_size_dim1;
const int batch_idx = batch_out_blk_idx / out_blks;
const int out_blk_idx = batch_out_blk_idx % out_blks;
...
...
mace/kernels/opencl/cl/matmul.cl
浏览文件 @
0e078d19
...
...
@@ -8,9 +8,13 @@ __kernel void matmul(__read_only image2d_t A,
__private
const
int
N,
__private
const
int
K,
__private
const
int
height_blocks,
__private
const
int
k_blocks
)
{
__private
const
int
k_blocks,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1
)
{
const
int
gx
=
get_global_id
(
0
)
<<
2
;
const
int
hb
=
get_global_id
(
1
)
;
if
(
get_global_id
(
0
)
>=
global_size_dim0
||
hb
>=
global_size_dim1
)
return
;
const
int
batch
=
hb
/
height_blocks
;
const
int
ty
=
(
hb
%
height_blocks
)
;
const
int
gy
=
mad24
(
batch,
height_blocks,
ty
)
;
...
...
mace/kernels/opencl/cl/pooling.cl
浏览文件 @
0e078d19
...
...
@@ -27,12 +27,19 @@ __kernel void pooling(__read_only image2d_t input,
__private
const
int
pad_left,
__private
const
int
stride,
__private
const
int
pooling_size,
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2
)
{
const
int
out_chan_idx
=
get_global_id
(
0
)
;
const
int
out_width_idx
=
get_global_id
(
1
)
;
const
int
out_width
=
get_global_size
(
1
)
;
const
int
out_hb_idx
=
get_global_id
(
2
)
;
if
(
out_chan_idx
>=
global_size_dim0
|
| out_width_idx >= global_size_dim1
|| out_hb_idx >= global_size_dim2) {
return;
}
const int out_width = global_size_dim1;
const int batch_idx = mul24((out_hb_idx / out_height), in_height);
const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top;
const int in_width_start = mul24(out_width_idx, stride) - pad_left;
...
...
mace/kernels/opencl/cl/resize_bilinear.cl
浏览文件 @
0e078d19
...
...
@@ -6,12 +6,20 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w *
__private
const
float
width_scale,
__private
const
int
in_height,
__private
const
int
in_width,
__private
const
int
out_height
)
{
__private
const
int
out_height,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2
)
{
const
int
ch_blk
=
get_global_id
(
0
)
;
const
int
ch_blks
=
get_global_size
(
0
)
;
const
int
w
=
get_global_id
(
1
)
;
const
int
out_width
=
get_global_size
(
1
)
;
const
int
hb
=
get_global_id
(
2
)
;
if
(
ch_blk
>=
global_size_dim0
|
| w >= global_size_dim1
|
|
hb
>=
global_size_dim2
)
{
return
;
}
const
int
ch_blks
=
global_size_dim0
;
const
int
out_width
=
global_size_dim1
;
const
int
b
=
hb
/
out_height
;
const
int
h
=
hb
%
out_height
;
...
...
mace/kernels/opencl/cl/slice.cl
浏览文件 @
0e078d19
...
...
@@ -2,11 +2,20 @@
__kernel
void
slice
(
__read_only
image2d_t
input,
__private
const
int
chan_blk_offset,
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2
)
{
const
int
chan_blk_idx
=
get_global_id
(
0
)
;
const
int
width_idx
=
get_global_id
(
1
)
;
const
int
width
=
get_global_size
(
1
)
;
const
int
hb_idx
=
get_global_id
(
2
)
;
if
(
chan_blk_idx
>=
global_size_dim0
|
| width_idx >= global_size_dim1
|
|
hb_idx
>=
global_size_dim2
)
{
return
;
}
const
int
width
=
global_size_dim1
;
DATA_TYPE4
data
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
mad24
(
chan_blk_idx
+
chan_blk_offset,
width,
width_idx
)
,
hb_idx
))
;
...
...
mace/kernels/opencl/cl/softmax.cl
浏览文件 @
0e078d19
...
...
@@ -3,12 +3,20 @@
__kernel
void
softmax
(
__read_only
image2d_t
input,
__private
const
int
channels,
__private
const
int
remain_channels,
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2
)
{
const
int
chan_blk_idx
=
get_global_id
(
0
)
;
const
int
width_idx
=
get_global_id
(
1
)
;
const
int
hb_idx
=
get_global_id
(
2
)
;
const
int
chan_blks
=
get_global_size
(
0
)
-
1
;
const
int
width
=
get_global_size
(
1
)
;
if
(
chan_blk_idx
>=
global_size_dim0
|
| width_idx >= global_size_dim1
|
|
hb_idx
>=
global_size_dim2
)
{
return
;
}
const
int
chan_blks
=
global_size_dim0
-
1
;
const
int
width
=
global_size_dim1
;
int
pos
=
width_idx
;
DATA_TYPE
max_value
=
-FLT_MAX
;
...
...
mace/kernels/opencl/cl/space_to_batch.cl
浏览文件 @
0e078d19
...
...
@@ -9,10 +9,17 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
__private
const
int
space_height,
__private
const
int
space_width,
__private
const
int
batch_height,
__private
const
int
batch_width
)
{
__private
const
int
batch_width,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2
)
{
const
int
chan_idx
=
get_global_id
(
0
)
;
const
int
batch_w_idx
=
get_global_id
(
1
)
;
const
int
batch_hb_idx
=
get_global_id
(
2
)
;
if
(
chan_idx
>=
global_size_dim0
|
| batch_w_idx >= global_size_dim1
|| batch_hb_idx >= global_size_dim2) {
return;
}
const int batch_b_idx = batch_hb_idx / batch_height;
const int batch_h_idx = batch_hb_idx % batch_height;
...
...
@@ -48,10 +55,17 @@ __kernel void batch_to_space(__read_only image2d_t batch_data,
__private const int space_height,
__private const int space_width,
__private const int batch_height,
__private
const
int
batch_width
)
{
__private const int batch_width,
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2) {
const int chan_idx = get_global_id(0);
const int batch_w_idx = get_global_id(1);
const int batch_hb_idx = get_global_id(2);
if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
|
|
batch_hb_idx
>=
global_size_dim2
)
{
return
;
}
const
int
batch_b_idx
=
batch_hb_idx
/
batch_height
;
const
int
batch_h_idx
=
batch_hb_idx
%
batch_height
;
...
...
mace/kernels/opencl/cl/winograd_transform.cl
浏览文件 @
0e078d19
...
...
@@ -8,10 +8,16 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
__private
const
int
round_hw,
__private
const
int
round_w,
__private
const
int
padding_top,
__private
const
int
padding_left
)
{
__private
const
int
padding_left,
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1
)
{
int
out_width_idx
=
get_global_id
(
0
)
;
int
chan_blk_idx
=
get_global_id
(
1
)
;
const
int
chan_blk_size
=
get_global_size
(
1
)
;
if
(
out_width_idx
>=
global_size_dim0
|
| chan_blk_idx >= global_size_dim1) {
return;
}
const int chan_blk_size = global_size_dim1;
const int batch_idx = out_width_idx / round_hw;
const int t_idx = out_width_idx % round_hw;
...
...
@@ -115,10 +121,16 @@ __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
__private const int out_width,
__private const int round_hw,
__private const int round_w,
__private const float relux_max_limit) {
__private const float relux_max_limit,
__private const int global_size_dim0,
__private const int global_size_dim1) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int out_channel = get_global_size(1);
if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
return;
}
const int out_channel = global_size_dim1;
int width = width_idx;
int height = height_idx;
...
...
mace/kernels/opencl/concat.cc
浏览文件 @
0e078d19
...
...
@@ -24,9 +24,14 @@ static void Concat2(cl::Kernel *kernel,
const
index_t
channel
=
output
->
dim
(
3
);
const
int
channel_blk
=
RoundUpDiv4
(
channel
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blk
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
batch
*
height
),
};
if
(
kernel
->
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel
->
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"concat_channel"
);
built_options
.
emplace
(
"-Dconcat_channel="
+
kernel_name
);
...
...
@@ -51,14 +56,16 @@ static void Concat2(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input0
->
dim
(
3
)));
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Image2D
*>
(
output
->
opencl_image
())));
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
*
prev_input_shape
=
input0
->
shape
();
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blk
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
batch
*
height
),
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"concat_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
@@ -75,8 +82,8 @@ static void ConcatN(cl::Kernel *kernel,
const
index_t
width
=
output
->
dim
(
2
);
const
index_t
channel
=
output
->
dim
(
3
);
if
(
kernel
->
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel
->
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"concat_channel_multi"
);
built_options
.
emplace
(
"-Dconcat_channel_multi="
+
kernel_name
);
...
...
@@ -89,18 +96,24 @@ static void ConcatN(cl::Kernel *kernel,
index_t
chan_blk_offset
=
0
;
for
(
int
i
=
0
;
i
<
inputs_count
;
++
i
)
{
const
Tensor
*
input
=
input_list
[
i
];
index_t
input_channel_blk
=
input
->
dim
(
3
)
/
4
;
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
input_channel_blk
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
batch
*
height
),
};
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
chan_blk_offset
));
kernel
->
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
index_t
input_channel_blk
=
input
->
dim
(
3
)
/
4
;
chan_blk_offset
+=
input_channel_blk
;
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
input_channel_blk
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
batch
*
height
),
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"concat_n_opencl_kernel_"
<<
input_channel_blk
<<
"_"
<<
width
<<
"_"
<<
batch
*
height
;
...
...
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
0e078d19
...
...
@@ -36,6 +36,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel
->
get
()
==
nullptr
)
{
MACE_CHECK
(
input_batch
==
batch
);
...
...
@@ -66,9 +67,13 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
auto
runtime
=
OpenCLRuntime
::
Global
();
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_1x1"
,
kernel_name
,
built_options
);
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
...
...
@@ -85,14 +90,16 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
height
));
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
width
));
kernel
->
setArg
(
idx
++
,
stride
);
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
*
prev_input_shape
=
input
->
shape
();
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
15
,
8
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
std
::
string
tuning_key
=
Concat
(
"conv2d_1x1_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
0e078d19
...
...
@@ -35,6 +35,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
width_blocks
=
RoundUpDiv
<
index_t
,
5
>
(
width
);
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel
->
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"conv_2d_3x3"
);
...
...
@@ -61,9 +63,13 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
auto
runtime
=
OpenCLRuntime
::
Global
();
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_3x3"
,
kernel_name
,
built_options
);
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
...
...
@@ -83,14 +89,16 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
padding
[
1
]
/
2
);
kernel
->
setArg
(
idx
++
,
dilations
[
0
]);
kernel
->
setArg
(
idx
++
,
dilations
[
1
]);
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
*
prev_input_shape
=
input
->
shape
();
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
4
,
15
,
8
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
const
std
::
vector
<
uint32_t
>
lws
=
{
4
,
kwg_size
/
32
,
8
,
1
};
std
::
string
tuning_key
=
Concat
(
"conv2d_3x3_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
...
...
mace/kernels/opencl/conv_2d_opencl_general.cc
浏览文件 @
0e078d19
...
...
@@ -35,6 +35,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel
->
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"conv_2d"
);
...
...
@@ -61,9 +63,13 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
auto
runtime
=
OpenCLRuntime
::
Global
();
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d"
,
kernel_name
,
built_options
);
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
...
...
@@ -85,14 +91,16 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
padding
[
1
]
/
2
);
kernel
->
setArg
(
idx
++
,
dilations
[
0
]);
kernel
->
setArg
(
idx
++
,
dilations
[
1
]);
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
*
prev_input_shape
=
input
->
shape
();
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
std
::
string
tuning_key
=
Concat
(
"conv2d_general_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
...
...
mace/kernels/opencl/depthwise_conv_opencl.cc
浏览文件 @
0e078d19
...
...
@@ -35,8 +35,14 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
if
(
kernel
->
get
()
==
nullptr
)
{
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel
->
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"depthwise_conv2d"
);
if
(
stride
==
1
&&
dilations
[
0
]
==
1
&&
dilations
[
1
]
==
1
)
{
...
...
@@ -104,13 +110,15 @@ void DepthwiseConv2d(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
static_cast
<
int16_t
>
(
dilations
[
0
]));
kernel
->
setArg
(
idx
++
,
static_cast
<
int16_t
>
(
dilations
[
1
]));
}
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
*
prev_input_shape
=
input
->
shape
();
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
std
::
string
tuning_key
=
Concat
(
"depthwise_conv2d_ocl_kernel_"
,
activation
,
batch
,
height
,
width
,
channels
,
multiplier
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
...
...
mace/kernels/opencl/eltwise_opencl.cc
浏览文件 @
0e078d19
...
...
@@ -24,8 +24,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
const
index_t
width_pixels
=
channel_blocks
*
width
;
const
index_t
batch_height_pixels
=
batch
*
height
;
if
(
kernel_
.
get
()
==
nullptr
)
{
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
static_cast
<
uint32_t
>
(
batch_height_pixels
)};
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"eltwise"
);
...
...
@@ -45,12 +49,14 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_
.
setArg
(
idx
++
,
coeff_
[
1
]);
}
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
input_shape_
=
input0
->
shape
();
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
static_cast
<
uint32_t
>
(
batch_height_pixels
)}
;
const
std
::
vector
<
uint32_t
>
lws
=
{
64
,
16
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
))
;
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size
/
16
,
16
,
1
};
std
::
stringstream
ss
;
ss
<<
"eltwise_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
mace/kernels/opencl/fully_connected_opencl.cc
浏览文件 @
0e078d19
...
...
@@ -75,6 +75,7 @@ void FCWXKernel(cl::Kernel *kernel,
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
output_blocks
=
RoundUpDiv4
(
output
->
dim
(
3
));
(
*
gws
)[
2
]
=
static_cast
<
uint32_t
>
(
batch
*
output_blocks
);
uint32_t
idx
=
0
;
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
...
...
@@ -90,14 +91,21 @@ void FCWXKernel(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
RoundUpDiv4
(
input
->
dim
(
3
))));
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
output_blocks
));
kernel
->
setArg
(
idx
++
,
relux_max_limit
);
(
*
gws
)[
2
]
=
static_cast
<
uint32_t
>
(
batch
*
output_blocks
);
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
0
]);
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
1
]);
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
2
]);
*
prev_input_shape
=
input
->
shape
();
}
std
::
vector
<
uint32_t
>
roundup_gws
(
lws
->
size
());
for
(
size_t
i
=
0
;
i
<
lws
->
size
();
++
i
)
{
roundup_gws
[
i
]
=
RoundUp
((
*
gws
)[
i
],
(
*
lws
)[
i
]);
}
cl
::
Event
event
;
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
*
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
(
*
gws
)[
0
],
(
*
gws
)[
1
],
(
*
gws
)
[
2
]),
*
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
((
*
lws
)[
0
],
(
*
lws
)[
1
],
(
*
lws
)[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
...
...
@@ -161,6 +169,13 @@ void FCWTXKernel(cl::Kernel *kernel,
}
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
output_blocks
=
RoundUpDiv4
(
output
->
dim
(
3
));
*
gws
=
{
static_cast
<
uint32_t
>
(
batch
),
static_cast
<
uint32_t
>
(
output_blocks
),
};
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
weight
->
opencl_image
()));
if
(
bias
!=
nullptr
)
{
...
...
@@ -172,13 +187,9 @@ void FCWTXKernel(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
input
->
dim
(
3
)));
// FIXME handle flexable data type: half not supported
kernel
->
setArg
(
idx
++
,
relux_max_limit
);
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
0
]);
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
1
]);
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
output_blocks
=
RoundUpDiv4
(
output
->
dim
(
3
));
*
gws
=
{
static_cast
<
uint32_t
>
(
batch
),
static_cast
<
uint32_t
>
(
output_blocks
),
};
*
prev_input_shape
=
input
->
shape
();
}
...
...
mace/kernels/opencl/helper.cc
浏览文件 @
0e078d19
...
...
@@ -226,12 +226,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
{
4
,
kwg_size
/
28
,
7
,
1
},
{
4
,
kwg_size
/
32
,
8
,
1
},
{
4
,
kwg_size
/
56
,
14
,
1
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
,
1
},
};
};
cl
::
Event
event
;
...
...
@@ -240,6 +235,11 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
MACE_CHECK
(
params
.
size
()
==
4
)
<<
"Tuning parameters of 3D kernel must be 4D"
;
cl_int
error
=
CL_SUCCESS
;
std
::
vector
<
uint32_t
>
roundup_gws
(
3
);
for
(
size_t
i
=
0
;
i
<
3
;
++
i
)
{
roundup_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
}
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
[
3
];
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
...
...
@@ -247,16 +247,17 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
uint32_t
roundup_gws2
=
RoundUp
(
gws2
,
params
[
2
]);
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
ClearTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
...
...
@@ -273,9 +274,10 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
uint32_t
roundup_gws2
=
RoundUp
(
gws2
,
params
[
2
]);
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
...
...
@@ -318,7 +320,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
{
kwg_size
/
64
,
64
,
1
},
{
kwg_size
/
128
,
128
,
1
},
{
kwg_size
/
256
,
256
,
1
},
{
kwg_size
/
512
,
512
,
1
},
{
kwg_size
,
1
,
1
},
{
1
,
kwg_size
,
1
}};
};
...
...
@@ -328,6 +329,11 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
MACE_CHECK
(
params
.
size
()
==
3
)
<<
"Tuning parameters of 2D kernel must be 3d"
;
cl_int
error
=
CL_SUCCESS
;
std
::
vector
<
uint32_t
>
roundup_gws
(
2
);
for
(
size_t
i
=
0
;
i
<
2
;
++
i
)
{
roundup_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
}
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
[
2
];
const
uint32_t
block_size
=
gws
[
1
]
/
num_blocks
;
...
...
@@ -335,15 +341,16 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
uint32_t
roundup_gws1
=
RoundUp
(
gws1
,
params
[
1
]);
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
else
{
timer
->
ClearTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_
gws
[
1
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
...
...
@@ -360,8 +367,9 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
uint32_t
roundup_gws1
=
RoundUp
(
gws1
,
params
[
1
]);
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
...
...
mace/kernels/opencl/matmul.cc
浏览文件 @
0e078d19
...
...
@@ -26,9 +26,14 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
const
index_t
height_blocks
=
RoundUpDiv4
(
height
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height_blocks
*
batch
),
};
if
(
kernel_
.
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"matmul"
);
...
...
@@ -46,12 +51,12 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
A
->
dim
(
2
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height_blocks
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
RoundUpDiv4
(
A
->
dim
(
2
))));
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height_blocks
*
batch
),
};
const
std
::
vector
<
uint32_t
>
lws
=
{
16
,
64
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size
/
64
,
64
,
1
};
std
::
stringstream
ss
;
ss
<<
"matmul_opencl_kernel_"
<<
C
->
dim
(
0
)
<<
"_"
<<
C
->
dim
(
1
)
<<
"_"
<<
C
->
dim
(
2
)
<<
"_"
<<
C
->
dim
(
3
);
...
...
mace/kernels/opencl/pooling_opencl.cc
浏览文件 @
0e078d19
...
...
@@ -18,9 +18,10 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
MACE_CHECK
(
dilations_
[
0
]
==
1
&&
dilations_
[
1
]
==
1
)
<<
"Pooling opencl kernel not support dilation yet"
;
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
const
DataType
dt
=
DataTypeToEnum
<
T
>::
value
;
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"pooling"
);
built_options
.
emplace
(
"-Dpooling="
+
kernel_name
);
...
...
@@ -37,6 +38,8 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
kernel_
=
runtime
->
BuildKernel
(
"pooling"
,
kernel_name
,
built_options
);
}
uint32_t
gws
[
3
];
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
index_t
>
filter_shape
=
{
kernels_
[
0
],
kernels_
[
1
],
...
...
@@ -59,6 +62,17 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
&
output_image_shape
);
output
->
ResizeImage
(
output_shape
,
output_image_shape
);
index_t
batch
=
output
->
dim
(
0
);
index_t
out_height
=
output
->
dim
(
1
);
index_t
out_width
=
output
->
dim
(
2
);
index_t
channels
=
output
->
dim
(
3
);
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
gws
[
0
]
=
static_cast
<
uint32_t
>
(
channel_blocks
);
gws
[
1
]
=
static_cast
<
uint32_t
>
(
out_width
);
gws
[
2
]
=
static_cast
<
uint32_t
>
(
batch
*
out_height
);
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
1
)));
...
...
@@ -69,10 +83,12 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_
.
setArg
(
idx
++
,
strides_
[
0
]);
kernel_
.
setArg
(
idx
++
,
kernels_
[
0
]);
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
input_shape_
=
input
->
shape
();
}
}
else
{
index_t
batch
=
output
->
dim
(
0
);
index_t
out_height
=
output
->
dim
(
1
);
index_t
out_width
=
output
->
dim
(
2
);
...
...
@@ -80,12 +96,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
gws
[
0
]
=
static_cast
<
uint32_t
>
(
channel_blocks
);
gws
[
1
]
=
static_cast
<
uint32_t
>
(
out_width
);
gws
[
2
]
=
static_cast
<
uint32_t
>
(
batch
*
out_height
);
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
batch
*
out_height
),
};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"pooling_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
mace/kernels/opencl/resize_bilinear_opencl.cc
浏览文件 @
0e078d19
...
...
@@ -24,8 +24,13 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
const
index_t
out_height
=
out_height_
;
const
index_t
out_width
=
out_width_
;
if
(
kernel_
.
get
()
==
nullptr
)
{
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
out_height
*
batch
)};
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"resize_bilinear_nocache"
);
built_options
.
emplace
(
"-Dresize_bilinear_nocache="
+
kernel_name
);
...
...
@@ -57,14 +62,16 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
in_height
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
in_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
out_height
));
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
input_shape_
=
input
->
shape
();
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
out_height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"resize_bilinear_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
mace/kernels/opencl/slice.cc
浏览文件 @
0e078d19
...
...
@@ -29,8 +29,9 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
output_list
[
i
]
->
ResizeImage
(
output_shape
,
image_shape
);
}
if
(
kernel_
.
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"slice"
);
built_options
.
emplace
(
"-Dslice="
+
kernel_name
);
...
...
@@ -46,7 +47,10 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
static_cast
<
uint32_t
>
(
input
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
input
->
dim
(
0
)
*
input
->
dim
(
1
)),
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"slice_opencl_kernel_"
<<
input
->
dim
(
0
)
<<
"_"
...
...
@@ -59,6 +63,9 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
channel_blk
*
i
));
kernel_
.
setArg
(
idx
++
,
*
(
output_list
[
i
]
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
}
...
...
mace/kernels/opencl/softmax_opencl.cc
浏览文件 @
0e078d19
...
...
@@ -23,9 +23,12 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
int
remain_channels
=
channel_blocks
*
4
-
channels
;
if
(
kernel_
.
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"softmax"
);
built_options
.
emplace
(
"-Dsoftmax="
+
kernel_name
);
...
...
@@ -40,12 +43,15 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
channels
));
kernel_
.
setArg
(
idx
++
,
remain_channels
);
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
input_shape_
=
logits
->
shape
();
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)}
;
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
))
;
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"softmax_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
mace/kernels/opencl/space_to_batch_opencl.cc
浏览文件 @
0e078d19
...
...
@@ -31,9 +31,15 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
batch_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
);
kernel_name
=
"space_to_batch"
;
}
const
uint32_t
chan_blk
=
RoundUpDiv4
<
uint32_t
>
(
batch_tensor
->
dim
(
3
));
const
uint32_t
gws
[
3
]
=
{
chan_blk
,
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
0
)
*
batch_tensor
->
dim
(
1
))};
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
kernel_name
);
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
stringstream
kernel_name_ss
;
kernel_name_ss
<<
"-D"
<<
kernel_name
<<
"="
<<
obfuscated_kernel_name
;
...
...
@@ -61,15 +67,16 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
space_tensor
->
dim
(
2
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
batch_tensor
->
dim
(
1
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
batch_tensor
->
dim
(
2
)));
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
space_shape_
=
space_tensor
->
shape
();
}
const
uint32_t
chan_blk
=
RoundUpDiv4
<
uint32_t
>
(
batch_tensor
->
dim
(
3
));
const
uint32_t
gws
[
3
]
=
{
chan_blk
,
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
0
)
*
batch_tensor
->
dim
(
1
))};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
kernel_name
<<
"_"
<<
batch_tensor
->
dim
(
0
)
<<
"_"
<<
batch_tensor
->
dim
(
1
)
<<
"_"
<<
batch_tensor
->
dim
(
2
)
<<
"_"
...
...
mace/kernels/opencl/winograd_transform.cc
浏览文件 @
0e078d19
...
...
@@ -15,6 +15,8 @@ template <typename T>
void
WinogradTransformFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_transform_2x2"
);
...
...
@@ -24,7 +26,6 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
auto
runtime
=
OpenCLRuntime
::
Global
();
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
built_options
);
}
...
...
@@ -44,6 +45,9 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
const
index_t
round_h
=
(
output_shape
[
1
]
+
1
)
/
2
;
const
index_t
round_w
=
(
output_shape
[
2
]
+
1
)
/
2
;
const
index_t
out_width
=
input_tensor
->
dim
(
0
)
*
round_h
*
round_w
;
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_tensor
->
dim
(
3
)))};
if
(
!
IsVecEqual
(
input_shape_
,
input_tensor
->
shape
()))
{
output_shape
=
{
16
,
input_tensor
->
dim
(
3
),
out_width
,
1
};
...
...
@@ -61,14 +65,15 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
round_w
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
paddings
[
0
]
/
2
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
paddings
[
1
]
/
2
));
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
input_shape_
=
input_tensor
->
shape
();
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_tensor
->
dim
(
3
)))};
const
std
::
vector
<
uint32_t
>
lws
=
{
128
,
8
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size
/
8
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"winograd_transform_kernel_"
<<
input_tensor
->
dim
(
0
)
<<
"_"
<<
input_tensor
->
dim
(
1
)
<<
"_"
<<
input_tensor
->
dim
(
2
)
<<
"_"
...
...
@@ -82,6 +87,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const
Tensor
*
bias
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_inverse_transform_2x2"
);
...
...
@@ -115,10 +123,13 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
}
auto
runtime
=
OpenCLRuntime
::
Global
();
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
built_options
);
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
input_tensor
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_tensor
->
dim
(
1
)))};
if
(
!
IsVecEqual
(
input_shape_
,
input_tensor
->
shape
()))
{
std
::
vector
<
index_t
>
output_shape
=
{
batch_
,
height_
,
width_
,
input_tensor
->
dim
(
1
)};
...
...
@@ -143,14 +154,15 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
round_h
*
round_w
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
round_w
));
kernel_
.
setArg
(
idx
++
,
relux_max_limit_
);
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
input_shape_
=
input_tensor
->
shape
();
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
input_tensor
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_tensor
->
dim
(
1
)))};
const
std
::
vector
<
uint32_t
>
lws
=
{
128
,
8
,
1
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size
/
8
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"winograd_inverse_transform_kernel_"
<<
input_tensor
->
dim
(
0
)
<<
"_"
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录