Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
33b049e6
Mace
项目概览
Xiaomi
/
Mace
通知
107
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
33b049e6
编写于
3月 29, 2018
作者:
刘
刘琦
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'compatible_with_opencl_1.1_and_1.2' into 'master'
Compatible with opencl 1.1 and 1.2 See merge request !333
上级
aeda046c
0fd7ce42
变更
65
隐藏空白更改
内联
并排
Showing
65 changed file
with
1084 addition
and
241 deletion
+1084
-241
mace/core/runtime/opencl/opencl_runtime.cc
mace/core/runtime/opencl/opencl_runtime.cc
+32
-6
mace/core/runtime/opencl/opencl_runtime.h
mace/core/runtime/opencl/opencl_runtime.h
+11
-0
mace/kernels/activation.h
mace/kernels/activation.h
+1
-0
mace/kernels/addn.h
mace/kernels/addn.h
+1
-0
mace/kernels/batch_norm.h
mace/kernels/batch_norm.h
+1
-0
mace/kernels/bias_add.h
mace/kernels/bias_add.h
+1
-0
mace/kernels/channel_shuffle.h
mace/kernels/channel_shuffle.h
+1
-0
mace/kernels/concat.h
mace/kernels/concat.h
+1
-0
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+1
-0
mace/kernels/depth_to_space.h
mace/kernels/depth_to_space.h
+1
-0
mace/kernels/depthwise_conv2d.h
mace/kernels/depthwise_conv2d.h
+1
-0
mace/kernels/eltwise.h
mace/kernels/eltwise.h
+1
-0
mace/kernels/matmul.h
mace/kernels/matmul.h
+1
-0
mace/kernels/opencl/activation_opencl.cc
mace/kernels/opencl/activation_opencl.cc
+18
-6
mace/kernels/opencl/addn.cc
mace/kernels/opencl/addn.cc
+17
-4
mace/kernels/opencl/batch_norm_opencl.cc
mace/kernels/opencl/batch_norm_opencl.cc
+19
-5
mace/kernels/opencl/bias_add_opencl.cc
mace/kernels/opencl/bias_add_opencl.cc
+33
-7
mace/kernels/opencl/buffer_to_image.cc
mace/kernels/opencl/buffer_to_image.cc
+32
-7
mace/kernels/opencl/channel_shuffle.cc
mace/kernels/opencl/channel_shuffle.cc
+20
-6
mace/kernels/opencl/cl/activation.cl
mace/kernels/opencl/cl/activation.cl
+12
-1
mace/kernels/opencl/cl/addn.cl
mace/kernels/opencl/cl/addn.cl
+7
-1
mace/kernels/opencl/cl/batch_norm.cl
mace/kernels/opencl/cl/batch_norm.cl
+12
-1
mace/kernels/opencl/cl/bias_add.cl
mace/kernels/opencl/cl/bias_add.cl
+12
-1
mace/kernels/opencl/cl/buffer_to_image.cl
mace/kernels/opencl/cl/buffer_to_image.cl
+120
-12
mace/kernels/opencl/cl/channel_shuffle.cl
mace/kernels/opencl/cl/channel_shuffle.cl
+17
-5
mace/kernels/opencl/cl/common.h
mace/kernels/opencl/cl/common.h
+17
-0
mace/kernels/opencl/cl/concat.cl
mace/kernels/opencl/cl/concat.cl
+28
-4
mace/kernels/opencl/cl/conv_2d.cl
mace/kernels/opencl/cl/conv_2d.cl
+13
-2
mace/kernels/opencl/cl/conv_2d_1x1.cl
mace/kernels/opencl/cl/conv_2d_1x1.cl
+13
-2
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+13
-2
mace/kernels/opencl/cl/depth_to_space.cl
mace/kernels/opencl/cl/depth_to_space.cl
+26
-2
mace/kernels/opencl/cl/depthwise_conv2d.cl
mace/kernels/opencl/cl/depthwise_conv2d.cl
+25
-3
mace/kernels/opencl/cl/eltwise.cl
mace/kernels/opencl/cl/eltwise.cl
+7
-1
mace/kernels/opencl/cl/matmul.cl
mace/kernels/opencl/cl/matmul.cl
+8
-1
mace/kernels/opencl/cl/pooling.cl
mace/kernels/opencl/cl/pooling.cl
+14
-2
mace/kernels/opencl/cl/resize_bilinear.cl
mace/kernels/opencl/cl/resize_bilinear.cl
+17
-3
mace/kernels/opencl/cl/slice.cl
mace/kernels/opencl/cl/slice.cl
+14
-2
mace/kernels/opencl/cl/softmax.cl
mace/kernels/opencl/cl/softmax.cl
+13
-1
mace/kernels/opencl/cl/space_to_batch.cl
mace/kernels/opencl/cl/space_to_batch.cl
+20
-2
mace/kernels/opencl/cl/winograd_transform.cl
mace/kernels/opencl/cl/winograd_transform.cl
+23
-2
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+45
-17
mace/kernels/opencl/conv_2d_opencl.cc
mace/kernels/opencl/conv_2d_opencl.cc
+12
-6
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+21
-6
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+21
-6
mace/kernels/opencl/conv_2d_opencl_general.cc
mace/kernels/opencl/conv_2d_opencl_general.cc
+21
-6
mace/kernels/opencl/depth_to_space_opencl.cc
mace/kernels/opencl/depth_to_space_opencl.cc
+33
-21
mace/kernels/opencl/depthwise_conv_opencl.cc
mace/kernels/opencl/depthwise_conv_opencl.cc
+24
-7
mace/kernels/opencl/eltwise_opencl.cc
mace/kernels/opencl/eltwise_opencl.cc
+17
-4
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+82
-26
mace/kernels/opencl/matmul.cc
mace/kernels/opencl/matmul.cc
+17
-6
mace/kernels/opencl/pooling_opencl.cc
mace/kernels/opencl/pooling_opencl.cc
+41
-14
mace/kernels/opencl/resize_bilinear_opencl.cc
mace/kernels/opencl/resize_bilinear_opencl.cc
+18
-5
mace/kernels/opencl/slice.cc
mace/kernels/opencl/slice.cc
+15
-2
mace/kernels/opencl/softmax_opencl.cc
mace/kernels/opencl/softmax_opencl.cc
+20
-6
mace/kernels/opencl/space_to_batch_opencl.cc
mace/kernels/opencl/space_to_batch_opencl.cc
+19
-6
mace/kernels/opencl/winograd_transform.cc
mace/kernels/opencl/winograd_transform.cc
+34
-10
mace/kernels/pooling.h
mace/kernels/pooling.h
+1
-0
mace/kernels/resize_bilinear.h
mace/kernels/resize_bilinear.h
+1
-0
mace/kernels/slice.h
mace/kernels/slice.h
+1
-0
mace/kernels/softmax.h
mace/kernels/softmax.h
+1
-0
mace/kernels/space_to_batch.h
mace/kernels/space_to_batch.h
+1
-0
mace/kernels/winograd_transform.h
mace/kernels/winograd_transform.h
+2
-0
tools/build_mace_run.sh
tools/build_mace_run.sh
+5
-0
tools/mace_tools.py
tools/mace_tools.py
+2
-1
tools/packaging_lib.sh
tools/packaging_lib.sh
+6
-1
未找到文件。
mace/core/runtime/opencl/opencl_runtime.cc
浏览文件 @
33b049e6
...
@@ -142,17 +142,17 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
...
@@ -142,17 +142,17 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
}
}
bool
gpu_detected
=
false
;
bool
gpu_detected
=
false
;
bool
is_adreno_gpu
=
false
;
device_
=
std
::
make_shared
<
cl
::
Device
>
();
device_
=
std
::
make_shared
<
cl
::
Device
>
();
for
(
auto
device
:
all_devices
)
{
for
(
auto
device
:
all_devices
)
{
if
(
device
.
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_GPU
)
{
if
(
device
.
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_GPU
)
{
*
device_
=
device
;
*
device_
=
device
;
gpu_detected
=
true
;
gpu_detected
=
true
;
const
std
::
string
device_name
=
device
.
getInfo
<
CL_DEVICE_NAME
>
();
const
std
::
string
device_name
=
device
.
getInfo
<
CL_DEVICE_NAME
>
();
constexpr
const
char
*
kQualcommAdrenoGPUStr
=
"QUALCOMM Adreno(TM)"
;
gpu_type_
=
ParseGPUTypeFromDeviceName
(
device_name
)
;
if
(
device_name
==
kQualcommAdrenoGPUStr
)
{
is_adreno_gpu
=
true
;
const
std
::
string
device_version
=
device
.
getInfo
<
CL_DEVICE_VERSION
>
()
;
}
opencl_version_
=
device_version
.
substr
(
7
,
3
);
VLOG
(
1
)
<<
"Using device: "
<<
device_name
;
VLOG
(
1
)
<<
"Using device: "
<<
device_name
;
break
;
break
;
...
@@ -171,7 +171,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
...
@@ -171,7 +171,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
}
}
cl_int
err
;
cl_int
err
;
if
(
is_adreno_gpu
)
{
if
(
gpu_type_
==
GPUType
::
QUALCOMM_ADRENO
)
{
std
::
vector
<
cl_context_properties
>
context_properties
;
std
::
vector
<
cl_context_properties
>
context_properties
;
context_properties
.
reserve
(
5
);
context_properties
.
reserve
(
5
);
GetAdrenoContextProperties
(
&
context_properties
,
gpu_perf_hint
,
GetAdrenoContextProperties
(
&
context_properties
,
gpu_perf_hint
,
...
@@ -350,4 +350,30 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
...
@@ -350,4 +350,30 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
return
size
;
return
size
;
}
}
const
bool
OpenCLRuntime
::
IsNonUniformWorkgroupsSupported
()
{
if
(
gpu_type_
==
GPUType
::
QUALCOMM_ADRENO
&&
opencl_version_
==
"2.0"
)
{
return
true
;
}
else
{
return
false
;
}
}
const
GPUType
OpenCLRuntime
::
ParseGPUTypeFromDeviceName
(
const
std
::
string
&
device_name
)
{
constexpr
const
char
*
kQualcommAdrenoGPUStr
=
"QUALCOMM Adreno(TM)"
;
constexpr
const
char
*
kMaliGPUStr
=
"Mali"
;
constexpr
const
char
*
kPowerVRGPUStr
=
"PowerVR"
;
if
(
device_name
==
kQualcommAdrenoGPUStr
)
{
return
GPUType
::
QUALCOMM_ADRENO
;
}
else
if
(
device_name
.
find
(
kMaliGPUStr
)
!=
std
::
string
::
npos
)
{
return
GPUType
::
MALI
;
}
else
if
(
device_name
.
find
(
kPowerVRGPUStr
)
!=
std
::
string
::
npos
)
{
return
GPUType
::
PowerVR
;
}
else
{
return
GPUType
::
UNKNOWN
;
}
}
}
// namespace mace
}
// namespace mace
mace/core/runtime/opencl/opencl_runtime.h
浏览文件 @
33b049e6
...
@@ -19,6 +19,13 @@
...
@@ -19,6 +19,13 @@
namespace
mace
{
namespace
mace
{
enum
GPUType
{
QUALCOMM_ADRENO
,
MALI
,
PowerVR
,
UNKNOWN
,
};
class
OpenCLProfilingTimer
:
public
Timer
{
class
OpenCLProfilingTimer
:
public
Timer
{
public:
public:
explicit
OpenCLProfilingTimer
(
const
cl
::
Event
*
event
)
explicit
OpenCLProfilingTimer
(
const
cl
::
Event
*
event
)
...
@@ -50,6 +57,8 @@ class OpenCLRuntime {
...
@@ -50,6 +57,8 @@ class OpenCLRuntime {
uint64_t
GetDeviceMaxWorkGroupSize
();
uint64_t
GetDeviceMaxWorkGroupSize
();
uint64_t
GetKernelMaxWorkGroupSize
(
const
cl
::
Kernel
&
kernel
);
uint64_t
GetKernelMaxWorkGroupSize
(
const
cl
::
Kernel
&
kernel
);
uint64_t
GetKernelWaveSize
(
const
cl
::
Kernel
&
kernel
);
uint64_t
GetKernelWaveSize
(
const
cl
::
Kernel
&
kernel
);
const
bool
IsNonUniformWorkgroupsSupported
();
const
GPUType
ParseGPUTypeFromDeviceName
(
const
std
::
string
&
device_name
);
cl
::
Kernel
BuildKernel
(
const
std
::
string
&
program_name
,
cl
::
Kernel
BuildKernel
(
const
std
::
string
&
program_name
,
const
std
::
string
&
kernel_name
,
const
std
::
string
&
kernel_name
,
const
std
::
set
<
std
::
string
>
&
build_options
);
const
std
::
set
<
std
::
string
>
&
build_options
);
...
@@ -75,6 +84,8 @@ class OpenCLRuntime {
...
@@ -75,6 +84,8 @@ class OpenCLRuntime {
std
::
map
<
std
::
string
,
cl
::
Program
>
built_program_map_
;
std
::
map
<
std
::
string
,
cl
::
Program
>
built_program_map_
;
std
::
mutex
program_build_mutex_
;
std
::
mutex
program_build_mutex_
;
std
::
string
kernel_path_
;
std
::
string
kernel_path_
;
GPUType
gpu_type_
;
std
::
string
opencl_version_
;
static
GPUPerfHint
gpu_perf_hint_
;
static
GPUPerfHint
gpu_perf_hint_
;
static
GPUPriorityHint
gpu_priority_hint_
;
static
GPUPriorityHint
gpu_priority_hint_
;
...
...
mace/kernels/activation.h
浏览文件 @
33b049e6
...
@@ -155,6 +155,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
...
@@ -155,6 +155,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
ActivationType
activation_
;
ActivationType
activation_
;
T
relux_max_limit_
;
T
relux_max_limit_
;
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
string
tuning_key_prefix_
;
std
::
string
tuning_key_prefix_
;
std
::
vector
<
index_t
>
input_shape_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
...
...
mace/kernels/addn.h
浏览文件 @
33b049e6
...
@@ -90,6 +90,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
...
@@ -90,6 +90,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
vector
<
index_t
>
input_shape_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
...
...
mace/kernels/batch_norm.h
浏览文件 @
33b049e6
...
@@ -157,6 +157,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
...
@@ -157,6 +157,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
vector
<
index_t
>
input_shape_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
...
...
mace/kernels/bias_add.h
浏览文件 @
33b049e6
...
@@ -64,6 +64,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
...
@@ -64,6 +64,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
vector
<
index_t
>
input_shape_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
...
...
mace/kernels/channel_shuffle.h
浏览文件 @
33b049e6
...
@@ -56,6 +56,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
...
@@ -56,6 +56,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
const
int
groups_
;
const
int
groups_
;
std
::
vector
<
index_t
>
input_shape_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
...
...
mace/kernels/concat.h
浏览文件 @
33b049e6
...
@@ -85,6 +85,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
...
@@ -85,6 +85,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
vector
<
index_t
>
input_shape_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
...
...
mace/kernels/conv_2d.h
浏览文件 @
33b049e6
...
@@ -401,6 +401,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
...
@@ -401,6 +401,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
vector
<
index_t
>
input_shape_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
...
...
mace/kernels/depth_to_space.h
浏览文件 @
33b049e6
...
@@ -108,6 +108,7 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
...
@@ -108,6 +108,7 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
const
int
block_size_
;
const
int
block_size_
;
bool
d2s_
;
bool
d2s_
;
std
::
vector
<
index_t
>
input_shape_
;
std
::
vector
<
index_t
>
input_shape_
;
...
...
mace/kernels/depthwise_conv2d.h
浏览文件 @
33b049e6
...
@@ -437,6 +437,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
...
@@ -437,6 +437,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
vector
<
index_t
>
input_shape_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
...
...
mace/kernels/eltwise.h
浏览文件 @
33b049e6
...
@@ -97,6 +97,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
...
@@ -97,6 +97,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
vector
<
index_t
>
input_shape_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
...
...
mace/kernels/matmul.h
浏览文件 @
33b049e6
...
@@ -241,6 +241,7 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
...
@@ -241,6 +241,7 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
};
};
}
// namespace kernels
}
// namespace kernels
...
...
mace/kernels/opencl/activation_opencl.cc
浏览文件 @
33b049e6
...
@@ -24,15 +24,18 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -24,15 +24,18 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
if
(
kernel_
.
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"activation"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"activation"
);
built_options
.
emplace
(
"-Dactivation="
+
kernel_name
);
built_options
.
emplace
(
"-Dactivation="
+
kernel_name
);
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
switch
(
activation_
)
{
switch
(
activation_
)
{
case
RELU
:
case
RELU
:
tuning_key_prefix_
=
"relu_opencl_kernel_"
;
tuning_key_prefix_
=
"relu_opencl_kernel_"
;
...
@@ -58,10 +61,22 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -58,10 +61,22 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
}
}
kernel_
=
runtime
->
BuildKernel
(
"activation"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"activation"
,
kernel_name
,
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
int
idx
=
0
;
int
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
}
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
if
(
activation_
==
PRELU
)
{
if
(
activation_
==
PRELU
)
{
MACE_CHECK_NOTNULL
(
alpha
);
MACE_CHECK_NOTNULL
(
alpha
);
...
@@ -73,10 +88,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -73,10 +88,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
input_shape_
=
input
->
shape
();
input_shape_
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
tuning_key_prefix_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
Concat
(
tuning_key_prefix_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
3
));
...
...
mace/kernels/opencl/addn.cc
浏览文件 @
33b049e6
...
@@ -24,6 +24,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -24,6 +24,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
const
index_t
width
=
input_tensors
[
0
]
->
dim
(
2
);
const
index_t
width
=
input_tensors
[
0
]
->
dim
(
2
);
const
index_t
channels
=
input_tensors
[
0
]
->
dim
(
3
);
const
index_t
channels
=
input_tensors
[
0
]
->
dim
(
3
);
auto
runtime
=
OpenCLRuntime
::
Global
();
for
(
int
i
=
1
;
i
<
size
;
++
i
)
{
for
(
int
i
=
1
;
i
<
size
;
++
i
)
{
MACE_CHECK_NOTNULL
(
input_tensors
[
i
]);
MACE_CHECK_NOTNULL
(
input_tensors
[
i
]);
MACE_CHECK
(
batch
==
input_tensors
[
i
]
->
dim
(
0
));
MACE_CHECK
(
batch
==
input_tensors
[
i
]
->
dim
(
0
));
...
@@ -36,7 +38,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -36,7 +38,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
if
(
input_tensors
.
size
()
>
4
)
{
if
(
input_tensors
.
size
()
>
4
)
{
MACE_NOT_IMPLEMENTED
;
MACE_NOT_IMPLEMENTED
;
}
}
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"addn"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"addn"
);
...
@@ -44,7 +45,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -44,7 +45,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
MakeString
(
"-DINPUT_NUM="
,
input_tensors
.
size
()));
built_options
.
emplace
(
MakeString
(
"-DINPUT_NUM="
,
input_tensors
.
size
()));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"addn"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"addn"
,
kernel_name
,
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
}
std
::
vector
<
index_t
>
output_shape
=
input_tensors
[
0
]
->
shape
();
std
::
vector
<
index_t
>
output_shape
=
input_tensors
[
0
]
->
shape
();
...
@@ -53,6 +61,9 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -53,6 +61,9 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
const
index_t
width_pixels
=
channel_blocks
*
width
;
const
index_t
width_pixels
=
channel_blocks
*
width
;
const
index_t
batch_height_pixels
=
batch
*
height
;
const
index_t
batch_height_pixels
=
batch
*
height
;
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
static_cast
<
uint32_t
>
(
batch_height_pixels
)};
if
(
!
IsVecEqual
(
input_shape_
,
input_tensors
[
0
]
->
shape
()))
{
if
(
!
IsVecEqual
(
input_shape_
,
input_tensors
[
0
]
->
shape
()))
{
std
::
vector
<
size_t
>
output_image_shape
;
std
::
vector
<
size_t
>
output_image_shape
;
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
...
@@ -60,6 +71,10 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -60,6 +71,10 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
output_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
);
output_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
);
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
}
for
(
auto
input
:
input_tensors
)
{
for
(
auto
input
:
input_tensors
)
{
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
}
}
...
@@ -68,9 +83,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -68,9 +83,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
input_shape_
=
input_tensors
[
0
]
->
shape
();
input_shape_
=
input_tensors
[
0
]
->
shape
();
}
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
16
,
16
,
1
};
static_cast
<
uint32_t
>
(
batch_height_pixels
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
64
,
16
,
1
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
ss
<<
"addn_opencl_kernel_"
<<
output_shape
[
0
]
<<
"_"
<<
output_shape
[
1
]
ss
<<
"addn_opencl_kernel_"
<<
output_shape
[
0
]
<<
"_"
<<
output_shape
[
1
]
<<
"_"
<<
output_shape
[
2
]
<<
"_"
<<
output_shape
[
3
];
<<
"_"
<<
output_shape
[
2
]
<<
"_"
<<
output_shape
[
3
];
...
...
mace/kernels/opencl/batch_norm_opencl.cc
浏览文件 @
33b049e6
...
@@ -30,14 +30,23 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -30,14 +30,23 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
if
(
kernel_
.
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"batch_norm"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"batch_norm"
);
built_options
.
emplace
(
"-Dbatch_norm="
+
kernel_name
);
built_options
.
emplace
(
"-Dbatch_norm="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
if
(
folded_constant_
)
{
if
(
folded_constant_
)
{
built_options
.
emplace
(
"-DFOLDED_CONSTANT"
);
built_options
.
emplace
(
"-DFOLDED_CONSTANT"
);
}
}
...
@@ -61,9 +70,17 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -61,9 +70,17 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
}
kernel_
=
runtime
->
BuildKernel
(
"batch_norm"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"batch_norm"
,
kernel_name
,
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
}
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
scale
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
scale
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
offset
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
offset
->
opencl_image
()));
...
@@ -78,10 +95,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -78,10 +95,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
input_shape_
=
input
->
shape
();
input_shape_
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"batch_norm_opencl_kernel_"
,
activation_
,
output
->
dim
(
0
),
Concat
(
"batch_norm_opencl_kernel_"
,
activation_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
folded_constant_
);
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
folded_constant_
);
...
...
mace/kernels/opencl/bias_add_opencl.cc
浏览文件 @
33b049e6
...
@@ -23,7 +23,12 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -23,7 +23,12 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
...
@@ -31,25 +36,46 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -31,25 +36,46 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options
.
emplace
(
"-Dbias_add="
+
kernel_name
);
built_options
.
emplace
(
"-Dbias_add="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"bias_add"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"bias_add"
,
kernel_name
,
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
}
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
bias
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
bias
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
input
->
shape
();
input_shape_
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
};
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
};
cl
::
Event
event
;
cl
::
Event
event
;
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl_int
error
;
kernel_
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel_
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
}
else
{
std
::
vector
<
uint32_t
>
roundup_gws
(
lws
.
size
());
for
(
size_t
i
=
0
;
i
<
lws
.
size
();
++
i
)
{
roundup_gws
[
i
]
=
RoundUp
(
gws
[
i
],
lws
[
i
]);
}
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel_
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
}
MACE_CHECK
(
error
==
CL_SUCCESS
);
MACE_CHECK
(
error
==
CL_SUCCESS
);
if
(
future
!=
nullptr
)
{
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
...
...
mace/kernels/opencl/buffer_to_image.cc
浏览文件 @
33b049e6
...
@@ -26,7 +26,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -26,7 +26,8 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
buffer
->
Resize
(
image
->
shape
());
buffer
->
Resize
(
image
->
shape
());
}
}
size_t
gws
[
2
]
=
{
image_shape
[
0
],
image_shape
[
1
]};
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
image_shape
[
0
]),
static_cast
<
uint32_t
>
(
image_shape
[
1
])};
std
::
string
kernel_name
;
std
::
string
kernel_name
;
switch
(
type
)
{
switch
(
type
)
{
case
CONV2D_FILTER
:
case
CONV2D_FILTER
:
...
@@ -58,11 +59,17 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -58,11 +59,17 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
:
"winograd_filter_buffer_to_image"
;
:
"winograd_filter_buffer_to_image"
;
break
;
break
;
}
}
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
kernel_name
);
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
kernel_name
);
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
std
::
stringstream
kernel_name_ss
;
std
::
stringstream
kernel_name_ss
;
kernel_name_ss
<<
"-D"
<<
kernel_name
<<
"="
<<
obfuscated_kernel_name
;
kernel_name_ss
<<
"-D"
<<
kernel_name
<<
"="
<<
obfuscated_kernel_name
;
built_options
.
emplace
(
kernel_name_ss
.
str
());
built_options
.
emplace
(
kernel_name_ss
.
str
());
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
if
(
buffer
->
dtype
()
==
image
->
dtype
())
{
if
(
buffer
->
dtype
()
==
image
->
dtype
())
{
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
...
@@ -73,11 +80,14 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -73,11 +80,14 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
}
}
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
b2f_kernel
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
auto
b2f_kernel
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
);
obfuscated_kernel_name
,
built_options
);
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
b2f_kernel
.
setArg
(
idx
++
,
gws
[
0
]);
b2f_kernel
.
setArg
(
idx
++
,
gws
[
1
]);
}
b2f_kernel
.
setArg
(
idx
++
,
*
(
buffer
->
opencl_buffer
()));
b2f_kernel
.
setArg
(
idx
++
,
*
(
buffer
->
opencl_buffer
()));
if
(
!
i2b_
)
{
if
(
!
i2b_
)
{
MACE_CHECK
(
buffer
->
buffer_offset
()
%
GetEnumTypeSize
(
buffer
->
dtype
())
==
0
,
MACE_CHECK
(
buffer
->
buffer_offset
()
%
GetEnumTypeSize
(
buffer
->
dtype
())
==
0
,
...
@@ -103,13 +113,28 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -103,13 +113,28 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
3
)));
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
3
)));
}
}
b2f_kernel
.
setArg
(
idx
++
,
*
(
image
->
opencl_image
()));
b2f_kernel
.
setArg
(
idx
++
,
*
(
image
->
opencl_image
()));
const
std
::
vector
<
uint32_t
>
lws
=
{
16
,
64
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
b2f_kernel
));
const
std
::
vector
<
uint32_t
>
lws
=
{
16
,
kwg_size
/
16
};
cl
::
Event
event
;
cl
::
Event
event
;
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl_int
error
;
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
else
{
std
::
vector
<
uint32_t
>
roundup_gws
(
lws
.
size
());
for
(
size_t
i
=
0
;
i
<
lws
.
size
();
++
i
)
{
roundup_gws
[
i
]
=
RoundUp
(
gws
[
i
],
lws
[
i
]);
}
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
if
(
future
!=
nullptr
)
{
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
event
.
wait
();
...
...
mace/kernels/opencl/channel_shuffle.cc
浏览文件 @
33b049e6
...
@@ -30,20 +30,36 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -30,20 +30,36 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
"groups must be multiple of 4"
);
"groups must be multiple of 4"
);
const
index_t
group_channel_blocks
=
RoundUpDiv4
(
channels_per_group
);
const
index_t
group_channel_blocks
=
RoundUpDiv4
(
channels_per_group
);
if
(
kernel_
.
get
()
==
nullptr
)
{
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
group_channel_blocks
),
auto
runtime
=
OpenCLRuntime
::
Global
();
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"channel_shuffle"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"channel_shuffle"
);
built_options
.
emplace
(
"-Dchannel_shuffle="
+
kernel_name
);
built_options
.
emplace
(
"-Dchannel_shuffle="
+
kernel_name
);
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"channel_shuffle"
,
kernel_name
,
kernel_
=
runtime
->
BuildKernel
(
"channel_shuffle"
,
kernel_name
,
built_options
);
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
}
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
groups_
);
kernel_
.
setArg
(
idx
++
,
groups_
);
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
channels_per_group
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
channels_per_group
));
...
@@ -51,10 +67,8 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -51,10 +67,8 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
input_shape_
=
input
->
shape
();
input_shape_
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
group_channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
ss
<<
"channel_shuffle_opencl_kernel_"
ss
<<
"channel_shuffle_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
0
)
<<
"_"
...
...
mace/kernels/opencl/cl/activation.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
__kernel
void
activation
(
__read_only
image2d_t
input,
__kernel
void
activation
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only
image2d_t
input,
#
ifdef
USE_PRELU
#
ifdef
USE_PRELU
__read_only
image2d_t
alpha,
__read_only
image2d_t
alpha,
#
endif
#
endif
...
@@ -9,7 +11,16 @@ __kernel void activation(__read_only image2d_t input,
...
@@ -9,7 +11,16 @@ __kernel void activation(__read_only image2d_t input,
const
int
ch_blk
=
get_global_id
(
0
)
;
const
int
ch_blk
=
get_global_id
(
0
)
;
const
int
w
=
get_global_id
(
1
)
;
const
int
w
=
get_global_id
(
1
)
;
const
int
hb
=
get_global_id
(
2
)
;
const
int
hb
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
ch_blk
>=
global_size_dim0
|
| w >= global_size_dim1
|
|
hb
>=
global_size_dim2
)
{
return
;
}
const
int
width
=
global_size_dim1
;
#
else
const
int
width
=
get_global_size
(
1
)
;
const
int
width
=
get_global_size
(
1
)
;
#
endif
const
int
pos
=
mad24
(
ch_blk,
width,
w
)
;
const
int
pos
=
mad24
(
ch_blk,
width,
w
)
;
DATA_TYPE4
in
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
pos,
hb
))
;
DATA_TYPE4
in
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
pos,
hb
))
;
...
...
mace/kernels/opencl/cl/addn.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
__kernel
void
addn
(
__read_only
image2d_t
input0,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__kernel
void
addn
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__read_only
image2d_t
input0,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
input1,
__read_only
image2d_t
input1,
#
if
INPUT_NUM
>
2
#
if
INPUT_NUM
>
2
__read_only
image2d_t
input2,
__read_only
image2d_t
input2,
...
@@ -12,6 +14,10 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
...
@@ -12,6 +14,10 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
const
int
w
=
get_global_id
(
0
)
;
const
int
w
=
get_global_id
(
0
)
;
const
int
hb
=
get_global_id
(
1
)
;
const
int
hb
=
get_global_id
(
1
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
w
>=
global_size_dim0
||
hb
>=
global_size_dim1
)
return
;
#
endif
DATA_TYPE4
in0
=
READ_IMAGET
(
input0,
SAMPLER,
(
int2
)(
w,
hb
))
;
DATA_TYPE4
in0
=
READ_IMAGET
(
input0,
SAMPLER,
(
int2
)(
w,
hb
))
;
DATA_TYPE4
in1
=
READ_IMAGET
(
input1,
SAMPLER,
(
int2
)(
w,
hb
))
;
DATA_TYPE4
in1
=
READ_IMAGET
(
input1,
SAMPLER,
(
int2
)(
w,
hb
))
;
DATA_TYPE4
out
=
in0
+
in1
;
DATA_TYPE4
out
=
in0
+
in1
;
...
...
mace/kernels/opencl/cl/batch_norm.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
//
Supported
data
types:
half/float
//
Supported
data
types:
half/float
__kernel
void
batch_norm
(
__read_only
image2d_t
input,
__kernel
void
batch_norm
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only
image2d_t
input,
__read_only
image2d_t
scale,
__read_only
image2d_t
scale,
__read_only
image2d_t
offset,
__read_only
image2d_t
offset,
#
ifndef
FOLDED_CONSTANT
#
ifndef
FOLDED_CONSTANT
...
@@ -13,7 +15,16 @@ __kernel void batch_norm(__read_only image2d_t input,
...
@@ -13,7 +15,16 @@ __kernel void batch_norm(__read_only image2d_t input,
const
int
ch_blk
=
get_global_id
(
0
)
;
const
int
ch_blk
=
get_global_id
(
0
)
;
const
int
w
=
get_global_id
(
1
)
;
const
int
w
=
get_global_id
(
1
)
;
const
int
hb
=
get_global_id
(
2
)
;
const
int
hb
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
ch_blk
>=
global_size_dim0
|
| w >= global_size_dim1
|| hb >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
const int width = get_global_size(1);
#endif
#ifdef FOLDED_CONSTANT
#ifdef FOLDED_CONSTANT
DATA_TYPE4 bn_scale = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0));
DATA_TYPE4 bn_scale = READ_IMAGET(scale, SAMPLER, (int2)(ch_blk, 0));
...
...
mace/kernels/opencl/cl/bias_add.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
//
Supported
data
types:
half/float
//
Supported
data
types:
half/float
__kernel
void
bias_add
(
__read_only
image2d_t
input,
__kernel
void
bias_add
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only
image2d_t
input,
__read_only
image2d_t
bias,
__read_only
image2d_t
bias,
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output
)
{
const
int
ch_blk
=
get_global_id
(
0
)
;
const
int
ch_blk
=
get_global_id
(
0
)
;
const
int
w
=
get_global_id
(
1
)
;
const
int
w
=
get_global_id
(
1
)
;
const
int
hb
=
get_global_id
(
2
)
;
const
int
hb
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
ch_blk
>=
global_size_dim0
|
| w >= global_size_dim1
|
|
hb
>=
global_size_dim2
)
{
return
;
}
const
int
width
=
global_size_dim1
;
#
else
const
int
width
=
get_global_size
(
1
)
;
const
int
width
=
get_global_size
(
1
)
;
#
endif
const
int
pos
=
mad24
(
ch_blk,
width,
w
)
;
const
int
pos
=
mad24
(
ch_blk,
width,
w
)
;
DATA_TYPE4
in
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
pos,
hb
))
;
DATA_TYPE4
in
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
pos,
hb
))
;
...
...
mace/kernels/opencl/cl/buffer_to_image.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
__kernel
void
filter_buffer_to_image
(
__global
const
DATA_TYPE
*input,
/*
h,
w,
oc,
ic
*/
__kernel
void
filter_buffer_to_image
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global
const
DATA_TYPE
*input,
/*
h,
w,
oc,
ic
*/
__private
const
int
input_offset,
__private
const
int
input_offset,
__private
const
int
filter_h,
__private
const
int
filter_h,
__private
const
int
filter_w,
__private
const
int
filter_w,
...
@@ -9,6 +11,13 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
...
@@ -9,6 +11,13 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output
)
{
int
w
=
get_global_id
(
0
)
;
int
w
=
get_global_id
(
0
)
;
int
h
=
get_global_id
(
1
)
;
int
h
=
get_global_id
(
1
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
w
>=
global_size_dim0
|
| h >= global_size_dim1) {
return;
}
#endif
const int in_channel_idx = w;
const int in_channel_idx = w;
const int hw_size = filter_w * filter_h;
const int hw_size = filter_w * filter_h;
const int out_channel_idx = h / hw_size * 4;
const int out_channel_idx = h / hw_size * 4;
...
@@ -44,7 +53,9 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
...
@@ -44,7 +53,9 @@ __kernel void filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w, o
WRITE_IMAGET(output, coord, values);
WRITE_IMAGET(output, coord, values);
}
}
__kernel
void
filter_image_to_buffer
(
__global
DATA_TYPE
*output,
/*
h,
w,
oc,
ic
*/
__kernel void filter_image_to_buffer(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global DATA_TYPE *output, /* h, w, oc, ic */
__private const int filter_h,
__private const int filter_h,
__private const int filter_w,
__private const int filter_w,
__private const int out_channel,
__private const int out_channel,
...
@@ -52,6 +63,13 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
...
@@ -52,6 +63,13 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
__read_only image2d_t input) {
__read_only image2d_t input) {
int w = get_global_id(0);
int w = get_global_id(0);
int h = get_global_id(1);
int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int in_channel_idx = w;
const int in_channel_idx = w;
const int hw_size = filter_w * filter_h;
const int hw_size = filter_w * filter_h;
const int out_channel_idx = h / hw_size * 4;
const int out_channel_idx = h / hw_size * 4;
...
@@ -84,7 +102,9 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
...
@@ -84,7 +102,9 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, oc, ic
}
}
}
}
__kernel
void
dw_filter_buffer_to_image
(
__global
const
DATA_TYPE
*input,
/*
h,
w,
ic,
m
*/
__kernel void dw_filter_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, /* h, w, ic, m */
__private const int input_offset,
__private const int input_offset,
__private const int filter_w,
__private const int filter_w,
__private const int in_channel,
__private const int in_channel,
...
@@ -93,6 +113,12 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
...
@@ -93,6 +113,12 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
const int w = get_global_id(0);
const int w = get_global_id(0);
const int h = get_global_id(1);
const int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
DATA_TYPE4 values = 0;
DATA_TYPE4 values = 0;
if (multiplier == 1) {
if (multiplier == 1) {
const int in_channel_idx = h << 2;
const int in_channel_idx = h << 2;
...
@@ -134,7 +160,9 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
...
@@ -134,7 +160,9 @@ __kernel void dw_filter_buffer_to_image(__global const DATA_TYPE *input, /* h, w
WRITE_IMAGET(output, coord, values);
WRITE_IMAGET(output, coord, values);
}
}
__kernel
void
in_out_buffer_to_image
(
__global
const
DATA_TYPE
*input,
/*
nhwc
*/
__kernel void in_out_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int input_offset,
__private const int height,
__private const int height,
__private const int width,
__private const int width,
...
@@ -142,6 +170,13 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
...
@@ -142,6 +170,13 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
__write_only image2d_t output) {
__write_only image2d_t output) {
int w = get_global_id(0);
int w = get_global_id(0);
int h = get_global_id(1);
int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int batch_idx = h / height;
const int batch_idx = h / height;
const int height_idx = h % height;
const int height_idx = h % height;
const int width_idx = w % width;
const int width_idx = w % width;
...
@@ -167,13 +202,22 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
...
@@ -167,13 +202,22 @@ __kernel void in_out_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
WRITE_IMAGET(output, coord, values);
WRITE_IMAGET(output, coord, values);
}
}
__kernel
void
in_out_image_to_buffer
(
__global
DATA_TYPE
*output,
/*
nhwc
*/
__kernel void in_out_image_to_buffer(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global DATA_TYPE *output, /* nhwc */
__private const int height,
__private const int height,
__private const int width,
__private const int width,
__private const int channels,
__private const int channels,
__read_only image2d_t input) {
__read_only image2d_t input) {
int w = get_global_id(0);
int w = get_global_id(0);
int h = get_global_id(1);
int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int batch_idx = h / height;
const int batch_idx = h / height;
const int height_idx = h % height;
const int height_idx = h % height;
const int width_idx = w % width;
const int width_idx = w % width;
...
@@ -198,13 +242,21 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
...
@@ -198,13 +242,21 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
}
}
}
}
__kernel
void
arg_buffer_to_image
(
__global
const
DATA_TYPE
*input,
/*
nhwc
*/
__kernel void arg_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int input_offset,
__private const int count,
__private const int count,
__write_only image2d_t output) {
__write_only image2d_t output) {
int w = get_global_id(0);
int w = get_global_id(0);
int h = get_global_id(1);
int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int offset = input_offset + w * 4;
const int offset = input_offset + w * 4;
const int size = count - w * 4;
const int size = count - w * 4;
...
@@ -226,11 +278,20 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
...
@@ -226,11 +278,20 @@ __kernel void arg_buffer_to_image(__global const DATA_TYPE *input, /* nhwc */
WRITE_IMAGET(output, coord, values);
WRITE_IMAGET(output, coord, values);
}
}
__kernel
void
arg_image_to_buffer
(
__global
DATA_TYPE
*output,
/*
nhwc
*/
__kernel void arg_image_to_buffer(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global DATA_TYPE *output, /* nhwc */
__private const int count,
__private const int count,
__read_only image2d_t input) {
__read_only image2d_t input) {
int w = get_global_id(0);
int w = get_global_id(0);
int h = get_global_id(1);
int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int offset = w * 4;
const int offset = w * 4;
int2 coord = (int2)(w, h);
int2 coord = (int2)(w, h);
...
@@ -251,7 +312,9 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
...
@@ -251,7 +312,9 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
}
}
__kernel
void
in_out_height_buffer_to_image
(
__global
const
DATA_TYPE
*input,
//nhwc
__kernel void in_out_height_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, //nhwc
__private const int input_offset,
__private const int input_offset,
__private const int height,
__private const int height,
__private const int width,
__private const int width,
...
@@ -259,6 +322,13 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
...
@@ -259,6 +322,13 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
__write_only image2d_t output) {
__write_only image2d_t output) {
int w = get_global_id(0);
int w = get_global_id(0);
int h = get_global_id(1);
int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int wc = width * channels;
const int wc = width * channels;
const int height_blks = (height + 3) / 4;
const int height_blks = (height + 3) / 4;
const int batch_idx = h / height_blks;
const int batch_idx = h / height_blks;
...
@@ -285,13 +355,22 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
...
@@ -285,13 +355,22 @@ __kernel void in_out_height_buffer_to_image(__global const DATA_TYPE *input, //n
WRITE_IMAGET(output, coord, values);
WRITE_IMAGET(output, coord, values);
}
}
__kernel
void
in_out_height_image_to_buffer
(
__global
DATA_TYPE
*output,
//nhwc
__kernel void in_out_height_image_to_buffer(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global DATA_TYPE *output, //nhwc
__private const int height,
__private const int height,
__private const int width,
__private const int width,
__private const int channels,
__private const int channels,
__read_only image2d_t input) {
__read_only image2d_t input) {
int w = get_global_id(0);
int w = get_global_id(0);
int h = get_global_id(1);
int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int height_blks = (height + 3) / 4;
const int height_blks = (height + 3) / 4;
const int batch_idx = h / height_blks;
const int batch_idx = h / height_blks;
const int height_idx = (h % height_blks) << 2;
const int height_idx = (h % height_blks) << 2;
...
@@ -315,7 +394,9 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
...
@@ -315,7 +394,9 @@ __kernel void in_out_height_image_to_buffer(__global DATA_TYPE *output, //nhwc
}
}
__kernel
void
in_out_width_buffer_to_image
(
__global
const
DATA_TYPE
*input,
/*
nhwc
*/
__kernel void in_out_width_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int input_offset,
__private const int height,
__private const int height,
__private const int width,
__private const int width,
...
@@ -323,6 +404,13 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
...
@@ -323,6 +404,13 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
__write_only image2d_t output) {
__write_only image2d_t output) {
int w = get_global_id(0);
int w = get_global_id(0);
int h = get_global_id(1);
int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
#endif
const int width_blks = (width + 3) / 4;
const int width_blks = (width + 3) / 4;
const int batch_idx = h / height;
const int batch_idx = h / height;
const int height_idx = h % height;
const int height_idx = h % height;
...
@@ -349,7 +437,9 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
...
@@ -349,7 +437,9 @@ __kernel void in_out_width_buffer_to_image(__global const DATA_TYPE *input, /* n
}
}
// only support 3x3 now
// only support 3x3 now
__kernel
void
winograd_filter_buffer_to_image
(
__global
const
DATA_TYPE
*input,
//Oc,
Ic,
H,
W
__kernel void winograd_filter_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
__private const int input_offset,
__private const int in_channels,
__private const int in_channels,
__private const int height,
__private const int height,
...
@@ -357,7 +447,16 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
...
@@ -357,7 +447,16 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
__write_only image2d_t output) {
__write_only image2d_t output) {
int w = get_global_id(0);
int w = get_global_id(0);
int h = get_global_id(1);
int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 || h >= global_size_dim1) {
return;
}
const int out_channels = global_size_dim1;
#else
const int out_channels = get_global_size(1);
const int out_channels = get_global_size(1);
#endif
const int out_channel_idx = h;
const int out_channel_idx = h;
const int in_channel_idx = w << 2;
const int in_channel_idx = w << 2;
const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
const int offset = input_offset + (out_channel_idx * in_channels + in_channel_idx) * height * width;
...
@@ -430,13 +529,22 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
...
@@ -430,13 +529,22 @@ __kernel void winograd_filter_buffer_to_image(__global const DATA_TYPE *input, /
}
}
// only support 3x3 now
// only support 3x3 now
__kernel
void
winograd_filter_image_to_buffer
(
__global
DATA_TYPE
*output,
//Oc,
Ic,
H,
W
__kernel void winograd_filter_image_to_buffer(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
__private const int height,
__private const int width,
__private const int width,
__private const int channel,
__private const int channel,
__read_only image2d_t input) {
__read_only image2d_t input) {
const int w = get_global_id(0);
const int w = get_global_id(0);
const int h = get_global_id(1);
const int h = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (w >= global_size_dim0 |
|
h
>=
global_size_dim1
)
{
return
;
}
#
endif
const
int
width_idx
=
w
<<
2
;
const
int
width_idx
=
w
<<
2
;
const
int
size
=
width
-
width_idx
;
const
int
size
=
width
-
width_idx
;
int
offset
=
h
*
width
+
width_idx
;
int
offset
=
h
*
width
+
width_idx
;
...
...
mace/kernels/opencl/cl/channel_shuffle.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
//
assume
channes_per_group
mod
4
=
0
&&
groups
mod
4
==
0
//
assume
channes_per_group
mod
4
=
0
&&
groups
mod
4
==
0
__kernel
void
channel_shuffle
(
__read_only
image2d_t
input,
__kernel
void
channel_shuffle
(
__private
const
int
groups,
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__private
const
int
channels_per_group,
__read_only
image2d_t
input,
__write_only
image2d_t
output
)
{
__private
const
int
groups,
__private
const
int
channels_per_group,
__write_only
image2d_t
output
)
{
const
int
group_chan_blk_idx
=
get_global_id
(
0
)
;
const
int
group_chan_blk_idx
=
get_global_id
(
0
)
;
const
int
width_idx
=
get_global_id
(
1
)
;
const
int
width_idx
=
get_global_id
(
1
)
;
const
int
width
=
get_global_size
(
1
)
;
const
int
hb_idx
=
get_global_id
(
2
)
;
const
int
hb_idx
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
group_chan_blk_idx
>=
global_size_dim0
|
| width_idx >= global_size_dim1
|
|
hb_idx
>=
global_size_dim2
)
{
return
;
}
const
int
width
=
global_size_dim1
;
#
else
const
int
width
=
get_global_size
(
1
)
;
#
endif
const
int
group_blks
=
groups
/
4
;
const
int
group_blks
=
groups
/
4
;
const
int
groups_blks_width
=
group_blks
*
width
;
const
int
groups_blks_width
=
group_blks
*
width
;
const
int
channels_per_group_blks
=
channels_per_group
/
4
;
const
int
channels_per_group_blks
=
channels_per_group
/
4
;
...
...
mace/kernels/opencl/cl/common.h
浏览文件 @
33b049e6
...
@@ -18,6 +18,23 @@
...
@@ -18,6 +18,23 @@
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
#ifndef NON_UNIFORM_WORK_GROUP
#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2 \
__private const int global_size_dim0, \
__private const int global_size_dim1,
#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3 \
__private const int global_size_dim0, \
__private const int global_size_dim1, \
__private const int global_size_dim2,
#else
#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
#define UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
#endif
__constant
sampler_t
SAMPLER
=
__constant
sampler_t
SAMPLER
=
CLK_NORMALIZED_COORDS_FALSE
|
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST
;
CLK_NORMALIZED_COORDS_FALSE
|
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST
;
...
...
mace/kernels/opencl/cl/concat.cl
浏览文件 @
33b049e6
...
@@ -22,14 +22,26 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
...
@@ -22,14 +22,26 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
}
}
//
Supported
data
type:
half/float
//
Supported
data
type:
half/float
__kernel
void
concat_channel
(
__read_only
image2d_t
input0,
__kernel
void
concat_channel
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only
image2d_t
input0,
__read_only
image2d_t
input1,
__read_only
image2d_t
input1,
__private
const
int
input0_chan,
__private
const
int
input0_chan,
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output
)
{
const
int
chan_blk_idx
=
get_global_id
(
0
)
;
const
int
chan_blk_idx
=
get_global_id
(
0
)
;
const
int
width_idx
=
get_global_id
(
1
)
;
const
int
width_idx
=
get_global_id
(
1
)
;
const
int
width
=
get_global_size
(
1
)
;
const
int
hb_idx
=
get_global_id
(
2
)
;
const
int
hb_idx
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
chan_blk_idx
>=
global_size_dim0
|
| width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif
const int input0_chan_blk = (input0_chan + 3) >> 2;
const int input0_chan_blk = (input0_chan + 3) >> 2;
DATA_TYPE4 data = 0;
DATA_TYPE4 data = 0;
...
@@ -72,13 +84,25 @@ __kernel void concat_channel(__read_only image2d_t input0,
...
@@ -72,13 +84,25 @@ __kernel void concat_channel(__read_only image2d_t input0,
}
}
// Required: All input channels are divisible by 4
// Required: All input channels are divisible by 4
__kernel
void
concat_channel_multi
(
__read_only
image2d_t
input,
__kernel void concat_channel_multi(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__private const int chan_blk_offset,
__private const int chan_blk_offset,
__write_only image2d_t output) {
__write_only image2d_t output) {
const int chan_blk_idx = get_global_id(0);
const int chan_blk_idx = get_global_id(0);
const int width_idx = get_global_id(1);
const int width_idx = get_global_id(1);
const
int
width
=
get_global_size
(
1
)
;
const int hb_idx = get_global_id(2);
const int hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_blk_idx >= global_size_dim0 || width_idx >= global_size_dim1
|| hb_idx >= global_size_dim2) {
return;
}
const int width = global_size_dim1;
#else
const int width = get_global_size(1);
#endif
DATA_TYPE4 data = 0;
DATA_TYPE4 data = 0;
data = READ_IMAGET(input,
data = READ_IMAGET(input,
SAMPLER,
SAMPLER,
...
...
mace/kernels/opencl/cl/conv_2d.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
__kernel
void
conv_2d
(
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__kernel
void
conv_2d
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin,
kh
*
kw
*
cout/4
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin,
kh
*
kw
*
cout/4
*/
#
ifdef
BIAS
#
ifdef
BIAS
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
...
@@ -21,9 +23,18 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
...
@@ -21,9 +23,18 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__private
const
int
dilation_w
)
{
__private
const
int
dilation_w
)
{
const
int
out_ch_blk
=
get_global_id
(
0
)
;
const
int
out_ch_blk
=
get_global_id
(
0
)
;
const
int
out_w_blk
=
get_global_id
(
1
)
;
const
int
out_w_blk
=
get_global_id
(
1
)
;
const
int
out_w_blks
=
get_global_size
(
1
)
;
const
int
out_hb
=
get_global_id
(
2
)
;
const
int
out_hb
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
out_ch_blk
>=
global_size_dim0
|
| out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
}
const int out_w_blks = global_size_dim1;
#else
const int out_w_blks = get_global_size(1);
#endif
#ifdef BIAS
#ifdef BIAS
DATA_TYPE4 out0 =
DATA_TYPE4 out0 =
READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
...
...
mace/kernels/opencl/cl/conv_2d_1x1.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
__kernel
void
conv_2d_1x1
(
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__kernel
void
conv_2d_1x1
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin,
cout/4
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin,
cout/4
*/
#
ifdef
BIAS
#
ifdef
BIAS
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
...
@@ -15,9 +17,18 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
...
@@ -15,9 +17,18 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__private
const
int
stride
)
{
__private
const
int
stride
)
{
const
int
out_ch_blk
=
get_global_id
(
0
)
;
const
int
out_ch_blk
=
get_global_id
(
0
)
;
const
int
out_w_blk
=
get_global_id
(
1
)
;
const
int
out_w_blk
=
get_global_id
(
1
)
;
const
int
out_w_blks
=
get_global_size
(
1
)
;
const
int
out_hb
=
get_global_id
(
2
)
;
const
int
out_hb
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
out_ch_blk
>=
global_size_dim0
|
| out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
}
const int out_w_blks = global_size_dim1;
#else
const int out_w_blks = get_global_size(1);
#endif
#ifdef BIAS
#ifdef BIAS
DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
DATA_TYPE4 out1 = out0;
DATA_TYPE4 out1 = out0;
...
...
mace/kernels/opencl/cl/conv_2d_3x3.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
__kernel
void
conv_2d_3x3
(
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__kernel
void
conv_2d_3x3
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin
,
kh
*
kw
*
cout/4
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin
,
kh
*
kw
*
cout/4
*/
#
ifdef
BIAS
#
ifdef
BIAS
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
...
@@ -19,9 +21,18 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
...
@@ -19,9 +21,18 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
__private
const
int
dilation_w
)
{
__private
const
int
dilation_w
)
{
const
int
out_ch_blk
=
get_global_id
(
0
)
;
const
int
out_ch_blk
=
get_global_id
(
0
)
;
const
int
out_w_blk
=
get_global_id
(
1
)
;
const
int
out_w_blk
=
get_global_id
(
1
)
;
const
int
out_w_blks
=
get_global_size
(
1
)
;
const
int
out_hb
=
get_global_id
(
2
)
;
const
int
out_hb
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
out_ch_blk
>=
global_size_dim0
|
| out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
}
const int out_w_blks = global_size_dim1;
#else
const int out_w_blks = get_global_size(1);
#endif
#ifdef BIAS
#ifdef BIAS
DATA_TYPE4 out0 =
DATA_TYPE4 out0 =
READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
...
...
mace/kernels/opencl/cl/depth_to_space.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
__kernel
void
depth_to_space
(
__read_only
image2d_t
input,
__kernel
void
depth_to_space
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only
image2d_t
input,
__private
const
int
block_size,
__private
const
int
block_size,
__private
const
int
output_depth,
__private
const
int
output_depth,
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output
)
{
const
int
out_d
=
get_global_id
(
0
)
;
const
int
out_d
=
get_global_id
(
0
)
;
const
int
out_w
=
get_global_id
(
1
)
;
const
int
out_w
=
get_global_id
(
1
)
;
const
int
out_h
=
get_global_id
(
2
)
;
const
int
out_h
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
out_d
>=
global_size_dim0
|
| out_w >= global_size_dim1
|| out_h >= global_size_dim2) {
return;
}
const int output_width = global_size_dim1;
#else
const int output_width = get_global_size(1);
const int output_width = get_global_size(1);
#endif
const int out_pos = mad24(out_d, output_width, out_w);
const int out_pos = mad24(out_d, output_width, out_w);
...
@@ -27,14 +38,27 @@ __kernel void depth_to_space(__read_only image2d_t input,
...
@@ -27,14 +38,27 @@ __kernel void depth_to_space(__read_only image2d_t input,
WRITE_IMAGET(output, (int2)(out_pos, out_h), in_data);
WRITE_IMAGET(output, (int2)(out_pos, out_h), in_data);
}
}
__kernel
void
space_to_depth
(
__read_only
image2d_t
input,
__kernel void space_to_depth(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input,
__private const int block_size,
__private const int block_size,
__private const int input_depth,
__private const int input_depth,
__write_only image2d_t output) {
__write_only image2d_t output) {
const int d = get_global_id(0);
const int d = get_global_id(0);
const int w = get_global_id(1);
const int w = get_global_id(1);
const int h = get_global_id(2);
const int h = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (d >= global_size_dim0 || w >= global_size_dim1
|
|
h
>=
global_size_dim2
)
{
return
;
}
const
int
input_width
=
global_size_dim1
;
#
else
const
int
input_width
=
get_global_size
(
1
)
;
const
int
input_width
=
get_global_size
(
1
)
;
#
endif
const
int
in_pos
=
mad24
(
d,
input_width,
w
)
;
const
int
in_pos
=
mad24
(
d,
input_width,
w
)
;
const
int
output_width
=
input_width
/
block_size
;
const
int
output_width
=
input_width
/
block_size
;
...
...
mace/kernels/opencl/cl/depthwise_conv2d.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
//
Only
multiplier
=
1
is
supported
//
Only
multiplier
=
1
is
supported
__kernel
void
depthwise_conv2d
(
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__kernel
void
depthwise_conv2d
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
kh
*
kw
*
m,
cin/4
*/
__read_only
image2d_t
filter,
/*
cout%4
*
kh
*
kw
*
m,
cin/4
*/
#
ifdef
BIAS
#
ifdef
BIAS
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
__read_only
image2d_t
bias,
/*
cout%4
*
cout/4
*/
...
@@ -21,8 +23,18 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
...
@@ -21,8 +23,18 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
__private
const
short
dilation_w
)
{
__private
const
short
dilation_w
)
{
const
short
out_ch_blk
=
get_global_id
(
0
)
;
const
short
out_ch_blk
=
get_global_id
(
0
)
;
const
short
out_w_blk
=
get_global_id
(
1
)
;
const
short
out_w_blk
=
get_global_id
(
1
)
;
const
short
out_w_blks
=
get_global_size
(
1
)
;
const
short
out_hb
=
get_global_id
(
2
)
;
const
short
out_hb
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
out_ch_blk
>=
global_size_dim0
|
| out_w_blk >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
}
const short out_w_blks = global_size_dim1;
#else
const short out_w_blks = get_global_size(1);
#endif
const short rounded_in_ch = in_ch_blks << 2;
const short rounded_in_ch = in_ch_blks << 2;
const short in_ch_blk = out_ch_blk; // multiplier = 1
const short in_ch_blk = out_ch_blk; // multiplier = 1
...
@@ -126,7 +138,9 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
...
@@ -126,7 +138,9 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
}
}
__kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__kernel void depthwise_conv2d_s1(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
#ifdef BIAS
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
__read_only image2d_t bias, /* cout%4 * cout/4 */
...
@@ -145,6 +159,14 @@ __kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4
...
@@ -145,6 +159,14 @@ __kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4
const short out_ch_blk = get_global_id(0);
const short out_ch_blk = get_global_id(0);
const short out_w_blk = get_global_id(1) << 2;
const short out_w_blk = get_global_id(1) << 2;
const short out_hb = get_global_id(2);
const short out_hb = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_ch_blk >= global_size_dim0 || get_global_id(1) >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
}
#endif
const short rounded_in_ch = in_ch_blks << 2;
const short rounded_in_ch = in_ch_blks << 2;
const short in_ch_blk = out_ch_blk; // multiplier = 1
const short in_ch_blk = out_ch_blk; // multiplier = 1
...
...
mace/kernels/opencl/cl/eltwise.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
__kernel
void
eltwise
(
__read_only
image2d_t
input0,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__kernel
void
eltwise
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__read_only
image2d_t
input0,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
input1,
__read_only
image2d_t
input1,
#
ifdef
COEFF_SUM
#
ifdef
COEFF_SUM
__private
const
float
coeff0,
__private
const
float
coeff0,
...
@@ -10,6 +12,10 @@ __kernel void eltwise(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
...
@@ -10,6 +12,10 @@ __kernel void eltwise(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
const
int
w
=
get_global_id
(
0
)
;
const
int
w
=
get_global_id
(
0
)
;
const
int
hb
=
get_global_id
(
1
)
;
const
int
hb
=
get_global_id
(
1
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
w
>=
global_size_dim0
||
hb
>=
global_size_dim1
)
return
;
#
endif
DATA_TYPE4
in0
=
READ_IMAGET
(
input0,
SAMPLER,
(
int2
)(
w,
hb
))
;
DATA_TYPE4
in0
=
READ_IMAGET
(
input0,
SAMPLER,
(
int2
)(
w,
hb
))
;
DATA_TYPE4
in1
=
READ_IMAGET
(
input1,
SAMPLER,
(
int2
)(
w,
hb
))
;
DATA_TYPE4
in1
=
READ_IMAGET
(
input1,
SAMPLER,
(
int2
)(
w,
hb
))
;
DATA_TYPE4
out
;
DATA_TYPE4
out
;
...
...
mace/kernels/opencl/cl/matmul.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
//
C
=
A
*
B
//
C
=
A
*
B
__kernel
void
matmul
(
__read_only
image2d_t
A,
__kernel
void
matmul
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__read_only
image2d_t
A,
__read_only
image2d_t
B,
__read_only
image2d_t
B,
__write_only
image2d_t
C,
__write_only
image2d_t
C,
__private
const
int
M,
__private
const
int
M,
...
@@ -11,6 +13,11 @@ __kernel void matmul(__read_only image2d_t A,
...
@@ -11,6 +13,11 @@ __kernel void matmul(__read_only image2d_t A,
__private
const
int
k_blocks
)
{
__private
const
int
k_blocks
)
{
const
int
gx
=
get_global_id
(
0
)
<<
2
;
const
int
gx
=
get_global_id
(
0
)
<<
2
;
const
int
hb
=
get_global_id
(
1
)
;
const
int
hb
=
get_global_id
(
1
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
get_global_id
(
0
)
>=
global_size_dim0
||
hb
>=
global_size_dim1
)
return
;
#
endif
const
int
batch
=
hb
/
height_blocks
;
const
int
batch
=
hb
/
height_blocks
;
const
int
ty
=
(
hb
%
height_blocks
)
;
const
int
ty
=
(
hb
%
height_blocks
)
;
const
int
gy
=
mad24
(
batch,
height_blocks,
ty
)
;
const
int
gy
=
mad24
(
batch,
height_blocks,
ty
)
;
...
...
mace/kernels/opencl/cl/pooling.cl
浏览文件 @
33b049e6
...
@@ -19,7 +19,9 @@ inline int calculate_avg_block_size(const int pool_size,
...
@@ -19,7 +19,9 @@ inline int calculate_avg_block_size(const int pool_size,
}
}
//
Supported
data
type:
half/float
//
Supported
data
type:
half/float
__kernel
void
pooling
(
__read_only
image2d_t
input,
__kernel
void
pooling
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only
image2d_t
input,
__private
const
int
in_height,
__private
const
int
in_height,
__private
const
int
in_width,
__private
const
int
in_width,
__private
const
int
out_height,
__private
const
int
out_height,
...
@@ -28,11 +30,21 @@ __kernel void pooling(__read_only image2d_t input,
...
@@ -28,11 +30,21 @@ __kernel void pooling(__read_only image2d_t input,
__private
const
int
stride,
__private
const
int
stride,
__private
const
int
pooling_size,
__private
const
int
pooling_size,
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output
)
{
const
int
out_chan_idx
=
get_global_id
(
0
)
;
const
int
out_chan_idx
=
get_global_id
(
0
)
;
const
int
out_width_idx
=
get_global_id
(
1
)
;
const
int
out_width_idx
=
get_global_id
(
1
)
;
const
int
out_width
=
get_global_size
(
1
)
;
const
int
out_hb_idx
=
get_global_id
(
2
)
;
const
int
out_hb_idx
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
out_chan_idx
>=
global_size_dim0
|
| out_width_idx >= global_size_dim1
|| out_hb_idx >= global_size_dim2) {
return;
}
const int out_width = global_size_dim1;
#else
const int out_width = get_global_size(1);
#endif
const int batch_idx = mul24((out_hb_idx / out_height), in_height);
const int batch_idx = mul24((out_hb_idx / out_height), in_height);
const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top;
const int in_height_start = mul24((out_hb_idx % out_height), stride) - pad_top;
const int in_width_start = mul24(out_width_idx, stride) - pad_left;
const int in_width_start = mul24(out_width_idx, stride) - pad_left;
...
...
mace/kernels/opencl/cl/resize_bilinear.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
__kernel
void
resize_bilinear_nocache
(
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__kernel
void
resize_bilinear_nocache
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__write_only
image2d_t
output,
__write_only
image2d_t
output,
__private
const
float
height_scale,
__private
const
float
height_scale,
__private
const
float
width_scale,
__private
const
float
width_scale,
__private
const
int
in_height,
__private
const
int
in_height,
__private
const
int
in_width,
__private
const
int
in_width,
__private
const
int
out_height
)
{
__private
const
int
out_height
)
{
const
int
ch_blk
=
get_global_id
(
0
)
;
const
int
ch_blk
=
get_global_id
(
0
)
;
const
int
ch_blks
=
get_global_size
(
0
)
;
const
int
w
=
get_global_id
(
1
)
;
const
int
w
=
get_global_id
(
1
)
;
const
int
out_width
=
get_global_size
(
1
)
;
const
int
hb
=
get_global_id
(
2
)
;
const
int
hb
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
ch_blk
>=
global_size_dim0
|
| w >= global_size_dim1
|
|
hb
>=
global_size_dim2
)
{
return
;
}
const
int
ch_blks
=
global_size_dim0
;
const
int
out_width
=
global_size_dim1
;
#
else
const
int
ch_blks
=
get_global_size
(
0
)
;
const
int
out_width
=
get_global_size
(
1
)
;
#
endif
const
int
b
=
hb
/
out_height
;
const
int
b
=
hb
/
out_height
;
const
int
h
=
hb
%
out_height
;
const
int
h
=
hb
%
out_height
;
...
...
mace/kernels/opencl/cl/slice.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
__kernel
void
slice
(
__read_only
image2d_t
input,
__kernel
void
slice
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only
image2d_t
input,
__private
const
int
chan_blk_offset,
__private
const
int
chan_blk_offset,
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output
)
{
const
int
chan_blk_idx
=
get_global_id
(
0
)
;
const
int
chan_blk_idx
=
get_global_id
(
0
)
;
const
int
width_idx
=
get_global_id
(
1
)
;
const
int
width_idx
=
get_global_id
(
1
)
;
const
int
width
=
get_global_size
(
1
)
;
const
int
hb_idx
=
get_global_id
(
2
)
;
const
int
hb_idx
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
chan_blk_idx
>=
global_size_dim0
|
| width_idx >= global_size_dim1
|
|
hb_idx
>=
global_size_dim2
)
{
return
;
}
const
int
width
=
global_size_dim1
;
#
else
const
int
width
=
get_global_size
(
1
)
;
#
endif
DATA_TYPE4
data
=
READ_IMAGET
(
input,
SAMPLER,
DATA_TYPE4
data
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
mad24
(
chan_blk_idx
+
chan_blk_offset,
(
int2
)(
mad24
(
chan_blk_idx
+
chan_blk_offset,
width,
width_idx
)
,
hb_idx
))
;
width,
width_idx
)
,
hb_idx
))
;
...
...
mace/kernels/opencl/cl/softmax.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
__kernel
void
softmax
(
__read_only
image2d_t
input,
__kernel
void
softmax
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only
image2d_t
input,
__private
const
int
channels,
__private
const
int
channels,
__private
const
int
remain_channels,
__private
const
int
remain_channels,
__write_only
image2d_t
output
)
{
__write_only
image2d_t
output
)
{
const
int
chan_blk_idx
=
get_global_id
(
0
)
;
const
int
chan_blk_idx
=
get_global_id
(
0
)
;
const
int
width_idx
=
get_global_id
(
1
)
;
const
int
width_idx
=
get_global_id
(
1
)
;
const
int
hb_idx
=
get_global_id
(
2
)
;
const
int
hb_idx
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
chan_blk_idx
>=
global_size_dim0
|
| width_idx >= global_size_dim1
|
|
hb_idx
>=
global_size_dim2
)
{
return
;
}
const
int
chan_blks
=
global_size_dim0
-
1
;
const
int
width
=
global_size_dim1
;
#
else
const
int
chan_blks
=
get_global_size
(
0
)
-
1
;
const
int
chan_blks
=
get_global_size
(
0
)
-
1
;
const
int
width
=
get_global_size
(
1
)
;
const
int
width
=
get_global_size
(
1
)
;
#
endif
int
pos
=
width_idx
;
int
pos
=
width_idx
;
DATA_TYPE
max_value
=
-FLT_MAX
;
DATA_TYPE
max_value
=
-FLT_MAX
;
...
...
mace/kernels/opencl/cl/space_to_batch.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
__kernel
void
space_to_batch
(
__read_only
image2d_t
space_data,
__kernel
void
space_to_batch
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only
image2d_t
space_data,
__write_only
image2d_t
batch_data,
__write_only
image2d_t
batch_data,
__private
const
int
block_height,
__private
const
int
block_height,
__private
const
int
block_width,
__private
const
int
block_width,
...
@@ -14,6 +16,13 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
...
@@ -14,6 +16,13 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
const
int
batch_w_idx
=
get_global_id
(
1
)
;
const
int
batch_w_idx
=
get_global_id
(
1
)
;
const
int
batch_hb_idx
=
get_global_id
(
2
)
;
const
int
batch_hb_idx
=
get_global_id
(
2
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
chan_idx
>=
global_size_dim0
|
| batch_w_idx >= global_size_dim1
|| batch_hb_idx >= global_size_dim2) {
return;
}
#endif
const int batch_b_idx = batch_hb_idx / batch_height;
const int batch_b_idx = batch_hb_idx / batch_height;
const int batch_h_idx = batch_hb_idx % batch_height;
const int batch_h_idx = batch_hb_idx % batch_height;
...
@@ -39,7 +48,9 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
...
@@ -39,7 +48,9 @@ __kernel void space_to_batch(__read_only image2d_t space_data,
WRITE_IMAGET(batch_data, batch_coord, value);
WRITE_IMAGET(batch_data, batch_coord, value);
}
}
__kernel
void
batch_to_space
(
__read_only
image2d_t
batch_data,
__kernel void batch_to_space(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__read_only image2d_t batch_data,
__write_only image2d_t space_data,
__write_only image2d_t space_data,
__private const int block_height,
__private const int block_height,
__private const int block_width,
__private const int block_width,
...
@@ -53,6 +64,13 @@ __kernel void batch_to_space(__read_only image2d_t batch_data,
...
@@ -53,6 +64,13 @@ __kernel void batch_to_space(__read_only image2d_t batch_data,
const int batch_w_idx = get_global_id(1);
const int batch_w_idx = get_global_id(1);
const int batch_hb_idx = get_global_id(2);
const int batch_hb_idx = get_global_id(2);
#ifndef NON_UNIFORM_WORK_GROUP
if (chan_idx >= global_size_dim0 || batch_w_idx >= global_size_dim1
|
|
batch_hb_idx
>=
global_size_dim2
)
{
return
;
}
#
endif
const
int
batch_b_idx
=
batch_hb_idx
/
batch_height
;
const
int
batch_b_idx
=
batch_hb_idx
/
batch_height
;
const
int
batch_h_idx
=
batch_hb_idx
%
batch_height
;
const
int
batch_h_idx
=
batch_hb_idx
%
batch_height
;
...
...
mace/kernels/opencl/cl/winograd_transform.cl
浏览文件 @
33b049e6
#
include
<common.h>
#
include
<common.h>
__kernel
void
winograd_transform_2x2
(
__read_only
image2d_t
input,
__kernel
void
winograd_transform_2x2
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__read_only
image2d_t
input,
__write_only
image2d_t
output,
__write_only
image2d_t
output,
__private
const
int
in_height,
__private
const
int
in_height,
__private
const
int
in_width,
__private
const
int
in_width,
...
@@ -11,7 +13,15 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
...
@@ -11,7 +13,15 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
__private
const
int
padding_left
)
{
__private
const
int
padding_left
)
{
int
out_width_idx
=
get_global_id
(
0
)
;
int
out_width_idx
=
get_global_id
(
0
)
;
int
chan_blk_idx
=
get_global_id
(
1
)
;
int
chan_blk_idx
=
get_global_id
(
1
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
out_width_idx
>=
global_size_dim0
|
| chan_blk_idx >= global_size_dim1) {
return;
}
const int chan_blk_size = global_size_dim1;
#else
const int chan_blk_size = get_global_size(1);
const int chan_blk_size = get_global_size(1);
#endif
const int batch_idx = out_width_idx / round_hw;
const int batch_idx = out_width_idx / round_hw;
const int t_idx = out_width_idx % round_hw;
const int t_idx = out_width_idx % round_hw;
...
@@ -106,7 +116,9 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
...
@@ -106,7 +116,9 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
}
}
}
}
__kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
__kernel void winograd_inverse_transform_2x2(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__read_only image2d_t input,
#ifdef BIAS
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
__read_only image2d_t bias, /* cout%4 * cout/4 */
#endif
#endif
...
@@ -118,7 +130,16 @@ __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
...
@@ -118,7 +130,16 @@ __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
__private const float relux_max_limit) {
__private const float relux_max_limit) {
const int width_idx = get_global_id(0);
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
const int height_idx = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
return;
}
const int out_channel = global_size_dim1;
#else
const int out_channel = get_global_size(1);
const int out_channel = get_global_size(1);
#endif
int width = width_idx;
int width = width_idx;
int height = height_idx;
int height = height_idx;
...
...
mace/kernels/opencl/concat.cc
浏览文件 @
33b049e6
...
@@ -17,19 +17,28 @@ static void Concat2(cl::Kernel *kernel,
...
@@ -17,19 +17,28 @@ static void Concat2(cl::Kernel *kernel,
const
DataType
dt
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
const
index_t
width
=
output
->
dim
(
2
);
const
index_t
channel
=
output
->
dim
(
3
);
const
index_t
channel
=
output
->
dim
(
3
);
const
int
channel_blk
=
RoundUpDiv4
(
channel
);
const
int
channel_blk
=
RoundUpDiv4
(
channel
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blk
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
batch
*
height
),
};
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel
->
get
()
==
nullptr
)
{
if
(
kernel
->
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"concat_channel"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"concat_channel"
);
built_options
.
emplace
(
"-Dconcat_channel="
+
kernel_name
);
built_options
.
emplace
(
"-Dconcat_channel="
+
kernel_name
);
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
if
(
input0
->
dtype
()
==
output
->
dtype
())
{
if
(
input0
->
dtype
()
==
output
->
dtype
())
{
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
dt
));
...
@@ -41,9 +50,17 @@ static void Concat2(cl::Kernel *kernel,
...
@@ -41,9 +50,17 @@ static void Concat2(cl::Kernel *kernel,
built_options
.
emplace
(
"-DDIVISIBLE_FOUR"
);
built_options
.
emplace
(
"-DDIVISIBLE_FOUR"
);
}
}
*
kernel
=
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
);
*
kernel
=
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
);
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
}
}
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input0
->
shape
()))
{
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input0
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
}
kernel
->
setArg
(
idx
++
,
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input0
->
opencl_image
())));
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input0
->
opencl_image
())));
kernel
->
setArg
(
idx
++
,
kernel
->
setArg
(
idx
++
,
...
@@ -51,14 +68,11 @@ static void Concat2(cl::Kernel *kernel,
...
@@ -51,14 +68,11 @@ static void Concat2(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input0
->
dim
(
3
)));
kernel
->
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input0
->
dim
(
3
)));
kernel
->
setArg
(
idx
++
,
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Image2D
*>
(
output
->
opencl_image
())));
*
(
static_cast
<
cl
::
Image2D
*>
(
output
->
opencl_image
())));
*
prev_input_shape
=
input0
->
shape
();
*
prev_input_shape
=
input0
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
1
};
static_cast
<
uint32_t
>
(
channel_blk
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
batch
*
height
),
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
ss
<<
"concat_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
ss
<<
"concat_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
@@ -69,38 +83,51 @@ static void ConcatN(cl::Kernel *kernel,
...
@@ -69,38 +83,51 @@ static void ConcatN(cl::Kernel *kernel,
const
std
::
vector
<
const
Tensor
*>
&
input_list
,
const
std
::
vector
<
const
Tensor
*>
&
input_list
,
const
DataType
dt
,
const
DataType
dt
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
const
index_t
width
=
output
->
dim
(
2
);
const
index_t
channel
=
output
->
dim
(
3
);
const
index_t
channel
=
output
->
dim
(
3
);
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel
->
get
()
==
nullptr
)
{
if
(
kernel
->
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"concat_channel_multi"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"concat_channel_multi"
);
built_options
.
emplace
(
"-Dconcat_channel_multi="
+
kernel_name
);
built_options
.
emplace
(
"-Dconcat_channel_multi="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
dt
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
*
kernel
=
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
);
*
kernel
=
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
);
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
}
}
const
int
inputs_count
=
input_list
.
size
();
const
int
inputs_count
=
input_list
.
size
();
index_t
chan_blk_offset
=
0
;
index_t
chan_blk_offset
=
0
;
for
(
int
i
=
0
;
i
<
inputs_count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
inputs_count
;
++
i
)
{
const
Tensor
*
input
=
input_list
[
i
];
const
Tensor
*
input
=
input_list
[
i
];
index_t
input_channel_blk
=
input
->
dim
(
3
)
/
4
;
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
input_channel_blk
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
batch
*
height
),
};
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
}
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
chan_blk_offset
));
kernel
->
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
chan_blk_offset
));
kernel
->
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
index_t
input_channel_blk
=
input
->
dim
(
3
)
/
4
;
chan_blk_offset
+=
input_channel_blk
;
chan_blk_offset
+=
input_channel_blk
;
const
uint32_t
gws
[
3
]
=
{
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
1
};
static_cast
<
uint32_t
>
(
input_channel_blk
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
batch
*
height
),
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
ss
<<
"concat_n_opencl_kernel_"
<<
input_channel_blk
<<
"_"
<<
width
<<
"_"
ss
<<
"concat_n_opencl_kernel_"
<<
input_channel_blk
<<
"_"
<<
width
<<
"_"
<<
batch
*
height
;
<<
batch
*
height
;
...
@@ -145,11 +172,12 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -145,11 +172,12 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
switch
(
inputs_count
)
{
switch
(
inputs_count
)
{
case
2
:
case
2
:
Concat2
(
&
kernel_
,
input_list
[
0
],
input_list
[
1
],
DataTypeToEnum
<
T
>::
value
,
Concat2
(
&
kernel_
,
input_list
[
0
],
input_list
[
1
],
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
);
&
input_shape_
,
output
,
future
,
&
kwg_size_
);
break
;
break
;
default:
default:
if
(
divisible_four
)
{
if
(
divisible_four
)
{
ConcatN
(
&
kernel_
,
input_list
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
ConcatN
(
&
kernel_
,
input_list
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
,
&
kwg_size_
);
}
else
{
}
else
{
MACE_NOT_IMPLEMENTED
;
MACE_NOT_IMPLEMENTED
;
}
}
...
...
mace/kernels/opencl/conv_2d_opencl.cc
浏览文件 @
33b049e6
...
@@ -20,7 +20,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
...
@@ -20,7 +20,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const
DataType
dt
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
);
StatsFuture
*
future
,
uint32_t
*
kwg_size
);
extern
void
Conv2dOpenclK3x3
(
cl
::
Kernel
*
kernel
,
extern
void
Conv2dOpenclK3x3
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
input
,
...
@@ -34,7 +35,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
...
@@ -34,7 +35,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const
DataType
dt
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
);
StatsFuture
*
future
,
uint32_t
*
kwg_size
);
extern
void
Conv2dOpencl
(
cl
::
Kernel
*
kernel
,
extern
void
Conv2dOpencl
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
input
,
...
@@ -48,7 +50,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
...
@@ -48,7 +50,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const
DataType
dt
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
);
StatsFuture
*
future
,
uint32_t
*
kwg_size
);
template
<
typename
T
>
template
<
typename
T
>
void
Conv2dFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
void
Conv2dFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
...
@@ -61,7 +64,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -61,7 +64,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const
Tensor
*
bias
,
const
int
stride
,
const
int
*
padding
,
const
Tensor
*
bias
,
const
int
stride
,
const
int
*
padding
,
const
int
*
dilations
,
const
ActivationType
activation
,
const
int
*
dilations
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
DataType
dt
,
const
float
relux_max_limit
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
input_shape
,
Tensor
*
output
,
StatsFuture
*
future
);
std
::
vector
<
index_t
>
*
input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
);
// Selection matrix: kernel_size x stride_size
// Selection matrix: kernel_size x stride_size
static
const
Conv2dOpenclFunction
selector
[
5
]
=
{
static
const
Conv2dOpenclFunction
selector
[
5
]
=
{
Conv2dOpenclK1x1
,
nullptr
,
Conv2dOpenclK3x3
,
nullptr
,
nullptr
};
Conv2dOpenclK1x1
,
nullptr
,
Conv2dOpenclK3x3
,
nullptr
,
nullptr
};
...
@@ -101,11 +105,13 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -101,11 +105,13 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto
conv2d_func
=
selector
[
kernel_h
-
1
];
auto
conv2d_func
=
selector
[
kernel_h
-
1
];
conv2d_func
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
conv2d_func
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
);
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
);
}
else
{
}
else
{
Conv2dOpencl
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
Conv2dOpencl
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
);
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
);
}
}
}
}
...
...
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
33b049e6
...
@@ -22,7 +22,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
...
@@ -22,7 +22,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const
DataType
dt
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
const
index_t
width
=
output
->
dim
(
2
);
...
@@ -36,6 +37,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
...
@@ -36,6 +37,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel
->
get
()
==
nullptr
)
{
if
(
kernel
->
get
()
==
nullptr
)
{
MACE_CHECK
(
input_batch
==
batch
);
MACE_CHECK
(
input_batch
==
batch
);
...
@@ -44,6 +47,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
...
@@ -44,6 +47,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
built_options
.
emplace
(
"-Dconv_2d_1x1="
+
kernel_name
);
built_options
.
emplace
(
"-Dconv_2d_1x1="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
if
(
bias
!=
nullptr
)
{
if
(
bias
!=
nullptr
)
{
built_options
.
emplace
(
"-DBIAS"
);
built_options
.
emplace
(
"-DBIAS"
);
}
}
...
@@ -66,11 +72,23 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
...
@@ -66,11 +72,23 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
}
auto
runtime
=
OpenCLRuntime
::
Global
();
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_1x1"
,
kernel_name
,
built_options
);
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_1x1"
,
kernel_name
,
built_options
);
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
}
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
if
(
bias
!=
nullptr
)
{
if
(
bias
!=
nullptr
)
{
...
@@ -89,10 +107,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
...
@@ -89,10 +107,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
*
prev_input_shape
=
input
->
shape
();
*
prev_input_shape
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
1
};
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
15
,
8
,
1
};
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"conv2d_1x1_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
Concat
(
"conv2d_1x1_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
33b049e6
...
@@ -24,7 +24,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
...
@@ -24,7 +24,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const
DataType
dt
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
const
index_t
width
=
output
->
dim
(
2
);
...
@@ -35,12 +36,17 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
...
@@ -35,12 +36,17 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
width_blocks
=
RoundUpDiv
<
index_t
,
5
>
(
width
);
const
index_t
width_blocks
=
RoundUpDiv
<
index_t
,
5
>
(
width
);
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel
->
get
()
==
nullptr
)
{
if
(
kernel
->
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"conv_2d_3x3"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"conv_2d_3x3"
);
built_options
.
emplace
(
"-Dconv_2d_3x3="
+
kernel_name
);
built_options
.
emplace
(
"-Dconv_2d_3x3="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
switch
(
activation
)
{
switch
(
activation
)
{
case
NOOP
:
case
NOOP
:
...
@@ -61,11 +67,23 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
...
@@ -61,11 +67,23 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
}
auto
runtime
=
OpenCLRuntime
::
Global
();
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_3x3"
,
kernel_name
,
built_options
);
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d_3x3"
,
kernel_name
,
built_options
);
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
}
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
if
(
bias
!=
nullptr
)
{
if
(
bias
!=
nullptr
)
{
...
@@ -87,10 +105,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
...
@@ -87,10 +105,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
*
prev_input_shape
=
input
->
shape
();
*
prev_input_shape
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
std
::
vector
<
uint32_t
>
lws
=
{
4
,
*
kwg_size
/
32
,
8
,
1
};
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
4
,
15
,
8
,
1
};
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"conv2d_3x3_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
Concat
(
"conv2d_3x3_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
...
...
mace/kernels/opencl/conv_2d_opencl_general.cc
浏览文件 @
33b049e6
...
@@ -24,7 +24,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
...
@@ -24,7 +24,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const
DataType
dt
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
const
index_t
width
=
output
->
dim
(
2
);
...
@@ -35,12 +36,17 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
...
@@ -35,12 +36,17 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel
->
get
()
==
nullptr
)
{
if
(
kernel
->
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"conv_2d"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"conv_2d"
);
built_options
.
emplace
(
"-Dconv_2d="
+
kernel_name
);
built_options
.
emplace
(
"-Dconv_2d="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
switch
(
activation
)
{
switch
(
activation
)
{
case
NOOP
:
case
NOOP
:
...
@@ -61,11 +67,23 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
...
@@ -61,11 +67,23 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
}
auto
runtime
=
OpenCLRuntime
::
Global
();
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d"
,
kernel_name
,
built_options
);
*
kernel
=
runtime
->
BuildKernel
(
"conv_2d"
,
kernel_name
,
built_options
);
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
}
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
if
(
bias
!=
nullptr
)
{
if
(
bias
!=
nullptr
)
{
...
@@ -89,10 +107,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
...
@@ -89,10 +107,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
*
prev_input_shape
=
input
->
shape
();
*
prev_input_shape
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
1
};
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"conv2d_general_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
Concat
(
"conv2d_general_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
...
...
mace/kernels/opencl/depth_to_space_opencl.cc
浏览文件 @
33b049e6
...
@@ -45,8 +45,9 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -45,8 +45,9 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
image_shape
);
CalImage2DShape
(
output_shape
,
BufferType
::
IN_OUT_CHANNEL
,
&
image_shape
);
output
->
ResizeImage
(
output_shape
,
image_shape
);
output
->
ResizeImage
(
output_shape
,
image_shape
);
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
if
(
kernel_
.
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
kernel_name
);
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
kernel_name
);
std
::
stringstream
kernel_name_ss
;
std
::
stringstream
kernel_name_ss
;
...
@@ -55,38 +56,49 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -55,38 +56,49 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
kernel_
=
runtime
->
BuildKernel
(
"depth_to_space"
,
kernel_name
,
built_options
);
runtime
->
BuildKernel
(
"depth_to_space"
,
kernel_name
,
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
}
uint32_t
gws
[
3
];
std
::
stringstream
ss
;
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
if
(
d2s_
)
{
gws
[
0
]
=
static_cast
<
uint32_t
>
(
depth_blocks
);
gws
[
1
]
=
static_cast
<
uint32_t
>
(
output_width
);
gws
[
2
]
=
static_cast
<
uint32_t
>
(
output_height
*
batch
);
ss
<<
"depth_to_space_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
}
else
{
gws
[
0
]
=
static_cast
<
uint32_t
>
(
depth_blocks
);
gws
[
1
]
=
static_cast
<
uint32_t
>
(
input_width
);
gws
[
2
]
=
static_cast
<
uint32_t
>
(
input_height
*
batch
);
ss
<<
"space_to_depth_opencl_kernel_"
<<
input
->
dim
(
0
)
<<
"_"
<<
input
->
dim
(
1
)
<<
"_"
<<
input
->
dim
(
2
)
<<
"_"
<<
input
->
dim
(
3
);
}
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
}
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
block_size_
);
kernel_
.
setArg
(
idx
++
,
block_size_
);
kernel_
.
setArg
(
idx
++
,
depth_blocks
);
kernel_
.
setArg
(
idx
++
,
depth_blocks
);
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
input
->
shape
();
input_shape_
=
input
->
shape
();
}
}
if
(
d2s_
)
{
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
depth_blocks
),
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
static_cast
<
uint32_t
>
(
output_width
),
static_cast
<
uint32_t
>
(
output_height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"depth_to_space_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
}
else
{
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
depth_blocks
),
static_cast
<
uint32_t
>
(
input_width
),
static_cast
<
uint32_t
>
(
input_height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"space_to_depth_opencl_kernel_"
<<
input
->
dim
(
0
)
<<
"_"
<<
input
->
dim
(
1
)
<<
"_"
<<
input
->
dim
(
2
)
<<
"_"
<<
input
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
}
}
}
template
struct
DepthToSpaceOpFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
DepthToSpaceOpFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/depthwise_conv_opencl.cc
浏览文件 @
33b049e6
...
@@ -23,7 +23,8 @@ void DepthwiseConv2d(cl::Kernel *kernel,
...
@@ -23,7 +23,8 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const
DataType
dt
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
const
index_t
width
=
output
->
dim
(
2
);
...
@@ -35,8 +36,14 @@ void DepthwiseConv2d(cl::Kernel *kernel,
...
@@ -35,8 +36,14 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
input_channel_blocks
=
RoundUpDiv4
(
input_channels
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel
->
get
()
==
nullptr
)
{
if
(
kernel
->
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"depthwise_conv2d"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"depthwise_conv2d"
);
if
(
stride
==
1
&&
dilations
[
0
]
==
1
&&
dilations
[
1
]
==
1
)
{
if
(
stride
==
1
&&
dilations
[
0
]
==
1
&&
dilations
[
1
]
==
1
)
{
...
@@ -45,6 +52,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
...
@@ -45,6 +52,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
}
else
{
}
else
{
built_options
.
emplace
(
"-Ddepthwise_conv2d="
+
kernel_name
);
built_options
.
emplace
(
"-Ddepthwise_conv2d="
+
kernel_name
);
}
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
...
@@ -70,6 +80,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
...
@@ -70,6 +80,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
*
kernel
=
*
kernel
=
runtime
->
BuildKernel
(
"depthwise_conv2d"
,
kernel_name
,
built_options
);
runtime
->
BuildKernel
(
"depthwise_conv2d"
,
kernel_name
,
built_options
);
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
}
}
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
const
index_t
input_batch
=
input
->
dim
(
0
);
const
index_t
input_batch
=
input
->
dim
(
0
);
...
@@ -84,6 +97,11 @@ void DepthwiseConv2d(cl::Kernel *kernel,
...
@@ -84,6 +97,11 @@ void DepthwiseConv2d(cl::Kernel *kernel,
input_channels
);
input_channels
);
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
}
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
filter
->
opencl_image
()));
if
(
bias
!=
nullptr
)
{
if
(
bias
!=
nullptr
)
{
...
@@ -104,13 +122,11 @@ void DepthwiseConv2d(cl::Kernel *kernel,
...
@@ -104,13 +122,11 @@ void DepthwiseConv2d(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
static_cast
<
int16_t
>
(
dilations
[
0
]));
kernel
->
setArg
(
idx
++
,
static_cast
<
int16_t
>
(
dilations
[
0
]));
kernel
->
setArg
(
idx
++
,
static_cast
<
int16_t
>
(
dilations
[
1
]));
kernel
->
setArg
(
idx
++
,
static_cast
<
int16_t
>
(
dilations
[
1
]));
}
}
*
prev_input_shape
=
input
->
shape
();
*
prev_input_shape
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
1
};
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
string
tuning_key
=
Concat
(
"depthwise_conv2d_ocl_kernel_"
,
activation
,
std
::
string
tuning_key
=
Concat
(
"depthwise_conv2d_ocl_kernel_"
,
activation
,
batch
,
height
,
width
,
channels
,
multiplier
);
batch
,
height
,
width
,
channels
,
multiplier
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
...
@@ -165,7 +181,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -165,7 +181,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
DepthwiseConv2d
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
DepthwiseConv2d
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
);
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
);
}
}
template
struct
DepthwiseConv2dFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
DepthwiseConv2dFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/eltwise_opencl.cc
浏览文件 @
33b049e6
...
@@ -24,8 +24,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
...
@@ -24,8 +24,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
const
index_t
width_pixels
=
channel_blocks
*
width
;
const
index_t
width_pixels
=
channel_blocks
*
width
;
const
index_t
batch_height_pixels
=
batch
*
height
;
const
index_t
batch_height_pixels
=
batch
*
height
;
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
static_cast
<
uint32_t
>
(
batch_height_pixels
)};
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
if
(
kernel_
.
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"eltwise"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"eltwise"
);
...
@@ -33,11 +37,21 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
...
@@ -33,11 +37,21 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
MakeString
(
"-DELTWISE_TYPE="
,
type_
));
built_options
.
emplace
(
MakeString
(
"-DELTWISE_TYPE="
,
type_
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
if
(
!
coeff_
.
empty
())
built_options
.
emplace
(
"-DCOEFF_SUM"
);
if
(
!
coeff_
.
empty
())
built_options
.
emplace
(
"-DCOEFF_SUM"
);
kernel_
=
runtime
->
BuildKernel
(
"eltwise"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"eltwise"
,
kernel_name
,
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
}
if
(
!
IsVecEqual
(
input_shape_
,
input0
->
shape
()))
{
if
(
!
IsVecEqual
(
input_shape_
,
input0
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
}
kernel_
.
setArg
(
idx
++
,
*
(
input0
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input0
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input1
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input1
->
opencl_image
()));
if
(
!
coeff_
.
empty
())
{
if
(
!
coeff_
.
empty
())
{
...
@@ -45,12 +59,11 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
...
@@ -45,12 +59,11 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_
.
setArg
(
idx
++
,
coeff_
[
1
]);
kernel_
.
setArg
(
idx
++
,
coeff_
[
1
]);
}
}
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
input0
->
shape
();
input_shape_
=
input0
->
shape
();
}
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
16
,
16
,
1
};
static_cast
<
uint32_t
>
(
batch_height_pixels
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
64
,
16
,
1
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
ss
<<
"eltwise_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
ss
<<
"eltwise_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
mace/kernels/opencl/helper.cc
浏览文件 @
33b049e6
...
@@ -200,6 +200,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
...
@@ -200,6 +200,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
const
std
::
vector
<
uint32_t
>
&
lws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
const
uint32_t
kwg_size
=
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel
));
...
@@ -226,12 +227,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
...
@@ -226,12 +227,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
{
4
,
kwg_size
/
28
,
7
,
1
},
{
4
,
kwg_size
/
28
,
7
,
1
},
{
4
,
kwg_size
/
32
,
8
,
1
},
{
4
,
kwg_size
/
32
,
8
,
1
},
{
4
,
kwg_size
/
56
,
14
,
1
},
{
4
,
kwg_size
/
56
,
14
,
1
},
{
3
,
15
,
9
,
1
},
{
7
,
15
,
9
,
1
},
{
9
,
7
,
15
,
1
},
{
15
,
7
,
9
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
1
,
kwg_size
,
1
,
1
},
{
4
,
15
,
8
,
1
},
};
};
};
};
cl
::
Event
event
;
cl
::
Event
event
;
...
@@ -240,6 +236,13 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
...
@@ -240,6 +236,13 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
MACE_CHECK
(
params
.
size
()
==
4
)
MACE_CHECK
(
params
.
size
()
==
4
)
<<
"Tuning parameters of 3D kernel must be 4D"
;
<<
"Tuning parameters of 3D kernel must be 4D"
;
cl_int
error
=
CL_SUCCESS
;
cl_int
error
=
CL_SUCCESS
;
std
::
vector
<
uint32_t
>
roundup_gws
(
3
);
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
for
(
size_t
i
=
0
;
i
<
3
;
++
i
)
{
roundup_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
}
}
if
(
timer
==
nullptr
)
{
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
[
3
];
uint32_t
num_blocks
=
params
[
3
];
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
const
uint32_t
block_size
=
gws
[
2
]
/
num_blocks
;
...
@@ -247,17 +250,32 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
...
@@ -247,17 +250,32 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
}
else
{
uint32_t
roundup_gws2
=
RoundUp
(
gws2
,
params
[
2
]);
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
}
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
}
else
{
}
else
{
timer
->
ClearTiming
();
timer
->
ClearTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
}
else
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
}
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
timer
->
AccumulateTiming
();
tuning_result
->
assign
(
params
.
begin
(),
params
.
end
());
tuning_result
->
assign
(
params
.
begin
(),
params
.
end
());
...
@@ -273,10 +291,18 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
...
@@ -273,10 +291,18 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
}
else
{
uint32_t
roundup_gws2
=
RoundUp
(
gws2
,
params
[
2
]);
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws2
),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
}
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
timer
->
AccumulateTiming
();
}
}
...
@@ -304,6 +330,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
...
@@ -304,6 +330,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
const
std
::
vector
<
uint32_t
>
&
lws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
runtime
=
OpenCLRuntime
::
Global
();
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
const
uint32_t
kwg_size
=
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel
));
...
@@ -318,7 +345,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
...
@@ -318,7 +345,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
{
kwg_size
/
64
,
64
,
1
},
{
kwg_size
/
64
,
64
,
1
},
{
kwg_size
/
128
,
128
,
1
},
{
kwg_size
/
128
,
128
,
1
},
{
kwg_size
/
256
,
256
,
1
},
{
kwg_size
/
256
,
256
,
1
},
{
kwg_size
/
512
,
512
,
1
},
{
kwg_size
,
1
,
1
},
{
kwg_size
,
1
,
1
},
{
1
,
kwg_size
,
1
}};
{
1
,
kwg_size
,
1
}};
};
};
...
@@ -328,6 +354,13 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
...
@@ -328,6 +354,13 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
MACE_CHECK
(
params
.
size
()
==
3
)
MACE_CHECK
(
params
.
size
()
==
3
)
<<
"Tuning parameters of 2D kernel must be 3d"
;
<<
"Tuning parameters of 2D kernel must be 3d"
;
cl_int
error
=
CL_SUCCESS
;
cl_int
error
=
CL_SUCCESS
;
std
::
vector
<
uint32_t
>
roundup_gws
(
2
);
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
for
(
size_t
i
=
0
;
i
<
2
;
++
i
)
{
roundup_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
}
}
if
(
timer
==
nullptr
)
{
if
(
timer
==
nullptr
)
{
uint32_t
num_blocks
=
params
[
2
];
uint32_t
num_blocks
=
params
[
2
];
const
uint32_t
block_size
=
gws
[
1
]
/
num_blocks
;
const
uint32_t
block_size
=
gws
[
1
]
/
num_blocks
;
...
@@ -335,16 +368,30 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
...
@@ -335,16 +368,30 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws1
=
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
}
else
{
uint32_t
roundup_gws1
=
RoundUp
(
gws1
,
params
[
1
]);
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
}
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
}
}
}
else
{
}
else
{
timer
->
ClearTiming
();
timer
->
ClearTiming
();
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
}
else
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
}
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
timer
->
AccumulateTiming
();
tuning_result
->
assign
(
params
.
begin
(),
params
.
end
());
tuning_result
->
assign
(
params
.
begin
(),
params
.
end
());
...
@@ -360,9 +407,18 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
...
@@ -360,9 +407,18 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws1
=
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
}
else
{
uint32_t
roundup_gws1
=
RoundUp
(
gws1
,
params
[
1
]);
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
}
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
timer
->
AccumulateTiming
();
timer
->
AccumulateTiming
();
}
}
...
...
mace/kernels/opencl/matmul.cc
浏览文件 @
33b049e6
...
@@ -26,18 +26,33 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
...
@@ -26,18 +26,33 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
const
index_t
height_blocks
=
RoundUpDiv4
(
height
);
const
index_t
height_blocks
=
RoundUpDiv4
(
height
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
const
index_t
width_blocks
=
RoundUpDiv4
(
width
);
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height_blocks
*
batch
),
};
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
if
(
kernel_
.
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"matmul"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"matmul"
);
built_options
.
emplace
(
"-Dmatmul="
+
kernel_name
);
built_options
.
emplace
(
"-Dmatmul="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"matmul"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"matmul"
,
kernel_name
,
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
}
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
}
kernel_
.
setArg
(
idx
++
,
*
(
A
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
A
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
B
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
B
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
C
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
C
->
opencl_image
()));
...
@@ -47,11 +62,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
...
@@ -47,11 +62,7 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height_blocks
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
height_blocks
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
RoundUpDiv4
(
A
->
dim
(
2
))));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
RoundUpDiv4
(
A
->
dim
(
2
))));
const
uint32_t
gws
[
2
]
=
{
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
64
,
64
,
1
};
static_cast
<
uint32_t
>
(
width_blocks
),
static_cast
<
uint32_t
>
(
height_blocks
*
batch
),
};
const
std
::
vector
<
uint32_t
>
lws
=
{
16
,
64
,
1
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
ss
<<
"matmul_opencl_kernel_"
<<
C
->
dim
(
0
)
<<
"_"
<<
C
->
dim
(
1
)
<<
"_"
ss
<<
"matmul_opencl_kernel_"
<<
C
->
dim
(
0
)
<<
"_"
<<
C
->
dim
(
1
)
<<
"_"
<<
C
->
dim
(
2
)
<<
"_"
<<
C
->
dim
(
3
);
<<
C
->
dim
(
2
)
<<
"_"
<<
C
->
dim
(
3
);
...
...
mace/kernels/opencl/pooling_opencl.cc
浏览文件 @
33b049e6
...
@@ -18,12 +18,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -18,12 +18,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
MACE_CHECK
(
dilations_
[
0
]
==
1
&&
dilations_
[
1
]
==
1
)
MACE_CHECK
(
dilations_
[
0
]
==
1
&&
dilations_
[
1
]
==
1
)
<<
"Pooling opencl kernel not support dilation yet"
;
<<
"Pooling opencl kernel not support dilation yet"
;
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
if
(
kernel_
.
get
()
==
nullptr
)
{
const
DataType
dt
=
DataTypeToEnum
<
T
>::
value
;
const
DataType
dt
=
DataTypeToEnum
<
T
>::
value
;
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"pooling"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"pooling"
);
built_options
.
emplace
(
"-Dpooling="
+
kernel_name
);
built_options
.
emplace
(
"-Dpooling="
+
kernel_name
);
if
(
pooling_type_
==
MAX
&&
input
->
dtype
()
==
output
->
dtype
())
{
if
(
pooling_type_
==
MAX
&&
input
->
dtype
()
==
output
->
dtype
())
{
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
dt
));
...
@@ -35,8 +37,16 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -35,8 +37,16 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if
(
pooling_type_
==
AVG
)
{
if
(
pooling_type_
==
AVG
)
{
built_options
.
emplace
(
"-DPOOL_AVG"
);
built_options
.
emplace
(
"-DPOOL_AVG"
);
}
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"pooling"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"pooling"
,
kernel_name
,
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
}
std
::
vector
<
uint32_t
>
gws
;
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
index_t
>
filter_shape
=
{
kernels_
[
0
],
kernels_
[
1
],
std
::
vector
<
index_t
>
filter_shape
=
{
kernels_
[
0
],
kernels_
[
1
],
...
@@ -59,7 +69,24 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -59,7 +69,24 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
&
output_image_shape
);
&
output_image_shape
);
output
->
ResizeImage
(
output_shape
,
output_image_shape
);
output
->
ResizeImage
(
output_shape
,
output_image_shape
);
index_t
batch
=
output
->
dim
(
0
);
index_t
out_height
=
output
->
dim
(
1
);
index_t
out_width
=
output
->
dim
(
2
);
index_t
channels
=
output
->
dim
(
3
);
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
gws
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
batch
*
out_height
),
};
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
}
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
1
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
1
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
2
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input
->
dim
(
2
)));
...
@@ -71,25 +98,25 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
...
@@ -71,25 +98,25 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
input
->
shape
();
input_shape_
=
input
->
shape
();
}
}
else
{
index_t
batch
=
output
->
dim
(
0
);
index_t
batch
=
output
->
dim
(
0
);
index_t
out_height
=
output
->
dim
(
1
);
index_t
out_height
=
output
->
dim
(
1
);
index_t
out_width
=
output
->
dim
(
2
);
index_t
out_width
=
output
->
dim
(
2
);
index_t
channels
=
output
->
dim
(
3
);
index_t
channels
=
output
->
dim
(
3
);
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
gws
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
batch
*
out_height
),
};
}
const
uint32_t
gws
[
3
]
=
{
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
batch
*
out_height
),
};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
ss
<<
"pooling_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
ss
<<
"pooling_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
.
data
()
,
lws
,
future
);
}
}
template
struct
PoolingFunctor
<
DeviceType
::
OPENCL
,
float
>;
template
struct
PoolingFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/resize_bilinear_opencl.cc
浏览文件 @
33b049e6
...
@@ -24,16 +24,27 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -24,16 +24,27 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
const
index_t
out_height
=
out_height_
;
const
index_t
out_height
=
out_height_
;
const
index_t
out_width
=
out_width_
;
const
index_t
out_width
=
out_width_
;
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
out_height
*
batch
)};
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
if
(
kernel_
.
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"resize_bilinear_nocache"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"resize_bilinear_nocache"
);
built_options
.
emplace
(
"-Dresize_bilinear_nocache="
+
kernel_name
);
built_options
.
emplace
(
"-Dresize_bilinear_nocache="
+
kernel_name
);
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
kernel_
=
runtime
->
BuildKernel
(
"resize_bilinear"
,
kernel_name
,
built_options
);
runtime
->
BuildKernel
(
"resize_bilinear"
,
kernel_name
,
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
MACE_CHECK
(
out_height
>
0
&&
out_width
>
0
);
MACE_CHECK
(
out_height
>
0
&&
out_width
>
0
);
...
@@ -50,6 +61,11 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -50,6 +61,11 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
CalculateResizeScale
(
in_width
,
out_width
,
align_corners_
);
CalculateResizeScale
(
in_width
,
out_width
,
align_corners_
);
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
}
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
height_scale
);
kernel_
.
setArg
(
idx
++
,
height_scale
);
...
@@ -61,10 +77,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -61,10 +77,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
input_shape_
=
input
->
shape
();
input_shape_
=
input
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
out_height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
ss
<<
"resize_bilinear_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
ss
<<
"resize_bilinear_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
mace/kernels/opencl/slice.cc
浏览文件 @
33b049e6
...
@@ -29,15 +29,22 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -29,15 +29,22 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
output_list
[
i
]
->
ResizeImage
(
output_shape
,
image_shape
);
output_list
[
i
]
->
ResizeImage
(
output_shape
,
image_shape
);
}
}
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
if
(
kernel_
.
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"slice"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"slice"
);
built_options
.
emplace
(
"-Dslice="
+
kernel_name
);
built_options
.
emplace
(
"-Dslice="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
+
DtToCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"slice"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"slice"
,
kernel_name
,
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
}
const
index_t
channel_blk
=
RoundUpDiv4
(
output_channels
);
const
index_t
channel_blk
=
RoundUpDiv4
(
output_channels
);
...
@@ -46,7 +53,8 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -46,7 +53,8 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
static_cast
<
uint32_t
>
(
input
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
input
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
input
->
dim
(
0
)
*
input
->
dim
(
1
)),
static_cast
<
uint32_t
>
(
input
->
dim
(
0
)
*
input
->
dim
(
1
)),
};
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
ss
<<
"slice_opencl_kernel_"
ss
<<
"slice_opencl_kernel_"
<<
input
->
dim
(
0
)
<<
"_"
<<
input
->
dim
(
0
)
<<
"_"
...
@@ -56,6 +64,11 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -56,6 +64,11 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
<<
outputs_count
;
<<
outputs_count
;
for
(
int
i
=
0
;
i
<
outputs_count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
outputs_count
;
++
i
)
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
}
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
channel_blk
*
i
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
channel_blk
*
i
));
kernel_
.
setArg
(
idx
++
,
*
(
output_list
[
i
]
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output_list
[
i
]
->
opencl_image
()));
...
...
mace/kernels/opencl/softmax_opencl.cc
浏览文件 @
33b049e6
...
@@ -23,29 +23,43 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
...
@@ -23,29 +23,43 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
index_t
channel_blocks
=
RoundUpDiv4
(
channels
);
const
int
remain_channels
=
channel_blocks
*
4
-
channels
;
const
int
remain_channels
=
channel_blocks
*
4
-
channels
;
if
(
kernel_
.
get
()
==
nullptr
)
{
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
auto
runtime
=
OpenCLRuntime
::
Global
();
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
height
*
batch
)};
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"softmax"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"softmax"
);
built_options
.
emplace
(
"-Dsoftmax="
+
kernel_name
);
built_options
.
emplace
(
"-Dsoftmax="
+
kernel_name
);
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"softmax"
,
kernel_name
,
built_options
);
kernel_
=
runtime
->
BuildKernel
(
"softmax"
,
kernel_name
,
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
}
if
(
!
IsVecEqual
(
input_shape_
,
logits
->
shape
()))
{
if
(
!
IsVecEqual
(
input_shape_
,
logits
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
}
kernel_
.
setArg
(
idx
++
,
*
(
logits
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
logits
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
channels
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
channels
));
kernel_
.
setArg
(
idx
++
,
remain_channels
);
kernel_
.
setArg
(
idx
++
,
remain_channels
);
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
logits
->
shape
();
input_shape_
=
logits
->
shape
();
}
}
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
width
),
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
static_cast
<
uint32_t
>
(
height
*
batch
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
ss
<<
"softmax_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
ss
<<
"softmax_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
mace/kernels/opencl/space_to_batch_opencl.cc
浏览文件 @
33b049e6
...
@@ -31,9 +31,15 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -31,9 +31,15 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
batch_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
);
batch_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
);
kernel_name
=
"space_to_batch"
;
kernel_name
=
"space_to_batch"
;
}
}
const
uint32_t
chan_blk
=
RoundUpDiv4
<
uint32_t
>
(
batch_tensor
->
dim
(
3
));
const
uint32_t
gws
[
3
]
=
{
chan_blk
,
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
0
)
*
batch_tensor
->
dim
(
1
))};
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
kernel_name
);
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
kernel_name
);
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
std
::
set
<
std
::
string
>
built_options
;
std
::
stringstream
kernel_name_ss
;
std
::
stringstream
kernel_name_ss
;
kernel_name_ss
<<
"-D"
<<
kernel_name
<<
"="
<<
obfuscated_kernel_name
;
kernel_name_ss
<<
"-D"
<<
kernel_name
<<
"="
<<
obfuscated_kernel_name
;
...
@@ -41,11 +47,22 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -41,11 +47,22 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
DtToCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
kernel_
=
runtime
->
BuildKernel
(
"space_to_batch"
,
kernel_name
,
built_options
);
runtime
->
BuildKernel
(
"space_to_batch"
,
kernel_name
,
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
}
if
(
!
IsVecEqual
(
space_shape_
,
space_tensor
->
shape
()))
{
if
(
!
IsVecEqual
(
space_shape_
,
space_tensor
->
shape
()))
{
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
}
if
(
b2s_
)
{
if
(
b2s_
)
{
kernel_
.
setArg
(
idx
++
,
*
(
batch_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
batch_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
space_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
space_tensor
->
opencl_image
()));
...
@@ -65,11 +82,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -65,11 +82,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
space_shape_
=
space_tensor
->
shape
();
space_shape_
=
space_tensor
->
shape
();
}
}
const
uint32_t
chan_blk
=
RoundUpDiv4
<
uint32_t
>
(
batch_tensor
->
dim
(
3
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
const
uint32_t
gws
[
3
]
=
{
chan_blk
,
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
batch_tensor
->
dim
(
0
)
*
batch_tensor
->
dim
(
1
))};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
16
,
8
,
1
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
ss
<<
kernel_name
<<
"_"
<<
batch_tensor
->
dim
(
0
)
<<
"_"
ss
<<
kernel_name
<<
"_"
<<
batch_tensor
->
dim
(
0
)
<<
"_"
<<
batch_tensor
->
dim
(
1
)
<<
"_"
<<
batch_tensor
->
dim
(
2
)
<<
"_"
<<
batch_tensor
->
dim
(
1
)
<<
"_"
<<
batch_tensor
->
dim
(
2
)
<<
"_"
...
...
mace/kernels/opencl/winograd_transform.cc
浏览文件 @
33b049e6
...
@@ -15,6 +15,8 @@ template <typename T>
...
@@ -15,6 +15,8 @@ template <typename T>
void
WinogradTransformFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
void
WinogradTransformFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
const
Tensor
*
input_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
string
obfuscated_kernel_name
=
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_transform_2x2"
);
MACE_OBFUSCATE_SYMBOL
(
"winograd_transform_2x2"
);
...
@@ -24,9 +26,14 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -24,9 +26,14 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
built_options
);
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
}
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
index_t
>
filter_shape
=
{
3
,
3
,
input_tensor
->
dim
(
3
),
1
};
std
::
vector
<
index_t
>
filter_shape
=
{
3
,
3
,
input_tensor
->
dim
(
3
),
1
};
...
@@ -44,6 +51,9 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -44,6 +51,9 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
const
index_t
round_h
=
(
output_shape
[
1
]
+
1
)
/
2
;
const
index_t
round_h
=
(
output_shape
[
1
]
+
1
)
/
2
;
const
index_t
round_w
=
(
output_shape
[
2
]
+
1
)
/
2
;
const
index_t
round_w
=
(
output_shape
[
2
]
+
1
)
/
2
;
const
index_t
out_width
=
input_tensor
->
dim
(
0
)
*
round_h
*
round_w
;
const
index_t
out_width
=
input_tensor
->
dim
(
0
)
*
round_h
*
round_w
;
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_tensor
->
dim
(
3
)))};
if
(
!
IsVecEqual
(
input_shape_
,
input_tensor
->
shape
()))
{
if
(
!
IsVecEqual
(
input_shape_
,
input_tensor
->
shape
()))
{
output_shape
=
{
16
,
input_tensor
->
dim
(
3
),
out_width
,
1
};
output_shape
=
{
16
,
input_tensor
->
dim
(
3
),
out_width
,
1
};
...
@@ -52,6 +62,10 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -52,6 +62,10 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
output_tensor
->
ResizeImage
(
output_shape
,
image_shape
);
output_tensor
->
ResizeImage
(
output_shape
,
image_shape
);
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
}
kernel_
.
setArg
(
idx
++
,
*
(
input_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
input_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
*
(
output_tensor
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input_tensor
->
dim
(
1
)));
kernel_
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input_tensor
->
dim
(
1
)));
...
@@ -65,10 +79,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -65,10 +79,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
input_shape_
=
input_tensor
->
shape
();
input_shape_
=
input_tensor
->
shape
();
}
}
const
uint32_t
gws
[
2
]
=
{
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
1
};
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_tensor
->
dim
(
3
)))};
const
std
::
vector
<
uint32_t
>
lws
=
{
128
,
8
,
1
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
ss
<<
"winograd_transform_kernel_"
<<
input_tensor
->
dim
(
0
)
<<
"_"
ss
<<
"winograd_transform_kernel_"
<<
input_tensor
->
dim
(
0
)
<<
"_"
<<
input_tensor
->
dim
(
1
)
<<
"_"
<<
input_tensor
->
dim
(
2
)
<<
"_"
<<
input_tensor
->
dim
(
1
)
<<
"_"
<<
input_tensor
->
dim
(
2
)
<<
"_"
...
@@ -82,6 +93,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -82,6 +93,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const
Tensor
*
bias
,
const
Tensor
*
bias
,
Tensor
*
output_tensor
,
Tensor
*
output_tensor
,
StatsFuture
*
future
)
{
StatsFuture
*
future
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
string
obfuscated_kernel_name
=
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_inverse_transform_2x2"
);
MACE_OBFUSCATE_SYMBOL
(
"winograd_inverse_transform_2x2"
);
...
@@ -92,6 +106,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -92,6 +106,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
switch
(
activation_
)
{
switch
(
activation_
)
{
case
NOOP
:
case
NOOP
:
...
@@ -115,10 +132,16 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -115,10 +132,16 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation_
;
}
}
auto
runtime
=
OpenCLRuntime
::
Global
();
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
built_options
);
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
input_tensor
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_tensor
->
dim
(
1
)))};
if
(
!
IsVecEqual
(
input_shape_
,
input_tensor
->
shape
()))
{
if
(
!
IsVecEqual
(
input_shape_
,
input_tensor
->
shape
()))
{
std
::
vector
<
index_t
>
output_shape
=
{
batch_
,
height_
,
width_
,
std
::
vector
<
index_t
>
output_shape
=
{
batch_
,
height_
,
width_
,
input_tensor
->
dim
(
1
)};
input_tensor
->
dim
(
1
)};
...
@@ -129,6 +152,10 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -129,6 +152,10 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const
uint32_t
round_h
=
(
height_
+
1
)
/
2
;
const
uint32_t
round_h
=
(
height_
+
1
)
/
2
;
const
uint32_t
round_w
=
(
width_
+
1
)
/
2
;
const
uint32_t
round_w
=
(
width_
+
1
)
/
2
;
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
}
kernel_
.
setArg
(
kernel_
.
setArg
(
idx
++
,
idx
++
,
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input_tensor
->
opencl_image
())));
*
(
static_cast
<
const
cl
::
Image2D
*>
(
input_tensor
->
opencl_image
())));
...
@@ -147,10 +174,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
...
@@ -147,10 +174,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
input_shape_
=
input_tensor
->
shape
();
input_shape_
=
input_tensor
->
shape
();
}
}
const
uint32_t
gws
[
2
]
=
{
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
1
};
static_cast
<
uint32_t
>
(
input_tensor
->
dim
(
2
)),
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_tensor
->
dim
(
1
)))};
const
std
::
vector
<
uint32_t
>
lws
=
{
128
,
8
,
1
};
std
::
stringstream
ss
;
std
::
stringstream
ss
;
ss
<<
"winograd_inverse_transform_kernel_"
<<
input_tensor
->
dim
(
0
)
<<
"_"
ss
<<
"winograd_inverse_transform_kernel_"
<<
input_tensor
->
dim
(
0
)
<<
"_"
...
...
mace/kernels/pooling.h
浏览文件 @
33b049e6
...
@@ -185,6 +185,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
...
@@ -185,6 +185,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
vector
<
index_t
>
input_shape_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
...
...
mace/kernels/resize_bilinear.h
浏览文件 @
33b049e6
...
@@ -173,6 +173,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
...
@@ -173,6 +173,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
vector
<
index_t
>
input_shape_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
...
...
mace/kernels/slice.h
浏览文件 @
33b049e6
...
@@ -61,6 +61,7 @@ struct SliceFunctor<DeviceType::OPENCL, T> {
...
@@ -61,6 +61,7 @@ struct SliceFunctor<DeviceType::OPENCL, T> {
const
std
::
vector
<
Tensor
*>
&
output_list
,
const
std
::
vector
<
Tensor
*>
&
output_list
,
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
};
};
}
// namespace kernels
}
// namespace kernels
...
...
mace/kernels/softmax.h
浏览文件 @
33b049e6
...
@@ -61,6 +61,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
...
@@ -61,6 +61,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
void
operator
()(
const
Tensor
*
logits
,
Tensor
*
output
,
StatsFuture
*
future
);
void
operator
()(
const
Tensor
*
logits
,
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
vector
<
index_t
>
input_shape_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
...
...
mace/kernels/space_to_batch.h
浏览文件 @
33b049e6
...
@@ -56,6 +56,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
...
@@ -56,6 +56,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
vector
<
index_t
>
space_shape_
;
std
::
vector
<
index_t
>
space_shape_
;
};
};
...
...
mace/kernels/winograd_transform.h
浏览文件 @
33b049e6
...
@@ -51,6 +51,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
...
@@ -51,6 +51,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
vector
<
index_t
>
input_shape_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
...
@@ -108,6 +109,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
...
@@ -108,6 +109,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
StatsFuture
*
future
);
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
vector
<
index_t
>
input_shape_
;
std
::
vector
<
index_t
>
input_shape_
;
};
};
...
...
tools/build_mace_run.sh
浏览文件 @
33b049e6
...
@@ -43,6 +43,10 @@ else
...
@@ -43,6 +43,10 @@ else
HEXAGON_MODE_BUILD_FLAG
=
"--define hexagon=true"
HEXAGON_MODE_BUILD_FLAG
=
"--define hexagon=true"
fi
fi
if
[
x
"
$TARGET_ABI
"
=
x
"arm64-v8a"
]
;
then
NEON_ENABLE_FLAG
=
"--define neon=true"
fi
bazel build
--verbose_failures
-c
opt
--strip
always //mace/examples:mace_run
\
bazel build
--verbose_failures
-c
opt
--strip
always //mace/examples:mace_run
\
--crosstool_top
=
//external:android/crosstool
\
--crosstool_top
=
//external:android/crosstool
\
--host_crosstool_top
=
@bazel_tools//tools/cpp:toolchain
\
--host_crosstool_top
=
@bazel_tools//tools/cpp:toolchain
\
...
@@ -54,6 +58,7 @@ else
...
@@ -54,6 +58,7 @@ else
--copt
=
"-DMACE_MODEL_TAG=
${
MODEL_TAG
}
"
\
--copt
=
"-DMACE_MODEL_TAG=
${
MODEL_TAG
}
"
\
--define
openmp
=
true
\
--define
openmp
=
true
\
--copt
=
"-O3"
\
--copt
=
"-O3"
\
$NEON_ENABLE_FLAG
\
$PRODUCTION_MODE_BUILD_FLAGS
\
$PRODUCTION_MODE_BUILD_FLAGS
\
$HEXAGON_MODE_BUILD_FLAG
||
exit
1
$HEXAGON_MODE_BUILD_FLAG
||
exit
1
fi
fi
...
...
tools/mace_tools.py
浏览文件 @
33b049e6
...
@@ -376,7 +376,8 @@ def main(unused_args):
...
@@ -376,7 +376,8 @@ def main(unused_args):
build_run_throughput_test
(
target_soc
,
FLAGS
.
run_seconds
,
build_run_throughput_test
(
target_soc
,
FLAGS
.
run_seconds
,
merged_lib_file
,
FLAGS
.
output_dir
)
merged_lib_file
,
FLAGS
.
output_dir
)
packaging_lib_file
(
FLAGS
.
output_dir
)
if
FLAGS
.
mode
==
"build"
or
FLAGS
.
mode
==
"all"
:
packaging_lib_file
(
FLAGS
.
output_dir
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
tools/packaging_lib.sh
浏览文件 @
33b049e6
...
@@ -14,8 +14,13 @@ source ${CURRENT_DIR}/env.sh
...
@@ -14,8 +14,13 @@ source ${CURRENT_DIR}/env.sh
LIBMACE_BUILD_DIR
=
$1
LIBMACE_BUILD_DIR
=
$1
TAR_PACKAGE_NAME
=
libmace_
${
PROJECT_NAME
}
.tar.gz
pushd
$LIBMACE_BUILD_DIR
/
$PROJECT_NAME
pushd
$LIBMACE_BUILD_DIR
/
$PROJECT_NAME
ls
|
grep
-v
build | xargs
tar
cvzf libmace_
${
PROJECT_NAME
}
.tar.gz
if
[
-f
$TAR_PACKAGE_NAME
]
;
then
rm
-f
$TAR_PACKAGE_NAME
fi
ls
|
grep
-v
build | xargs
tar
cvzf
$TAR_PACKAGE_NAME
popd
popd
echo
"Packaging done!"
echo
"Packaging done!"
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录