Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
冰之2023
Mace
提交
ed267833
Mace
项目概览
冰之2023
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
ed267833
编写于
3月 28, 2018
作者:
Y
yejianwu
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
format code and reduce get kwg size
上级
44d4903d
变更
43
隐藏空白更改
内联
并排
Showing
43 changed file
with
276 addition
and
197 deletion
+276
-197
mace/core/runtime/opencl/opencl_runtime.cc
mace/core/runtime/opencl/opencl_runtime.cc
+25
-14
mace/core/runtime/opencl/opencl_runtime.h
mace/core/runtime/opencl/opencl_runtime.h
+5
-4
mace/kernels/activation.h
mace/kernels/activation.h
+2
-0
mace/kernels/addn.h
mace/kernels/addn.h
+2
-0
mace/kernels/batch_norm.h
mace/kernels/batch_norm.h
+2
-0
mace/kernels/bias_add.h
mace/kernels/bias_add.h
+2
-0
mace/kernels/channel_shuffle.h
mace/kernels/channel_shuffle.h
+2
-0
mace/kernels/concat.h
mace/kernels/concat.h
+2
-0
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+2
-0
mace/kernels/depth_to_space.h
mace/kernels/depth_to_space.h
+2
-0
mace/kernels/depthwise_conv2d.h
mace/kernels/depthwise_conv2d.h
+2
-0
mace/kernels/eltwise.h
mace/kernels/eltwise.h
+2
-0
mace/kernels/matmul.h
mace/kernels/matmul.h
+2
-0
mace/kernels/opencl/activation_opencl.cc
mace/kernels/opencl/activation_opencl.cc
+7
-6
mace/kernels/opencl/addn.cc
mace/kernels/opencl/addn.cc
+7
-6
mace/kernels/opencl/batch_norm_opencl.cc
mace/kernels/opencl/batch_norm_opencl.cc
+7
-5
mace/kernels/opencl/bias_add_opencl.cc
mace/kernels/opencl/bias_add_opencl.cc
+8
-7
mace/kernels/opencl/buffer_to_image.cc
mace/kernels/opencl/buffer_to_image.cc
+4
-3
mace/kernels/opencl/channel_shuffle.cc
mace/kernels/opencl/channel_shuffle.cc
+7
-6
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+22
-15
mace/kernels/opencl/conv_2d_opencl.cc
mace/kernels/opencl/conv_2d_opencl.cc
+15
-6
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+10
-7
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+10
-7
mace/kernels/opencl/conv_2d_opencl_general.cc
mace/kernels/opencl/conv_2d_opencl_general.cc
+10
-7
mace/kernels/opencl/depth_to_space_opencl.cc
mace/kernels/opencl/depth_to_space_opencl.cc
+7
-6
mace/kernels/opencl/depthwise_conv_opencl.cc
mace/kernels/opencl/depthwise_conv_opencl.cc
+13
-8
mace/kernels/opencl/eltwise_opencl.cc
mace/kernels/opencl/eltwise_opencl.cc
+8
-6
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+12
-21
mace/kernels/opencl/helper.h
mace/kernels/opencl/helper.h
+0
-2
mace/kernels/opencl/matmul.cc
mace/kernels/opencl/matmul.cc
+5
-5
mace/kernels/opencl/pooling_opencl.cc
mace/kernels/opencl/pooling_opencl.cc
+12
-21
mace/kernels/opencl/resize_bilinear_opencl.cc
mace/kernels/opencl/resize_bilinear_opencl.cc
+7
-6
mace/kernels/opencl/slice.cc
mace/kernels/opencl/slice.cc
+5
-5
mace/kernels/opencl/softmax_opencl.cc
mace/kernels/opencl/softmax_opencl.cc
+8
-6
mace/kernels/opencl/space_to_batch_opencl.cc
mace/kernels/opencl/space_to_batch_opencl.cc
+7
-6
mace/kernels/opencl/winograd_transform.cc
mace/kernels/opencl/winograd_transform.cc
+14
-12
mace/kernels/pooling.h
mace/kernels/pooling.h
+2
-0
mace/kernels/resize_bilinear.h
mace/kernels/resize_bilinear.h
+2
-0
mace/kernels/slice.h
mace/kernels/slice.h
+2
-0
mace/kernels/softmax.h
mace/kernels/softmax.h
+2
-0
mace/kernels/space_to_batch.h
mace/kernels/space_to_batch.h
+2
-0
mace/kernels/winograd_transform.h
mace/kernels/winograd_transform.h
+4
-0
tools/build_mace_run.sh
tools/build_mace_run.sh
+5
-0
未找到文件。
mace/core/runtime/opencl/opencl_runtime.cc
浏览文件 @
ed267833
...
...
@@ -147,16 +147,9 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
if
(
device
.
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_GPU
)
{
*
device_
=
device
;
gpu_detected
=
true
;
const
std
::
string
device_name
=
device
.
getInfo
<
CL_DEVICE_NAME
>
();
constexpr
const
char
*
kQualcommAdrenoGPUStr
=
"QUALCOMM Adreno(TM)"
;
constexpr
const
char
*
kMaliGPUStr
=
"Mali"
;
if
(
device_name
==
kQualcommAdrenoGPUStr
)
{
gpu_type_
=
GPU_TYPE
::
QUALCOMM_ADRENO
;
}
else
if
(
device_name
.
find
(
kMaliGPUStr
)
!=
std
::
string
::
npos
)
{
gpu_type_
=
GPU_TYPE
::
MALI
;
}
else
{
gpu_type_
=
GPU_TYPE
::
UNKNOWN
;
}
gpu_type_
=
ParseGPUTypeFromDeviceName
(
device_name
);
const
std
::
string
device_version
=
device
.
getInfo
<
CL_DEVICE_VERSION
>
();
opencl_version_
=
device_version
.
substr
(
7
,
3
);
...
...
@@ -178,7 +171,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
}
cl_int
err
;
if
(
gpu_type_
==
GPU
_TYPE
::
QUALCOMM_ADRENO
)
{
if
(
gpu_type_
==
GPU
Type
::
QUALCOMM_ADRENO
)
{
std
::
vector
<
cl_context_properties
>
context_properties
;
context_properties
.
reserve
(
5
);
GetAdrenoContextProperties
(
&
context_properties
,
gpu_perf_hint
,
...
...
@@ -357,12 +350,30 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
return
size
;
}
const
GPU_TYPE
OpenCLRuntime
::
GetGPUType
()
const
{
return
gpu_type_
;
const
bool
OpenCLRuntime
::
IsNonUniformWorkgroupsSupported
()
{
if
(
gpu_type_
==
GPUType
::
QUALCOMM_ADRENO
&&
opencl_version_
==
"2.0"
)
{
return
true
;
}
else
{
return
false
;
}
}
const
std
::
string
&
OpenCLRuntime
::
GetOpenclVersion
()
const
{
return
opencl_version_
;
const
GPUType
OpenCLRuntime
::
ParseGPUTypeFromDeviceName
(
const
std
::
string
&
device_name
)
{
constexpr
const
char
*
kQualcommAdrenoGPUStr
=
"QUALCOMM Adreno(TM)"
;
constexpr
const
char
*
kMaliGPUStr
=
"Mali"
;
constexpr
const
char
*
kPowerVRGPUStr
=
"PowerVR"
;
if
(
device_name
==
kQualcommAdrenoGPUStr
)
{
return
GPUType
::
QUALCOMM_ADRENO
;
}
else
if
(
device_name
.
find
(
kMaliGPUStr
)
!=
std
::
string
::
npos
)
{
return
GPUType
::
MALI
;
}
else
if
(
device_name
.
find
(
kPowerVRGPUStr
)
!=
std
::
string
::
npos
)
{
return
GPUType
::
PowerVR
;
}
else
{
return
GPUType
::
UNKNOWN
;
}
}
}
// namespace mace
mace/core/runtime/opencl/opencl_runtime.h
浏览文件 @
ed267833
...
...
@@ -18,9 +18,10 @@
namespace
mace
{
enum
GPU
_TYPE
{
enum
GPU
Type
{
QUALCOMM_ADRENO
,
MALI
,
PowerVR
,
UNKNOWN
,
};
...
...
@@ -55,8 +56,8 @@ class OpenCLRuntime {
uint64_t
GetDeviceMaxWorkGroupSize
();
uint64_t
GetKernelMaxWorkGroupSize
(
const
cl
::
Kernel
&
kernel
);
uint64_t
GetKernelWaveSize
(
const
cl
::
Kernel
&
kernel
);
const
GPU_TYPE
GetGPUType
()
const
;
const
std
::
string
&
GetOpenclVersion
()
const
;
const
bool
IsNonUniformWorkgroupsSupported
()
;
const
GPUType
ParseGPUTypeFromDeviceName
(
const
std
::
string
&
device_name
)
;
cl
::
Kernel
BuildKernel
(
const
std
::
string
&
program_name
,
const
std
::
string
&
kernel_name
,
const
std
::
set
<
std
::
string
>
&
build_options
);
...
...
@@ -82,7 +83,7 @@ class OpenCLRuntime {
std
::
map
<
std
::
string
,
cl
::
Program
>
built_program_map_
;
std
::
mutex
program_build_mutex_
;
std
::
string
kernel_path_
;
GPU
_TYPE
gpu_type_
;
GPU
Type
gpu_type_
;
std
::
string
opencl_version_
;
static
GPUPerfHint
gpu_perf_hint_
;
...
...
mace/kernels/activation.h
浏览文件 @
ed267833
...
...
@@ -155,6 +155,8 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
ActivationType
activation_
;
T
relux_max_limit_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
std
::
string
tuning_key_prefix_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/addn.h
浏览文件 @
ed267833
...
...
@@ -90,6 +90,8 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/batch_norm.h
浏览文件 @
ed267833
...
...
@@ -157,6 +157,8 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/bias_add.h
浏览文件 @
ed267833
...
...
@@ -64,6 +64,8 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/channel_shuffle.h
浏览文件 @
ed267833
...
...
@@ -56,6 +56,8 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
const
int
groups_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/concat.h
浏览文件 @
ed267833
...
...
@@ -85,6 +85,8 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/conv_2d.h
浏览文件 @
ed267833
...
...
@@ -401,6 +401,8 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/depth_to_space.h
浏览文件 @
ed267833
...
...
@@ -108,6 +108,8 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
const
int
block_size_
;
bool
d2s_
;
std
::
vector
<
index_t
>
input_shape_
;
...
...
mace/kernels/depthwise_conv2d.h
浏览文件 @
ed267833
...
...
@@ -437,6 +437,8 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/eltwise.h
浏览文件 @
ed267833
...
...
@@ -97,6 +97,8 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/matmul.h
浏览文件 @
ed267833
...
...
@@ -241,6 +241,8 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
};
}
// namespace kernels
...
...
mace/kernels/opencl/activation_opencl.cc
浏览文件 @
ed267833
...
...
@@ -26,16 +26,16 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel_
.
get
()
==
nullptr
)
{
is_non_uniform_work_groups_supported_
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"activation"
);
built_options
.
emplace
(
"-Dactivation="
+
kernel_name
);
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported_
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
switch
(
activation_
)
{
...
...
@@ -83,11 +83,12 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
input_shape_
=
input
->
shape
();
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
std
::
string
tuning_key
=
Concat
(
tuning_key_prefix_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
...
...
mace/kernels/opencl/addn.cc
浏览文件 @
ed267833
...
...
@@ -26,8 +26,6 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
for
(
int
i
=
1
;
i
<
size
;
++
i
)
{
MACE_CHECK_NOTNULL
(
input_tensors
[
i
]);
MACE_CHECK
(
batch
==
input_tensors
[
i
]
->
dim
(
0
));
...
...
@@ -37,6 +35,8 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
}
if
(
kernel_
.
get
()
==
nullptr
)
{
is_non_uniform_work_groups_supported_
=
runtime
->
IsNonUniformWorkgroupsSupported
();
if
(
input_tensors
.
size
()
>
4
)
{
MACE_NOT_IMPLEMENTED
;
}
...
...
@@ -47,7 +47,7 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
MakeString
(
"-DINPUT_NUM="
,
input_tensors
.
size
()));
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported_
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
...
...
@@ -78,11 +78,12 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
input_shape_
=
input_tensors
[
0
]
->
shape
();
}
const
uint32_t
kwg_size
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size
/
16
,
16
,
1
};
}
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
16
,
16
,
1
};
std
::
stringstream
ss
;
ss
<<
"addn_opencl_kernel_"
<<
output_shape
[
0
]
<<
"_"
<<
output_shape
[
1
]
<<
"_"
<<
output_shape
[
2
]
<<
"_"
<<
output_shape
[
3
];
...
...
mace/kernels/opencl/batch_norm_opencl.cc
浏览文件 @
ed267833
...
...
@@ -36,16 +36,17 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel_
.
get
()
==
nullptr
)
{
is_non_uniform_work_groups_supported_
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"batch_norm"
);
built_options
.
emplace
(
"-Dbatch_norm="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported_
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
if
(
folded_constant_
)
{
...
...
@@ -89,11 +90,12 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
input_shape_
=
input
->
shape
();
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
std
::
string
tuning_key
=
Concat
(
"batch_norm_opencl_kernel_"
,
activation_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
folded_constant_
);
...
...
mace/kernels/opencl/bias_add_opencl.cc
浏览文件 @
ed267833
...
...
@@ -29,16 +29,16 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel_
.
get
()
==
nullptr
)
{
is_non_uniform_work_groups_supported_
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"bias_add"
);
built_options
.
emplace
(
"-Dbias_add="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported_
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
kernel_
=
runtime
->
BuildKernel
(
"bias_add"
,
kernel_name
,
built_options
);
...
...
@@ -52,15 +52,16 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
input_shape_
=
input
->
shape
();
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
};
cl
::
Event
event
;
cl_int
error
;
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported_
)
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel_
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
...
...
mace/kernels/opencl/buffer_to_image.cc
浏览文件 @
ed267833
...
...
@@ -62,14 +62,15 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
const
bool
is_non_uniform_work_groups_supported
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
kernel_name
);
std
::
set
<
std
::
string
>
built_options
;
std
::
stringstream
kernel_name_ss
;
kernel_name_ss
<<
"-D"
<<
kernel_name
<<
"="
<<
obfuscated_kernel_name
;
built_options
.
emplace
(
kernel_name_ss
.
str
());
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
if
(
buffer
->
dtype
()
==
image
->
dtype
())
{
...
...
@@ -115,7 +116,7 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
cl
::
Event
event
;
cl_int
error
;
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported
)
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
...
...
mace/kernels/opencl/channel_shuffle.cc
浏览文件 @
ed267833
...
...
@@ -36,16 +36,16 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel_
.
get
()
==
nullptr
)
{
is_non_uniform_work_groups_supported_
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"channel_shuffle"
);
built_options
.
emplace
(
"-Dchannel_shuffle="
+
kernel_name
);
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported_
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
kernel_
=
runtime
->
BuildKernel
(
"channel_shuffle"
,
kernel_name
,
...
...
@@ -63,11 +63,12 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
input_shape_
=
input
->
shape
();
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"channel_shuffle_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
...
...
mace/kernels/opencl/concat.cc
浏览文件 @
ed267833
...
...
@@ -17,7 +17,9 @@ static void Concat2(cl::Kernel *kernel,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
,
bool
*
is_non_uniform_work_groups_supported
,
uint32_t
*
kwg_size
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -31,13 +33,13 @@ static void Concat2(cl::Kernel *kernel,
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel
->
get
()
==
nullptr
)
{
*
is_non_uniform_work_groups_supported
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"concat_channel"
);
built_options
.
emplace
(
"-Dconcat_channel="
+
kernel_name
);
if
(
is_qualcomm_opencl200
)
{
if
(
*
is_non_uniform_work_groups_supported
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
if
(
input0
->
dtype
()
==
output
->
dtype
())
{
...
...
@@ -66,11 +68,12 @@ static void Concat2(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
*
prev_input_shape
=
input0
->
shape
();
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
}
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"concat_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
@@ -81,7 +84,9 @@ static void ConcatN(cl::Kernel *kernel,
const
std
::
vector
<
const
Tensor
*>
&
input_list
,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
,
bool
*
is_non_uniform_work_groups_supported
,
uint32_t
*
kwg_size
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -89,15 +94,15 @@ static void ConcatN(cl::Kernel *kernel,
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel
->
get
()
==
nullptr
)
{
*
is_non_uniform_work_groups_supported
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"concat_channel_multi"
);
built_options
.
emplace
(
"-Dconcat_channel_multi="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
dt
));
if
(
is_qualcomm_opencl200
)
{
if
(
*
is_non_uniform_work_groups_supported
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
*
kernel
=
runtime
->
BuildKernel
(
"concat"
,
kernel_name
,
built_options
);
...
...
@@ -122,9 +127,9 @@ static void ConcatN(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
chan_blk_offset
+=
input_channel_blk
;
const
uint32_t
kwg_size
=
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"concat_n_opencl_kernel_"
<<
input_channel_blk
<<
"_"
<<
width
<<
"_"
<<
batch
*
height
;
...
...
@@ -169,11 +174,13 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
switch
(
inputs_count
)
{
case
2
:
Concat2
(
&
kernel_
,
input_list
[
0
],
input_list
[
1
],
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
);
&
input_shape_
,
output
,
future
,
&
is_non_uniform_work_groups_supported_
,
&
kwg_size_
);
break
;
default:
if
(
divisible_four
)
{
ConcatN
(
&
kernel_
,
input_list
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
);
ConcatN
(
&
kernel_
,
input_list
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
,
&
is_non_uniform_work_groups_supported_
,
&
kwg_size_
);
}
else
{
MACE_NOT_IMPLEMENTED
;
}
...
...
mace/kernels/opencl/conv_2d_opencl.cc
浏览文件 @
ed267833
...
...
@@ -20,7 +20,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
);
StatsFuture
*
future
,
bool
*
is_non_uniform_work_groups_supported
,
uint32_t
*
kwg_size
);
extern
void
Conv2dOpenclK3x3
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
...
...
@@ -34,7 +36,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
);
StatsFuture
*
future
,
bool
*
is_non_uniform_work_groups_supported
,
uint32_t
*
kwg_size
);
extern
void
Conv2dOpencl
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
...
...
@@ -48,7 +52,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
);
StatsFuture
*
future
,
bool
*
is_non_uniform_work_groups_supported
,
uint32_t
*
kwg_size
);
template
<
typename
T
>
void
Conv2dFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
...
...
@@ -61,7 +67,8 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const
Tensor
*
bias
,
const
int
stride
,
const
int
*
padding
,
const
int
*
dilations
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
input_shape
,
Tensor
*
output
,
StatsFuture
*
future
);
std
::
vector
<
index_t
>
*
input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
bool
*
is_non_uniform_work_groups_supported
,
uint32_t
*
kwg_size
);
// Selection matrix: kernel_size x stride_size
static
const
Conv2dOpenclFunction
selector
[
5
]
=
{
Conv2dOpenclK1x1
,
nullptr
,
Conv2dOpenclK3x3
,
nullptr
,
nullptr
};
...
...
@@ -101,11 +108,13 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto
conv2d_func
=
selector
[
kernel_h
-
1
];
conv2d_func
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
);
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
is_non_uniform_work_groups_supported_
,
&
kwg_size_
);
}
else
{
Conv2dOpencl
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
);
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
is_non_uniform_work_groups_supported_
,
&
kwg_size_
);
}
}
...
...
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
ed267833
...
...
@@ -22,7 +22,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
,
bool
*
is_non_uniform_work_groups_supported
,
uint32_t
*
kwg_size
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -38,9 +40,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel
->
get
()
==
nullptr
)
{
*
is_non_uniform_work_groups_supported
=
runtime
->
IsNonUniformWorkgroupsSupported
();
MACE_CHECK
(
input_batch
==
batch
);
std
::
set
<
std
::
string
>
built_options
;
...
...
@@ -48,7 +50,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
built_options
.
emplace
(
"-Dconv_2d_1x1="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
is_qualcomm_opencl200
)
{
if
(
*
is_non_uniform_work_groups_supported
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
if
(
bias
!=
nullptr
)
{
...
...
@@ -101,11 +103,12 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
*
prev_input_shape
=
input
->
shape
();
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
}
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
1
};
std
::
string
tuning_key
=
Concat
(
"conv2d_1x1_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
ed267833
...
...
@@ -24,7 +24,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
,
bool
*
is_non_uniform_work_groups_supported
,
uint32_t
*
kwg_size
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -37,15 +39,15 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel
->
get
()
==
nullptr
)
{
*
is_non_uniform_work_groups_supported
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"conv_2d_3x3"
);
built_options
.
emplace
(
"-Dconv_2d_3x3="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
is_qualcomm_opencl200
)
{
if
(
*
is_non_uniform_work_groups_supported
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
...
...
@@ -99,11 +101,12 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
*
prev_input_shape
=
input
->
shape
();
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
}
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
const
std
::
vector
<
uint32_t
>
lws
=
{
4
,
kwg_size
/
32
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
4
,
*
kwg_size
/
32
,
8
,
1
};
std
::
string
tuning_key
=
Concat
(
"conv2d_3x3_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
...
...
mace/kernels/opencl/conv_2d_opencl_general.cc
浏览文件 @
ed267833
...
...
@@ -24,7 +24,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
,
bool
*
is_non_uniform_work_groups_supported
,
uint32_t
*
kwg_size
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -37,15 +39,15 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel
->
get
()
==
nullptr
)
{
*
is_non_uniform_work_groups_supported
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"conv_2d"
);
built_options
.
emplace
(
"-Dconv_2d="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
is_qualcomm_opencl200
)
{
if
(
*
is_non_uniform_work_groups_supported
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
...
...
@@ -101,11 +103,12 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
*
prev_input_shape
=
input
->
shape
();
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
}
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
1
};
std
::
string
tuning_key
=
Concat
(
"conv2d_general_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
...
...
mace/kernels/opencl/depth_to_space_opencl.cc
浏览文件 @
ed267833
...
...
@@ -47,9 +47,9 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel_
.
get
()
==
nullptr
)
{
is_non_uniform_work_groups_supported_
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
set
<
std
::
string
>
built_options
;
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
kernel_name
);
std
::
stringstream
kernel_name_ss
;
...
...
@@ -58,7 +58,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported_
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
kernel_
=
...
...
@@ -93,11 +93,12 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
input_shape_
=
input
->
shape
();
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
}
...
...
mace/kernels/opencl/depthwise_conv_opencl.cc
浏览文件 @
ed267833
...
...
@@ -23,7 +23,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
)
{
StatsFuture
*
future
,
bool
*
is_non_uniform_work_groups_supported
,
uint32_t
*
kwg_size
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -42,9 +44,9 @@ void DepthwiseConv2d(cl::Kernel *kernel,
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel
->
get
()
==
nullptr
)
{
*
is_non_uniform_work_groups_supported
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"depthwise_conv2d"
);
if
(
stride
==
1
&&
dilations
[
0
]
==
1
&&
dilations
[
1
]
==
1
)
{
...
...
@@ -53,7 +55,7 @@ void DepthwiseConv2d(cl::Kernel *kernel,
}
else
{
built_options
.
emplace
(
"-Ddepthwise_conv2d="
+
kernel_name
);
}
if
(
is_qualcomm_opencl200
)
{
if
(
*
is_non_uniform_work_groups_supported
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
...
...
@@ -118,12 +120,14 @@ void DepthwiseConv2d(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
kernel
->
setArg
(
idx
++
,
gws
[
2
]);
*
prev_input_shape
=
input
->
shape
();
*
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
}
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
1
};
std
::
string
tuning_key
=
Concat
(
"depthwise_conv2d_ocl_kernel_"
,
activation
,
batch
,
height
,
width
,
channels
,
multiplier
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
...
...
@@ -178,7 +182,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
DepthwiseConv2d
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
);
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
is_non_uniform_work_groups_supported_
,
&
kwg_size_
);
}
template
struct
DepthwiseConv2dFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/eltwise_opencl.cc
浏览文件 @
ed267833
...
...
@@ -29,9 +29,9 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel_
.
get
()
==
nullptr
)
{
is_non_uniform_work_groups_supported_
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"eltwise"
);
...
...
@@ -39,7 +39,7 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
MakeString
(
"-DELTWISE_TYPE="
,
type_
));
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported_
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
if
(
!
coeff_
.
empty
())
built_options
.
emplace
(
"-DCOEFF_SUM"
);
...
...
@@ -56,12 +56,14 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
input_shape_
=
input0
->
shape
();
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size
/
16
,
16
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
16
,
16
,
1
};
std
::
stringstream
ss
;
ss
<<
"eltwise_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
mace/kernels/opencl/helper.cc
浏览文件 @
ed267833
...
...
@@ -194,24 +194,14 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
}
}
const
bool
IsQualcommOpenCL200
()
{
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
runtime
->
GetGPUType
()
==
GPU_TYPE
::
QUALCOMM_ADRENO
&&
runtime
->
GetOpenclVersion
()
==
"2.0"
)
{
return
true
;
}
else
{
return
false
;
}
}
void
TuningOrRun3DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
const
bool
is_non_uniform_work_groups_supported
=
runtime
->
IsNonUniformWorkgroupsSupported
();
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
const
uint32_t
kwg_size
=
...
...
@@ -249,7 +239,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
<<
"Tuning parameters of 3D kernel must be 4D"
;
cl_int
error
=
CL_SUCCESS
;
std
::
vector
<
uint32_t
>
roundup_gws
(
3
);
if
(
!
is_
qualcomm_opencl200
)
{
if
(
!
is_
non_uniform_work_groups_supported
)
{
for
(
size_t
i
=
0
;
i
<
3
;
++
i
)
{
roundup_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
}
...
...
@@ -262,7 +252,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported
)
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
...
...
@@ -278,7 +268,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
}
}
else
{
timer
->
ClearTiming
();
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported
)
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
],
params
[
2
]),
nullptr
,
&
event
);
...
...
@@ -303,7 +293,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws2
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
2
]
-
(
i
*
block_size
))
:
block_size
;
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported
)
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws2
),
...
...
@@ -342,7 +332,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
const
std
::
vector
<
uint32_t
>
&
lws
,
StatsFuture
*
future
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
const
bool
is_non_uniform_work_groups_supported
=
runtime
->
IsNonUniformWorkgroupsSupported
();
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
const
uint32_t
kwg_size
=
...
...
@@ -368,7 +359,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
<<
"Tuning parameters of 2D kernel must be 3d"
;
cl_int
error
=
CL_SUCCESS
;
std
::
vector
<
uint32_t
>
roundup_gws
(
2
);
if
(
!
is_
qualcomm_opencl200
)
{
if
(
!
is_
non_uniform_work_groups_supported
)
{
for
(
size_t
i
=
0
;
i
<
2
;
++
i
)
{
roundup_gws
[
i
]
=
RoundUp
(
gws
[
i
],
params
[
i
]);
}
...
...
@@ -381,7 +372,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported
)
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
...
...
@@ -396,7 +387,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
}
}
else
{
timer
->
ClearTiming
();
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported
)
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
nullptr
,
&
event
);
...
...
@@ -420,7 +411,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
for
(
uint32_t
i
=
0
;
i
<
num_blocks
;
++
i
)
{
uint32_t
gws1
=
(
i
==
num_blocks
-
1
)
?
(
gws
[
1
]
-
(
i
*
block_size
))
:
block_size
;
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported
)
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NDRange
(
0
,
i
*
block_size
),
cl
::
NDRange
(
gws
[
0
],
gws1
),
cl
::
NDRange
(
params
[
0
],
params
[
1
]),
...
...
mace/kernels/opencl/helper.h
浏览文件 @
ed267833
...
...
@@ -102,8 +102,6 @@ std::string Concat(Args... args) {
return
ss
.
str
();
}
const
bool
IsQualcommOpenCL200
();
}
// namespace kernels
}
// namespace mace
#endif // MACE_KERNELS_OPENCL_HELPER_H_
mace/kernels/opencl/matmul.cc
浏览文件 @
ed267833
...
...
@@ -33,16 +33,16 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel_
.
get
()
==
nullptr
)
{
is_non_uniform_work_groups_supported_
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"matmul"
);
built_options
.
emplace
(
"-Dmatmul="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported_
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
kernel_
=
runtime
->
BuildKernel
(
"matmul"
,
kernel_name
,
built_options
);
...
...
@@ -59,9 +59,9 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
const
uint32_t
kwg_size
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size
/
64
,
64
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size
_
/
64
,
64
,
1
};
std
::
stringstream
ss
;
ss
<<
"matmul_opencl_kernel_"
<<
C
->
dim
(
0
)
<<
"_"
<<
C
->
dim
(
1
)
<<
"_"
<<
C
->
dim
(
2
)
<<
"_"
<<
C
->
dim
(
3
);
...
...
mace/kernels/opencl/pooling_opencl.cc
浏览文件 @
ed267833
...
...
@@ -20,9 +20,9 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel_
.
get
()
==
nullptr
)
{
is_non_uniform_work_groups_supported_
=
runtime
->
IsNonUniformWorkgroupsSupported
();
const
DataType
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"pooling"
);
...
...
@@ -39,13 +39,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if
(
pooling_type_
==
AVG
)
{
built_options
.
emplace
(
"-DPOOL_AVG"
);
}
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported_
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
kernel_
=
runtime
->
BuildKernel
(
"pooling"
,
kernel_name
,
built_options
);
}
uint32_t
gws
[
3
]
;
std
::
vector
<
uint32_t
>
gws
;
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
std
::
vector
<
index_t
>
output_shape
(
4
);
std
::
vector
<
index_t
>
filter_shape
=
{
kernels_
[
0
],
kernels_
[
1
],
...
...
@@ -75,9 +75,10 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
gws
[
0
]
=
static_cast
<
uint32_t
>
(
channel_blocks
);
gws
[
1
]
=
static_cast
<
uint32_t
>
(
out_width
);
gws
[
2
]
=
static_cast
<
uint32_t
>
(
batch
*
out_height
);
gws
=
{
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
batch
*
out_height
),
};
uint32_t
idx
=
0
;
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
...
...
@@ -94,26 +95,16 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
input_shape_
=
input
->
shape
();
}
else
{
index_t
batch
=
output
->
dim
(
0
);
index_t
out_height
=
output
->
dim
(
1
);
index_t
out_width
=
output
->
dim
(
2
);
index_t
channels
=
output
->
dim
(
3
);
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
gws
[
0
]
=
static_cast
<
uint32_t
>
(
channel_blocks
);
gws
[
1
]
=
static_cast
<
uint32_t
>
(
out_width
);
gws
[
2
]
=
static_cast
<
uint32_t
>
(
batch
*
out_height
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"pooling_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
.
data
()
,
lws
,
future
);
}
template
struct
PoolingFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/resize_bilinear_opencl.cc
浏览文件 @
ed267833
...
...
@@ -30,16 +30,16 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel_
.
get
()
==
nullptr
)
{
is_non_uniform_work_groups_supported_
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"resize_bilinear_nocache"
);
built_options
.
emplace
(
"-Dresize_bilinear_nocache="
+
kernel_name
);
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported_
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
kernel_
=
...
...
@@ -72,11 +72,12 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
input_shape_
=
input
->
shape
();
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"resize_bilinear_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
mace/kernels/opencl/slice.cc
浏览文件 @
ed267833
...
...
@@ -31,16 +31,16 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel_
.
get
()
==
nullptr
)
{
is_non_uniform_work_groups_supported_
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"slice"
);
built_options
.
emplace
(
"-Dslice="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported_
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
kernel_
=
runtime
->
BuildKernel
(
"slice"
,
kernel_name
,
built_options
);
...
...
@@ -53,9 +53,9 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
static_cast
<
uint32_t
>
(
input
->
dim
(
0
)
*
input
->
dim
(
1
)),
};
const
uint32_t
kwg_size
=
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
_
/
64
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"slice_opencl_kernel_"
<<
input
->
dim
(
0
)
<<
"_"
...
...
mace/kernels/opencl/softmax_opencl.cc
浏览文件 @
ed267833
...
...
@@ -29,16 +29,16 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel_
.
get
()
==
nullptr
)
{
is_non_uniform_work_groups_supported_
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"softmax"
);
built_options
.
emplace
(
"-Dsoftmax="
+
kernel_name
);
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported_
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
kernel_
=
runtime
->
BuildKernel
(
"softmax"
,
kernel_name
,
built_options
);
...
...
@@ -52,12 +52,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
input_shape_
=
logits
->
shape
();
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"softmax_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
mace/kernels/opencl/space_to_batch_opencl.cc
浏览文件 @
ed267833
...
...
@@ -38,9 +38,9 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel_
.
get
()
==
nullptr
)
{
is_non_uniform_work_groups_supported_
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
kernel_name
);
std
::
set
<
std
::
string
>
built_options
;
std
::
stringstream
kernel_name_ss
;
...
...
@@ -49,7 +49,7 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported_
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
kernel_
=
...
...
@@ -77,11 +77,12 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
space_shape_
=
space_tensor
->
shape
();
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size
/
64
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
kernel_name
<<
"_"
<<
batch_tensor
->
dim
(
0
)
<<
"_"
<<
batch_tensor
->
dim
(
1
)
<<
"_"
<<
batch_tensor
->
dim
(
2
)
<<
"_"
...
...
mace/kernels/opencl/winograd_transform.cc
浏览文件 @
ed267833
...
...
@@ -17,9 +17,9 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel_
.
get
()
==
nullptr
)
{
is_non_uniform_work_groups_supported_
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_transform_2x2"
);
std
::
set
<
std
::
string
>
built_options
;
...
...
@@ -28,7 +28,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported_
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
kernel_
=
runtime
->
BuildKernel
(
"winograd_transform"
,
obfuscated_kernel_name
,
...
...
@@ -74,11 +74,12 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
input_shape_
=
input_tensor
->
shape
();
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size
/
8
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"winograd_transform_kernel_"
<<
input_tensor
->
dim
(
0
)
<<
"_"
<<
input_tensor
->
dim
(
1
)
<<
"_"
<<
input_tensor
->
dim
(
2
)
<<
"_"
...
...
@@ -95,9 +96,9 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
auto
runtime
=
OpenCLRuntime
::
Global
();
const
bool
is_qualcomm_opencl200
=
IsQualcommOpenCL200
();
if
(
kernel_
.
get
()
==
nullptr
)
{
is_non_uniform_work_groups_supported_
=
runtime
->
IsNonUniformWorkgroupsSupported
();
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_inverse_transform_2x2"
);
std
::
set
<
std
::
string
>
built_options
;
...
...
@@ -107,7 +108,7 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
if
(
is_
qualcomm_opencl200
)
{
if
(
is_
non_uniform_work_groups_supported_
)
{
built_options
.
emplace
(
"-DUSE_QUALCOMM_OPENCL_2_0"
);
}
built_options
.
emplace
(
bias
!=
nullptr
?
"-DBIAS"
:
""
);
...
...
@@ -168,11 +169,12 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
input_shape_
=
input_tensor
->
shape
();
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size
/
8
,
8
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
1
};
std
::
stringstream
ss
;
ss
<<
"winograd_inverse_transform_kernel_"
<<
input_tensor
->
dim
(
0
)
<<
"_"
...
...
mace/kernels/pooling.h
浏览文件 @
ed267833
...
...
@@ -185,6 +185,8 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/resize_bilinear.h
浏览文件 @
ed267833
...
...
@@ -173,6 +173,8 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/slice.h
浏览文件 @
ed267833
...
...
@@ -61,6 +61,8 @@ struct SliceFunctor<DeviceType::OPENCL, T> {
const
std
::
vector
<
Tensor
*>
&
output_list
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
};
}
// namespace kernels
...
...
mace/kernels/softmax.h
浏览文件 @
ed267833
...
...
@@ -61,6 +61,8 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
void
operator
()(
const
Tensor
*
logits
,
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/space_to_batch.h
浏览文件 @
ed267833
...
...
@@ -56,6 +56,8 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
std
::
vector
<
index_t
>
space_shape_
;
};
...
...
mace/kernels/winograd_transform.h
浏览文件 @
ed267833
...
...
@@ -51,6 +51,8 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
void
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
@@ -108,6 +110,8 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
bool
is_non_uniform_work_groups_supported_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
tools/build_mace_run.sh
浏览文件 @
ed267833
...
...
@@ -43,6 +43,10 @@ else
HEXAGON_MODE_BUILD_FLAG
=
"--define hexagon=true"
fi
if
[
x
"
$TARGET_ABI
"
=
x
"arm64-v8a"
]
;
then
NEON_ENABLE_FLAG
=
"--define neon=true"
fi
bazel build
--verbose_failures
-c
opt
--strip
always //mace/examples:mace_run
\
--crosstool_top
=
//external:android/crosstool
\
--host_crosstool_top
=
@bazel_tools//tools/cpp:toolchain
\
...
...
@@ -54,6 +58,7 @@ else
--copt
=
"-DMACE_MODEL_TAG=
${
MODEL_TAG
}
"
\
--define
openmp
=
true
\
--copt
=
"-O3"
\
$NEON_ENABLE_FLAG
\
$PRODUCTION_MODE_BUILD_FLAGS
\
$HEXAGON_MODE_BUILD_FLAG
||
exit
1
fi
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录