Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
冰之2023
Mace
提交
9d8505f4
Mace
项目概览
冰之2023
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
9d8505f4
编写于
4月 03, 2018
作者:
Y
yejianwu
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add out_of_range_check for opencl kernel
上级
9b10c71d
变更
71
隐藏空白更改
内联
并排
Showing
71 changed file
with
899 addition
and
99 deletion
+899
-99
mace/core/runtime/opencl/opencl_runtime.cc
mace/core/runtime/opencl/opencl_runtime.cc
+12
-0
mace/core/runtime/opencl/opencl_runtime.h
mace/core/runtime/opencl/opencl_runtime.h
+2
-0
mace/kernels/activation.h
mace/kernels/activation.h
+2
-0
mace/kernels/addn.h
mace/kernels/addn.h
+2
-0
mace/kernels/batch_norm.h
mace/kernels/batch_norm.h
+2
-0
mace/kernels/bias_add.h
mace/kernels/bias_add.h
+2
-0
mace/kernels/buffer_to_image.h
mace/kernels/buffer_to_image.h
+5
-1
mace/kernels/channel_shuffle.h
mace/kernels/channel_shuffle.h
+2
-0
mace/kernels/concat.h
mace/kernels/concat.h
+2
-0
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+2
-0
mace/kernels/cwise.h
mace/kernels/cwise.h
+1
-0
mace/kernels/depth_to_space.h
mace/kernels/depth_to_space.h
+1
-0
mace/kernels/depthwise_conv2d.h
mace/kernels/depthwise_conv2d.h
+1
-0
mace/kernels/eltwise.h
mace/kernels/eltwise.h
+1
-0
mace/kernels/fully_connected.h
mace/kernels/fully_connected.h
+1
-0
mace/kernels/matmul.h
mace/kernels/matmul.h
+1
-0
mace/kernels/opencl/activation_opencl.cc
mace/kernels/opencl/activation_opencl.cc
+19
-0
mace/kernels/opencl/addn.cc
mace/kernels/opencl/addn.cc
+19
-0
mace/kernels/opencl/batch_norm_opencl.cc
mace/kernels/opencl/batch_norm_opencl.cc
+19
-1
mace/kernels/opencl/bias_add_opencl.cc
mace/kernels/opencl/bias_add_opencl.cc
+18
-0
mace/kernels/opencl/buffer_to_image.cc
mace/kernels/opencl/buffer_to_image.cc
+22
-0
mace/kernels/opencl/channel_shuffle.cc
mace/kernels/opencl/channel_shuffle.cc
+19
-0
mace/kernels/opencl/cl/activation.cl
mace/kernels/opencl/cl/activation.cl
+6
-2
mace/kernels/opencl/cl/addn.cl
mace/kernels/opencl/cl/addn.cl
+5
-1
mace/kernels/opencl/cl/batch_norm.cl
mace/kernels/opencl/cl/batch_norm.cl
+5
-1
mace/kernels/opencl/cl/bias_add.cl
mace/kernels/opencl/cl/bias_add.cl
+6
-1
mace/kernels/opencl/cl/buffer_to_image.cl
mace/kernels/opencl/cl/buffer_to_image.cl
+47
-12
mace/kernels/opencl/cl/channel_shuffle.cl
mace/kernels/opencl/cl/channel_shuffle.cl
+7
-1
mace/kernels/opencl/cl/common.h
mace/kernels/opencl/cl/common.h
+23
-0
mace/kernels/opencl/cl/concat.cl
mace/kernels/opencl/cl/concat.cl
+16
-4
mace/kernels/opencl/cl/conv_2d.cl
mace/kernels/opencl/cl/conv_2d.cl
+15
-1
mace/kernels/opencl/cl/conv_2d_1x1.cl
mace/kernels/opencl/cl/conv_2d_1x1.cl
+14
-1
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+17
-1
mace/kernels/opencl/cl/cwise.cl
mace/kernels/opencl/cl/cwise.cl
+5
-1
mace/kernels/opencl/cl/depth_to_space.cl
mace/kernels/opencl/cl/depth_to_space.cl
+37
-21
mace/kernels/opencl/cl/depthwise_conv2d.cl
mace/kernels/opencl/cl/depthwise_conv2d.cl
+28
-2
mace/kernels/opencl/cl/eltwise.cl
mace/kernels/opencl/cl/eltwise.cl
+5
-1
mace/kernels/opencl/cl/fully_connected.cl
mace/kernels/opencl/cl/fully_connected.cl
+12
-2
mace/kernels/opencl/cl/matmul.cl
mace/kernels/opencl/cl/matmul.cl
+18
-1
mace/kernels/opencl/cl/pooling.cl
mace/kernels/opencl/cl/pooling.cl
+7
-2
mace/kernels/opencl/cl/resize_bilinear.cl
mace/kernels/opencl/cl/resize_bilinear.cl
+6
-1
mace/kernels/opencl/cl/slice.cl
mace/kernels/opencl/cl/slice.cl
+8
-2
mace/kernels/opencl/cl/softmax.cl
mace/kernels/opencl/cl/softmax.cl
+5
-1
mace/kernels/opencl/cl/space_to_batch.cl
mace/kernels/opencl/cl/space_to_batch.cl
+12
-2
mace/kernels/opencl/cl/winograd_transform.cl
mace/kernels/opencl/cl/winograd_transform.cl
+19
-2
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+44
-4
mace/kernels/opencl/conv_2d_opencl.cc
mace/kernels/opencl/conv_2d_opencl.cc
+9
-6
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+21
-1
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+21
-1
mace/kernels/opencl/conv_2d_opencl_general.cc
mace/kernels/opencl/conv_2d_opencl_general.cc
+21
-1
mace/kernels/opencl/cwise_opencl.cc
mace/kernels/opencl/cwise_opencl.cc
+19
-0
mace/kernels/opencl/depth_to_space_opencl.cc
mace/kernels/opencl/depth_to_space_opencl.cc
+35
-9
mace/kernels/opencl/depthwise_conv_opencl.cc
mace/kernels/opencl/depthwise_conv_opencl.cc
+22
-2
mace/kernels/opencl/eltwise_opencl.cc
mace/kernels/opencl/eltwise_opencl.cc
+18
-0
mace/kernels/opencl/fully_connected_opencl.cc
mace/kernels/opencl/fully_connected_opencl.cc
+46
-6
mace/kernels/opencl/matmul.cc
mace/kernels/opencl/matmul.cc
+19
-0
mace/kernels/opencl/pooling_opencl.cc
mace/kernels/opencl/pooling_opencl.cc
+19
-0
mace/kernels/opencl/resize_bilinear_opencl.cc
mace/kernels/opencl/resize_bilinear_opencl.cc
+19
-0
mace/kernels/opencl/slice.cc
mace/kernels/opencl/slice.cc
+18
-0
mace/kernels/opencl/softmax_opencl.cc
mace/kernels/opencl/softmax_opencl.cc
+19
-0
mace/kernels/opencl/space_to_batch_opencl.cc
mace/kernels/opencl/space_to_batch_opencl.cc
+19
-0
mace/kernels/opencl/winograd_transform.cc
mace/kernels/opencl/winograd_transform.cc
+38
-0
mace/kernels/pooling.h
mace/kernels/pooling.h
+1
-0
mace/kernels/resize_bilinear.h
mace/kernels/resize_bilinear.h
+1
-0
mace/kernels/slice.h
mace/kernels/slice.h
+1
-0
mace/kernels/softmax.h
mace/kernels/softmax.h
+1
-0
mace/kernels/space_to_batch.h
mace/kernels/space_to_batch.h
+1
-0
mace/kernels/winograd_transform.h
mace/kernels/winograd_transform.h
+2
-0
tools/bazel_adb_run.py
tools/bazel_adb_run.py
+2
-1
tools/sh_commands.py
tools/sh_commands.py
+4
-3
tools/tuning_run.sh
tools/tuning_run.sh
+1
-0
未找到文件。
mace/core/runtime/opencl/opencl_runtime.cc
浏览文件 @
9d8505f4
...
...
@@ -323,6 +323,14 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
const
char
*
kernel_path
=
getenv
(
"MACE_KERNEL_PATH"
);
this
->
kernel_path_
=
std
::
string
(
kernel_path
==
nullptr
?
""
:
kernel_path
)
+
"/"
;
const
char
*
out_of_range_check
=
getenv
(
"MACE_OUT_OF_RANGE_CHECK"
);
if
(
out_of_range_check
!=
nullptr
&&
strlen
(
out_of_range_check
)
==
1
&&
out_of_range_check
[
0
]
==
'1'
)
{
this
->
out_of_range_check_
=
true
;
}
else
{
this
->
out_of_range_check_
=
false
;
}
}
OpenCLRuntime
::~
OpenCLRuntime
()
{
...
...
@@ -505,4 +513,8 @@ const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName(
}
}
const
bool
OpenCLRuntime
::
IsOutOfRangeCheckEnabled
()
const
{
return
out_of_range_check_
;
}
}
// namespace mace
mace/core/runtime/opencl/opencl_runtime.h
浏览文件 @
9d8505f4
...
...
@@ -70,6 +70,7 @@ class OpenCLRuntime {
cl
::
Kernel
BuildKernel
(
const
std
::
string
&
program_name
,
const
std
::
string
&
kernel_name
,
const
std
::
set
<
std
::
string
>
&
build_options
);
const
bool
IsOutOfRangeCheckEnabled
()
const
;
private:
OpenCLRuntime
(
GPUPerfHint
,
GPUPriorityHint
);
...
...
@@ -94,6 +95,7 @@ class OpenCLRuntime {
std
::
string
kernel_path_
;
GPUType
gpu_type_
;
std
::
string
opencl_version_
;
bool
out_of_range_check_
;
static
GPUPerfHint
gpu_perf_hint_
;
static
GPUPriorityHint
gpu_priority_hint_
;
...
...
mace/kernels/activation.h
浏览文件 @
9d8505f4
...
...
@@ -6,6 +6,7 @@
#define MACE_KERNELS_ACTIVATION_H_
#include <algorithm>
#include <memory>
#include <string>
#include <vector>
...
...
@@ -165,6 +166,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
T
relux_max_limit_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
string
tuning_key_prefix_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/addn.h
浏览文件 @
9d8505f4
...
...
@@ -9,6 +9,7 @@
#include <arm_neon.h>
#endif
#include <algorithm>
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -85,6 +86,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/batch_norm.h
浏览文件 @
9d8505f4
...
...
@@ -8,6 +8,7 @@
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -165,6 +166,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/bias_add.h
浏览文件 @
9d8505f4
...
...
@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_BIAS_ADD_H_
#define MACE_KERNELS_BIAS_ADD_H_
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -65,6 +66,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/buffer_to_image.h
浏览文件 @
9d8505f4
...
...
@@ -5,6 +5,8 @@
#ifndef MACE_KERNELS_BUFFER_TO_IMAGE_H_
#define MACE_KERNELS_BUFFER_TO_IMAGE_H_
#include <memory>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/opencl/helper.h"
...
...
@@ -13,8 +15,10 @@ namespace mace {
namespace
kernels
{
struct
BufferToImageFunctorBase
{
explicit
BufferToImageFunctorBase
(
bool
i2b
)
:
i2b_
(
i2b
)
{}
explicit
BufferToImageFunctorBase
(
bool
i2b
)
:
i2b_
(
i2b
),
kernel_error_
(
nullptr
)
{}
bool
i2b_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
};
template
<
DeviceType
D
,
typename
T
>
...
...
mace/kernels/channel_shuffle.h
浏览文件 @
9d8505f4
...
...
@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_CHANNEL_SHUFFLE_H_
#define MACE_KERNELS_CHANNEL_SHUFFLE_H_
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -57,6 +58,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
const
int
groups_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/concat.h
浏览文件 @
9d8505f4
...
...
@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_CONCAT_H_
#define MACE_KERNELS_CONCAT_H_
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -86,6 +87,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/conv_2d.h
浏览文件 @
9d8505f4
...
...
@@ -9,6 +9,7 @@
#include <arm_neon.h>
#endif
#include <algorithm>
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -468,6 +469,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/cwise.h
浏览文件 @
9d8505f4
...
...
@@ -115,6 +115,7 @@ struct CWiseFunctor<DeviceType::OPENCL, T> : CWiseFunctorBase {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/depth_to_space.h
浏览文件 @
9d8505f4
...
...
@@ -109,6 +109,7 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
const
int
block_size_
;
bool
d2s_
;
std
::
vector
<
index_t
>
input_shape_
;
...
...
mace/kernels/depthwise_conv2d.h
浏览文件 @
9d8505f4
...
...
@@ -454,6 +454,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/eltwise.h
浏览文件 @
9d8505f4
...
...
@@ -105,6 +105,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/fully_connected.h
浏览文件 @
9d8505f4
...
...
@@ -107,6 +107,7 @@ struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
std
::
vector
<
uint32_t
>
gws_
;
std
::
vector
<
uint32_t
>
lws_
;
std
::
vector
<
index_t
>
input_shape_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
};
}
// namespace kernels
...
...
mace/kernels/matmul.h
浏览文件 @
9d8505f4
...
...
@@ -68,6 +68,7 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
};
}
// namespace kernels
...
...
mace/kernels/opencl/activation_opencl.cc
浏览文件 @
9d8505f4
...
...
@@ -33,6 +33,14 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
'0'
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -72,6 +80,10 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
int
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -93,6 +105,13 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Concat
(
tuning_key_prefix_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
ActivationFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/addn.cc
浏览文件 @
9d8505f4
...
...
@@ -45,6 +45,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
MakeString
(
"-DINPUT_NUM="
,
input_tensors
.
size
()));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
'0'
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -71,6 +79,10 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
output_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
);
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -88,6 +100,13 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
ss
<<
"addn_opencl_kernel_"
<<
output_shape
[
0
]
<<
"_"
<<
output_shape
[
1
]
<<
"_"
<<
output_shape
[
2
]
<<
"_"
<<
output_shape
[
3
];
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
AddNFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/batch_norm_opencl.cc
浏览文件 @
9d8505f4
...
...
@@ -36,7 +36,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
...
...
@@ -44,6 +43,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options
.
emplace
(
"-Dbatch_norm="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
'0'
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -76,6 +83,10 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -100,6 +111,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Concat
(
"batch_norm_opencl_kernel_"
,
activation_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
folded_constant_
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
BatchNormFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/bias_add_opencl.cc
浏览文件 @
9d8505f4
...
...
@@ -36,6 +36,14 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options
.
emplace
(
"-Dbias_add="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
'0'
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -46,6 +54,10 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -77,6 +89,12 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
}
MACE_CHECK_CL_SUCCESS
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
...
...
mace/kernels/opencl/buffer_to_image.cc
浏览文件 @
9d8505f4
...
...
@@ -13,6 +13,7 @@ template <typename T>
void
BufferToImageFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
Tensor
*
buffer
,
const
BufferType
type
,
Tensor
*
image
,
StatsFuture
*
future
)
{
std
::
vector
<
size_t
>
image_shape
;
if
(
!
i2b_
)
{
CalImage2DShape
(
buffer
->
shape
(),
type
,
&
image_shape
);
if
(
type
==
WINOGRAD_FILTER
)
{
...
...
@@ -80,10 +81,25 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
}
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
if
(
!
kernel_error_
)
{
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
'0'
;
kernel_error_
->
UnMap
();
}
}
auto
b2f_kernel
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
);
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
b2f_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
b2f_kernel
.
setArg
(
idx
++
,
gws
[
0
]);
b2f_kernel
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -135,6 +151,12 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
MACE_CHECK_CL_SUCCESS
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
...
...
mace/kernels/opencl/channel_shuffle.cc
浏览文件 @
9d8505f4
...
...
@@ -43,6 +43,14 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
'0'
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -55,6 +63,10 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -76,6 +88,13 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
...
...
mace/kernels/opencl/cl/activation.cl
浏览文件 @
9d8505f4
#
include
<common.h>
__kernel
void
activation
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
activation
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
#
ifdef
USE_PRELU
__read_only
image2d_t
alpha,
...
...
@@ -29,6 +30,9 @@ __kernel void activation(GLOBAL_WORK_GROUP_SIZE_DIM3
#
else
DATA_TYPE4
out
=
do_activation
(
in,
relux_max_limit
)
;
#
endif
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
pos,
hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
pos,
hb
)
,
out
)
;
}
mace/kernels/opencl/cl/addn.cl
浏览文件 @
9d8505f4
#
include
<common.h>
__kernel
void
addn
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel
void
addn
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
input0,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
input1,
#
if
INPUT_NUM
>
2
...
...
@@ -31,6 +32,9 @@ __kernel void addn(GLOBAL_WORK_GROUP_SIZE_DIM2
out
=
out
+
in3
;
#
endif
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
w,
hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
w,
hb
)
,
out
)
;
}
mace/kernels/opencl/cl/batch_norm.cl
浏览文件 @
9d8505f4
#
include
<common.h>
//
Supported
data
types:
half/float
__kernel
void
batch_norm
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
batch_norm
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__read_only
image2d_t
scale,
__read_only
image2d_t
offset,
...
...
@@ -48,5 +49,8 @@ __kernel void batch_norm(GLOBAL_WORK_GROUP_SIZE_DIM3
out
=
do_activation
(
out,
relux_max_limit
)
;
#
endif
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
pos,
hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
pos,
hb
)
,
out
)
;
}
mace/kernels/opencl/cl/bias_add.cl
浏览文件 @
9d8505f4
#
include
<common.h>
//
Supported
data
types:
half/float
__kernel
void
bias_add
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
bias_add
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__read_only
image2d_t
bias,
__write_only
image2d_t
output
)
{
...
...
@@ -22,5 +23,9 @@ __kernel void bias_add(GLOBAL_WORK_GROUP_SIZE_DIM3
DATA_TYPE4
in
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
pos,
hb
))
;
DATA_TYPE4
bias_value
=
READ_IMAGET
(
bias,
SAMPLER,
(
int2
)(
ch_blk,
0
))
;
DATA_TYPE4
out
=
in
+
bias_value
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
pos,
hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
pos,
hb
)
,
out
)
;
}
mace/kernels/opencl/cl/buffer_to_image.cl
浏览文件 @
9d8505f4
#
include
<common.h>
__kernel
void
filter_buffer_to_image
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel
void
filter_buffer_to_image
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global
const
DATA_TYPE
*input,
/*
h,
w,
oc,
ic
*/
__private
const
int
input_offset,
__private
const
int
filter_h,
...
...
@@ -49,10 +50,14 @@ __kernel void filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
}
int2 coord = (int2)(w, h);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, w, h, kernel_error);
#endif
WRITE_IMAGET(output, coord, values);
}
__kernel void filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, /* h, w, oc, ic */
__private const int filter_h,
__private const int filter_w,
...
...
@@ -100,7 +105,8 @@ __kernel void filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
}
}
__kernel void dw_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* h, w, ic, m */
__private const int input_offset,
__private const int filter_w,
...
...
@@ -154,10 +160,14 @@ __kernel void dw_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
}
int2 coord = (int2)(w, h);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, w, h, kernel_error);
#endif
WRITE_IMAGET(output, coord, values);
}
__kernel void in_out_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void in_out_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int height,
...
...
@@ -195,10 +205,14 @@ __kernel void in_out_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
values = vload4(0, input + offset);
}
int2 coord = (int2)(w, h);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, w, h, kernel_error);
#endif
WRITE_IMAGET(output, coord, values);
}
__kernel void in_out_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void in_out_image_to_buffer(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, /* nhwc */
__private const int height,
__private const int width,
...
...
@@ -237,7 +251,8 @@ __kernel void in_out_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
}
}
__kernel void arg_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int count,
...
...
@@ -269,10 +284,14 @@ __kernel void arg_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
values = vload4(0, input + offset);
}
int2 coord = (int2)(w, h);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, w, h, kernel_error);
#endif
WRITE_IMAGET(output, coord, values);
}
__kernel void arg_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void arg_image_to_buffer(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, /* nhwc */
__private const int count,
__read_only image2d_t input) {
...
...
@@ -305,7 +324,8 @@ __kernel void arg_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
}
__kernel void in_out_height_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void in_out_height_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //nhwc
__private const int input_offset,
__private const int height,
...
...
@@ -344,10 +364,14 @@ __kernel void in_out_height_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
values.x = *(input + offset);
}
int2 coord = (int2)(w, h);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, w, h, kernel_error);
#endif
WRITE_IMAGET(output, coord, values);
}
__kernel void in_out_height_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //nhwc
__private const int height,
__private const int width,
...
...
@@ -385,7 +409,8 @@ __kernel void in_out_height_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
}
__kernel void in_out_width_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int height,
...
...
@@ -423,11 +448,15 @@ __kernel void in_out_width_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
values.x = *(input + offset);
}
int2 coord = (int2)(w, h);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, w, h, kernel_error);
#endif
WRITE_IMAGET(output, coord, values);
}
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
__private const int in_channels,
...
...
@@ -495,6 +524,11 @@ __kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
tu3[1] = tt + tu3[1] / 2;
int2 coord = (int2)(w, h);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, coord.x, coord.y + out_channels * 15, kernel_error);
#endif
#pragma unroll
for (short i = 0; i < 4; ++i) {
WRITE_IMAGET(output, coord, tu0[i]);
...
...
@@ -518,7 +552,8 @@ __kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
}
// only support 3x3 now
__kernel void winograd_filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
__private const int width,
...
...
mace/kernels/opencl/cl/channel_shuffle.cl
浏览文件 @
9d8505f4
#
include
<common.h>
//
assume
channes_per_group
mod
4
=
0
&&
groups
mod
4
==
0
__kernel
void
channel_shuffle
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
channel_shuffle
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__private
const
int
groups,
__private
const
int
channels_per_group,
...
...
@@ -49,6 +50,11 @@ __kernel void channel_shuffle(GLOBAL_WORK_GROUP_SIZE_DIM3
out_chan_data3
=
(
DATA_TYPE4
)(
in_chan_data0.w,
in_chan_data1.w,
in_chan_data2.w,
in_chan_data3.w
)
;
int
out_x
=
mad24
(
mad24
(
group_chan_blk_idx,
groups,
g_blk
)
,
width,
width_idx
)
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x
+
groups_blks_width
*
3
,
hb_idx,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x,
hb_idx
)
,
out_chan_data0
)
;
out_x
+=
groups_blks_width
;
...
...
mace/kernels/opencl/cl/common.h
浏览文件 @
9d8505f4
...
...
@@ -17,6 +17,7 @@
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
#ifndef NON_UNIFORM_WORK_GROUP
#define GLOBAL_WORK_GROUP_SIZE_DIM2 \
...
...
@@ -34,6 +35,18 @@
#endif
#ifdef OUT_OF_RANGE_CHECK
#define KERNEL_ERROR_PARAMS \
__global char *kernel_error,
#else
#define KERNEL_ERROR_PARAMS
#endif
__constant
sampler_t
SAMPLER
=
CLK_NORMALIZED_COORDS_FALSE
|
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST
;
...
...
@@ -61,4 +74,14 @@ inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
return
out
;
}
inline
void
check_out_of_range_for_image2d
(
__write_only
image2d_t
image
,
__private
const
int
x
,
__private
const
int
y
,
global
char
*
kernel_error
)
{
int2
image_dim
=
get_image_dim
(
image
);
if
(
x
>=
image_dim
.
x
||
y
>=
image_dim
.
y
)
{
*
kernel_error
=
'1'
;
}
}
#endif // MACE_KERNELS_OPENCL_CL_COMMON_H_
mace/kernels/opencl/cl/concat.cl
浏览文件 @
9d8505f4
...
...
@@ -22,7 +22,8 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
}
//
Supported
data
type:
half/float
__kernel
void
concat_channel
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
concat_channel
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input0,
__read_only
image2d_t
input1,
__private
const
int
input0_chan,
...
...
@@ -79,11 +80,17 @@ __kernel void concat_channel(GLOBAL_WORK_GROUP_SIZE_DIM3
}
#endif
WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx), data);
const int pos = mad24(chan_blk_idx, width, width_idx);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, pos, hb_idx, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(pos, hb_idx), data);
}
// Required: All input channels are divisible by 4
__kernel void concat_channel_multi(GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel void concat_channel_multi(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__private const int chan_blk_offset,
__write_only image2d_t output) {
...
...
@@ -106,7 +113,12 @@ __kernel void concat_channel_multi(GLOBAL_WORK_GROUP_SIZE_DIM3
SAMPLER,
(int2)(mad24(chan_blk_idx, width, width_idx), hb_idx));
WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx + chan_blk_offset, width, width_idx), hb_idx), data);
const int pos = mad24(chan_blk_idx + chan_blk_offset, width, width_idx);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, pos, hb_idx, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(pos, hb_idx), data);
}
//__kernel void concat_width(__read_only image2d_t input0,
...
...
mace/kernels/opencl/cl/conv_2d.cl
浏览文件 @
9d8505f4
#
include
<common.h>
__kernel
void
conv_2d
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
conv_2d
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin,
kh
*
kw
*
cout/4
*/
#
ifdef
BIAS
...
...
@@ -126,19 +127,32 @@ __kernel void conv_2d(GLOBAL_WORK_GROUP_SIZE_DIM3
#
endif
const
int
out_x_base
=
mul24
(
out_ch_blk,
out_width
)
;
int
w
=
out_w_blk
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x_base
+
w,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out0
)
;
w
+=
out_w_blks
;
if
(
w
>=
out_width
)
return
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x_base
+
w,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out1
)
;
w
+=
out_w_blks
;
if
(
w
>=
out_width
)
return
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x_base
+
w,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out2
)
;
w
+=
out_w_blks
;
if
(
w
>=
out_width
)
return
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x_base
+
w,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out3
)
;
}
mace/kernels/opencl/cl/conv_2d_1x1.cl
浏览文件 @
9d8505f4
#
include
<common.h>
__kernel
void
conv_2d_1x1
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
conv_2d_1x1
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin,
cout/4
*/
#
ifdef
BIAS
...
...
@@ -104,17 +105,29 @@ __kernel void conv_2d_1x1(GLOBAL_WORK_GROUP_SIZE_DIM3
const
int
out_x_base
=
mul24
(
out_ch_blk,
width
)
;
int
out_x_idx
=
out_w_blk
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x_base
+
out_x_idx,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out0
)
;
out_x_idx
+=
out_w_blks
;
if
(
out_x_idx
>=
width
)
return
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x_base
+
out_x_idx,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out1
)
;
out_x_idx
+=
out_w_blks
;
if
(
out_x_idx
>=
width
)
return
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x_base
+
out_x_idx,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out2
)
;
out_x_idx
+=
out_w_blks
;
if
(
out_x_idx
>=
width
)
return
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x_base
+
out_x_idx,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
out_x_idx,
out_hb
)
,
out3
)
;
}
mace/kernels/opencl/cl/conv_2d_3x3.cl
浏览文件 @
9d8505f4
#
include
<common.h>
__kernel
void
conv_2d_3x3
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
conv_2d_3x3
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin
,
kh
*
kw
*
cout/4
*/
#
ifdef
BIAS
...
...
@@ -135,30 +136,45 @@ __kernel void conv_2d_3x3(GLOBAL_WORK_GROUP_SIZE_DIM3
const
int
out_x_base
=
mul24
(
out_ch_blk,
out_width
)
;
int
w
=
out_w_blk
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x_base
+
w,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out0
)
;
w
+=
out_w_blks
;
if
(
w
>=
out_width
)
return
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x_base
+
w,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out1
)
;
w
+=
out_w_blks
;
if
(
w
>=
out_width
)
return
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x_base
+
w,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out2
)
;
w
+=
out_w_blks
;
if
(
w
>=
out_width
)
return
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x_base
+
w,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out3
)
;
w
+=
out_w_blks
;
if
(
w
>=
out_width
)
return
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x_base
+
w,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out4
)
;
...
...
mace/kernels/opencl/cl/cwise.cl
浏览文件 @
9d8505f4
#
include
<common.h>
__kernel
void
cwise
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel
void
cwise
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__private
const
float
value,
__write_only
image2d_t
output
)
{
...
...
@@ -43,5 +44,8 @@ __kernel void cwise(GLOBAL_WORK_GROUP_SIZE_DIM2
out.w
=
fabs
(
in0.w
)
;
#
endif
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
w,
hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
w,
hb
)
,
out
)
;
}
mace/kernels/opencl/cl/depth_to_space.cl
浏览文件 @
9d8505f4
#
include
<common.h>
__kernel
void
depth_to_space
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
depth_to_space
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__private
const
int
block_size,
__private
const
int
input_h
eight
,
__private
const
int
input_h
b
,
__private
const
int
input_width,
__private
const
int
input_depth_blocks,
__private
const
int
output_height,
__private
const
int
output_width,
__private
const
int
output_depth_blocks,
__write_only
image2d_t
output
)
{
const
int
out_d
=
get_global_id
(
0
)
;
const
int
out_w
=
get_global_id
(
1
)
;
const
int
out_h
=
get_global_id
(
2
)
;
const
int
out_h
b
=
get_global_id
(
2
)
;
if
(
out_d
>=
output_depth_blocks
|
| out_h >= output_height || out_w >= output_width)
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
out_d
>=
global_size_dim0
|
| out_w >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
}
#endif
const int out_pos = mad24(out_d, output_width, out_w);
const int in_h
= out_h
/ block_size;
const int offset_h = out_h % block_size;
const int in_h
b = out_hb
/ block_size;
const int offset_h = out_h
b
% block_size;
const int in_w = out_w / block_size;
const int offset_w = out_w % block_size;
const int offset_d = (offset_h * block_size + offset_w) * output_depth_blocks;
const int in_d = out_d + offset_d;
if (in_h
>= input_height || in_w >= input_width || in_d >= input_depth_blocks)
if (in_h
b >= input_hb || in_w >= input_width || in_d >= input_depth_blocks) {
return;
}
const int in_pos = mad24(in_d, input_width, in_w);
DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, in_h));
WRITE_IMAGET(output, (int2)(out_pos, out_h), in_data);
DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, in_hb));
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_pos, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data);
}
__kernel void space_to_depth(
__kernel void space_to_depth(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__private const int block_size,
__private const int input_height,
__private const int input_width,
__private const int input_depth_blocks,
__private const int output_h
eight
,
__private const int output_h
b
,
__private const int output_width,
__private const int output_depth_blocks,
__write_only image2d_t output) {
const int d = get_global_id(0);
const int w = get_global_id(1);
const int h = get_global_id(2);
const int h
b
= get_global_id(2);
if (h >= input_height || w >= input_width || d >= input_depth_blocks)
#ifndef NON_UNIFORM_WORK_GROUP
if (d >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) {
return;
}
#endif
const int in_pos = mad24(d, input_width, w);
const int out_h
= h
/ block_size;
const int offset_h = h % block_size;
const int out_h
b = hb
/ block_size;
const int offset_h = h
b
% block_size;
const int out_w = w / block_size;
const int offset_w = w % block_size;
const int offset_d = (offset_h * block_size + offset_w) * input_depth_blocks;
const int out_d = d + offset_d;
if (out_d >= output_depth_blocks || out_h
>= output_height |
|
out_w
>=
output_width
)
if (out_d >= output_depth_blocks || out_h
b >= output_hb |
|
out_w
>=
output_width
)
{
return
;
}
const
int
out_pos
=
mad24
(
out_d,
output_width,
out_w
)
;
DATA_TYPE4
in_data
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
in_pos,
h
))
;
WRITE_IMAGET
(
output,
(
int2
)(
out_pos,
out_h
)
,
in_data
)
;
DATA_TYPE4
in_data
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
in_pos,
hb
))
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_pos,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_pos,
out_hb
)
,
in_data
)
;
}
mace/kernels/opencl/cl/depthwise_conv2d.cl
浏览文件 @
9d8505f4
#
include
<common.h>
//
Only
multiplier
=
1
is
supported
__kernel
void
depthwise_conv2d
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
depthwise_conv2d
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
kh
*
kw
*
m,
cin/4
*/
#
ifdef
BIAS
...
...
@@ -122,22 +123,35 @@ __kernel void depthwise_conv2d(GLOBAL_WORK_GROUP_SIZE_DIM3
const short out_x_base = mul24(out_ch_blk, out_width);
short w = out_w_blk;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out0);
w += out_w_blks;
if (w >= out_width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out1);
w += out_w_blks;
if (w >= out_width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out2);
w += out_w_blks;
if (w >= out_width) return;
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_x_base + w, out_hb, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
}
__kernel void depthwise_conv2d_s1(GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel void depthwise_conv2d_s1(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
#ifdef BIAS
...
...
@@ -247,17 +261,29 @@ __kernel void depthwise_conv2d_s1(GLOBAL_WORK_GROUP_SIZE_DIM3
const
short
out_x_base
=
mul24
(
out_ch_blk,
out_width
)
;
short
w
=
out_w_blk
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x_base
+
w,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out0
)
;
w
+=
1
;
if
(
w
>=
out_width
)
return
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x_base
+
w,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out1
)
;
w
+=
1
;
if
(
w
>=
out_width
)
return
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x_base
+
w,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out2
)
;
w
+=
1
;
if
(
w
>=
out_width
)
return
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_x_base
+
w,
out_hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out3
)
;
}
mace/kernels/opencl/cl/eltwise.cl
浏览文件 @
9d8505f4
#
include
<common.h>
__kernel
void
eltwise
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel
void
eltwise
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
input0,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
input1,
#
ifdef
COEFF_SUM
...
...
@@ -36,5 +37,8 @@ __kernel void eltwise(GLOBAL_WORK_GROUP_SIZE_DIM2
out
=
in0
-
in1
;
#
endif
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
w,
hb,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
w,
hb
)
,
out
)
;
}
mace/kernels/opencl/cl/fully_connected.cl
浏览文件 @
9d8505f4
#
include
<common.h>
//
output
=
weight
*
input
+
bias
__kernel
void
fully_connected
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel
void
fully_connected
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
input,
__read_only
image2d_t
weight,
#
ifdef
BIAS
...
...
@@ -58,11 +59,16 @@ __kernel void fully_connected(GLOBAL_WORK_GROUP_SIZE_DIM2
#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
result = do_activation(result, relux_max_limit);
#endif
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_blk_idx, batch_idx, kernel_error);
#endif
WRITE_IMAGET(output, (int2)(out_blk_idx, batch_idx), result);
}
// output = weight * input + bias
__kernel void fully_connected_width(GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel void fully_connected_width(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__read_only image2d_t weight,
#ifdef BIAS
...
...
@@ -147,6 +153,10 @@ __kernel void fully_connected_width(GLOBAL_WORK_GROUP_SIZE_DIM3
#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) |
|
defined
(
USE_SIGMOID
)
result
=
do_activation
(
result,
relux_max_limit
)
;
#
endif
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_blk_idx,
batch_idx,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_blk_idx,
batch_idx
)
,
result
)
;
}
}
mace/kernels/opencl/cl/matmul.cl
浏览文件 @
9d8505f4
#
include
<common.h>
//
C
=
A
*
B
__kernel
void
matmul
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel
void
matmul
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
A,
__read_only
image2d_t
B,
__write_only
image2d_t
C,
...
...
@@ -46,11 +47,27 @@ __kernel void matmul(GLOBAL_WORK_GROUP_SIZE_DIM2
c3
+=
(
DATA_TYPE4
)(
dot
(
a0,
b3
)
,
dot
(
a1,
b3
)
,
dot
(
a2,
b3
)
,
dot
(
a3,
b3
))
;
}
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
C,
gx,
gy,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
C,
(
int2
)(
gx,
gy
)
,
c0
)
;
if
((
gx
+
1
)
>=
N
)
return
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
C,
gx
+
1
,
gy,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
C,
(
int2
)(
gx
+
1
,
gy
)
,
c1
)
;
if
((
gx
+
2
)
>=
N
)
return
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
C,
gx
+
2
,
gy,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
C,
(
int2
)(
gx
+
2
,
gy
)
,
c2
)
;
if
((
gx
+
3
)
>=
N
)
return
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
C,
gx
+
3
,
gy,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
C,
(
int2
)(
gx
+
3
,
gy
)
,
c3
)
;
}
mace/kernels/opencl/cl/pooling.cl
浏览文件 @
9d8505f4
...
...
@@ -19,7 +19,8 @@ inline int calculate_avg_block_size(const int pool_size,
}
//
Supported
data
type:
half/float
__kernel
void
pooling
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
pooling
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__private
const
int
in_height,
__private
const
int
in_width,
...
...
@@ -94,5 +95,9 @@ __kernel void pooling(GLOBAL_WORK_GROUP_SIZE_DIM3
}
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
mad24
(
out_chan_idx,
out_width,
out_width_idx
)
,
out_hb_idx
)
,
res
)
;
const
int
pos
=
mad24
(
out_chan_idx,
out_width,
out_width_idx
)
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
pos,
out_hb_idx,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
pos,
out_hb_idx
)
,
res
)
;
}
mace/kernels/opencl/cl/resize_bilinear.cl
浏览文件 @
9d8505f4
#
include
<common.h>
__kernel
void
resize_bilinear_nocache
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
resize_bilinear_nocache
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__write_only
image2d_t
output,
__private
const
float
height_scale,
...
...
@@ -56,6 +57,10 @@ __kernel void resize_bilinear_nocache(GLOBAL_WORK_GROUP_SIZE_DIM3
const
int
out_w_offset
=
mul24
(
ch_blk,
out_width
)
;
const
int
out_h_offset
=
mul24
(
b,
out_height
)
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
out_w_offset
+
w,
out_h_offset
+
h,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_w_offset
+
w,
out_h_offset
+
h
)
,
out
)
;
}
mace/kernels/opencl/cl/slice.cl
浏览文件 @
9d8505f4
#
include
<common.h>
__kernel
void
slice
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
slice
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__private
const
int
chan_blk_offset,
__write_only
image2d_t
output
)
{
...
...
@@ -21,6 +22,11 @@ __kernel void slice(GLOBAL_WORK_GROUP_SIZE_DIM3
DATA_TYPE4
data
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
mad24
(
chan_blk_idx
+
chan_blk_offset,
width,
width_idx
)
,
hb_idx
))
;
const
int
pos
=
mad24
(
chan_blk_idx,
width,
width_idx
)
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
pos,
hb_idx,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
mad24
(
chan_blk_idx,
width,
width_idx
)
,
hb_idx
)
,
data
)
;
(
int2
)(
pos
,
hb_idx
)
,
data
)
;
}
mace/kernels/opencl/cl/softmax.cl
浏览文件 @
9d8505f4
#
include
<common.h>
__kernel
void
softmax
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
softmax
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__private
const
int
channels,
__private
const
int
remain_channels,
...
...
@@ -84,5 +85,8 @@ __kernel void softmax(GLOBAL_WORK_GROUP_SIZE_DIM3
data
=
native_exp
(
data
)
/
sum
;
}
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
pos,
hb_idx,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
pos,
hb_idx
)
,
data
)
;
}
mace/kernels/opencl/cl/space_to_batch.cl
浏览文件 @
9d8505f4
#
include
<common.h>
__kernel
void
space_to_batch
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
space_to_batch
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
space_data,
__write_only
image2d_t
batch_data,
__private
const
int
block_height,
...
...
@@ -44,10 +45,15 @@ __kernel void space_to_batch(GLOBAL_WORK_GROUP_SIZE_DIM3
DATA_TYPE4 value = READ_IMAGET(space_data, SAMPLER, space_coord);
int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx);
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(batch_data, batch_coord.x, batch_coord.y, kernel_error);
#endif
WRITE_IMAGET(batch_data, batch_coord, value);
}
__kernel void batch_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel void batch_to_space(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t batch_data,
__write_only image2d_t space_data,
__private const int block_height,
...
...
@@ -87,6 +93,10 @@ __kernel void batch_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3
int2
space_coord
=
(
int2
)(
mul24
(
chan_idx,
space_width
)
+
space_w_idx,
space_b_idx
*
space_height
+
space_h_idx
)
;
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
space_data,
space_coord.x,
space_coord.y,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
space_data,
space_coord,
value
)
;
}
}
mace/kernels/opencl/cl/winograd_transform.cl
浏览文件 @
9d8505f4
#
include
<common.h>
__kernel
void
winograd_transform_2x2
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel
void
winograd_transform_2x2
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
input,
__write_only
image2d_t
output,
__private
const
int
in_height,
...
...
@@ -93,6 +94,9 @@ __kernel void winograd_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2
input3[2] = tv3[2] - tv3[1];
input3[3] = tv3[1] - tv3[3];
#ifdef OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d(output, out_width_idx, chan_blk_idx + chan_blk_idx * 15, kernel_error);
#endif
#pragma unroll
for (short i = 0; i < 4; ++i) {
WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), input0[i]);
...
...
@@ -115,7 +119,8 @@ __kernel void winograd_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2
}
}
__kernel void winograd_inverse_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input,
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
...
...
@@ -208,18 +213,30 @@ __kernel void winograd_inverse_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2
in1[1]
=
do_activation
(
in1[1],
relux_max_limit
)
;
#
endif
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
coord_x,
coord_y,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
coord_x,
coord_y
)
,
in0[0]
)
;
t
=
0
;
if
(
out_width_idx
+
1
<
out_width
)
{
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
coord_x
+
1
,
coord_y,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
coord_x
+
1
,
coord_y
)
,
in0[1]
)
;
t
+=
1
;
}
if
(
out_height_idx
+
1
<
out_height
)
{
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
coord_x,
coord_y
+
1
,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
coord_x,
coord_y
+
1
)
,
in1[0]
)
;
t
+=
1
;
}
if
(
t
==
2
)
{
#
ifdef
OUT_OF_RANGE_CHECK
check_out_of_range_for_image2d
(
output,
coord_x
+
1
,
coord_y
+
1
,
kernel_error
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
coord_x
+
1
,
coord_y
+
1
)
,
in1[1]
)
;
}
...
...
mace/kernels/opencl/concat.cc
浏览文件 @
9d8505f4
...
...
@@ -18,7 +18,8 @@ static void Concat2(cl::Kernel *kernel,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -36,6 +37,14 @@ static void Concat2(cl::Kernel *kernel,
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"concat_channel"
);
built_options
.
emplace
(
"-Dconcat_channel="
+
kernel_name
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
'0'
;
(
*
kernel_error
)
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -56,6 +65,10 @@ static void Concat2(cl::Kernel *kernel,
}
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input0
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
((
*
kernel_error
)
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -77,6 +90,13 @@ static void Concat2(cl::Kernel *kernel,
ss
<<
"concat_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
*
kernel
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
}
static
void
ConcatN
(
cl
::
Kernel
*
kernel
,
...
...
@@ -84,7 +104,8 @@ static void ConcatN(cl::Kernel *kernel,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -98,6 +119,14 @@ static void ConcatN(cl::Kernel *kernel,
built_options
.
emplace
(
"-Dconcat_channel_multi="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
'0'
;
(
*
kernel_error
)
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -117,6 +146,10 @@ static void ConcatN(cl::Kernel *kernel,
};
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
((
*
kernel_error
)
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -132,6 +165,13 @@ static void ConcatN(cl::Kernel *kernel,
ss
<<
"concat_n_opencl_kernel_"
<<
input_channel_blk
<<
"_"
<<
width
<<
"_"
<<
batch
*
height
;
TuningOrRun3DKernel
(
*
kernel
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
}
}
...
...
@@ -172,12 +212,12 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
switch
(
inputs_count
)
{
case
2
:
Concat2
(
&
kernel_
,
input_list
[
0
],
input_list
[
1
],
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
);
&
input_shape_
,
output
,
future
,
&
kwg_size_
,
&
kernel_error_
);
break
;
default:
if
(
divisible_four
)
{
ConcatN
(
&
kernel_
,
input_list
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
,
&
kwg_size_
);
&
kwg_size_
,
&
kernel_error_
);
}
else
{
MACE_NOT_IMPLEMENTED
;
}
...
...
mace/kernels/opencl/conv_2d_opencl.cc
浏览文件 @
9d8505f4
...
...
@@ -21,7 +21,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
);
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
);
extern
void
Conv2dOpenclK3x3
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
...
...
@@ -36,7 +37,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
);
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
);
extern
void
Conv2dOpencl
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
...
...
@@ -51,7 +53,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
);
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
);
template
<
typename
T
>
void
Conv2dFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
...
...
@@ -65,7 +68,7 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const
int
*
dilations
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
);
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
);
// Selection matrix: kernel_size x stride_size
static
const
Conv2dOpenclFunction
selector
[
5
]
=
{
Conv2dOpenclK1x1
,
nullptr
,
Conv2dOpenclK3x3
,
nullptr
,
nullptr
};
...
...
@@ -106,12 +109,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
conv2d_func
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
);
&
kwg_size_
,
&
kernel_error_
);
}
else
{
Conv2dOpencl
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
);
&
kwg_size_
,
&
kernel_error_
);
}
}
...
...
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
9d8505f4
...
...
@@ -23,7 +23,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -47,6 +48,14 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
built_options
.
emplace
(
"-Dconv_2d_1x1="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
'0'
;
(
*
kernel_error
)
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -84,6 +93,10 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
((
*
kernel_error
)
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -112,6 +125,13 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
Concat
(
"conv2d_1x1_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
}
}
// namespace kernels
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
9d8505f4
...
...
@@ -25,7 +25,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -44,6 +45,14 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
built_options
.
emplace
(
"-Dconv_2d_3x3="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
'0'
;
(
*
kernel_error
)
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -79,6 +88,10 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
((
*
kernel_error
)
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -110,6 +123,13 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
Concat
(
"conv2d_3x3_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
}
}
// namespace kernels
...
...
mace/kernels/opencl/conv_2d_opencl_general.cc
浏览文件 @
9d8505f4
...
...
@@ -25,7 +25,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -44,6 +45,14 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
built_options
.
emplace
(
"-Dconv_2d="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
'0'
;
(
*
kernel_error
)
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -79,6 +88,10 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
((
*
kernel_error
)
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -112,6 +125,13 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
Concat
(
"conv2d_general_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
}
}
// namespace kernels
...
...
mace/kernels/opencl/cwise_opencl.cc
浏览文件 @
9d8505f4
...
...
@@ -34,6 +34,14 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
MakeString
(
"-DCWISE_TYPE="
,
type_
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
'0'
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -44,6 +52,10 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -59,6 +71,13 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
ss
<<
"cwise_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
CWiseFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/depth_to_space_opencl.cc
浏览文件 @
9d8505f4
...
...
@@ -23,8 +23,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
const
char
*
kernel_name
=
nullptr
;
index_t
output_height
,
output_width
,
output_depth
;
if
(
d2s_
)
{
output_height
=
input_height
*
block_size_
;
if
(
d2s_
)
{
output_height
=
input_height
*
block_size_
;
output_width
=
input_width
*
block_size_
;
output_depth
=
input_depth
/
(
block_size_
*
block_size_
);
kernel_name
=
"depth_to_space"
;
...
...
@@ -55,6 +54,14 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
'0'
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -84,19 +91,31 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
}
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
}
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
block_size_
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_height
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_depth_blocks
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_height
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_depth_blocks
));
if
(
d2s_
)
{
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
block_size_
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_height
*
batch
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_depth_blocks
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_depth_blocks
));
}
else
{
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
block_size_
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_depth_blocks
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_height
*
batch
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_depth_blocks
));
}
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
input
->
shape
();
...
...
@@ -104,6 +123,13 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
DepthToSpaceOpFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/depthwise_conv_opencl.cc
浏览文件 @
9d8505f4
...
...
@@ -24,7 +24,8 @@ void DepthwiseConv2d(cl::Kernel *kernel,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -52,6 +53,14 @@ void DepthwiseConv2d(cl::Kernel *kernel,
}
else
{
built_options
.
emplace
(
"-Ddepthwise_conv2d="
+
kernel_name
);
}
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
'0'
;
(
*
kernel_error
)
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -97,6 +106,10 @@ void DepthwiseConv2d(cl::Kernel *kernel,
input_channels
);
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
((
*
kernel_error
)
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -130,6 +143,13 @@ void DepthwiseConv2d(cl::Kernel *kernel,
std
::
string
tuning_key
=
Concat
(
"depthwise_conv2d_ocl_kernel_"
,
activation
,
batch
,
height
,
width
,
channels
,
multiplier
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
}
template
<
typename
T
>
...
...
@@ -182,7 +202,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
DepthwiseConv2d
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
);
&
kwg_size_
,
&
kernel_error_
);
}
template
struct
DepthwiseConv2dFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/eltwise_opencl.cc
浏览文件 @
9d8505f4
...
...
@@ -37,6 +37,14 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
MakeString
(
"-DELTWISE_TYPE="
,
type_
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
'0'
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -48,6 +56,10 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
}
if
(
!
IsVecEqual
(
input_shape_
,
input0
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -68,6 +80,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
ss
<<
"eltwise_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
EltwiseFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/fully_connected_opencl.cc
浏览文件 @
9d8505f4
...
...
@@ -19,7 +19,8 @@ void FCWXKernel(cl::Kernel *kernel,
std
::
vector
<
uint32_t
>
*
gws
,
std
::
vector
<
uint32_t
>
*
lws
,
const
float
relux_max_limit
,
StatsFuture
*
future
)
{
StatsFuture
*
future
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
)
{
MACE_CHECK
(
input
->
dim
(
3
)
%
4
==
0
)
<<
"FC width kernel only support input with 4x channel."
;
MACE_CHECK_NOTNULL
(
gws
);
...
...
@@ -33,8 +34,7 @@ void FCWXKernel(cl::Kernel *kernel,
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"fully_connected"
);
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"fully_connected_width"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"fully_connected_width"
);
built_options
.
emplace
(
"-Dfully_connected_width="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
...
...
@@ -62,6 +62,14 @@ void FCWXKernel(cl::Kernel *kernel,
if
(
runtime
->
gpu_type
()
!=
GPUType
::
QUALCOMM_ADRENO
)
{
built_options
.
emplace
(
"-DNON_QUALCOMM_ADRENO"
);
}
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
'0'
;
(
*
kernel_error
)
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -95,6 +103,10 @@ void FCWXKernel(cl::Kernel *kernel,
(
*
gws
)[
2
]
=
static_cast
<
uint32_t
>
(
batch
*
output_blocks
);
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
((
*
kernel_error
)
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
0
]);
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
1
]);
...
...
@@ -132,6 +144,12 @@ void FCWXKernel(cl::Kernel *kernel,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
((
*
lws
)[
0
],
(
*
lws
)[
1
],
(
*
lws
)[
2
]),
nullptr
,
&
event
);
}
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
if
(
future
!=
nullptr
)
{
...
...
@@ -155,7 +173,8 @@ void FCWTXKernel(cl::Kernel *kernel,
std
::
vector
<
uint32_t
>
*
gws
,
std
::
vector
<
uint32_t
>
*
lws
,
const
float
relux_max_limit
,
StatsFuture
*
future
)
{
StatsFuture
*
future
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
)
{
MACE_CHECK_NOTNULL
(
gws
);
MACE_CHECK_NOTNULL
(
lws
);
auto
runtime
=
OpenCLRuntime
::
Global
();
...
...
@@ -169,6 +188,14 @@ void FCWTXKernel(cl::Kernel *kernel,
if
(
bias
!=
nullptr
)
{
built_options
.
emplace
(
"-DBIAS"
);
}
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
'0'
;
(
*
kernel_error
)
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -206,6 +233,10 @@ void FCWTXKernel(cl::Kernel *kernel,
};
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
((
*
kernel_error
)
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
0
]);
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
1
]);
...
...
@@ -229,6 +260,13 @@ void FCWTXKernel(cl::Kernel *kernel,
ss
<<
"fc_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun2DKernel
(
*
kernel
,
ss
.
str
(),
gws
->
data
(),
*
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
}
template
<
typename
T
>
...
...
@@ -246,10 +284,12 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
if
(
weight_type_
==
BufferType
::
WEIGHT_HEIGHT
)
{
FCWTXKernel
<
T
>
(
&
kernel_
,
input
,
weight
,
bias
,
&
input_shape_
,
output
,
activation_
,
&
gws_
,
&
lws_
,
relux_max_limit_
,
future
);
activation_
,
&
gws_
,
&
lws_
,
relux_max_limit_
,
future
,
&
kernel_error_
);
}
else
{
FCWXKernel
<
T
>
(
&
kernel_
,
input
,
weight
,
bias
,
&
input_shape_
,
output
,
activation_
,
&
gws_
,
&
lws_
,
relux_max_limit_
,
future
);
activation_
,
&
gws_
,
&
lws_
,
relux_max_limit_
,
future
,
&
kernel_error_
);
}
}
...
...
mace/kernels/opencl/matmul.cc
浏览文件 @
9d8505f4
...
...
@@ -40,6 +40,14 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
built_options
.
emplace
(
"-Dmatmul="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
'0'
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -49,6 +57,10 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -67,6 +79,13 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
ss
<<
"matmul_opencl_kernel_"
<<
C
->
dim
(
0
)
<<
"_"
<<
C
->
dim
(
1
)
<<
"_"
<<
C
->
dim
(
2
)
<<
"_"
<<
C
->
dim
(
3
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
MatMulFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/pooling_opencl.cc
浏览文件 @
9d8505f4
...
...
@@ -37,6 +37,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if
(
pooling_type_
==
AVG
)
{
built_options
.
emplace
(
"-DPOOL_AVG"
);
}
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
'0'
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -82,6 +90,10 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
};
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -117,6 +129,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
ss
<<
"pooling_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
.
data
(),
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
PoolingFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/resize_bilinear_opencl.cc
浏览文件 @
9d8505f4
...
...
@@ -37,6 +37,14 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
'0'
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -61,6 +69,10 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
CalculateResizeScale
(
in_width
,
out_width
,
align_corners_
);
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -82,6 +94,13 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
ss
<<
"resize_bilinear_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
ResizeBilinearFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/slice.cc
浏览文件 @
9d8505f4
...
...
@@ -38,6 +38,14 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
'0'
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -64,6 +72,10 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
<<
outputs_count
;
for
(
int
i
=
0
;
i
<
outputs_count
;
++
i
)
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -74,6 +86,12 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
*
(
output_list
[
i
]
->
opencl_image
()));
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
}
...
...
mace/kernels/opencl/softmax_opencl.cc
浏览文件 @
9d8505f4
...
...
@@ -36,6 +36,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
'0'
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -46,6 +54,10 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
}
if
(
!
IsVecEqual
(
input_shape_
,
logits
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -64,6 +76,13 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
ss
<<
"softmax_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
SoftmaxFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/space_to_batch_opencl.cc
浏览文件 @
9d8505f4
...
...
@@ -47,6 +47,14 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
'0'
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -59,6 +67,10 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
}
if
(
!
IsVecEqual
(
space_shape_
,
space_tensor
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -89,6 +101,13 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
<<
batch_tensor
->
dim
(
1
)
<<
"_"
<<
batch_tensor
->
dim
(
2
)
<<
"_"
<<
batch_tensor
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
SpaceToBatchFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/winograd_transform.cc
浏览文件 @
9d8505f4
...
...
@@ -26,6 +26,14 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
'0'
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -62,6 +70,10 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
output_tensor
->
ResizeImage
(
output_shape
,
image_shape
);
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -85,6 +97,13 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
<<
input_tensor
->
dim
(
1
)
<<
"_"
<<
input_tensor
->
dim
(
2
)
<<
"_"
<<
input_tensor
->
dim
(
3
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
<
typename
T
>
...
...
@@ -106,6 +125,14 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
'0'
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -152,6 +179,10 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const
uint32_t
round_h
=
(
height_
+
1
)
/
2
;
const
uint32_t
round_w
=
(
width_
+
1
)
/
2
;
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -181,6 +212,13 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
<<
input_tensor
->
dim
(
1
)
<<
"_"
<<
input_tensor
->
dim
(
2
)
<<
"_"
<<
input_tensor
->
dim
(
3
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
'0'
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
WinogradTransformFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/pooling.h
浏览文件 @
9d8505f4
...
...
@@ -198,6 +198,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/resize_bilinear.h
浏览文件 @
9d8505f4
...
...
@@ -174,6 +174,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/slice.h
浏览文件 @
9d8505f4
...
...
@@ -62,6 +62,7 @@ struct SliceFunctor<DeviceType::OPENCL, T> {
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
};
}
// namespace kernels
...
...
mace/kernels/softmax.h
浏览文件 @
9d8505f4
...
...
@@ -67,6 +67,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/space_to_batch.h
浏览文件 @
9d8505f4
...
...
@@ -57,6 +57,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
space_shape_
;
};
...
...
mace/kernels/winograd_transform.h
浏览文件 @
9d8505f4
...
...
@@ -52,6 +52,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
@@ -110,6 +111,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
tools/bazel_adb_run.py
浏览文件 @
9d8505f4
...
...
@@ -99,7 +99,8 @@ def main(unused_args):
args
=
FLAGS
.
args
,
opencl_profiling
=
1
,
vlog_level
=
0
,
device_bin_path
=
"/data/local/tmp/mace"
)
device_bin_path
=
"/data/local/tmp/mace"
,
out_of_range_check
=
1
)
globals
()[
FLAGS
.
stdout_processor
](
stdouts
,
device_properties
,
target_abi
)
if
__name__
==
"__main__"
:
...
...
tools/sh_commands.py
浏览文件 @
9d8505f4
...
...
@@ -60,7 +60,8 @@ def adb_run(serialno, host_bin_path, bin_name,
args
=
""
,
opencl_profiling
=
1
,
vlog_level
=
0
,
device_bin_path
=
"/data/local/tmp/mace"
):
device_bin_path
=
"/data/local/tmp/mace"
,
out_of_range_check
=
1
):
host_bin_full_path
=
"%s/%s"
%
(
host_bin_path
,
bin_name
)
device_bin_full_path
=
"%s/%s"
%
(
device_bin_path
,
bin_name
)
device_cl_path
=
"%s/cl"
%
device_bin_path
...
...
@@ -77,8 +78,8 @@ def adb_run(serialno, host_bin_path, bin_name,
stdout_buff
=
[]
process_output
=
make_output_processor
(
stdout_buff
)
p
=
sh
.
adb
(
"-s"
,
serialno
,
"shell"
,
"MACE_OPENCL_PROFILING=%d MACE_KERNEL_PATH=%s MACE_CPP_MIN_VLOG_LEVEL=%d %s %s"
%
(
opencl_profiling
,
device_cl_path
,
vlog_level
,
device_bin_full_path
,
args
),
"MACE_O
UT_OF_RANGE_CHECK=%d MACE_O
PENCL_PROFILING=%d MACE_KERNEL_PATH=%s MACE_CPP_MIN_VLOG_LEVEL=%d %s %s"
%
(
o
ut_of_range_check
,
o
pencl_profiling
,
device_cl_path
,
vlog_level
,
device_bin_full_path
,
args
),
_out
=
process_output
,
_bg
=
True
,
_err_to_out
=
True
)
p
.
wait
()
return
""
.
join
(
stdout_buff
)
...
...
tools/tuning_run.sh
浏览文件 @
9d8505f4
...
...
@@ -70,6 +70,7 @@ else
ADB_CMD_STR
=
"LD_LIBRARY_PATH=
${
PHONE_DATA_DIR
}
\
MACE_TUNING=
${
tuning_flag
}
\
MACE_OUT_OF_RANGE_CHECK="
1
"
\
MACE_CPP_MIN_VLOG_LEVEL=
$VLOG_LEVEL
\
MACE_RUN_PARAMETER_PATH=
${
PHONE_DATA_DIR
}
/mace_run.config
\
MACE_KERNEL_PATH=
$KERNEL_DIR
\
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录