Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
968fedc9
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
968fedc9
编写于
4月 10, 2018
作者:
L
Liangliang He
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'out_of_range_check' into 'master'
Out of range check See merge request !353
上级
6656855a
edc7c7ba
变更
72
显示空白变更内容
内联
并排
Showing
72 changed file
with
943 addition
and
124 deletion
+943
-124
mace/core/runtime/opencl/opencl_runtime.cc
mace/core/runtime/opencl/opencl_runtime.cc
+12
-0
mace/core/runtime/opencl/opencl_runtime.h
mace/core/runtime/opencl/opencl_runtime.h
+2
-0
mace/kernels/BUILD
mace/kernels/BUILD
+2
-0
mace/kernels/activation.h
mace/kernels/activation.h
+2
-0
mace/kernels/addn.h
mace/kernels/addn.h
+2
-0
mace/kernels/batch_norm.h
mace/kernels/batch_norm.h
+2
-0
mace/kernels/bias_add.h
mace/kernels/bias_add.h
+2
-0
mace/kernels/buffer_to_image.h
mace/kernels/buffer_to_image.h
+5
-1
mace/kernels/channel_shuffle.h
mace/kernels/channel_shuffle.h
+2
-0
mace/kernels/concat.h
mace/kernels/concat.h
+2
-0
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+2
-0
mace/kernels/cwise.h
mace/kernels/cwise.h
+2
-0
mace/kernels/depth_to_space.h
mace/kernels/depth_to_space.h
+2
-0
mace/kernels/depthwise_conv2d.h
mace/kernels/depthwise_conv2d.h
+2
-0
mace/kernels/eltwise.h
mace/kernels/eltwise.h
+2
-0
mace/kernels/fully_connected.h
mace/kernels/fully_connected.h
+2
-0
mace/kernels/matmul.h
mace/kernels/matmul.h
+3
-1
mace/kernels/opencl/activation_opencl.cc
mace/kernels/opencl/activation_opencl.cc
+19
-0
mace/kernels/opencl/addn.cc
mace/kernels/opencl/addn.cc
+19
-0
mace/kernels/opencl/batch_norm_opencl.cc
mace/kernels/opencl/batch_norm_opencl.cc
+19
-1
mace/kernels/opencl/bias_add_opencl.cc
mace/kernels/opencl/bias_add_opencl.cc
+18
-0
mace/kernels/opencl/buffer_to_image.cc
mace/kernels/opencl/buffer_to_image.cc
+22
-0
mace/kernels/opencl/channel_shuffle.cc
mace/kernels/opencl/channel_shuffle.cc
+19
-0
mace/kernels/opencl/cl/activation.cl
mace/kernels/opencl/cl/activation.cl
+3
-2
mace/kernels/opencl/cl/addn.cl
mace/kernels/opencl/cl/addn.cl
+2
-1
mace/kernels/opencl/cl/batch_norm.cl
mace/kernels/opencl/cl/batch_norm.cl
+2
-1
mace/kernels/opencl/cl/bias_add.cl
mace/kernels/opencl/cl/bias_add.cl
+3
-1
mace/kernels/opencl/cl/buffer_to_image.cl
mace/kernels/opencl/cl/buffer_to_image.cl
+59
-32
mace/kernels/opencl/cl/channel_shuffle.cl
mace/kernels/opencl/cl/channel_shuffle.cl
+3
-1
mace/kernels/opencl/cl/common.h
mace/kernels/opencl/cl/common.h
+37
-2
mace/kernels/opencl/cl/concat.cl
mace/kernels/opencl/cl/concat.cl
+10
-4
mace/kernels/opencl/cl/conv_2d.cl
mace/kernels/opencl/cl/conv_2d.cl
+3
-1
mace/kernels/opencl/cl/conv_2d_1x1.cl
mace/kernels/opencl/cl/conv_2d_1x1.cl
+2
-1
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+2
-2
mace/kernels/opencl/cl/cwise.cl
mace/kernels/opencl/cl/cwise.cl
+2
-1
mace/kernels/opencl/cl/depth_to_space.cl
mace/kernels/opencl/cl/depth_to_space.cl
+31
-21
mace/kernels/opencl/cl/depthwise_conv2d.cl
mace/kernels/opencl/cl/depthwise_conv2d.cl
+4
-2
mace/kernels/opencl/cl/eltwise.cl
mace/kernels/opencl/cl/eltwise.cl
+2
-1
mace/kernels/opencl/cl/fully_connected.cl
mace/kernels/opencl/cl/fully_connected.cl
+6
-2
mace/kernels/opencl/cl/matmul.cl
mace/kernels/opencl/cl/matmul.cl
+6
-1
mace/kernels/opencl/cl/pooling.cl
mace/kernels/opencl/cl/pooling.cl
+4
-2
mace/kernels/opencl/cl/resize_bilinear.cl
mace/kernels/opencl/cl/resize_bilinear.cl
+3
-1
mace/kernels/opencl/cl/slice.cl
mace/kernels/opencl/cl/slice.cl
+5
-3
mace/kernels/opencl/cl/softmax.cl
mace/kernels/opencl/cl/softmax.cl
+2
-1
mace/kernels/opencl/cl/space_to_batch.cl
mace/kernels/opencl/cl/space_to_batch.cl
+6
-2
mace/kernels/opencl/cl/winograd_transform.cl
mace/kernels/opencl/cl/winograd_transform.cl
+4
-2
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+44
-4
mace/kernels/opencl/conv_2d_opencl.cc
mace/kernels/opencl/conv_2d_opencl.cc
+9
-6
mace/kernels/opencl/conv_2d_opencl_1x1.cc
mace/kernels/opencl/conv_2d_opencl_1x1.cc
+21
-1
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+21
-1
mace/kernels/opencl/conv_2d_opencl_general.cc
mace/kernels/opencl/conv_2d_opencl_general.cc
+21
-1
mace/kernels/opencl/cwise_opencl.cc
mace/kernels/opencl/cwise_opencl.cc
+19
-0
mace/kernels/opencl/depth_to_space_opencl.cc
mace/kernels/opencl/depth_to_space_opencl.cc
+35
-9
mace/kernels/opencl/depthwise_conv_opencl.cc
mace/kernels/opencl/depthwise_conv_opencl.cc
+22
-2
mace/kernels/opencl/eltwise_opencl.cc
mace/kernels/opencl/eltwise_opencl.cc
+18
-0
mace/kernels/opencl/fully_connected_opencl.cc
mace/kernels/opencl/fully_connected_opencl.cc
+46
-6
mace/kernels/opencl/matmul.cc
mace/kernels/opencl/matmul.cc
+19
-0
mace/kernels/opencl/out_of_range_check_test.cc
mace/kernels/opencl/out_of_range_check_test.cc
+152
-0
mace/kernels/opencl/pooling_opencl.cc
mace/kernels/opencl/pooling_opencl.cc
+19
-0
mace/kernels/opencl/resize_bilinear_opencl.cc
mace/kernels/opencl/resize_bilinear_opencl.cc
+19
-0
mace/kernels/opencl/slice.cc
mace/kernels/opencl/slice.cc
+18
-0
mace/kernels/opencl/softmax_opencl.cc
mace/kernels/opencl/softmax_opencl.cc
+19
-0
mace/kernels/opencl/space_to_batch_opencl.cc
mace/kernels/opencl/space_to_batch_opencl.cc
+19
-0
mace/kernels/opencl/winograd_transform.cc
mace/kernels/opencl/winograd_transform.cc
+38
-0
mace/kernels/pooling.h
mace/kernels/pooling.h
+2
-0
mace/kernels/resize_bilinear.h
mace/kernels/resize_bilinear.h
+2
-0
mace/kernels/slice.h
mace/kernels/slice.h
+2
-0
mace/kernels/softmax.h
mace/kernels/softmax.h
+2
-0
mace/kernels/space_to_batch.h
mace/kernels/space_to_batch.h
+2
-0
mace/kernels/winograd_transform.h
mace/kernels/winograd_transform.h
+3
-0
tools/bazel_adb_run.py
tools/bazel_adb_run.py
+2
-1
tools/sh_commands.py
tools/sh_commands.py
+4
-3
未找到文件。
mace/core/runtime/opencl/opencl_runtime.cc
浏览文件 @
968fedc9
...
...
@@ -322,6 +322,14 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint,
}
}
}
const
char
*
out_of_range_check
=
getenv
(
"MACE_OUT_OF_RANGE_CHECK"
);
if
(
out_of_range_check
!=
nullptr
&&
strlen
(
out_of_range_check
)
==
1
&&
out_of_range_check
[
0
]
==
'1'
)
{
this
->
out_of_range_check_
=
true
;
}
else
{
this
->
out_of_range_check_
=
false
;
}
}
OpenCLRuntime
::~
OpenCLRuntime
()
{
...
...
@@ -578,4 +586,8 @@ const std::string OpenCLRuntime::ParseDeviceVersion(
return
words
[
1
];
}
const
bool
OpenCLRuntime
::
IsOutOfRangeCheckEnabled
()
const
{
return
out_of_range_check_
;
}
}
// namespace mace
mace/core/runtime/opencl/opencl_runtime.h
浏览文件 @
968fedc9
...
...
@@ -73,6 +73,7 @@ class OpenCLRuntime {
uint64_t
GetKernelMaxWorkGroupSize
(
const
cl
::
Kernel
&
kernel
);
uint64_t
GetKernelWaveSize
(
const
cl
::
Kernel
&
kernel
);
const
bool
IsNonUniformWorkgroupsSupported
();
const
bool
IsOutOfRangeCheckEnabled
()
const
;
const
GPUType
ParseGPUType
(
const
std
::
string
&
device_name
);
const
std
::
string
ParseDeviceVersion
(
const
std
::
string
&
device_version
);
void
SaveBuiltCLProgram
();
...
...
@@ -111,6 +112,7 @@ class OpenCLRuntime {
std
::
mutex
program_build_mutex_
;
GPUType
gpu_type_
;
std
::
string
opencl_version_
;
bool
out_of_range_check_
;
std
::
string
platform_info_
;
bool
program_map_changed_
;
std
::
unique_ptr
<
KVStorage
>
storage_
;
...
...
mace/kernels/BUILD
浏览文件 @
968fedc9
...
...
@@ -20,6 +20,7 @@ cc_library(
exclude
=
[
"*_test.cc"
,
"arm/*_test.cc"
,
"opencl/*_test.cc"
,
],
),
hdrs
=
glob
([
...
...
@@ -42,6 +43,7 @@ cc_test(
[
"*_test.cc"
,
"arm/*_test.cc"
,
"opencl/*_test.cc"
,
],
),
copts
=
if_openmp_enabled
([
"-fopenmp"
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
]),
...
...
mace/kernels/activation.h
浏览文件 @
968fedc9
...
...
@@ -6,6 +6,7 @@
#define MACE_KERNELS_ACTIVATION_H_
#include <algorithm>
#include <memory>
#include <string>
#include <vector>
...
...
@@ -171,6 +172,7 @@ class ActivationFunctor<DeviceType::OPENCL, T> {
T
relux_max_limit_
;
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
string
tuning_key_prefix_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/addn.h
浏览文件 @
968fedc9
...
...
@@ -9,6 +9,7 @@
#include <arm_neon.h>
#endif
#include <algorithm>
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -85,6 +86,7 @@ struct AddNFunctor<DeviceType::OPENCL, T> {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/batch_norm.h
浏览文件 @
968fedc9
...
...
@@ -8,6 +8,7 @@
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -165,6 +166,7 @@ struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/bias_add.h
浏览文件 @
968fedc9
...
...
@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_BIAS_ADD_H_
#define MACE_KERNELS_BIAS_ADD_H_
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -65,6 +66,7 @@ struct BiasAddFunctor<DeviceType::OPENCL, T> {
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/buffer_to_image.h
浏览文件 @
968fedc9
...
...
@@ -5,6 +5,8 @@
#ifndef MACE_KERNELS_BUFFER_TO_IMAGE_H_
#define MACE_KERNELS_BUFFER_TO_IMAGE_H_
#include <memory>
#include "mace/core/future.h"
#include "mace/core/tensor.h"
#include "mace/kernels/opencl/helper.h"
...
...
@@ -13,8 +15,10 @@ namespace mace {
namespace
kernels
{
struct
BufferToImageFunctorBase
{
explicit
BufferToImageFunctorBase
(
bool
i2b
)
:
i2b_
(
i2b
)
{}
explicit
BufferToImageFunctorBase
(
bool
i2b
)
:
i2b_
(
i2b
),
kernel_error_
(
nullptr
)
{}
bool
i2b_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
};
template
<
DeviceType
D
,
typename
T
>
...
...
mace/kernels/channel_shuffle.h
浏览文件 @
968fedc9
...
...
@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_CHANNEL_SHUFFLE_H_
#define MACE_KERNELS_CHANNEL_SHUFFLE_H_
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -57,6 +58,7 @@ struct ChannelShuffleFunctor<DeviceType::OPENCL, T> {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
const
int
groups_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/concat.h
浏览文件 @
968fedc9
...
...
@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_CONCAT_H_
#define MACE_KERNELS_CONCAT_H_
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -86,6 +87,7 @@ struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase {
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/conv_2d.h
浏览文件 @
968fedc9
...
...
@@ -9,6 +9,7 @@
#include <arm_neon.h>
#endif
#include <algorithm>
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -468,6 +469,7 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/cwise.h
浏览文件 @
968fedc9
...
...
@@ -5,6 +5,7 @@
#define MACE_KERNELS_CWISE_H_
#include <algorithm>
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -115,6 +116,7 @@ struct CWiseFunctor<DeviceType::OPENCL, T> : CWiseFunctorBase {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/depth_to_space.h
浏览文件 @
968fedc9
...
...
@@ -4,6 +4,7 @@
#ifndef MACE_KERNELS_DEPTH_TO_SPACE_H_
#define MACE_KERNELS_DEPTH_TO_SPACE_H_
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -109,6 +110,7 @@ struct DepthToSpaceOpFunctor<DeviceType::OPENCL, T> {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
const
int
block_size_
;
bool
d2s_
;
std
::
vector
<
index_t
>
input_shape_
;
...
...
mace/kernels/depthwise_conv2d.h
浏览文件 @
968fedc9
...
...
@@ -9,6 +9,7 @@
#include <arm_neon.h>
#endif
#include <algorithm>
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -454,6 +455,7 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/eltwise.h
浏览文件 @
968fedc9
...
...
@@ -5,6 +5,7 @@
#define MACE_KERNELS_ELTWISE_H_
#include <algorithm>
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -105,6 +106,7 @@ struct EltwiseFunctor<DeviceType::OPENCL, T> : EltwiseFunctorBase {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/fully_connected.h
浏览文件 @
968fedc9
...
...
@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_FULLY_CONNECTED_H_
#define MACE_KERNELS_FULLY_CONNECTED_H_
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -107,6 +108,7 @@ struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
std
::
vector
<
uint32_t
>
gws_
;
std
::
vector
<
uint32_t
>
lws_
;
std
::
vector
<
index_t
>
input_shape_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
};
}
// namespace kernels
...
...
mace/kernels/matmul.h
浏览文件 @
968fedc9
...
...
@@ -9,9 +9,10 @@
#include <arm_neon.h>
#endif
#include <algorithm>
#include <memory>
#include <string>
#include <vector>
#include <algorithm>
#include "mace/core/future.h"
#include "mace/core/runtime/opencl/cl2_header.h"
...
...
@@ -68,6 +69,7 @@ struct MatMulFunctor<DeviceType::OPENCL, T> {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
};
}
// namespace kernels
...
...
mace/kernels/opencl/activation_opencl.cc
浏览文件 @
968fedc9
...
...
@@ -33,6 +33,14 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -72,6 +80,10 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
int
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -93,6 +105,13 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Concat
(
tuning_key_prefix_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
ActivationFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/addn.cc
浏览文件 @
968fedc9
...
...
@@ -45,6 +45,14 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
MakeString
(
"-DINPUT_NUM="
,
input_tensors
.
size
()));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -71,6 +79,10 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
output_tensor
->
ResizeImage
(
output_shape
,
output_image_shape
);
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -88,6 +100,13 @@ void AddNFunctor<DeviceType::OPENCL, T>::operator()(
ss
<<
"addn_opencl_kernel_"
<<
output_shape
[
0
]
<<
"_"
<<
output_shape
[
1
]
<<
"_"
<<
output_shape
[
2
]
<<
"_"
<<
output_shape
[
3
];
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
AddNFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/batch_norm_opencl.cc
浏览文件 @
968fedc9
...
...
@@ -36,7 +36,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
...
...
@@ -44,6 +43,14 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options
.
emplace
(
"-Dbatch_norm="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -76,6 +83,10 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -100,6 +111,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
Concat
(
"batch_norm_opencl_kernel_"
,
activation_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
folded_constant_
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
BatchNormFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/bias_add_opencl.cc
浏览文件 @
968fedc9
...
...
@@ -36,6 +36,14 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options
.
emplace
(
"-Dbias_add="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -46,6 +54,10 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -77,6 +89,12 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]),
nullptr
,
&
event
);
}
MACE_CHECK_CL_SUCCESS
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
...
...
mace/kernels/opencl/buffer_to_image.cc
浏览文件 @
968fedc9
...
...
@@ -13,6 +13,7 @@ template <typename T>
void
BufferToImageFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
Tensor
*
buffer
,
const
BufferType
type
,
Tensor
*
image
,
StatsFuture
*
future
)
{
std
::
vector
<
size_t
>
image_shape
;
if
(
!
i2b_
)
{
CalImage2DShape
(
buffer
->
shape
(),
type
,
&
image_shape
);
if
(
type
==
WINOGRAD_FILTER
)
{
...
...
@@ -80,10 +81,25 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
}
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
if
(
!
kernel_error_
)
{
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
}
auto
b2f_kernel
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
);
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
b2f_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
b2f_kernel
.
setArg
(
idx
++
,
gws
[
0
]);
b2f_kernel
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -135,6 +151,12 @@ void BufferToImageFunctor<DeviceType::OPENCL, T>::operator()(
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
MACE_CHECK_CL_SUCCESS
(
error
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
...
...
mace/kernels/opencl/channel_shuffle.cc
浏览文件 @
968fedc9
...
...
@@ -43,6 +43,14 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -55,6 +63,10 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -76,6 +88,13 @@ void ChannelShuffleFunctor<DeviceType::OPENCL, T>::operator()(
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
...
...
mace/kernels/opencl/cl/activation.cl
浏览文件 @
968fedc9
#
include
<common.h>
__kernel
void
activation
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
activation
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
#
ifdef
USE_PRELU
__read_only
image2d_t
alpha,
...
...
@@ -29,6 +30,6 @@ __kernel void activation(GLOBAL_WORK_GROUP_SIZE_DIM3
#
else
DATA_TYPE4
out
=
do_activation
(
in,
relux_max_limit
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
pos,
hb
)
,
out
)
;
}
mace/kernels/opencl/cl/addn.cl
浏览文件 @
968fedc9
#
include
<common.h>
__kernel
void
addn
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel
void
addn
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
input0,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
input1,
#
if
INPUT_NUM
>
2
...
...
mace/kernels/opencl/cl/batch_norm.cl
浏览文件 @
968fedc9
#
include
<common.h>
//
Supported
data
types:
half/float
__kernel
void
batch_norm
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
batch_norm
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__read_only
image2d_t
scale,
__read_only
image2d_t
offset,
...
...
mace/kernels/opencl/cl/bias_add.cl
浏览文件 @
968fedc9
#
include
<common.h>
//
Supported
data
types:
half/float
__kernel
void
bias_add
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
bias_add
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__read_only
image2d_t
bias,
__write_only
image2d_t
output
)
{
...
...
@@ -22,5 +23,6 @@ __kernel void bias_add(GLOBAL_WORK_GROUP_SIZE_DIM3
DATA_TYPE4
in
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
pos,
hb
))
;
DATA_TYPE4
bias_value
=
READ_IMAGET
(
bias,
SAMPLER,
(
int2
)(
ch_blk,
0
))
;
DATA_TYPE4
out
=
in
+
bias_value
;
WRITE_IMAGET
(
output,
(
int2
)(
pos,
hb
)
,
out
)
;
}
mace/kernels/opencl/cl/buffer_to_image.cl
浏览文件 @
968fedc9
#
include
<common.h>
__kernel
void
filter_buffer_to_image
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel
void
filter_buffer_to_image
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global
const
DATA_TYPE
*input,
/*
h,
w,
oc,
ic
*/
__private
const
int
input_offset,
__private
const
int
filter_h,
...
...
@@ -52,7 +53,8 @@ __kernel void filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
WRITE_IMAGET(output, coord, values);
}
__kernel void filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void filter_image_to_buffer(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, /* h, w, oc, ic */
__private const int filter_h,
__private const int filter_w,
...
...
@@ -100,7 +102,8 @@ __kernel void filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
}
}
__kernel void dw_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void dw_filter_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* h, w, ic, m */
__private const int input_offset,
__private const int filter_w,
...
...
@@ -157,7 +160,8 @@ __kernel void dw_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
WRITE_IMAGET(output, coord, values);
}
__kernel void in_out_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void in_out_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int height,
...
...
@@ -198,7 +202,8 @@ __kernel void in_out_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
WRITE_IMAGET(output, coord, values);
}
__kernel void in_out_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void in_out_image_to_buffer(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, /* nhwc */
__private const int height,
__private const int width,
...
...
@@ -237,7 +242,8 @@ __kernel void in_out_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
}
}
__kernel void arg_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void arg_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int count,
...
...
@@ -272,7 +278,8 @@ __kernel void arg_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
WRITE_IMAGET(output, coord, values);
}
__kernel void arg_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void arg_image_to_buffer(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, /* nhwc */
__private const int count,
__read_only image2d_t input) {
...
...
@@ -305,7 +312,8 @@ __kernel void arg_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
}
__kernel void in_out_height_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void in_out_height_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //nhwc
__private const int input_offset,
__private const int height,
...
...
@@ -347,7 +355,8 @@ __kernel void in_out_height_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
WRITE_IMAGET(output, coord, values);
}
__kernel void in_out_height_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void in_out_height_image_to_buffer(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //nhwc
__private const int height,
__private const int width,
...
...
@@ -385,7 +394,8 @@ __kernel void in_out_height_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
}
__kernel void in_out_width_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void in_out_width_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int height,
...
...
@@ -427,7 +437,8 @@ __kernel void in_out_width_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
}
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void winograd_filter_buffer_to_image(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
__private const int in_channels,
...
...
@@ -495,30 +506,46 @@ __kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
tu3[1] = tt + tu3[1] / 2;
int2 coord = (int2)(w, h);
#pragma unroll
for (short i = 0; i < 4; ++i) {
WRITE_IMAGET(output, coord, tu0[i]);
WRITE_IMAGET(output, coord, tu0[0]);
coord.y += out_channels;
}
#pragma unroll
for (short i = 0; i < 4; ++i) {
WRITE_IMAGET(output, coord, tu1[i]);
WRITE_IMAGET(output, coord, tu0[1]);
coord.y += out_channels;
}
#pragma unroll
for (short i = 0; i < 4; ++i) {
WRITE_IMAGET(output, coord, tu2[i]);
WRITE_IMAGET(output, coord, tu0[2]);
coord.y += out_channels;
}
#pragma unroll
for (short i = 0; i < 4; ++i) {
WRITE_IMAGET(output, coord, tu3[i]);
WRITE_IMAGET(output, coord, tu0[3]);
coord.y += out_channels;
}
WRITE_IMAGET(output, coord, tu1[0]);
coord.y += out_channels;
WRITE_IMAGET(output, coord, tu1[1]);
coord.y += out_channels;
WRITE_IMAGET(output, coord, tu1[2]);
coord.y += out_channels;
WRITE_IMAGET(output, coord, tu1[3]);
coord.y += out_channels;
WRITE_IMAGET(output, coord, tu2[0]);
coord.y += out_channels;
WRITE_IMAGET(output, coord, tu2[1]);
coord.y += out_channels;
WRITE_IMAGET(output, coord, tu2[2]);
coord.y += out_channels;
WRITE_IMAGET(output, coord, tu2[3]);
coord.y += out_channels;
WRITE_IMAGET(output, coord, tu3[0]);
coord.y += out_channels;
WRITE_IMAGET(output, coord, tu3[1]);
coord.y += out_channels;
WRITE_IMAGET(output, coord, tu3[2]);
coord.y += out_channels;
WRITE_IMAGET(output, coord, tu3[3]);
}
// only support 3x3 now
__kernel void winograd_filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void winograd_filter_image_to_buffer(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
__private const int width,
...
...
mace/kernels/opencl/cl/channel_shuffle.cl
浏览文件 @
968fedc9
#
include
<common.h>
//
assume
channes_per_group
mod
4
=
0
&&
groups
mod
4
==
0
__kernel
void
channel_shuffle
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
channel_shuffle
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__private
const
int
groups,
__private
const
int
channels_per_group,
...
...
@@ -49,6 +50,7 @@ __kernel void channel_shuffle(GLOBAL_WORK_GROUP_SIZE_DIM3
out_chan_data3
=
(
DATA_TYPE4
)(
in_chan_data0.w,
in_chan_data1.w,
in_chan_data2.w,
in_chan_data3.w
)
;
int
out_x
=
mad24
(
mad24
(
group_chan_blk_idx,
groups,
g_blk
)
,
width,
width_idx
)
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x,
hb_idx
)
,
out_chan_data0
)
;
out_x
+=
groups_blks_width
;
...
...
mace/kernels/opencl/cl/common.h
浏览文件 @
968fedc9
...
...
@@ -14,8 +14,19 @@
#define CMD_TYPE(cmd, type) CMD_TYPE_STR(cmd, type)
#define DATA_TYPE4 VEC_DATA_TYPE(DATA_TYPE, 4)
#define READ_IMAGET CMD_TYPE(read_image, CMD_DATA_TYPE)
#define WRITE_IMAGET CMD_TYPE(write_image, CMD_DATA_TYPE)
#ifdef OUT_OF_RANGE_CHECK
#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \
check_out_of_range_for_image2d(image, (coord).x, (coord).y, kernel_error);
#else
#define CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord)
#endif
#define READ_IMAGET(image, coord, value) \
CMD_TYPE(read_image, CMD_DATA_TYPE)(image, coord, value)
#define WRITE_IMAGET(image, coord, value) \
CHECK_OUT_OF_RANGE_FOR_IMAGE2D(image, coord) \
CMD_TYPE(write_image, CMD_DATA_TYPE)(image, coord, value);
#ifndef NON_UNIFORM_WORK_GROUP
...
...
@@ -34,6 +45,18 @@
#endif
#ifdef OUT_OF_RANGE_CHECK
#define KERNEL_ERROR_PARAMS \
__global char *kernel_error,
#else
#define KERNEL_ERROR_PARAMS
#endif
__constant
sampler_t
SAMPLER
=
CLK_NORMALIZED_COORDS_FALSE
|
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST
;
...
...
@@ -61,4 +84,16 @@ inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
return
out
;
}
inline
void
check_out_of_range_for_image2d
(
__write_only
image2d_t
image
,
__private
const
int
x
,
__private
const
int
y
,
global
char
*
kernel_error
)
{
#ifdef OUT_OF_RANGE_CHECK
int2
image_dim
=
get_image_dim
(
image
);
if
(
x
>=
image_dim
.
x
||
y
>=
image_dim
.
y
)
{
*
kernel_error
=
1
;
}
#endif
}
#endif // MACE_KERNELS_OPENCL_CL_COMMON_H_
mace/kernels/opencl/cl/concat.cl
浏览文件 @
968fedc9
...
...
@@ -22,7 +22,8 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
}
//
Supported
data
type:
half/float
__kernel
void
concat_channel
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
concat_channel
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input0,
__read_only
image2d_t
input1,
__private
const
int
input0_chan,
...
...
@@ -79,11 +80,14 @@ __kernel void concat_channel(GLOBAL_WORK_GROUP_SIZE_DIM3
}
#endif
WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx), data);
const int pos = mad24(chan_blk_idx, width, width_idx);
WRITE_IMAGET(output, (int2)(pos, hb_idx), data);
}
// Required: All input channels are divisible by 4
__kernel void concat_channel_multi(GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel void concat_channel_multi(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__private const int chan_blk_offset,
__write_only image2d_t output) {
...
...
@@ -106,7 +110,9 @@ __kernel void concat_channel_multi(GLOBAL_WORK_GROUP_SIZE_DIM3
SAMPLER,
(int2)(mad24(chan_blk_idx, width, width_idx), hb_idx));
WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx + chan_blk_offset, width, width_idx), hb_idx), data);
const int pos = mad24(chan_blk_idx + chan_blk_offset, width, width_idx);
WRITE_IMAGET(output, (int2)(pos, hb_idx), data);
}
//__kernel void concat_width(__read_only image2d_t input0,
...
...
mace/kernels/opencl/cl/conv_2d.cl
浏览文件 @
968fedc9
#
include
<common.h>
__kernel
void
conv_2d
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
conv_2d
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin,
kh
*
kw
*
cout/4
*/
#
ifdef
BIAS
...
...
@@ -126,6 +127,7 @@ __kernel void conv_2d(GLOBAL_WORK_GROUP_SIZE_DIM3
#
endif
const
int
out_x_base
=
mul24
(
out_ch_blk,
out_width
)
;
int
w
=
out_w_blk
;
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out0
)
;
...
...
mace/kernels/opencl/cl/conv_2d_1x1.cl
浏览文件 @
968fedc9
#
include
<common.h>
__kernel
void
conv_2d_1x1
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
conv_2d_1x1
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin,
cout/4
*/
#
ifdef
BIAS
...
...
mace/kernels/opencl/cl/conv_2d_3x3.cl
浏览文件 @
968fedc9
#
include
<common.h>
__kernel
void
conv_2d_3x3
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
conv_2d_3x3
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin
,
kh
*
kw
*
cout/4
*/
#
ifdef
BIAS
...
...
@@ -162,5 +163,4 @@ __kernel void conv_2d_3x3(GLOBAL_WORK_GROUP_SIZE_DIM3
WRITE_IMAGET
(
output,
(
int2
)(
out_x_base
+
w,
out_hb
)
,
out4
)
;
}
mace/kernels/opencl/cl/cwise.cl
浏览文件 @
968fedc9
#
include
<common.h>
__kernel
void
cwise
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel
void
cwise
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__private
const
float
value,
__write_only
image2d_t
output
)
{
...
...
mace/kernels/opencl/cl/depth_to_space.cl
浏览文件 @
968fedc9
#
include
<common.h>
__kernel
void
depth_to_space
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
depth_to_space
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__private
const
int
block_size,
__private
const
int
input_h
eight
,
__private
const
int
input_h
b
,
__private
const
int
input_width,
__private
const
int
input_depth_blocks,
__private
const
int
output_height,
__private
const
int
output_width,
__private
const
int
output_depth_blocks,
__write_only
image2d_t
output
)
{
const
int
out_d
=
get_global_id
(
0
)
;
const
int
out_w
=
get_global_id
(
1
)
;
const
int
out_h
=
get_global_id
(
2
)
;
const
int
out_h
b
=
get_global_id
(
2
)
;
if
(
out_d
>=
output_depth_blocks
|
| out_h >= output_height || out_w >= output_width)
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
out_d
>=
global_size_dim0
|
| out_w >= global_size_dim1
|| out_hb >= global_size_dim2) {
return;
}
#endif
const int out_pos = mad24(out_d, output_width, out_w);
const int in_h
= out_h
/ block_size;
const int offset_h = out_h % block_size;
const int in_h
b = out_hb
/ block_size;
const int offset_h = out_h
b
% block_size;
const int in_w = out_w / block_size;
const int offset_w = out_w % block_size;
const int offset_d = (offset_h * block_size + offset_w) * output_depth_blocks;
const int in_d = out_d + offset_d;
if (in_h
>= input_height || in_w >= input_width || in_d >= input_depth_blocks)
if (in_h
b >= input_hb || in_w >= input_width || in_d >= input_depth_blocks) {
return;
}
const int in_pos = mad24(in_d, input_width, in_w);
DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, in_h));
WRITE_IMAGET(output, (int2)(out_pos, out_h), in_data);
DATA_TYPE4 in_data = READ_IMAGET(input, SAMPLER, (int2)(in_pos, in_hb));
WRITE_IMAGET(output, (int2)(out_pos, out_hb), in_data);
}
__kernel void space_to_depth(
__kernel void space_to_depth(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__private const int block_size,
__private const int input_height,
__private const int input_width,
__private const int input_depth_blocks,
__private const int output_h
eight
,
__private const int output_h
b
,
__private const int output_width,
__private const int output_depth_blocks,
__write_only image2d_t output) {
const int d = get_global_id(0);
const int w = get_global_id(1);
const int h = get_global_id(2);
const int h
b
= get_global_id(2);
if (h >= input_height || w >= input_width || d >= input_depth_blocks)
#ifndef NON_UNIFORM_WORK_GROUP
if (d >= global_size_dim0 || w >= global_size_dim1
|| hb >= global_size_dim2) {
return;
}
#endif
const int in_pos = mad24(d, input_width, w);
const int out_h
= h
/ block_size;
const int offset_h = h % block_size;
const int out_h
b = hb
/ block_size;
const int offset_h = h
b
% block_size;
const int out_w = w / block_size;
const int offset_w = w % block_size;
const int offset_d = (offset_h * block_size + offset_w) * input_depth_blocks;
const int out_d = d + offset_d;
if (out_d >= output_depth_blocks || out_h
>= output_height |
|
out_w
>=
output_width
)
if (out_d >= output_depth_blocks || out_h
b >= output_hb |
|
out_w
>=
output_width
)
{
return
;
}
const
int
out_pos
=
mad24
(
out_d,
output_width,
out_w
)
;
DATA_TYPE4
in_data
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
in_pos,
h
))
;
WRITE_IMAGET
(
output,
(
int2
)(
out_pos,
out_h
)
,
in_data
)
;
DATA_TYPE4
in_data
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
in_pos,
hb
))
;
WRITE_IMAGET
(
output,
(
int2
)(
out_pos,
out_hb
)
,
in_data
)
;
}
mace/kernels/opencl/cl/depthwise_conv2d.cl
浏览文件 @
968fedc9
#
include
<common.h>
//
Only
multiplier
=
1
is
supported
__kernel
void
depthwise_conv2d
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
depthwise_conv2d
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
kh
*
kw
*
m,
cin/4
*/
#
ifdef
BIAS
...
...
@@ -137,7 +138,8 @@ __kernel void depthwise_conv2d(GLOBAL_WORK_GROUP_SIZE_DIM3
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
}
__kernel void depthwise_conv2d_s1(GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel void depthwise_conv2d_s1(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
#ifdef BIAS
...
...
mace/kernels/opencl/cl/eltwise.cl
浏览文件 @
968fedc9
#
include
<common.h>
__kernel
void
eltwise
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel
void
eltwise
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
input0,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
input1,
#
ifdef
COEFF_SUM
...
...
mace/kernels/opencl/cl/fully_connected.cl
浏览文件 @
968fedc9
#
include
<common.h>
//
output
=
weight
*
input
+
bias
__kernel
void
fully_connected
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel
void
fully_connected
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
input,
__read_only
image2d_t
weight,
#
ifdef
BIAS
...
...
@@ -58,11 +59,13 @@ __kernel void fully_connected(GLOBAL_WORK_GROUP_SIZE_DIM2
#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
result = do_activation(result, relux_max_limit);
#endif
WRITE_IMAGET(output, (int2)(out_blk_idx, batch_idx), result);
}
// output = weight * input + bias
__kernel void fully_connected_width(GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel void fully_connected_width(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__read_only image2d_t weight,
#ifdef BIAS
...
...
@@ -147,6 +150,7 @@ __kernel void fully_connected_width(GLOBAL_WORK_GROUP_SIZE_DIM3
#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) |
|
defined
(
USE_SIGMOID
)
result
=
do_activation
(
result,
relux_max_limit
)
;
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
out_blk_idx,
batch_idx
)
,
result
)
;
}
}
mace/kernels/opencl/cl/matmul.cl
浏览文件 @
968fedc9
#
include
<common.h>
//
C
=
A
*
B
__kernel
void
matmul
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel
void
matmul
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
A,
__read_only
image2d_t
B,
__write_only
image2d_t
C,
...
...
@@ -46,11 +47,15 @@ __kernel void matmul(GLOBAL_WORK_GROUP_SIZE_DIM2
c3
+=
(
DATA_TYPE4
)(
dot
(
a0,
b3
)
,
dot
(
a1,
b3
)
,
dot
(
a2,
b3
)
,
dot
(
a3,
b3
))
;
}
WRITE_IMAGET
(
C,
(
int2
)(
gx,
gy
)
,
c0
)
;
if
((
gx
+
1
)
>=
N
)
return
;
WRITE_IMAGET
(
C,
(
int2
)(
gx
+
1
,
gy
)
,
c1
)
;
if
((
gx
+
2
)
>=
N
)
return
;
WRITE_IMAGET
(
C,
(
int2
)(
gx
+
2
,
gy
)
,
c2
)
;
if
((
gx
+
3
)
>=
N
)
return
;
WRITE_IMAGET
(
C,
(
int2
)(
gx
+
3
,
gy
)
,
c3
)
;
}
mace/kernels/opencl/cl/pooling.cl
浏览文件 @
968fedc9
...
...
@@ -19,7 +19,8 @@ inline int calculate_avg_block_size(const int pool_size,
}
//
Supported
data
type:
half/float
__kernel
void
pooling
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
pooling
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__private
const
int
in_height,
__private
const
int
in_width,
...
...
@@ -94,5 +95,6 @@ __kernel void pooling(GLOBAL_WORK_GROUP_SIZE_DIM3
}
#
endif
WRITE_IMAGET
(
output,
(
int2
)(
mad24
(
out_chan_idx,
out_width,
out_width_idx
)
,
out_hb_idx
)
,
res
)
;
const
int
pos
=
mad24
(
out_chan_idx,
out_width,
out_width_idx
)
;
WRITE_IMAGET
(
output,
(
int2
)(
pos,
out_hb_idx
)
,
res
)
;
}
mace/kernels/opencl/cl/resize_bilinear.cl
浏览文件 @
968fedc9
#
include
<common.h>
__kernel
void
resize_bilinear_nocache
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
resize_bilinear_nocache
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__write_only
image2d_t
output,
__private
const
float
height_scale,
...
...
@@ -56,6 +57,7 @@ __kernel void resize_bilinear_nocache(GLOBAL_WORK_GROUP_SIZE_DIM3
const
int
out_w_offset
=
mul24
(
ch_blk,
out_width
)
;
const
int
out_h_offset
=
mul24
(
b,
out_height
)
;
WRITE_IMAGET
(
output,
(
int2
)(
out_w_offset
+
w,
out_h_offset
+
h
)
,
out
)
;
}
mace/kernels/opencl/cl/slice.cl
浏览文件 @
968fedc9
#
include
<common.h>
__kernel
void
slice
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
slice
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__private
const
int
chan_blk_offset,
__write_only
image2d_t
output
)
{
...
...
@@ -21,6 +22,7 @@ __kernel void slice(GLOBAL_WORK_GROUP_SIZE_DIM3
DATA_TYPE4
data
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
mad24
(
chan_blk_idx
+
chan_blk_offset,
width,
width_idx
)
,
hb_idx
))
;
WRITE_IMAGET
(
output,
(
int2
)(
mad24
(
chan_blk_idx,
width,
width_idx
)
,
hb_idx
)
,
data
)
;
const
int
pos
=
mad24
(
chan_blk_idx,
width,
width_idx
)
;
WRITE_IMAGET
(
output,
(
int2
)(
pos,
hb_idx
)
,
data
)
;
}
mace/kernels/opencl/cl/softmax.cl
浏览文件 @
968fedc9
#
include
<common.h>
__kernel
void
softmax
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
softmax
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__private
const
int
channels,
__private
const
int
remain_channels,
...
...
mace/kernels/opencl/cl/space_to_batch.cl
浏览文件 @
968fedc9
#
include
<common.h>
__kernel
void
space_to_batch
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel
void
space_to_batch
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
space_data,
__write_only
image2d_t
batch_data,
__private
const
int
block_height,
...
...
@@ -44,10 +45,12 @@ __kernel void space_to_batch(GLOBAL_WORK_GROUP_SIZE_DIM3
DATA_TYPE4 value = READ_IMAGET(space_data, SAMPLER, space_coord);
int2 batch_coord = (int2)(mul24(chan_idx, batch_width) + batch_w_idx, batch_hb_idx);
WRITE_IMAGET(batch_data, batch_coord, value);
}
__kernel void batch_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3
__kernel void batch_to_space(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t batch_data,
__write_only image2d_t space_data,
__private const int block_height,
...
...
@@ -87,6 +90,7 @@ __kernel void batch_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3
int2
space_coord
=
(
int2
)(
mul24
(
chan_idx,
space_width
)
+
space_w_idx,
space_b_idx
*
space_height
+
space_h_idx
)
;
WRITE_IMAGET
(
space_data,
space_coord,
value
)
;
}
}
mace/kernels/opencl/cl/winograd_transform.cl
浏览文件 @
968fedc9
#
include
<common.h>
__kernel
void
winograd_transform_2x2
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel
void
winograd_transform_2x2
(
KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
input,
__write_only
image2d_t
output,
__private
const
int
in_height,
...
...
@@ -115,7 +116,8 @@ __kernel void winograd_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2
}
}
__kernel void winograd_inverse_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2
__kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input,
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
...
...
mace/kernels/opencl/concat.cc
浏览文件 @
968fedc9
...
...
@@ -18,7 +18,8 @@ static void Concat2(cl::Kernel *kernel,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -36,6 +37,14 @@ static void Concat2(cl::Kernel *kernel,
std
::
set
<
std
::
string
>
built_options
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"concat_channel"
);
built_options
.
emplace
(
"-Dconcat_channel="
+
kernel_name
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
0
;
(
*
kernel_error
)
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -56,6 +65,10 @@ static void Concat2(cl::Kernel *kernel,
}
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input0
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
((
*
kernel_error
)
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -77,6 +90,13 @@ static void Concat2(cl::Kernel *kernel,
ss
<<
"concat_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
*
kernel
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
}
static
void
ConcatN
(
cl
::
Kernel
*
kernel
,
...
...
@@ -84,7 +104,8 @@ static void ConcatN(cl::Kernel *kernel,
const
DataType
dt
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -98,6 +119,14 @@ static void ConcatN(cl::Kernel *kernel,
built_options
.
emplace
(
"-Dconcat_channel_multi="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
0
;
(
*
kernel_error
)
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -117,6 +146,10 @@ static void ConcatN(cl::Kernel *kernel,
};
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
((
*
kernel_error
)
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -132,6 +165,13 @@ static void ConcatN(cl::Kernel *kernel,
ss
<<
"concat_n_opencl_kernel_"
<<
input_channel_blk
<<
"_"
<<
width
<<
"_"
<<
batch
*
height
;
TuningOrRun3DKernel
(
*
kernel
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
}
}
...
...
@@ -172,12 +212,12 @@ void ConcatFunctor<DeviceType::OPENCL, T>::operator()(
switch
(
inputs_count
)
{
case
2
:
Concat2
(
&
kernel_
,
input_list
[
0
],
input_list
[
1
],
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
);
&
input_shape_
,
output
,
future
,
&
kwg_size_
,
&
kernel_error_
);
break
;
default:
if
(
divisible_four
)
{
ConcatN
(
&
kernel_
,
input_list
,
DataTypeToEnum
<
T
>::
value
,
output
,
future
,
&
kwg_size_
);
&
kwg_size_
,
&
kernel_error_
);
}
else
{
MACE_NOT_IMPLEMENTED
;
}
...
...
mace/kernels/opencl/conv_2d_opencl.cc
浏览文件 @
968fedc9
...
...
@@ -21,7 +21,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
);
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
);
extern
void
Conv2dOpenclK3x3
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
...
...
@@ -36,7 +37,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
);
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
);
extern
void
Conv2dOpencl
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
...
...
@@ -51,7 +53,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
);
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
);
template
<
typename
T
>
void
Conv2dFunctor
<
DeviceType
::
OPENCL
,
T
>::
operator
()(
const
Tensor
*
input
,
...
...
@@ -65,7 +68,7 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const
int
*
dilations
,
const
ActivationType
activation
,
const
float
relux_max_limit
,
const
DataType
dt
,
std
::
vector
<
index_t
>
*
input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
);
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
);
// Selection matrix: kernel_size x stride_size
static
const
Conv2dOpenclFunction
selector
[
5
]
=
{
Conv2dOpenclK1x1
,
nullptr
,
Conv2dOpenclK3x3
,
nullptr
,
nullptr
};
...
...
@@ -106,12 +109,12 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
conv2d_func
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
);
&
kwg_size_
,
&
kernel_error_
);
}
else
{
Conv2dOpencl
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
);
&
kwg_size_
,
&
kernel_error_
);
}
}
...
...
mace/kernels/opencl/conv_2d_opencl_1x1.cc
浏览文件 @
968fedc9
...
...
@@ -23,7 +23,8 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -47,6 +48,14 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
built_options
.
emplace
(
"-Dconv_2d_1x1="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
0
;
(
*
kernel_error
)
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -84,6 +93,10 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
((
*
kernel_error
)
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -112,6 +125,13 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
Concat
(
"conv2d_1x1_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
}
}
// namespace kernels
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
浏览文件 @
968fedc9
...
...
@@ -25,7 +25,8 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -44,6 +45,14 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
built_options
.
emplace
(
"-Dconv_2d_3x3="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
0
;
(
*
kernel_error
)
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -79,6 +88,10 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
((
*
kernel_error
)
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -110,6 +123,13 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
Concat
(
"conv2d_3x3_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
}
}
// namespace kernels
...
...
mace/kernels/opencl/conv_2d_opencl_general.cc
浏览文件 @
968fedc9
...
...
@@ -25,7 +25,8 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -44,6 +45,14 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
built_options
.
emplace
(
"-Dconv_2d="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
0
;
(
*
kernel_error
)
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -79,6 +88,10 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
((
*
kernel_error
)
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -112,6 +125,13 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
Concat
(
"conv2d_general_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
}
}
// namespace kernels
...
...
mace/kernels/opencl/cwise_opencl.cc
浏览文件 @
968fedc9
...
...
@@ -34,6 +34,14 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
MakeString
(
"-DCWISE_TYPE="
,
type_
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -44,6 +52,10 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -59,6 +71,13 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
ss
<<
"cwise_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
CWiseFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/depth_to_space_opencl.cc
浏览文件 @
968fedc9
...
...
@@ -23,8 +23,7 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
const
char
*
kernel_name
=
nullptr
;
index_t
output_height
,
output_width
,
output_depth
;
if
(
d2s_
)
{
output_height
=
input_height
*
block_size_
;
if
(
d2s_
)
{
output_height
=
input_height
*
block_size_
;
output_width
=
input_width
*
block_size_
;
output_depth
=
input_depth
/
(
block_size_
*
block_size_
);
kernel_name
=
"depth_to_space"
;
...
...
@@ -55,6 +54,14 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -84,19 +91,31 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
}
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
kernel_
.
setArg
(
idx
++
,
gws
[
2
]);
}
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
if
(
d2s_
)
{
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
block_size_
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_height
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_height
*
batch
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_depth_blocks
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_height
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_depth_blocks
));
}
else
{
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
block_size_
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
input_depth_blocks
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_height
*
batch
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_width
));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int32_t
>
(
output_depth_blocks
));
}
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
input
->
shape
();
...
...
@@ -104,6 +123,13 @@ void DepthToSpaceOpFunctor<DeviceType::OPENCL, T>::operator()(
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
};
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
DepthToSpaceOpFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/depthwise_conv_opencl.cc
浏览文件 @
968fedc9
...
...
@@ -24,7 +24,8 @@ void DepthwiseConv2d(cl::Kernel *kernel,
std
::
vector
<
index_t
>
*
prev_input_shape
,
Tensor
*
output
,
StatsFuture
*
future
,
uint32_t
*
kwg_size
)
{
uint32_t
*
kwg_size
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
height
=
output
->
dim
(
1
);
const
index_t
width
=
output
->
dim
(
2
);
...
...
@@ -52,6 +53,14 @@ void DepthwiseConv2d(cl::Kernel *kernel,
}
else
{
built_options
.
emplace
(
"-Ddepthwise_conv2d="
+
kernel_name
);
}
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
0
;
(
*
kernel_error
)
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -97,6 +106,10 @@ void DepthwiseConv2d(cl::Kernel *kernel,
input_channels
);
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
((
*
kernel_error
)
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
gws
[
0
]);
kernel
->
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -130,6 +143,13 @@ void DepthwiseConv2d(cl::Kernel *kernel,
std
::
string
tuning_key
=
Concat
(
"depthwise_conv2d_ocl_kernel_"
,
activation
,
batch
,
height
,
width
,
channels
,
multiplier
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
}
template
<
typename
T
>
...
...
@@ -182,7 +202,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
DepthwiseConv2d
(
&
kernel_
,
input
,
filter
,
bias
,
strides_
[
0
],
paddings
.
data
(),
dilations_
,
activation_
,
relux_max_limit_
,
DataTypeToEnum
<
T
>::
value
,
&
input_shape_
,
output
,
future
,
&
kwg_size_
);
&
kwg_size_
,
&
kernel_error_
);
}
template
struct
DepthwiseConv2dFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/eltwise_opencl.cc
浏览文件 @
968fedc9
...
...
@@ -37,6 +37,14 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
MakeString
(
"-DELTWISE_TYPE="
,
type_
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -48,6 +56,10 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
}
if
(
!
IsVecEqual
(
input_shape_
,
input0
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -68,6 +80,12 @@ void EltwiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input0,
ss
<<
"eltwise_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
EltwiseFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/fully_connected_opencl.cc
浏览文件 @
968fedc9
...
...
@@ -19,7 +19,8 @@ void FCWXKernel(cl::Kernel *kernel,
std
::
vector
<
uint32_t
>
*
gws
,
std
::
vector
<
uint32_t
>
*
lws
,
const
float
relux_max_limit
,
StatsFuture
*
future
)
{
StatsFuture
*
future
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
)
{
MACE_CHECK
(
input
->
dim
(
3
)
%
4
==
0
)
<<
"FC width kernel only support input with 4x channel."
;
MACE_CHECK_NOTNULL
(
gws
);
...
...
@@ -33,8 +34,7 @@ void FCWXKernel(cl::Kernel *kernel,
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"fully_connected"
);
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"fully_connected_width"
);
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"fully_connected_width"
);
built_options
.
emplace
(
"-Dfully_connected_width="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
...
...
@@ -62,6 +62,14 @@ void FCWXKernel(cl::Kernel *kernel,
if
(
runtime
->
gpu_type
()
!=
GPUType
::
QUALCOMM_ADRENO
)
{
built_options
.
emplace
(
"-DNON_QUALCOMM_ADRENO"
);
}
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
0
;
(
*
kernel_error
)
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -95,6 +103,10 @@ void FCWXKernel(cl::Kernel *kernel,
(
*
gws
)[
2
]
=
static_cast
<
uint32_t
>
(
batch
*
output_blocks
);
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
((
*
kernel_error
)
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
0
]);
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
1
]);
...
...
@@ -132,6 +144,12 @@ void FCWXKernel(cl::Kernel *kernel,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
((
*
lws
)[
0
],
(
*
lws
)[
1
],
(
*
lws
)[
2
]),
nullptr
,
&
event
);
}
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
if
(
future
!=
nullptr
)
{
...
...
@@ -155,7 +173,8 @@ void FCWTXKernel(cl::Kernel *kernel,
std
::
vector
<
uint32_t
>
*
gws
,
std
::
vector
<
uint32_t
>
*
lws
,
const
float
relux_max_limit
,
StatsFuture
*
future
)
{
StatsFuture
*
future
,
std
::
unique_ptr
<
BufferBase
>
*
kernel_error
)
{
MACE_CHECK_NOTNULL
(
gws
);
MACE_CHECK_NOTNULL
(
lws
);
auto
runtime
=
OpenCLRuntime
::
Global
();
...
...
@@ -169,6 +188,14 @@ void FCWTXKernel(cl::Kernel *kernel,
if
(
bias
!=
nullptr
)
{
built_options
.
emplace
(
"-DBIAS"
);
}
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
*
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
(
*
kernel_error
)
->
Map
(
nullptr
);
*
((
*
kernel_error
)
->
mutable_data
<
char
>
())
=
0
;
(
*
kernel_error
)
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -206,6 +233,10 @@ void FCWTXKernel(cl::Kernel *kernel,
};
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel
->
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
((
*
kernel_error
)
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
0
]);
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
1
]);
...
...
@@ -229,6 +260,13 @@ void FCWTXKernel(cl::Kernel *kernel,
ss
<<
"fc_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun2DKernel
(
*
kernel
,
ss
.
str
(),
gws
->
data
(),
*
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
char
*
kerror_code
=
(
*
kernel_error
)
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
(
*
kernel_error
)
->
UnMap
();
}
}
template
<
typename
T
>
...
...
@@ -246,10 +284,12 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
if
(
weight_type_
==
BufferType
::
WEIGHT_HEIGHT
)
{
FCWTXKernel
<
T
>
(
&
kernel_
,
input
,
weight
,
bias
,
&
input_shape_
,
output
,
activation_
,
&
gws_
,
&
lws_
,
relux_max_limit_
,
future
);
activation_
,
&
gws_
,
&
lws_
,
relux_max_limit_
,
future
,
&
kernel_error_
);
}
else
{
FCWXKernel
<
T
>
(
&
kernel_
,
input
,
weight
,
bias
,
&
input_shape_
,
output
,
activation_
,
&
gws_
,
&
lws_
,
relux_max_limit_
,
future
);
activation_
,
&
gws_
,
&
lws_
,
relux_max_limit_
,
future
,
&
kernel_error_
);
}
}
...
...
mace/kernels/opencl/matmul.cc
浏览文件 @
968fedc9
...
...
@@ -40,6 +40,14 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
built_options
.
emplace
(
"-Dmatmul="
+
kernel_name
);
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -49,6 +57,10 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -67,6 +79,13 @@ void MatMulFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *A,
ss
<<
"matmul_opencl_kernel_"
<<
C
->
dim
(
0
)
<<
"_"
<<
C
->
dim
(
1
)
<<
"_"
<<
C
->
dim
(
2
)
<<
"_"
<<
C
->
dim
(
3
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
MatMulFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/out_of_range_check_test.cc
0 → 100644
浏览文件 @
968fedc9
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <vector>
#include "gtest/gtest.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/core/tensor.h"
#include "mace/core/workspace.h"
#include "mace/kernels/opencl/helper.h"
namespace
mace
{
namespace
kernels
{
namespace
{
const
bool
BufferToImageOpImpl
(
Tensor
*
buffer
,
Tensor
*
image
,
const
std
::
vector
<
size_t
>
&
image_shape
)
{
std
::
unique_ptr
<
BufferBase
>
kernel_error
;
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
image_shape
[
0
]),
static_cast
<
uint32_t
>
(
image_shape
[
1
])};
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
string
kernel_name
=
"in_out_buffer_to_image"
;
std
::
string
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
kernel_name
);
std
::
set
<
std
::
string
>
built_options
;
std
::
stringstream
kernel_name_ss
;
kernel_name_ss
<<
"-D"
<<
kernel_name
<<
"="
<<
obfuscated_kernel_name
;
built_options
.
emplace
(
kernel_name_ss
.
str
());
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
if
(
buffer
->
dtype
()
==
image
->
dtype
())
{
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DataTypeToEnum
<
float
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
DataTypeToEnum
<
float
>::
value
));
}
else
{
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
DataTypeToEnum
<
float
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
float
>::
value
));
}
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error
->
Map
(
nullptr
);
*
(
kernel_error
->
mutable_data
<
char
>
())
=
0
;
kernel_error
->
UnMap
();
}
auto
b2f_kernel
=
runtime
->
BuildKernel
(
"buffer_to_image"
,
obfuscated_kernel_name
,
built_options
);
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
b2f_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
b2f_kernel
.
setArg
(
idx
++
,
gws
[
0
]);
b2f_kernel
.
setArg
(
idx
++
,
gws
[
1
]);
}
b2f_kernel
.
setArg
(
idx
++
,
*
(
buffer
->
opencl_buffer
()));
MACE_CHECK
(
buffer
->
buffer_offset
()
%
GetEnumTypeSize
(
buffer
->
dtype
())
==
0
,
"buffer offset not aligned"
);
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
buffer_offset
()
/
GetEnumTypeSize
(
buffer
->
dtype
())));
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
1
)));
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
2
)));
b2f_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
buffer
->
dim
(
3
)));
b2f_kernel
.
setArg
(
idx
++
,
*
(
image
->
opencl_image
()));
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
b2f_kernel
));
const
std
::
vector
<
uint32_t
>
lws
=
{
16
,
kwg_size
/
16
};
cl
::
Event
event
;
cl_int
error
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
else
{
std
::
vector
<
uint32_t
>
roundup_gws
(
lws
.
size
());
for
(
size_t
i
=
0
;
i
<
lws
.
size
();
++
i
)
{
roundup_gws
[
i
]
=
RoundUp
(
gws
[
i
],
lws
[
i
]);
}
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
b2f_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
MACE_CHECK_CL_SUCCESS
(
error
);
runtime
->
command_queue
().
finish
();
bool
is_out_of_range
=
false
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error
->
Map
(
nullptr
);
is_out_of_range
=
*
(
kernel_error
->
mutable_data
<
char
>
())
==
1
?
true
:
false
;
kernel_error
->
UnMap
();
}
return
is_out_of_range
;
}
}
// namespace
class
OutOfRangeCheckTest
:
public
::
testing
::
Test
{
protected:
virtual
void
SetUp
()
{
setenv
(
"MACE_OUT_OF_RANGE_CHECK"
,
"1"
,
1
);
}
};
TEST
(
OutOfRangeCheckTest
,
RandomTest
)
{
static
unsigned
int
seed
=
time
(
NULL
);
for
(
int
round
=
0
;
round
<
10
;
++
round
)
{
index_t
batch
=
11
+
rand_r
(
&
seed
)
%
10
;
index_t
height
=
12
+
rand_r
(
&
seed
)
%
100
;
index_t
width
=
13
+
rand_r
(
&
seed
)
%
100
;
index_t
channels
=
14
+
rand_r
(
&
seed
)
%
50
;
std
::
vector
<
index_t
>
buffer_shape
=
{
batch
,
height
,
width
,
channels
};
Workspace
ws
;
Tensor
*
buffer
=
ws
.
CreateTensor
(
"Buffer"
,
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
DataTypeToEnum
<
float
>::
v
());
buffer
->
Resize
(
buffer_shape
);
std
::
vector
<
size_t
>
image_shape
;
Tensor
*
image
=
ws
.
CreateTensor
(
"Image"
,
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
DataTypeToEnum
<
float
>::
v
());
CalImage2DShape
(
buffer
->
shape
(),
IN_OUT_CHANNEL
,
&
image_shape
);
image
->
ResizeImage
(
buffer
->
shape
(),
image_shape
);
ASSERT_FALSE
(
BufferToImageOpImpl
(
buffer
,
image
,
image_shape
));
std
::
vector
<
size_t
>
overflow_image_shape
=
image_shape
;
for
(
int
i
=
0
;
i
<
overflow_image_shape
.
size
();
++
i
)
{
overflow_image_shape
[
i
]
+=
1
;
}
ASSERT_TRUE
(
BufferToImageOpImpl
(
buffer
,
image
,
overflow_image_shape
));
}
}
}
// namespace kernels
}
// namespace mace
mace/kernels/opencl/pooling_opencl.cc
浏览文件 @
968fedc9
...
...
@@ -37,6 +37,14 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
if
(
pooling_type_
==
AVG
)
{
built_options
.
emplace
(
"-DPOOL_AVG"
);
}
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -82,6 +90,10 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
};
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -117,6 +129,13 @@ void PoolingFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
ss
<<
"pooling_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
.
data
(),
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
PoolingFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/resize_bilinear_opencl.cc
浏览文件 @
968fedc9
...
...
@@ -37,6 +37,14 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -61,6 +69,10 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
CalculateResizeScale
(
in_width
,
out_width
,
align_corners_
);
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -82,6 +94,13 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
ss
<<
"resize_bilinear_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
ResizeBilinearFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/slice.cc
浏览文件 @
968fedc9
...
...
@@ -38,6 +38,14 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -64,6 +72,10 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
<<
outputs_count
;
for
(
int
i
=
0
;
i
<
outputs_count
;
++
i
)
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -74,6 +86,12 @@ void SliceFunctor<DeviceType::OPENCL, T>::operator()(
kernel_
.
setArg
(
idx
++
,
*
(
output_list
[
i
]
->
opencl_image
()));
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
}
...
...
mace/kernels/opencl/softmax_opencl.cc
浏览文件 @
968fedc9
...
...
@@ -36,6 +36,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -46,6 +54,10 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
}
if
(
!
IsVecEqual
(
input_shape_
,
logits
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -64,6 +76,13 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
ss
<<
"softmax_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
SoftmaxFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/space_to_batch_opencl.cc
浏览文件 @
968fedc9
...
...
@@ -47,6 +47,14 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -59,6 +67,10 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
}
if
(
!
IsVecEqual
(
space_shape_
,
space_tensor
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -89,6 +101,13 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(
<<
batch_tensor
->
dim
(
1
)
<<
"_"
<<
batch_tensor
->
dim
(
2
)
<<
"_"
<<
batch_tensor
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
SpaceToBatchFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/opencl/winograd_transform.cc
浏览文件 @
968fedc9
...
...
@@ -26,6 +26,14 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -62,6 +70,10 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
output_tensor
->
ResizeImage
(
output_shape
,
image_shape
);
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -85,6 +97,13 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(
<<
input_tensor
->
dim
(
1
)
<<
"_"
<<
input_tensor
->
dim
(
2
)
<<
"_"
<<
input_tensor
->
dim
(
3
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
<
typename
T
>
...
...
@@ -106,6 +125,14 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
DataTypeToEnum
<
T
>::
value
));
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
built_options
.
emplace
(
"-DOUT_OF_RANGE_CHECK"
);
kernel_error_
=
std
::
move
(
std
::
unique_ptr
<
Buffer
>
(
new
Buffer
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
1
)));
kernel_error_
->
Map
(
nullptr
);
*
(
kernel_error_
->
mutable_data
<
char
>
())
=
0
;
kernel_error_
->
UnMap
();
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
...
...
@@ -152,6 +179,10 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
const
uint32_t
round_h
=
(
height_
+
1
)
/
2
;
const
uint32_t
round_w
=
(
width_
+
1
)
/
2
;
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
kernel_error_
->
buffer
())));
}
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
...
...
@@ -181,6 +212,13 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(
<<
input_tensor
->
dim
(
1
)
<<
"_"
<<
input_tensor
->
dim
(
2
)
<<
"_"
<<
input_tensor
->
dim
(
3
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
template
struct
WinogradTransformFunctor
<
DeviceType
::
OPENCL
,
float
>;
...
...
mace/kernels/pooling.h
浏览文件 @
968fedc9
...
...
@@ -7,6 +7,7 @@
#include <algorithm>
#include <limits>
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -198,6 +199,7 @@ struct PoolingFunctor<DeviceType::OPENCL, T> : PoolingFunctorBase {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/resize_bilinear.h
浏览文件 @
968fedc9
...
...
@@ -5,6 +5,7 @@
#define MACE_KERNELS_RESIZE_BILINEAR_H_
#include <algorithm>
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -174,6 +175,7 @@ struct ResizeBilinearFunctor<DeviceType::OPENCL, T>
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/slice.h
浏览文件 @
968fedc9
...
...
@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_SLICE_H_
#define MACE_KERNELS_SLICE_H_
#include <memory>
#include <functional>
#include <vector>
...
...
@@ -79,6 +80,7 @@ struct SliceFunctor<DeviceType::OPENCL, T> : SliceFunctorBase {
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
};
}
// namespace kernels
...
...
mace/kernels/softmax.h
浏览文件 @
968fedc9
...
...
@@ -7,6 +7,7 @@
#include <algorithm>
#include <functional>
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -67,6 +68,7 @@ struct SoftmaxFunctor<DeviceType::OPENCL, T> {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/space_to_batch.h
浏览文件 @
968fedc9
...
...
@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_SPACE_TO_BATCH_H_
#define MACE_KERNELS_SPACE_TO_BATCH_H_
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -57,6 +58,7 @@ struct SpaceToBatchFunctor<DeviceType::OPENCL, T> : SpaceToBatchFunctorBase {
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
space_shape_
;
};
...
...
mace/kernels/winograd_transform.h
浏览文件 @
968fedc9
...
...
@@ -5,6 +5,7 @@
#ifndef MACE_KERNELS_WINOGRAD_TRANSFORM_H_
#define MACE_KERNELS_WINOGRAD_TRANSFORM_H_
#include <memory>
#include <vector>
#include "mace/core/future.h"
...
...
@@ -52,6 +53,7 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T>
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
@@ -110,6 +112,7 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T>
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
unique_ptr
<
BufferBase
>
kernel_error_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
tools/bazel_adb_run.py
浏览文件 @
968fedc9
...
...
@@ -108,7 +108,8 @@ def main(unused_args):
args
=
FLAGS
.
args
,
opencl_profiling
=
1
,
vlog_level
=
0
,
device_bin_path
=
"/data/local/tmp/mace"
)
device_bin_path
=
"/data/local/tmp/mace"
,
out_of_range_check
=
1
)
device_properties
=
sh_commands
.
adb_getprop_by_serialno
(
serialno
)
globals
()[
FLAGS
.
stdout_processor
](
stdouts
,
device_properties
,
target_abi
)
...
...
tools/sh_commands.py
浏览文件 @
968fedc9
...
...
@@ -66,7 +66,8 @@ def adb_run(serialno, host_bin_path, bin_name,
args
=
""
,
opencl_profiling
=
1
,
vlog_level
=
0
,
device_bin_path
=
"/data/local/tmp/mace"
):
device_bin_path
=
"/data/local/tmp/mace"
,
out_of_range_check
=
1
):
host_bin_full_path
=
"%s/%s"
%
(
host_bin_path
,
bin_name
)
device_bin_full_path
=
"%s/%s"
%
(
device_bin_path
,
bin_name
)
props
=
adb_getprop_by_serialno
(
serialno
)
...
...
@@ -81,8 +82,8 @@ def adb_run(serialno, host_bin_path, bin_name,
stdout_buff
=
[]
process_output
=
make_output_processor
(
stdout_buff
)
p
=
sh
.
adb
(
"-s"
,
serialno
,
"shell"
,
"MACE_OPENCL_PROFILING=%d MACE_CPP_MIN_VLOG_LEVEL=%d %s %s"
%
(
opencl_profiling
,
vlog_level
,
device_bin_full_path
,
args
),
"MACE_O
UT_OF_RANGE_CHECK=%d MACE_O
PENCL_PROFILING=%d MACE_CPP_MIN_VLOG_LEVEL=%d %s %s"
%
(
o
ut_of_range_check
,
o
pencl_profiling
,
vlog_level
,
device_bin_full_path
,
args
),
_out
=
process_output
,
_bg
=
True
,
_err_to_out
=
True
)
p
.
wait
()
return
""
.
join
(
stdout_buff
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录