Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
cf5cae14
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
cf5cae14
编写于
5月 02, 2018
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Refactor opencl default local work group size.
上级
708c90ed
变更
27
隐藏空白更改
内联
并排
Showing
27 changed file
with
420 addition
and
186 deletion
+420
-186
mace/core/runtime/opencl/opencl_runtime.cc
mace/core/runtime/opencl/opencl_runtime.cc
+13
-0
mace/core/runtime/opencl/opencl_runtime.h
mace/core/runtime/opencl/opencl_runtime.h
+5
-0
mace/kernels/opencl/REAEMD.md
mace/kernels/opencl/REAEMD.md
+0
-58
mace/kernels/opencl/activation.cc
mace/kernels/opencl/activation.cc
+6
-7
mace/kernels/opencl/addn.cc
mace/kernels/opencl/addn.cc
+4
-4
mace/kernels/opencl/batch_norm.cc
mace/kernels/opencl/batch_norm.cc
+5
-2
mace/kernels/opencl/bias_add.cc
mace/kernels/opencl/bias_add.cc
+1
-1
mace/kernels/opencl/channel_shuffle.cc
mace/kernels/opencl/channel_shuffle.cc
+5
-8
mace/kernels/opencl/concat.cc
mace/kernels/opencl/concat.cc
+24
-6
mace/kernels/opencl/conv_2d.cc
mace/kernels/opencl/conv_2d.cc
+0
-0
mace/kernels/opencl/conv_2d_1x1.cc
mace/kernels/opencl/conv_2d_1x1.cc
+35
-2
mace/kernels/opencl/conv_2d_3x3.cc
mace/kernels/opencl/conv_2d_3x3.cc
+31
-2
mace/kernels/opencl/conv_2d_general.cc
mace/kernels/opencl/conv_2d_general.cc
+41
-3
mace/kernels/opencl/depth_to_space.cc
mace/kernels/opencl/depth_to_space.cc
+7
-7
mace/kernels/opencl/depthwise_conv.cc
mace/kernels/opencl/depthwise_conv.cc
+34
-3
mace/kernels/opencl/eltwise.cc
mace/kernels/opencl/eltwise.cc
+5
-5
mace/kernels/opencl/fully_connected.cc
mace/kernels/opencl/fully_connected.cc
+4
-4
mace/kernels/opencl/helper.cc
mace/kernels/opencl/helper.cc
+86
-36
mace/kernels/opencl/helper.h
mace/kernels/opencl/helper.h
+7
-0
mace/kernels/opencl/matmul.cc
mace/kernels/opencl/matmul.cc
+4
-4
mace/kernels/opencl/pad.cc
mace/kernels/opencl/pad.cc
+1
-1
mace/kernels/opencl/pooling.cc
mace/kernels/opencl/pooling.cc
+27
-5
mace/kernels/opencl/resize_bilinear.cc
mace/kernels/opencl/resize_bilinear.cc
+33
-5
mace/kernels/opencl/slice.cc
mace/kernels/opencl/slice.cc
+1
-1
mace/kernels/opencl/softmax.cc
mace/kernels/opencl/softmax.cc
+26
-5
mace/kernels/opencl/space_to_batch.cc
mace/kernels/opencl/space_to_batch.cc
+5
-6
mace/kernels/opencl/winograd_transform.cc
mace/kernels/opencl/winograd_transform.cc
+10
-11
未找到文件。
mace/core/runtime/opencl/opencl_runtime.cc
浏览文件 @
cf5cae14
...
@@ -362,6 +362,11 @@ OpenCLRuntime::OpenCLRuntime():
...
@@ -362,6 +362,11 @@ OpenCLRuntime::OpenCLRuntime():
}
}
}
}
device_
->
getInfo
(
CL_DEVICE_GLOBAL_MEM_CACHE_SIZE
,
&
device_gloabl_mem_cache_size_
);
device_
->
getInfo
(
CL_DEVICE_MAX_COMPUTE_UNITS
,
&
device_compute_units_
);
const
char
*
out_of_range_check
=
getenv
(
"MACE_OUT_OF_RANGE_CHECK"
);
const
char
*
out_of_range_check
=
getenv
(
"MACE_OUT_OF_RANGE_CHECK"
);
if
(
out_of_range_check
!=
nullptr
&&
strlen
(
out_of_range_check
)
==
1
if
(
out_of_range_check
!=
nullptr
&&
strlen
(
out_of_range_check
)
==
1
&&
out_of_range_check
[
0
]
==
'1'
)
{
&&
out_of_range_check
[
0
]
==
'1'
)
{
...
@@ -386,6 +391,14 @@ cl::Device &OpenCLRuntime::device() { return *device_; }
...
@@ -386,6 +391,14 @@ cl::Device &OpenCLRuntime::device() { return *device_; }
cl
::
CommandQueue
&
OpenCLRuntime
::
command_queue
()
{
return
*
command_queue_
;
}
cl
::
CommandQueue
&
OpenCLRuntime
::
command_queue
()
{
return
*
command_queue_
;
}
const
uint64_t
OpenCLRuntime
::
device_global_mem_cache_size
()
const
{
return
device_gloabl_mem_cache_size_
;
}
const
uint32_t
OpenCLRuntime
::
device_compute_units
()
const
{
return
device_compute_units_
;
}
bool
OpenCLRuntime
::
BuildProgramFromBinary
(
bool
OpenCLRuntime
::
BuildProgramFromBinary
(
const
std
::
string
&
built_program_key
,
const
std
::
string
&
built_program_key
,
const
std
::
string
&
build_options_str
,
const
std
::
string
&
build_options_str
,
...
...
mace/core/runtime/opencl/opencl_runtime.h
浏览文件 @
cf5cae14
...
@@ -73,6 +73,8 @@ class OpenCLRuntime {
...
@@ -73,6 +73,8 @@ class OpenCLRuntime {
cl
::
CommandQueue
&
command_queue
();
cl
::
CommandQueue
&
command_queue
();
const
GPUType
gpu_type
()
const
;
const
GPUType
gpu_type
()
const
;
const
std
::
string
platform_info
()
const
;
const
std
::
string
platform_info
()
const
;
const
uint64_t
device_global_mem_cache_size
()
const
;
const
uint32_t
device_compute_units
()
const
;
cl
::
Kernel
BuildKernel
(
const
std
::
string
&
program_name
,
cl
::
Kernel
BuildKernel
(
const
std
::
string
&
program_name
,
const
std
::
string
&
kernel_name
,
const
std
::
string
&
kernel_name
,
...
@@ -128,6 +130,9 @@ class OpenCLRuntime {
...
@@ -128,6 +130,9 @@ class OpenCLRuntime {
bool
program_map_changed_
;
bool
program_map_changed_
;
std
::
unique_ptr
<
KVStorage
>
storage_
;
std
::
unique_ptr
<
KVStorage
>
storage_
;
bool
is_profiling_enabled_
;
bool
is_profiling_enabled_
;
uint64_t
device_gloabl_mem_cache_size_
;
uint32_t
device_compute_units_
;
static
GPUPerfHint
kGPUPerfHint
;
static
GPUPerfHint
kGPUPerfHint
;
static
GPUPriorityHint
kGPUPriorityHint
;
static
GPUPriorityHint
kGPUPriorityHint
;
...
...
mace/kernels/opencl/REAEMD.md
已删除
100644 → 0
浏览文件 @
708c90ed
OpenCL Image Storage Layout
===
Use
**Image**
object to optimize memory access and parallel computing based on OpenCL 2.0.
Design the corresponding
**Image**
format to optimize memory access for different Op algorithm.
Each pixel of
**Image**
object contains 4 elements(e.g. RGBA).
The Followings are the
**Buffer**
and
**Image**
format for all
**Tensors**
.
Input/Output
---
**Mace**
use NHWC format Input/Output.
| Tensor| Buffer| Image Size [Width, Height]| Explanation|
| --------- | :---------:|:--------:|:----:|
|Channel-Major Input/Output | NHWC | [W
* (C+3)/4, N *
H] | Default Input/Output format|
|Height-Major Input/Output | NHWC | [W
* C, N *
(H+3)/4] | Winograd Convolution format|
|Width-Major Input/Output | NHWC | [(W+3)/4
* C, N *
H] | Winograd Convolution format|
Each Pixel of
**Image**
contains 4 elements. The below table list the coordination relation
between
**Image**
and
**Buffer**
.
| Tensor| Pixel Coordinate Relation| Explanation
| --------- | :---------:| :-----: |
|Channel-Major Input/Output | P[i, j] = {E[n, h, w, c]
|
(n=j/H, h=j%H, w=i%W, c=[i/W
*
4 + k])}| k=[0, 4)|
|Height-Major Input/Output | P[i, j] = {E[n, h, w, c]
|
(n=j%N, h=[j/H
*
4 + k], w=i%W, c=i/W)}| k=[0, 4)|
|Width-Major Input/Output | P[i, j] = {E[n, h, w, c]
|
(n=j/H, h=j%H, w=[i%W
*
4 + k], c=i/W)}| k=[0, 4)|
Filter
---
| Tensor| Buffer| Image Size [Width, Height]| Explanation|
| --------- | :---------:|:--------:|:----:|
|Convolution Filter | HWOI | [H
* W *
RoundUp
<
4
>
(I), (O+3)/4]|Convolution filter format,There is no difference compared to [H
*w*
I, (O+3)/4]|
|Depthwise Convlution Filter | HWIM | [H
* W *
M, (I+3)/4]|Depthwise-Convolution filter format|
Each Pixel of
**Image**
contains 4 elements. The below table list the coordination relation
between
**Image**
and
**Buffer**
.
| Tensor| Pixel Coordinate Relation| Explanation|
| --------- | :---------:| :-----:|
|Convolution Filter | P[m, n] = {E[h, w, o, i]
|
(h=T/W, w=T%W, o=[n
*4+k], i=m%RI)}| RI=((I + 3) / 4) *
4, T=m/RI, k=[0, 4)|
|Depthwise Convlution Filter | P[m, n] = {E[h, w, i, 0]
|
(h=m/W, w=m%W, i=[n
*
4+k])}| only support multiplier == 1, k=[0, 4)|
1-D Argument
---
| Tensor| Buffer| Image Size [Width, Height]| Explanation|
| --------- | :---------:|:--------:|:----:|
|1-D Argument | W | [(W+3)/4, 1] | 1D argument format, e.g. Bias|
Each Pixel of
**Image**
contains 4 elements. The below table list the coordination relation
between
**Image**
and
**Buffer**
.
| Tensor| Pixel Coordinate Relation| Explanation|
| --------- | :---------:| :-----:|
|1-D Argument | P[i, 0] = {E[w]
|
w=i
*
4+k}| k=[0, 4)|
mace/kernels/opencl/activation
_opencl
.cc
→
mace/kernels/opencl/activation.cc
浏览文件 @
cf5cae14
...
@@ -21,7 +21,6 @@
...
@@ -21,7 +21,6 @@
namespace
mace
{
namespace
mace
{
namespace
kernels
{
namespace
kernels
{
template
<
typename
T
>
template
<
typename
T
>
void
ActivationFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
input
,
void
ActivationFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
alpha
,
const
Tensor
*
alpha
,
...
@@ -56,23 +55,23 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
...
@@ -56,23 +55,23 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
}
}
switch
(
activation_
)
{
switch
(
activation_
)
{
case
RELU
:
case
RELU
:
tuning_key_prefix_
=
"relu_opencl_kernel
_
"
;
tuning_key_prefix_
=
"relu_opencl_kernel"
;
built_options
.
emplace
(
"-DUSE_RELU"
);
built_options
.
emplace
(
"-DUSE_RELU"
);
break
;
break
;
case
RELUX
:
case
RELUX
:
tuning_key_prefix_
=
"relux_opencl_kernel
_
"
;
tuning_key_prefix_
=
"relux_opencl_kernel"
;
built_options
.
emplace
(
"-DUSE_RELUX"
);
built_options
.
emplace
(
"-DUSE_RELUX"
);
break
;
break
;
case
PRELU
:
case
PRELU
:
tuning_key_prefix_
=
"prelu_opencl_kernel
_
"
;
tuning_key_prefix_
=
"prelu_opencl_kernel"
;
built_options
.
emplace
(
"-DUSE_PRELU"
);
built_options
.
emplace
(
"-DUSE_PRELU"
);
break
;
break
;
case
TANH
:
case
TANH
:
tuning_key_prefix_
=
"tanh_opencl_kernel
_
"
;
tuning_key_prefix_
=
"tanh_opencl_kernel"
;
built_options
.
emplace
(
"-DUSE_TANH"
);
built_options
.
emplace
(
"-DUSE_TANH"
);
break
;
break
;
case
SIGMOID
:
case
SIGMOID
:
tuning_key_prefix_
=
"sigmoid_opencl_kernel
_
"
;
tuning_key_prefix_
=
"sigmoid_opencl_kernel"
;
built_options
.
emplace
(
"-DUSE_SIGMOID"
);
built_options
.
emplace
(
"-DUSE_SIGMOID"
);
break
;
break
;
default:
default:
...
@@ -110,7 +109,7 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
...
@@ -110,7 +109,7 @@ void ActivationFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
input_shape_
=
input
->
shape
();
input_shape_
=
input
->
shape
();
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
}
;
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
)
;
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
tuning_key_prefix_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
Concat
(
tuning_key_prefix_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
3
));
...
...
mace/kernels/opencl/addn.cc
浏览文件 @
cf5cae14
...
@@ -106,10 +106,10 @@ void AddNFunctor<DeviceType::GPU, T>::operator()(
...
@@ -106,10 +106,10 @@ void AddNFunctor<DeviceType::GPU, T>::operator()(
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
16
,
16
,
0
};
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
16
,
16
,
0
};
std
::
string
stream
ss
;
std
::
string
tuning_key
=
ss
<<
"addn_opencl_kernel_"
<<
output_shape
[
0
]
<<
"_"
<<
output_shape
[
1
]
Concat
(
"addn_opencl_kernel"
,
output_tensor
->
dim
(
0
),
output_tensor
->
dim
(
1
),
<<
"_"
<<
output_shape
[
2
]
<<
"_"
<<
output_shape
[
3
]
;
output_tensor
->
dim
(
2
),
output_tensor
->
dim
(
3
))
;
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
()
,
gws
,
lws
,
future
);
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/batch_norm
_opencl
.cc
→
mace/kernels/opencl/batch_norm.cc
浏览文件 @
cf5cae14
...
@@ -116,9 +116,12 @@ void BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
...
@@ -116,9 +116,12 @@ void BatchNormFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
input_shape_
=
input
->
shape
();
input_shape_
=
input
->
shape
();
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
};
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size_
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
4
,
kwg_size_
/
lws
[
1
]);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
kwg_size_
/
(
lws
[
1
]
*
lws
[
0
]));
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"batch_norm_opencl_kernel
_
"
,
activation_
,
output
->
dim
(
0
),
Concat
(
"batch_norm_opencl_kernel"
,
activation_
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
folded_constant_
);
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
folded_constant_
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
...
...
mace/kernels/opencl/bias_add
_opencl
.cc
→
mace/kernels/opencl/bias_add.cc
浏览文件 @
cf5cae14
...
@@ -79,7 +79,7 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
...
@@ -79,7 +79,7 @@ void BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
input_shape_
=
input
->
shape
();
input_shape_
=
input
->
shape
();
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
}
;
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
)
;
cl
::
Event
event
;
cl
::
Event
event
;
cl_int
error
;
cl_int
error
;
...
...
mace/kernels/opencl/channel_shuffle.cc
浏览文件 @
cf5cae14
...
@@ -90,14 +90,11 @@ void ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
...
@@ -90,14 +90,11 @@ void ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
input_shape_
=
input
->
shape
();
input_shape_
=
input
->
shape
();
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
};
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
);
std
::
stringstream
ss
;
std
::
string
tuning_key
=
ss
<<
"channel_shuffle_opencl_kernel_"
Concat
(
"channel_shuffle_opencl_kernel"
,
output
->
dim
(
0
),
<<
output
->
dim
(
0
)
<<
"_"
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
<<
output
->
dim
(
1
)
<<
"_"
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/concat.cc
浏览文件 @
cf5cae14
...
@@ -21,6 +21,23 @@
...
@@ -21,6 +21,23 @@
namespace
mace
{
namespace
mace
{
namespace
kernels
{
namespace
kernels
{
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
);
return
lws
;
}
}
// namespace
static
void
Concat2
(
cl
::
Kernel
*
kernel
,
static
void
Concat2
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input0
,
const
Tensor
*
input0
,
const
Tensor
*
input1
,
const
Tensor
*
input1
,
...
@@ -95,11 +112,11 @@ static void Concat2(cl::Kernel *kernel,
...
@@ -95,11 +112,11 @@ static void Concat2(cl::Kernel *kernel,
*
prev_input_shape
=
input0
->
shape
();
*
prev_input_shape
=
input0
->
shape
();
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
0
}
;
const
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
*
kwg_size
)
;
std
::
string
stream
ss
;
std
::
string
tuning_key
=
ss
<<
"concat_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
Concat
(
"concat_opencl_kernel"
,
output
->
dim
(
0
),
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
)
);
TuningOrRun3DKernel
(
*
kernel
,
ss
.
str
()
,
gws
,
lws
,
future
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
(
*
kernel_error
)
->
Map
(
nullptr
);
...
@@ -149,7 +166,6 @@ static void ConcatN(cl::Kernel *kernel,
...
@@ -149,7 +166,6 @@ static void ConcatN(cl::Kernel *kernel,
index_t
chan_blk_offset
=
0
;
index_t
chan_blk_offset
=
0
;
cl
::
Event
event
;
cl
::
Event
event
;
CallStats
call_stats
{
INT64_MAX
,
0
};
CallStats
call_stats
{
INT64_MAX
,
0
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
1
};
for
(
int
i
=
0
;
i
<
inputs_count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
inputs_count
;
++
i
)
{
const
Tensor
*
input
=
input_list
[
i
];
const
Tensor
*
input
=
input_list
[
i
];
index_t
input_channel_blk
=
input
->
dim
(
3
)
/
4
;
index_t
input_channel_blk
=
input
->
dim
(
3
)
/
4
;
...
@@ -157,6 +173,7 @@ static void ConcatN(cl::Kernel *kernel,
...
@@ -157,6 +173,7 @@ static void ConcatN(cl::Kernel *kernel,
static_cast
<
uint32_t
>
(
input_channel_blk
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
input_channel_blk
),
static_cast
<
uint32_t
>
(
width
),
static_cast
<
uint32_t
>
(
batch
*
height
),
static_cast
<
uint32_t
>
(
batch
*
height
),
};
};
const
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
*
kwg_size
);
uint32_t
idx
=
0
;
uint32_t
idx
=
0
;
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
@@ -183,6 +200,7 @@ static void ConcatN(cl::Kernel *kernel,
...
@@ -183,6 +200,7 @@ static void ConcatN(cl::Kernel *kernel,
for
(
size_t
j
=
0
;
j
<
3
;
++
j
)
{
for
(
size_t
j
=
0
;
j
<
3
;
++
j
)
{
roundup_gws
[
j
]
=
RoundUp
(
gws
[
j
],
lws
[
j
]);
roundup_gws
[
j
]
=
RoundUp
(
gws
[
j
],
lws
[
j
]);
}
}
const
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
*
kwg_size
);
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
*
kernel
,
cl
::
NullRange
,
*
kernel
,
cl
::
NullRange
,
...
...
mace/kernels/opencl/conv_2d
_opencl
.cc
→
mace/kernels/opencl/conv_2d.cc
浏览文件 @
cf5cae14
文件已移动
mace/kernels/opencl/conv_2d_
opencl_
1x1.cc
→
mace/kernels/opencl/conv_2d_1x1.cc
浏览文件 @
cf5cae14
...
@@ -20,6 +20,39 @@
...
@@ -20,6 +20,39 @@
namespace
mace
{
namespace
mace
{
namespace
kernels
{
namespace
kernels
{
namespace
{
// (inputs + weights + outputs) * array_size * sizeof(float)
const
uint32_t
kernel_cache_size
=
(
4
+
4
+
4
)
*
4
*
4
;
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
}
else
{
lws
[
0
]
=
gws
[
0
]
/
8
;
if
(
lws
[
0
]
<
base
)
{
lws
[
0
]
=
std
::
max
<
uint32_t
>
(
gws
[
0
]
/
4
,
base
);
}
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
(
cache_size
/
kernel_cache_size
/
lws_size
/
compute_units
)
*
8
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
);
}
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
);
return
lws
;
}
}
// namespace
extern
void
Conv2dOpenclK1x1
(
cl
::
Kernel
*
kernel
,
extern
void
Conv2dOpenclK1x1
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
filter
,
...
@@ -130,9 +163,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
...
@@ -130,9 +163,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
*
prev_input_shape
=
input
->
shape
();
*
prev_input_shape
=
input
->
shape
();
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
0
}
;
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
*
kwg_size
)
;
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"conv2d_1x1_opencl_kernel
_"
,
activation
,
output
->
dim
(
0
),
Concat
(
"conv2d_1x1_opencl_kernel
"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
...
...
mace/kernels/opencl/conv_2d_
opencl_
3x3.cc
→
mace/kernels/opencl/conv_2d_3x3.cc
浏览文件 @
cf5cae14
...
@@ -22,6 +22,35 @@
...
@@ -22,6 +22,35 @@
namespace
mace
{
namespace
mace
{
namespace
kernels
{
namespace
kernels
{
namespace
{
// (inputs + weights + outputs) * array_size * sizeof(float)
const
uint32_t
kernel_cache_size
=
(
5
+
4
+
5
)
*
4
*
4
;
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
std
::
max
<
uint32_t
>
(
OpenCLRuntime
::
Global
()
->
device_compute_units
()
/
2
,
1
);
const
uint32_t
base
=
std
::
min
<
uint32_t
>
(
cache_size
/
kBaseGPUMemCacheSize
,
4
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
RoundUp
<
uint32_t
>
(
cache_size
/
kernel_cache_size
/
lws_size
/
compute_units
,
base
),
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
);
}
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
);
return
lws
;
}
}
// namespace
extern
void
Conv2dOpenclK3x3
(
cl
::
Kernel
*
kernel
,
extern
void
Conv2dOpenclK3x3
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
filter
,
...
@@ -128,9 +157,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
...
@@ -128,9 +157,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
*
prev_input_shape
=
input
->
shape
();
*
prev_input_shape
=
input
->
shape
();
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
4
,
*
kwg_size
/
32
,
8
,
0
}
;
const
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
*
kwg_size
)
;
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"conv2d_3x3_opencl_kernel
_"
,
activation
,
output
->
dim
(
0
),
Concat
(
"conv2d_3x3_opencl_kernel
"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
...
...
mace/kernels/opencl/conv_2d_
opencl_
general.cc
→
mace/kernels/opencl/conv_2d_general.cc
浏览文件 @
cf5cae14
...
@@ -21,6 +21,42 @@
...
@@ -21,6 +21,42 @@
namespace
mace
{
namespace
mace
{
namespace
kernels
{
namespace
kernels
{
namespace
{
// (inputs + weights + outputs) * array_size * sizeof(float)
const
uint32_t
kernel_cache_size
=
(
4
+
4
+
4
)
*
4
*
4
;
// TODO(liuqi): Fix the specific value.
const
uint32_t
lws_limit
=
20
;
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kernel_size
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
compute_units
=
OpenCLRuntime
::
Global
()
->
device_compute_units
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
0
]
=
gws
[
0
]
/
4
;
if
(
lws
[
0
]
==
0
)
{
lws
[
0
]
=
gws
[
0
];
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
(
cache_size
/
kernel_cache_size
/
kernel_size
/
lws_size
/
compute_units
)
*
8
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
if
(
gws
[
2
]
<
lws_limit
)
{
lws
[
2
]
=
gws
[
2
];
}
else
{
lws
[
2
]
=
base
;
}
}
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
);
return
lws
;
}
}
// namespace
extern
void
Conv2dOpencl
(
cl
::
Kernel
*
kernel
,
extern
void
Conv2dOpencl
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
const
Tensor
*
input
,
...
@@ -130,10 +166,12 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
...
@@ -130,10 +166,12 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
*
prev_input_shape
=
input
->
shape
();
*
prev_input_shape
=
input
->
shape
();
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
0
};
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"conv2d_general_opencl_kernel_"
,
activation
,
output
->
dim
(
0
),
Concat
(
"conv2d_general_opencl_kernel"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
),
filter
->
dim
(
0
),
filter
->
dim
(
1
));
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
filter
->
dim
(
0
)
*
filter
->
dim
(
1
),
*
kwg_size
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
mace/kernels/opencl/depth_to_space
_opencl
.cc
→
mace/kernels/opencl/depth_to_space.cc
浏览文件 @
cf5cae14
...
@@ -33,7 +33,7 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
...
@@ -33,7 +33,7 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
const
char
*
kernel_name
=
nullptr
;
const
char
*
kernel_name
=
nullptr
;
uint32_t
gws
[
3
];
uint32_t
gws
[
3
];
std
::
string
stream
ss
;
std
::
string
tuning_key
;
index_t
output_height
,
output_width
,
output_depth
;
index_t
output_height
,
output_width
,
output_depth
;
if
(
d2s_
)
{
if
(
d2s_
)
{
output_height
=
input_height
*
block_size_
;
output_height
=
input_height
*
block_size_
;
...
@@ -46,8 +46,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
...
@@ -46,8 +46,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
gws
[
0
]
=
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
output_depth
));
gws
[
0
]
=
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
output_depth
));
gws
[
1
]
=
static_cast
<
uint32_t
>
(
output_width
);
gws
[
1
]
=
static_cast
<
uint32_t
>
(
output_width
);
gws
[
2
]
=
static_cast
<
uint32_t
>
(
output_height
*
batch
);
gws
[
2
]
=
static_cast
<
uint32_t
>
(
output_height
*
batch
);
ss
<<
"depth_to_space_opencl_kernel_"
<<
batch
<<
"_"
tuning_key
=
Concat
(
"depth_to_space_opencl_kernel"
,
batch
,
output_height
,
<<
output_height
<<
"_"
<<
output_width
<<
"_"
<<
output_depth
;
output_width
,
output_depth
)
;
}
else
{
}
else
{
output_height
=
input_height
/
block_size_
;
output_height
=
input_height
/
block_size_
;
output_width
=
input_width
/
block_size_
;
output_width
=
input_width
/
block_size_
;
...
@@ -59,8 +59,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
...
@@ -59,8 +59,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
gws
[
0
]
=
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_depth
));
gws
[
0
]
=
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_depth
));
gws
[
1
]
=
static_cast
<
uint32_t
>
(
input_width
);
gws
[
1
]
=
static_cast
<
uint32_t
>
(
input_width
);
gws
[
2
]
=
static_cast
<
uint32_t
>
(
input_height
*
batch
);
gws
[
2
]
=
static_cast
<
uint32_t
>
(
input_height
*
batch
);
ss
<<
"space_to_depth_opencl_kernel_"
<<
input
->
dim
(
0
)
<<
"_"
tuning_key
=
Concat
(
"space_to_depth_opencl_kernel"
,
input
->
dim
(
0
),
<<
input
->
dim
(
1
)
<<
"_"
<<
input
->
dim
(
2
)
<<
"_"
<<
input
->
dim
(
3
);
input
->
dim
(
1
),
input
->
dim
(
2
),
input
->
dim
(
3
)
);
}
}
const
index_t
input_depth_blocks
=
RoundUpDiv4
(
input_depth
);
const
index_t
input_depth_blocks
=
RoundUpDiv4
(
input_depth
);
const
index_t
output_depth_blocks
=
RoundUpDiv4
(
output_depth
);
const
index_t
output_depth_blocks
=
RoundUpDiv4
(
output_depth
);
...
@@ -134,8 +134,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
...
@@ -134,8 +134,8 @@ void DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
input_shape_
=
input
->
shape
();
input_shape_
=
input
->
shape
();
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
}
;
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
)
;
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
()
,
gws
,
lws
,
future
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/depthwise_conv
_opencl
.cc
→
mace/kernels/opencl/depthwise_conv.cc
浏览文件 @
cf5cae14
...
@@ -21,6 +21,37 @@
...
@@ -21,6 +21,37 @@
namespace
mace
{
namespace
mace
{
namespace
kernels
{
namespace
kernels
{
namespace
{
// (inputs + weights + outputs) * array_size * sizeof(float)
const
uint32_t
kernel_cache_size
=
(
4
+
4
+
1
)
*
4
*
4
;
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
min_lws0
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
min_lws0
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
min_lws0
);
}
else
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
]
/
8
,
kwg_size
/
lws
[
1
]);
if
(
lws
[
0
]
<
min_lws0
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
std
::
max
<
uint32_t
>
(
gws
[
0
]
/
4
,
min_lws0
),
kwg_size
/
lws
[
1
]);
}
}
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
(
cache_size
/
kernel_cache_size
/
lws_size
)
*
4
,
gws
[
2
]);
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
gws
[
2
];
}
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
);
return
lws
;
}
}
// namespace
static
void
DepthwiseConv2d
(
cl
::
Kernel
*
kernel
,
static
void
DepthwiseConv2d
(
cl
::
Kernel
*
kernel
,
const
Tensor
*
input
,
// NHWC
const
Tensor
*
input
,
// NHWC
const
Tensor
*
filter
,
// HWIM
const
Tensor
*
filter
,
// HWIM
...
@@ -149,9 +180,9 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
...
@@ -149,9 +180,9 @@ static void DepthwiseConv2d(cl::Kernel *kernel,
*
prev_input_shape
=
input
->
shape
();
*
prev_input_shape
=
input
->
shape
();
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
*
kwg_size
/
64
,
8
,
0
}
;
const
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
*
kwg_size
)
;
std
::
string
tuning_key
=
Concat
(
"depthwise_conv2d_ocl_kernel
_"
,
activation
,
std
::
string
tuning_key
=
Concat
(
"depthwise_conv2d_ocl_kernel
"
,
batch
,
height
,
width
,
channels
,
multiplier
);
gws
[
0
],
gws
[
1
],
gws
[
2
]
,
multiplier
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
TuningOrRun3DKernel
(
*
kernel
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
...
...
mace/kernels/opencl/eltwise
_opencl
.cc
→
mace/kernels/opencl/eltwise.cc
浏览文件 @
cf5cae14
...
@@ -116,11 +116,11 @@ void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
...
@@ -116,11 +116,11 @@ void EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
input_shape_
=
input0
->
shape
();
input_shape_
=
input0
->
shape
();
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
}
;
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
)
;
std
::
string
stream
ss
;
std
::
string
tuning_key
=
ss
<<
"eltwise_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
Concat
(
"eltwise_opencl_kernel"
,
output
->
dim
(
0
),
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
)
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
()
,
gws
,
lws
,
future
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
...
...
mace/kernels/opencl/fully_connected
_opencl
.cc
→
mace/kernels/opencl/fully_connected.cc
浏览文件 @
cf5cae14
...
@@ -267,10 +267,10 @@ void FCWTXKernel(cl::Kernel *kernel,
...
@@ -267,10 +267,10 @@ void FCWTXKernel(cl::Kernel *kernel,
*
prev_input_shape
=
input
->
shape
();
*
prev_input_shape
=
input
->
shape
();
}
}
std
::
string
stream
ss
;
std
::
string
tuning_key
=
ss
<<
"fc_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
Concat
(
"fc_opencl_kernel"
,
output
->
dim
(
0
),
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
)
);
TuningOrRun2DKernel
(
*
kernel
,
ss
.
str
()
,
gws
->
data
(),
*
lws
,
future
);
TuningOrRun2DKernel
(
*
kernel
,
tuning_key
,
gws
->
data
(),
*
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
(
*
kernel_error
)
->
Map
(
nullptr
);
(
*
kernel_error
)
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/helper.cc
浏览文件 @
cf5cae14
...
@@ -206,6 +206,32 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
...
@@ -206,6 +206,32 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
}
}
}
}
std
::
vector
<
uint32_t
>
Default2DLocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
3
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
);
lws
[
1
]
=
kwg_size
/
lws
[
1
];
return
lws
;
}
std
::
vector
<
uint32_t
>
Default3DLocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
1
]
*
lws
[
2
];
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
base
,
kwg_size
/
lws_size
);
return
lws
;
}
void
TuningOrRun3DKernel
(
const
cl
::
Kernel
&
kernel
,
void
TuningOrRun3DKernel
(
const
cl
::
Kernel
&
kernel
,
const
std
::
string
tuning_key
,
const
std
::
string
tuning_key
,
const
uint32_t
*
gws
,
const
uint32_t
*
gws
,
...
@@ -216,31 +242,47 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
...
@@ -216,31 +242,47 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel,
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
const
uint32_t
kwg_size
=
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel
));
std
::
vector
<
uint32_t
>
local_ws
(
3
,
0
);
std
::
vector
<
std
::
vector
<
uint32_t
>>
results
;
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
kwg_size
);
std
::
vector
<
std
::
vector
<
uint32_t
>>
candidates
=
{
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
/
local_ws
[
0
]);
local_ws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
kwg_size
/
(
local_ws
[
0
]
*
local_ws
[
1
]));
return
{
// TODO(heliangliang): tuning these magic numbers
// TODO(heliangliang): tuning these magic numbers
{
local_ws
[
0
],
local_ws
[
1
],
local_ws
[
2
],
0
},
{
gws
[
0
],
gws
[
1
],
gws
[
2
],
0
},
{
kwg_size
/
16
,
4
,
4
,
0
},
{
gws
[
0
],
gws
[
1
],
gws
[
2
]
/
8
,
0
},
{
kwg_size
/
32
,
4
,
8
,
0
},
{
gws
[
0
],
gws
[
1
],
gws
[
2
]
/
4
,
0
},
{
kwg_size
/
32
,
8
,
4
,
0
},
{
gws
[
0
],
gws
[
1
],
8
,
0
},
{
kwg_size
/
64
,
8
,
8
,
0
},
{
gws
[
0
],
gws
[
1
],
4
,
0
},
{
kwg_size
/
64
,
16
,
4
,
0
},
{
gws
[
0
],
gws
[
1
],
1
,
0
},
{
kwg_size
/
128
,
8
,
16
,
0
},
{
gws
[
0
]
/
4
,
gws
[
1
],
gws
[
2
],
0
},
{
kwg_size
/
128
,
16
,
8
,
0
},
{
gws
[
0
]
/
4
,
gws
[
1
],
gws
[
2
]
/
8
,
0
},
{
kwg_size
/
128
,
32
,
4
,
0
},
{
gws
[
0
]
/
4
,
gws
[
1
],
gws
[
2
]
/
4
,
0
},
{
1
,
kwg_size
/
32
,
32
,
0
},
{
gws
[
0
]
/
4
,
gws
[
1
],
8
,
0
},
{
1
,
kwg_size
/
64
,
64
,
0
},
{
gws
[
0
]
/
4
,
gws
[
1
],
4
,
0
},
{
1
,
kwg_size
/
128
,
128
,
0
},
{
gws
[
0
]
/
4
,
gws
[
1
],
1
,
0
},
{
4
,
kwg_size
/
16
,
4
,
0
},
{
gws
[
0
]
/
8
,
gws
[
1
],
gws
[
2
],
0
},
{
4
,
kwg_size
/
28
,
7
,
0
},
{
gws
[
0
]
/
8
,
gws
[
1
],
gws
[
2
]
/
8
,
0
},
{
4
,
kwg_size
/
32
,
8
,
0
},
{
gws
[
0
]
/
8
,
gws
[
1
],
gws
[
2
]
/
4
,
0
},
{
4
,
kwg_size
/
56
,
14
,
0
},
{
gws
[
0
]
/
8
,
gws
[
1
],
8
,
0
},
{
1
,
kwg_size
,
1
,
0
},
{
gws
[
0
]
/
8
,
gws
[
1
],
4
,
0
},
{
gws
[
0
]
/
8
,
gws
[
1
],
1
,
0
},
{
4
,
gws
[
1
],
gws
[
2
],
0
},
{
4
,
gws
[
1
],
gws
[
2
]
/
8
,
0
},
{
4
,
gws
[
1
],
gws
[
2
]
/
4
,
0
},
{
4
,
gws
[
1
],
8
,
0
},
{
4
,
gws
[
1
],
4
,
0
},
{
4
,
gws
[
1
],
1
,
0
},
{
1
,
gws
[
1
],
gws
[
2
],
0
},
{
1
,
gws
[
1
],
gws
[
2
]
/
8
,
0
},
{
1
,
gws
[
1
],
gws
[
2
]
/
4
,
0
},
{
1
,
gws
[
1
],
8
,
0
},
{
1
,
gws
[
1
],
4
,
0
},
{
1
,
gws
[
1
],
1
,
0
},
};
};
for
(
auto
&
ele
:
candidates
)
{
const
uint32_t
tmp
=
ele
[
0
]
*
ele
[
1
]
*
ele
[
2
];
if
(
0
<
tmp
&&
tmp
<=
kwg_size
)
{
results
.
push_back
(
ele
);
}
}
return
results
;
};
};
cl
::
Event
event
;
cl
::
Event
event
;
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
,
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
,
...
@@ -333,19 +375,26 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
...
@@ -333,19 +375,26 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
auto
params_generator
=
[
&
]()
->
std
::
vector
<
std
::
vector
<
uint32_t
>>
{
const
uint32_t
kwg_size
=
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel
));
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel
));
uint32_t
local_ws
[
2
];
std
::
vector
<
std
::
vector
<
uint32_t
>>
results
;
local_ws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
kwg_size
);
std
::
vector
<
std
::
vector
<
uint32_t
>>
candidates
=
{
local_ws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
/
local_ws
[
0
]);
{
kwg_size
/
2
,
2
,
0
},
return
{{
local_ws
[
0
],
local_ws
[
1
],
0
},
{
kwg_size
/
4
,
4
,
0
},
{
local_ws
[
1
],
local_ws
[
0
],
0
},
{
kwg_size
/
8
,
8
,
0
},
{
kwg_size
/
4
,
4
,
0
},
{
kwg_size
/
16
,
16
,
0
},
{
kwg_size
/
16
,
16
,
0
},
{
kwg_size
/
32
,
32
,
0
},
{
kwg_size
/
32
,
32
,
0
},
{
kwg_size
/
64
,
64
,
0
},
{
kwg_size
/
64
,
64
,
0
},
{
kwg_size
/
128
,
128
,
0
},
{
kwg_size
/
128
,
128
,
0
},
{
kwg_size
/
256
,
256
,
0
},
{
kwg_size
/
256
,
256
,
0
},
{
kwg_size
,
1
,
0
},
{
kwg_size
,
1
,
0
},
{
1
,
kwg_size
,
0
}
{
1
,
kwg_size
,
0
}};
};
for
(
auto
&
ele
:
candidates
)
{
const
uint32_t
tmp
=
ele
[
0
]
*
ele
[
1
]
*
ele
[
2
];
if
(
0
<
tmp
&&
tmp
<=
kwg_size
)
{
results
.
push_back
(
ele
);
}
}
return
results
;
};
};
cl
::
Event
event
;
cl
::
Event
event
;
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
,
auto
func
=
[
&
](
const
std
::
vector
<
uint32_t
>
&
params
,
Timer
*
timer
,
...
@@ -426,5 +475,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
...
@@ -426,5 +475,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel,
}
}
}
}
}
// namespace kernels
}
// namespace kernels
}
// namespace mace
}
// namespace mace
mace/kernels/opencl/helper.h
浏览文件 @
cf5cae14
...
@@ -29,6 +29,8 @@ namespace kernels {
...
@@ -29,6 +29,8 @@ namespace kernels {
const
float
kMaxKernelExeTime
=
1000.0
;
// microseconds
const
float
kMaxKernelExeTime
=
1000.0
;
// microseconds
const
int32_t
kBaseGPUMemCacheSize
=
16384
;
enum
BufferType
{
enum
BufferType
{
CONV2D_FILTER
=
0
,
CONV2D_FILTER
=
0
,
IN_OUT_CHANNEL
=
1
,
IN_OUT_CHANNEL
=
1
,
...
@@ -112,6 +114,11 @@ std::string Concat(Args... args) {
...
@@ -112,6 +114,11 @@ std::string Concat(Args... args) {
return
ss
.
str
();
return
ss
.
str
();
}
}
std
::
vector
<
uint32_t
>
Default2DLocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
);
std
::
vector
<
uint32_t
>
Default3DLocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
);
}
// namespace kernels
}
// namespace kernels
}
// namespace mace
}
// namespace mace
#endif // MACE_KERNELS_OPENCL_HELPER_H_
#endif // MACE_KERNELS_OPENCL_HELPER_H_
mace/kernels/opencl/matmul.cc
浏览文件 @
cf5cae14
...
@@ -85,10 +85,10 @@ void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
...
@@ -85,10 +85,10 @@ void MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
RoundUpDiv4
(
A
->
dim
(
2
))));
kernel_
.
setArg
(
idx
++
,
static_cast
<
int
>
(
RoundUpDiv4
(
A
->
dim
(
2
))));
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
64
,
64
,
0
};
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
64
,
64
,
0
};
std
::
string
stream
ss
;
std
::
string
tuning_key
=
ss
<<
"matmul_opencl_kernel_"
<<
C
->
dim
(
0
)
<<
"_"
<<
C
->
dim
(
1
)
<<
"_"
Concat
(
"matmul_opencl_kernel"
,
C
->
dim
(
0
),
<<
C
->
dim
(
2
)
<<
"_"
<<
C
->
dim
(
3
);
C
->
dim
(
1
),
C
->
dim
(
2
),
C
->
dim
(
3
)
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
()
,
gws
,
lws
,
future
);
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/pad.cc
浏览文件 @
cf5cae14
...
@@ -100,7 +100,7 @@ void PadFunctor<DeviceType::GPU, T>::operator()(
...
@@ -100,7 +100,7 @@ void PadFunctor<DeviceType::GPU, T>::operator()(
input_shape_
=
input
->
shape
();
input_shape_
=
input
->
shape
();
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
}
;
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
)
;
std
::
string
tuning_key
=
std
::
string
tuning_key
=
Concat
(
"pad"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
Concat
(
"pad"
,
output
->
dim
(
0
),
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
));
output
->
dim
(
3
));
...
...
mace/kernels/opencl/pooling
_opencl
.cc
→
mace/kernels/opencl/pooling.cc
浏览文件 @
cf5cae14
...
@@ -21,6 +21,28 @@
...
@@ -21,6 +21,28 @@
namespace
mace
{
namespace
mace
{
namespace
kernels
{
namespace
kernels
{
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
std
::
min
<
uint32_t
>
(
gws
[
2
],
base
),
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
1
]
*
lws
[
2
];
lws
[
0
]
=
gws
[
0
]
/
4
;
if
(
lws
[
0
]
==
0
)
{
lws
[
0
]
=
gws
[
0
];
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws_size
);
return
lws
;
}
}
// namespace
template
<
typename
T
>
template
<
typename
T
>
void
PoolingFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
input
,
void
PoolingFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
Tensor
*
output
,
...
@@ -134,11 +156,11 @@ void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
...
@@ -134,11 +156,11 @@ void PoolingFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
};
};
}
}
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
}
;
const
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
.
data
(),
kwg_size_
)
;
std
::
string
stream
ss
;
std
::
string
tuning_key
=
ss
<<
"pooling_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
Concat
(
"pooling_opencl_kernel_"
,
output
->
dim
(
0
),
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
)
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
()
,
gws
.
data
(),
lws
,
future
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
.
data
(),
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/resize_bilinear
_opencl
.cc
→
mace/kernels/opencl/resize_bilinear.cc
浏览文件 @
cf5cae14
...
@@ -22,6 +22,34 @@
...
@@ -22,6 +22,34 @@
namespace
mace
{
namespace
mace
{
namespace
kernels
{
namespace
kernels
{
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
lws
[
1
]
>=
base
)
{
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
gws
[
0
],
base
);
}
else
{
lws
[
0
]
=
gws
[
0
]
/
8
;
if
(
lws
[
0
]
==
0
)
{
lws
[
0
]
=
gws
[
0
];
}
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
const
uint32_t
lws_size
=
lws
[
0
]
*
lws
[
1
];
lws
[
2
]
=
gws
[
2
]
/
8
;
if
(
lws
[
2
]
==
0
)
{
lws
[
2
]
=
gws
[
2
];
}
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
lws
[
2
],
kwg_size
/
lws_size
);
return
lws
;
}
}
// namespace
template
<
typename
T
>
template
<
typename
T
>
void
ResizeBilinearFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
void
ResizeBilinearFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
)
{
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
)
{
...
@@ -99,11 +127,11 @@ void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
...
@@ -99,11 +127,11 @@ void ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
input_shape_
=
input
->
shape
();
input_shape_
=
input
->
shape
();
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
}
;
const
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
kwg_size_
)
;
std
::
string
stream
ss
;
std
::
string
tuning_key
=
ss
<<
"resize_bilinear_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
Concat
(
"resize_bilinear_opencl_kernel"
,
output
->
dim
(
0
),
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
)
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
()
,
gws
,
lws
,
future
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/slice.cc
浏览文件 @
cf5cae14
...
@@ -72,7 +72,7 @@ void SliceFunctor<DeviceType::GPU, T>::operator()(
...
@@ -72,7 +72,7 @@ void SliceFunctor<DeviceType::GPU, T>::operator()(
static_cast
<
uint32_t
>
(
input
->
dim
(
0
)
*
input
->
dim
(
1
)),
static_cast
<
uint32_t
>
(
input
->
dim
(
0
)
*
input
->
dim
(
1
)),
};
};
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
1
}
;
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
)
;
cl
::
Event
event
;
cl
::
Event
event
;
CallStats
call_stats
{
INT64_MAX
,
0
};
CallStats
call_stats
{
INT64_MAX
,
0
};
for
(
int
i
=
0
;
i
<
outputs_count
;
++
i
)
{
for
(
int
i
=
0
;
i
<
outputs_count
;
++
i
)
{
...
...
mace/kernels/opencl/softmax
_opencl
.cc
→
mace/kernels/opencl/softmax.cc
浏览文件 @
cf5cae14
...
@@ -22,6 +22,27 @@
...
@@ -22,6 +22,27 @@
namespace
mace
{
namespace
mace
{
namespace
kernels
{
namespace
kernels
{
namespace
{
std
::
vector
<
uint32_t
>
LocalWS
(
const
uint32_t
*
gws
,
const
uint32_t
kwg_size
)
{
uint64_t
cache_size
=
OpenCLRuntime
::
Global
()
->
device_global_mem_cache_size
();
uint32_t
base
=
cache_size
/
kBaseGPUMemCacheSize
;
std
::
vector
<
uint32_t
>
lws
(
4
,
0
);
lws
[
1
]
=
std
::
min
<
uint32_t
>
(
gws
[
1
],
kwg_size
);
if
(
gws
[
0
]
<
base
)
{
lws
[
0
]
=
gws
[
0
];
}
else
{
lws
[
0
]
=
gws
[
0
]
/
base
;
}
lws
[
0
]
=
std
::
min
<
uint32_t
>
(
lws
[
0
],
kwg_size
/
lws
[
1
]);
lws
[
2
]
=
std
::
min
<
uint32_t
>
(
gws
[
2
],
kwg_size
/
(
lws
[
0
]
*
lws
[
1
]));
return
lws
;
}
}
// namespace
template
<
typename
T
>
template
<
typename
T
>
void
SoftmaxFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
logits
,
void
SoftmaxFunctor
<
DeviceType
::
GPU
,
T
>::
operator
()(
const
Tensor
*
logits
,
Tensor
*
output
,
Tensor
*
output
,
...
@@ -81,11 +102,11 @@ void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
...
@@ -81,11 +102,11 @@ void SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
input_shape_
=
logits
->
shape
();
input_shape_
=
logits
->
shape
();
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
}
;
std
::
vector
<
uint32_t
>
lws
=
LocalWS
(
gws
,
kwg_size_
)
;
std
::
string
stream
ss
;
std
::
string
tuning_key
=
ss
<<
"softmax_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
Concat
(
"softmax_opencl_kernel"
,
output
->
dim
(
0
),
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
output
->
dim
(
1
),
output
->
dim
(
2
),
output
->
dim
(
3
)
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
()
,
gws
,
lws
,
future
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/space_to_batch
_opencl
.cc
→
mace/kernels/opencl/space_to_batch.cc
浏览文件 @
cf5cae14
...
@@ -105,12 +105,11 @@ void SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
...
@@ -105,12 +105,11 @@ void SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
space_shape_
=
space_tensor
->
shape
();
space_shape_
=
space_tensor
->
shape
();
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
8
,
kwg_size_
/
64
,
8
,
0
};
const
std
::
vector
<
uint32_t
>
lws
=
Default3DLocalWS
(
gws
,
kwg_size_
);
std
::
stringstream
ss
;
std
::
string
tuning_key
=
ss
<<
kernel_name
<<
"_"
<<
batch_tensor
->
dim
(
0
)
<<
"_"
Concat
(
kernel_name
,
batch_tensor
->
dim
(
0
),
batch_tensor
->
dim
(
1
),
<<
batch_tensor
->
dim
(
1
)
<<
"_"
<<
batch_tensor
->
dim
(
2
)
<<
"_"
batch_tensor
->
dim
(
2
),
batch_tensor
->
dim
(
3
));
<<
batch_tensor
->
dim
(
3
);
TuningOrRun3DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
TuningOrRun3DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
mace/kernels/opencl/winograd_transform.cc
浏览文件 @
cf5cae14
...
@@ -102,11 +102,11 @@ void WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
...
@@ -102,11 +102,11 @@ void WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
0
};
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
0
};
std
::
string
stream
ss
;
std
::
string
tuning_key
=
ss
<<
"winograd_transform_kernel_"
<<
input_tensor
->
dim
(
0
)
<<
"_"
Concat
(
"winograd_transform_kernel"
,
output_tensor
->
dim
(
0
),
<<
input_tensor
->
dim
(
1
)
<<
"_"
<<
input_tensor
->
dim
(
2
)
<<
"_"
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
2
),
<<
input_tensor
->
dim
(
3
);
output_tensor
->
dim
(
3
)
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
()
,
gws
,
lws
,
future
);
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
@@ -216,12 +216,11 @@ void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
...
@@ -216,12 +216,11 @@ void WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
}
}
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
0
};
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
0
};
std
::
string
tuning_key
=
std
::
stringstream
ss
;
Concat
(
"winograd_inverse_transform_kernel"
,
output_tensor
->
dim
(
0
),
ss
<<
"winograd_inverse_transform_kernel_"
<<
input_tensor
->
dim
(
0
)
<<
"_"
output_tensor
->
dim
(
1
),
output_tensor
->
dim
(
2
),
<<
input_tensor
->
dim
(
1
)
<<
"_"
<<
input_tensor
->
dim
(
2
)
<<
"_"
output_tensor
->
dim
(
3
),
input_tensor
->
dim
(
2
));
<<
input_tensor
->
dim
(
3
);
TuningOrRun2DKernel
(
kernel_
,
tuning_key
,
gws
,
lws
,
future
);
TuningOrRun2DKernel
(
kernel_
,
ss
.
str
(),
gws
,
lws
,
future
);
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
kernel_error_
->
Map
(
nullptr
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录