Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
e9eaa4d4
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
e9eaa4d4
编写于
3月 30, 2018
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
FC and CWise ops support opencl 1.1/1.2.
上级
6fcf7b6e
变更
30
隐藏空白更改
内联
并排
Showing
30 changed file
with
159 addition
and
118 deletion
+159
-118
mace/core/runtime/opencl/opencl_runtime.cc
mace/core/runtime/opencl/opencl_runtime.cc
+6
-6
mace/core/runtime/opencl/opencl_runtime.h
mace/core/runtime/opencl/opencl_runtime.h
+1
-0
mace/kernels/cwise.h
mace/kernels/cwise.h
+1
-0
mace/kernels/opencl/cl/activation.cl
mace/kernels/opencl/cl/activation.cl
+1
-2
mace/kernels/opencl/cl/addn.cl
mace/kernels/opencl/cl/addn.cl
+1
-2
mace/kernels/opencl/cl/batch_norm.cl
mace/kernels/opencl/cl/batch_norm.cl
+1
-2
mace/kernels/opencl/cl/bias_add.cl
mace/kernels/opencl/cl/bias_add.cl
+1
-2
mace/kernels/opencl/cl/buffer_to_image.cl
mace/kernels/opencl/cl/buffer_to_image.cl
+12
-24
mace/kernels/opencl/cl/channel_shuffle.cl
mace/kernels/opencl/cl/channel_shuffle.cl
+1
-2
mace/kernels/opencl/cl/common.h
mace/kernels/opencl/cl/common.h
+4
-4
mace/kernels/opencl/cl/concat.cl
mace/kernels/opencl/cl/concat.cl
+2
-4
mace/kernels/opencl/cl/conv_2d.cl
mace/kernels/opencl/cl/conv_2d.cl
+1
-2
mace/kernels/opencl/cl/conv_2d_1x1.cl
mace/kernels/opencl/cl/conv_2d_1x1.cl
+1
-2
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+1
-2
mace/kernels/opencl/cl/cwise.cl
mace/kernels/opencl/cl/cwise.cl
+8
-3
mace/kernels/opencl/cl/depth_to_space.cl
mace/kernels/opencl/cl/depth_to_space.cl
+2
-3
mace/kernels/opencl/cl/depthwise_conv2d.cl
mace/kernels/opencl/cl/depthwise_conv2d.cl
+2
-4
mace/kernels/opencl/cl/eltwise.cl
mace/kernels/opencl/cl/eltwise.cl
+1
-2
mace/kernels/opencl/cl/fully_connected.cl
mace/kernels/opencl/cl/fully_connected.cl
+20
-3
mace/kernels/opencl/cl/matmul.cl
mace/kernels/opencl/cl/matmul.cl
+1
-2
mace/kernels/opencl/cl/pooling.cl
mace/kernels/opencl/cl/pooling.cl
+1
-2
mace/kernels/opencl/cl/resize_bilinear.cl
mace/kernels/opencl/cl/resize_bilinear.cl
+1
-2
mace/kernels/opencl/cl/slice.cl
mace/kernels/opencl/cl/slice.cl
+1
-2
mace/kernels/opencl/cl/softmax.cl
mace/kernels/opencl/cl/softmax.cl
+1
-2
mace/kernels/opencl/cl/space_to_batch.cl
mace/kernels/opencl/cl/space_to_batch.cl
+2
-4
mace/kernels/opencl/cl/winograd_transform.cl
mace/kernels/opencl/cl/winograd_transform.cl
+2
-4
mace/kernels/opencl/cwise_opencl.cc
mace/kernels/opencl/cwise_opencl.cc
+14
-4
mace/kernels/opencl/fully_connected_opencl.cc
mace/kernels/opencl/fully_connected_opencl.cc
+67
-24
mace/ops/depthwise_conv2d_test.cc
mace/ops/depthwise_conv2d_test.cc
+0
-1
mace/ops/fully_connected_test.cc
mace/ops/fully_connected_test.cc
+2
-2
未找到文件。
mace/core/runtime/opencl/opencl_runtime.cc
浏览文件 @
e9eaa4d4
...
...
@@ -480,12 +480,12 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) {
}
const
bool
OpenCLRuntime
::
IsNonUniformWorkgroupsSupported
()
{
if
(
gpu_type_
==
GPUType
::
QUALCOMM_ADRENO
&&
opencl_version_
==
"2.0"
)
{
return
true
;
}
else
{
return
false
;
}
return
(
gpu_type_
==
GPUType
::
QUALCOMM_ADRENO
&&
opencl_version_
==
"2.0"
)
;
}
const
GPUType
OpenCLRuntime
::
gpu_type
()
const
{
return
gpu_type_
;
}
const
GPUType
OpenCLRuntime
::
ParseGPUTypeFromDeviceName
(
...
...
mace/core/runtime/opencl/opencl_runtime.h
浏览文件 @
e9eaa4d4
...
...
@@ -66,6 +66,7 @@ class OpenCLRuntime {
uint64_t
GetKernelWaveSize
(
const
cl
::
Kernel
&
kernel
);
const
bool
IsNonUniformWorkgroupsSupported
();
const
GPUType
ParseGPUTypeFromDeviceName
(
const
std
::
string
&
device_name
);
const
GPUType
gpu_type
()
const
;
cl
::
Kernel
BuildKernel
(
const
std
::
string
&
program_name
,
const
std
::
string
&
kernel_name
,
const
std
::
set
<
std
::
string
>
&
build_options
);
...
...
mace/kernels/cwise.h
浏览文件 @
e9eaa4d4
...
...
@@ -114,6 +114,7 @@ struct CWiseFunctor<DeviceType::OPENCL, T> : CWiseFunctorBase {
StatsFuture
*
future
);
cl
::
Kernel
kernel_
;
uint32_t
kwg_size_
;
std
::
vector
<
index_t
>
input_shape_
;
};
...
...
mace/kernels/opencl/cl/activation.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
__kernel
void
activation
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel
void
activation
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
#
ifdef
USE_PRELU
__read_only
image2d_t
alpha,
...
...
mace/kernels/opencl/cl/addn.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
__kernel
void
addn
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__kernel
void
addn
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
input0,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
input1,
#
if
INPUT_NUM
>
2
...
...
mace/kernels/opencl/cl/batch_norm.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
//
Supported
data
types:
half/float
__kernel
void
batch_norm
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel
void
batch_norm
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__read_only
image2d_t
scale,
__read_only
image2d_t
offset,
...
...
mace/kernels/opencl/cl/bias_add.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
//
Supported
data
types:
half/float
__kernel
void
bias_add
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel
void
bias_add
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__read_only
image2d_t
bias,
__write_only
image2d_t
output
)
{
...
...
mace/kernels/opencl/cl/buffer_to_image.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
__kernel
void
filter_buffer_to_image
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__kernel
void
filter_buffer_to_image
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__global
const
DATA_TYPE
*input,
/*
h,
w,
oc,
ic
*/
__private
const
int
input_offset,
__private
const
int
filter_h,
...
...
@@ -53,8 +52,7 @@ __kernel void filter_buffer_to_image(
WRITE_IMAGET(output, coord, values);
}
__kernel void filter_image_to_buffer(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__kernel void filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, /* h, w, oc, ic */
__private const int filter_h,
__private const int filter_w,
...
...
@@ -102,8 +100,7 @@ __kernel void filter_image_to_buffer(
}
}
__kernel void dw_filter_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__kernel void dw_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* h, w, ic, m */
__private const int input_offset,
__private const int filter_w,
...
...
@@ -160,8 +157,7 @@ __kernel void dw_filter_buffer_to_image(
WRITE_IMAGET(output, coord, values);
}
__kernel void in_out_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__kernel void in_out_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int height,
...
...
@@ -202,8 +198,7 @@ __kernel void in_out_buffer_to_image(
WRITE_IMAGET(output, coord, values);
}
__kernel void in_out_image_to_buffer(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__kernel void in_out_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, /* nhwc */
__private const int height,
__private const int width,
...
...
@@ -242,8 +237,7 @@ __kernel void in_out_image_to_buffer(
}
}
__kernel void arg_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__kernel void arg_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int count,
...
...
@@ -278,8 +272,7 @@ __kernel void arg_buffer_to_image(
WRITE_IMAGET(output, coord, values);
}
__kernel void arg_image_to_buffer(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__kernel void arg_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, /* nhwc */
__private const int count,
__read_only image2d_t input) {
...
...
@@ -312,8 +305,7 @@ __kernel void arg_image_to_buffer(
}
__kernel void in_out_height_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__kernel void in_out_height_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //nhwc
__private const int input_offset,
__private const int height,
...
...
@@ -355,8 +347,7 @@ __kernel void in_out_height_buffer_to_image(
WRITE_IMAGET(output, coord, values);
}
__kernel void in_out_height_image_to_buffer(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__kernel void in_out_height_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //nhwc
__private const int height,
__private const int width,
...
...
@@ -394,8 +385,7 @@ __kernel void in_out_height_image_to_buffer(
}
__kernel void in_out_width_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__kernel void in_out_width_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, /* nhwc */
__private const int input_offset,
__private const int height,
...
...
@@ -437,8 +427,7 @@ __kernel void in_out_width_buffer_to_image(
}
// only support 3x3 now
__kernel void winograd_filter_buffer_to_image(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__kernel void winograd_filter_buffer_to_image(GLOBAL_WORK_GROUP_SIZE_DIM2
__global const DATA_TYPE *input, //Oc, Ic, H, W
__private const int input_offset,
__private const int in_channels,
...
...
@@ -529,8 +518,7 @@ __kernel void winograd_filter_buffer_to_image(
}
// only support 3x3 now
__kernel void winograd_filter_image_to_buffer(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__kernel void winograd_filter_image_to_buffer(GLOBAL_WORK_GROUP_SIZE_DIM2
__global DATA_TYPE *output, //Oc, Ic, H, W
__private const int height,
__private const int width,
...
...
mace/kernels/opencl/cl/channel_shuffle.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
//
assume
channes_per_group
mod
4
=
0
&&
groups
mod
4
==
0
__kernel
void
channel_shuffle
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel
void
channel_shuffle
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__private
const
int
groups,
__private
const
int
channels_per_group,
...
...
mace/kernels/opencl/cl/common.h
浏览文件 @
e9eaa4d4
...
...
@@ -19,18 +19,18 @@
#ifndef NON_UNIFORM_WORK_GROUP
#define
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_
2 \
#define
GLOBAL_WORK_GROUP_SIZE_DIM
2 \
__private const int global_size_dim0, \
__private const int global_size_dim1,
#define
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_
3 \
#define
GLOBAL_WORK_GROUP_SIZE_DIM
3 \
__private const int global_size_dim0, \
__private const int global_size_dim1, \
__private const int global_size_dim2,
#else
#define
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_
2
#define
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_
3
#define
GLOBAL_WORK_GROUP_SIZE_DIM
2
#define
GLOBAL_WORK_GROUP_SIZE_DIM
3
#endif
...
...
mace/kernels/opencl/cl/concat.cl
浏览文件 @
e9eaa4d4
...
...
@@ -22,8 +22,7 @@ DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
}
//
Supported
data
type:
half/float
__kernel
void
concat_channel
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel
void
concat_channel
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input0,
__read_only
image2d_t
input1,
__private
const
int
input0_chan,
...
...
@@ -84,8 +83,7 @@ __kernel void concat_channel(
}
// Required: All input channels are divisible by 4
__kernel void concat_channel_multi(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel void concat_channel_multi(GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__private const int chan_blk_offset,
__write_only image2d_t output) {
...
...
mace/kernels/opencl/cl/conv_2d.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
__kernel
void
conv_2d
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel
void
conv_2d
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin,
kh
*
kw
*
cout/4
*/
#
ifdef
BIAS
...
...
mace/kernels/opencl/cl/conv_2d_1x1.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
__kernel
void
conv_2d_1x1
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel
void
conv_2d_1x1
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin,
cout/4
*/
#
ifdef
BIAS
...
...
mace/kernels/opencl/cl/conv_2d_3x3.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
__kernel
void
conv_2d_3x3
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel
void
conv_2d_3x3
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
cin
,
kh
*
kw
*
cout/4
*/
#
ifdef
BIAS
...
...
mace/kernels/opencl/cl/cwise.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
__kernel
void
cwise
(
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__private
const
float
value,
__write_only
image2d_t
output
)
{
__kernel
void
cwise
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__private
const
float
value,
__write_only
image2d_t
output
)
{
const
int
w
=
get_global_id
(
0
)
;
const
int
hb
=
get_global_id
(
1
)
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
w
>=
global_size_dim0
||
hb
>=
global_size_dim1
)
return
;
#
endif
DATA_TYPE4
in0
=
READ_IMAGET
(
input,
SAMPLER,
(
int2
)(
w,
hb
))
;
DATA_TYPE4
in1
=
(
DATA_TYPE4
)
{value,
value,
value,
value}
;
DATA_TYPE4
out
;
...
...
mace/kernels/opencl/cl/depth_to_space.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
__kernel
void
depth_to_space
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel
void
depth_to_space
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__private
const
int
block_size,
__private
const
int
input_height,
...
...
@@ -36,7 +35,7 @@ __kernel void depth_to_space(
}
__kernel void space_to_depth(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_
3
GLOBAL_WORK_GROUP_SIZE_DIM
3
__read_only image2d_t input,
__private const int block_size,
__private const int input_height,
...
...
mace/kernels/opencl/cl/depthwise_conv2d.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
//
Only
multiplier
=
1
is
supported
__kernel
void
depthwise_conv2d
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel
void
depthwise_conv2d
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
filter,
/*
cout%4
*
kh
*
kw
*
m,
cin/4
*/
#
ifdef
BIAS
...
...
@@ -138,8 +137,7 @@ __kernel void depthwise_conv2d(
WRITE_IMAGET(output, (int2)(out_x_base + w, out_hb), out3);
}
__kernel void depthwise_conv2d_s1(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel void depthwise_conv2d_s1(GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
__read_only image2d_t filter, /* cout%4 * kh * kw * m, cin/4 */
#ifdef BIAS
...
...
mace/kernels/opencl/cl/eltwise.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
__kernel
void
eltwise
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__kernel
void
eltwise
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
input0,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__read_only
image2d_t
input1,
#
ifdef
COEFF_SUM
...
...
mace/kernels/opencl/cl/fully_connected.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
//
output
=
weight
*
input
+
bias
__kernel
void
fully_connected
(
__read_only
image2d_t
input,
__kernel
void
fully_connected
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
input,
__read_only
image2d_t
weight,
#
ifdef
BIAS
__read_only
image2d_t
bias,
...
...
@@ -15,6 +16,10 @@ __kernel void fully_connected(__read_only image2d_t input,
const
int
out_blk_idx
=
get_global_id
(
1
)
;
const
int
input_chan_blk
=
(
input_channel
+
3
)
>>
2
;
#
ifndef
NON_UNIFORM_WORK_GROUP
if
(
batch_idx
>=
global_size_dim0
|
| out_blk_idx >= global_size_dim1) return;
#endif
float4 input_value;
float4 w0, w1, w2, w3;
...
...
@@ -57,7 +62,8 @@ __kernel void fully_connected(__read_only image2d_t input,
}
// output = weight * input + bias
__kernel void fully_connected_width(__read_only image2d_t input,
__kernel void fully_connected_width(GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t input,
__read_only image2d_t weight,
#ifdef BIAS
__read_only image2d_t bias,
...
...
@@ -73,6 +79,7 @@ __kernel void fully_connected_width(__read_only image2d_t input,
const int width_blk_idx = get_global_id(1);
const int width_blk_count = get_global_size(1);
const int batch_out_blk_idx = get_global_id(2);
const int batch_idx = batch_out_blk_idx / out_blks;
const int out_blk_idx = batch_out_blk_idx % out_blks;
...
...
@@ -115,6 +122,16 @@ __kernel void fully_connected_width(__read_only image2d_t input,
short inter_idx = mad24((short)get_local_id(2), local_size, inter_out_offset);
intermediate_output[inter_idx] = sum;
#ifdef NON_QUALCOMM_ADRENO
barrier(CLK_LOCAL_MEM_FENCE);
#endif
#ifndef NON_UNIFORM_WORK_GROUP
if (batch_out_blk_idx >= global_size_dim2) {
return;
}
#endif
if (inter_out_offset == 0) {
#ifdef BIAS
DATA_TYPE4 result = READ_IMAGET(bias, SAMPLER, (int2)(out_blk_idx, 0));
...
...
@@ -122,7 +139,7 @@ __kernel void fully_connected_width(__read_only image2d_t input,
DATA_TYPE4 result = (DATA_TYPE4)(0, 0, 0, 0);
#endif
for(short i = 0; i < local_width_blk_size; ++i) {
for
(short i = 0; i < local_width_blk_size; ++i) {
result += vload4(0, intermediate_output+inter_idx);
inter_idx += 4;
}
...
...
mace/kernels/opencl/cl/matmul.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
//
C
=
A
*
B
__kernel
void
matmul
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__kernel
void
matmul
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
A,
__read_only
image2d_t
B,
__write_only
image2d_t
C,
...
...
mace/kernels/opencl/cl/pooling.cl
浏览文件 @
e9eaa4d4
...
...
@@ -19,8 +19,7 @@ inline int calculate_avg_block_size(const int pool_size,
}
//
Supported
data
type:
half/float
__kernel
void
pooling
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel
void
pooling
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__private
const
int
in_height,
__private
const
int
in_width,
...
...
mace/kernels/opencl/cl/resize_bilinear.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
__kernel
void
resize_bilinear_nocache
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel
void
resize_bilinear_nocache
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
/*
[c%4
*
w
*
c/4,
h
*
b]
*/
__write_only
image2d_t
output,
__private
const
float
height_scale,
...
...
mace/kernels/opencl/cl/slice.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
__kernel
void
slice
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel
void
slice
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__private
const
int
chan_blk_offset,
__write_only
image2d_t
output
)
{
...
...
mace/kernels/opencl/cl/softmax.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
__kernel
void
softmax
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel
void
softmax
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
input,
__private
const
int
channels,
__private
const
int
remain_channels,
...
...
mace/kernels/opencl/cl/space_to_batch.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
__kernel
void
space_to_batch
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel
void
space_to_batch
(
GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only
image2d_t
space_data,
__write_only
image2d_t
batch_data,
__private
const
int
block_height,
...
...
@@ -48,8 +47,7 @@ __kernel void space_to_batch(
WRITE_IMAGET(batch_data, batch_coord, value);
}
__kernel void batch_to_space(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_3
__kernel void batch_to_space(GLOBAL_WORK_GROUP_SIZE_DIM3
__read_only image2d_t batch_data,
__write_only image2d_t space_data,
__private const int block_height,
...
...
mace/kernels/opencl/cl/winograd_transform.cl
浏览文件 @
e9eaa4d4
#
include
<common.h>
__kernel
void
winograd_transform_2x2
(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__kernel
void
winograd_transform_2x2
(
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only
image2d_t
input,
__write_only
image2d_t
output,
__private
const
int
in_height,
...
...
@@ -116,8 +115,7 @@ __kernel void winograd_transform_2x2(
}
}
__kernel void winograd_inverse_transform_2x2(
UNIFORM_WORK_GROUP_SIZE_PARAMS_IN_DIM_2
__kernel void winograd_inverse_transform_2x2(GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input,
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
...
...
mace/kernels/opencl/cwise_opencl.cc
浏览文件 @
e9eaa4d4
...
...
@@ -23,8 +23,10 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const
index_t
width_pixels
=
channel_blocks
*
width
;
const
index_t
batch_height_pixels
=
batch
*
height
;
auto
runtime
=
OpenCLRuntime
::
Global
();
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
static_cast
<
uint32_t
>
(
batch_height_pixels
)};
if
(
kernel_
.
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"cwise"
);
...
...
@@ -32,19 +34,27 @@ void CWiseFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
dt
));
built_options
.
emplace
(
"-DCMD_DATA_TYPE="
+
DtToUpstreamCLCMDDt
(
dt
));
built_options
.
emplace
(
MakeString
(
"-DCWISE_TYPE="
,
type_
));
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
kernel_
=
runtime
->
BuildKernel
(
"cwise"
,
kernel_name
,
built_options
);
kwg_size_
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
kernel_
));
}
if
(
!
IsVecEqual
(
input_shape_
,
input
->
shape
()))
{
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel_
.
setArg
(
idx
++
,
gws
[
0
]);
kernel_
.
setArg
(
idx
++
,
gws
[
1
]);
}
kernel_
.
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel_
.
setArg
(
idx
++
,
static_cast
<
float
>
(
coeff_
));
kernel_
.
setArg
(
idx
++
,
*
(
output
->
opencl_image
()));
input_shape_
=
input
->
shape
();
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
width_pixels
),
static_cast
<
uint32_t
>
(
batch_height_pixels
)};
const
std
::
vector
<
uint32_t
>
lws
=
{
64
,
16
,
1
};
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
16
,
16
,
1
};
std
::
stringstream
ss
;
ss
<<
"cwise_opencl_kernel_"
<<
output
->
dim
(
0
)
<<
"_"
<<
output
->
dim
(
1
)
<<
"_"
<<
output
->
dim
(
2
)
<<
"_"
<<
output
->
dim
(
3
);
...
...
mace/kernels/opencl/fully_connected_opencl.cc
浏览文件 @
e9eaa4d4
...
...
@@ -27,6 +27,10 @@ void FCWXKernel(cl::Kernel *kernel,
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel
->
get
()
==
nullptr
)
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
output_size
=
output
->
dim
(
3
);
const
index_t
output_blocks
=
RoundUpDiv4
(
output_size
);
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"fully_connected"
);
...
...
@@ -55,28 +59,47 @@ void FCWXKernel(cl::Kernel *kernel,
default:
LOG
(
FATAL
)
<<
"Unknown activation type: "
<<
activation
;
}
if
(
runtime
->
gpu_type
()
!=
GPUType
::
QUALCOMM_ADRENO
)
{
built_options
.
emplace
(
"-DNON_QUALCOMM_ADRENO"
);
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
*
kernel
=
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
built_options
);
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
output_size
=
output
->
dim
(
3
);
const
index_t
output_blocks
=
RoundUpDiv4
(
output_size
);
const
uint32_t
wave_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelWaveSize
(
*
kernel
));
if
(
runtime
->
gpu_type
()
==
GPUType
::
QUALCOMM_ADRENO
)
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
const
uint32_t
wave_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelWaveSize
(
*
kernel
));
*
gws
=
{
4
,
(
wave_size
/
4
),
static_cast
<
uint32_t
>
(
batch
*
output_blocks
)};
*
gws
=
{
4
,
(
wave_size
/
4
),
static_cast
<
uint32_t
>
(
batch
*
output_blocks
)};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
const
uint32_t
inter_local_blks
=
kwg_size
/
((
*
gws
)[
0
]
*
(
*
gws
)[
1
]);
*
lws
=
{(
*
gws
)[
0
],
(
*
gws
)[
1
],
inter_local_blks
};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
const
uint32_t
inter_local_blks
=
kwg_size
/
((
*
gws
)[
0
]
*
(
*
gws
)[
1
]);
*
lws
=
{(
*
gws
)[
0
],
(
*
gws
)[
1
],
inter_local_blks
};
}
else
{
*
gws
=
{
4
,
8
,
static_cast
<
uint32_t
>
(
batch
*
output_blocks
)};
const
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
const
uint32_t
inter_local_blks
=
kwg_size
/
((
*
gws
)[
0
]
*
(
*
gws
)[
1
]);
*
lws
=
{(
*
gws
)[
0
],
(
*
gws
)[
1
],
inter_local_blks
};
}
}
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
output_blocks
=
RoundUpDiv4
(
output
->
dim
(
3
));
(
*
gws
)[
2
]
=
static_cast
<
uint32_t
>
(
batch
*
output_blocks
);
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
0
]);
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
1
]);
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
2
]);
}
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
weight
->
opencl_image
()));
if
(
bias
!=
nullptr
)
{
...
...
@@ -91,15 +114,25 @@ void FCWXKernel(cl::Kernel *kernel,
kernel
->
setArg
(
idx
++
,
static_cast
<
int
>
(
output_blocks
));
kernel
->
setArg
(
idx
++
,
relux_max_limit
);
(
*
gws
)[
2
]
=
static_cast
<
uint32_t
>
(
batch
*
output_blocks
);
*
prev_input_shape
=
input
->
shape
();
}
cl
::
Event
event
;
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
*
kernel
,
cl
::
NullRange
,
cl
::
NDRange
((
*
gws
)[
0
],
(
*
gws
)[
1
],
(
*
gws
)[
2
]),
cl
::
NDRange
((
*
lws
)[
0
],
(
*
lws
)[
1
],
(
*
lws
)[
2
]),
nullptr
,
&
event
);
MACE_CHECK_CL_SUCCESS
(
error
);
cl_int
error
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
*
kernel
,
cl
::
NullRange
,
cl
::
NDRange
((
*
gws
)[
0
],
(
*
gws
)[
1
],
(
*
gws
)[
2
]),
cl
::
NDRange
((
*
lws
)[
0
],
(
*
lws
)[
1
],
(
*
lws
)[
2
]),
nullptr
,
&
event
);
}
else
{
std
::
vector
<
uint32_t
>
roundup_gws
(
lws
->
size
());
for
(
size_t
i
=
0
;
i
<
lws
->
size
();
++
i
)
{
roundup_gws
[
i
]
=
RoundUp
((
*
gws
)[
i
],
(
*
lws
)[
i
]);
}
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
*
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
],
roundup_gws
[
2
]),
cl
::
NDRange
((
*
lws
)[
0
],
(
*
lws
)[
1
],
(
*
lws
)[
2
]),
nullptr
,
&
event
);
}
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
...
...
@@ -125,8 +158,8 @@ void FCWTXKernel(cl::Kernel *kernel,
StatsFuture
*
future
)
{
MACE_CHECK_NOTNULL
(
gws
);
MACE_CHECK_NOTNULL
(
lws
);
auto
runtime
=
OpenCLRuntime
::
Global
();
if
(
kernel
->
get
()
==
nullptr
)
{
auto
runtime
=
OpenCLRuntime
::
Global
();
std
::
set
<
std
::
string
>
built_options
;
auto
dt
=
DataTypeToEnum
<
T
>::
value
;
std
::
string
kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"fully_connected"
);
...
...
@@ -136,6 +169,9 @@ void FCWTXKernel(cl::Kernel *kernel,
if
(
bias
!=
nullptr
)
{
built_options
.
emplace
(
"-DBIAS"
);
}
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
built_options
.
emplace
(
"-DNON_UNIFORM_WORK_GROUP"
);
}
switch
(
activation
)
{
case
NOOP
:
break
;
...
...
@@ -157,10 +193,23 @@ void FCWTXKernel(cl::Kernel *kernel,
*
kernel
=
runtime
->
BuildKernel
(
"fully_connected"
,
kernel_name
,
built_options
);
*
lws
=
{
16
,
64
,
1
};
uint32_t
kwg_size
=
static_cast
<
uint32_t
>
(
runtime
->
GetKernelMaxWorkGroupSize
(
*
kernel
));
*
lws
=
{
16
,
kwg_size
/
16
,
1
};
}
if
(
!
IsVecEqual
(
*
prev_input_shape
,
input
->
shape
()))
{
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
output_blocks
=
RoundUpDiv4
(
output
->
dim
(
3
));
*
gws
=
{
static_cast
<
uint32_t
>
(
batch
),
static_cast
<
uint32_t
>
(
output_blocks
),
};
uint32_t
idx
=
0
;
if
(
!
runtime
->
IsNonUniformWorkgroupsSupported
())
{
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
0
]);
kernel
->
setArg
(
idx
++
,
(
*
gws
)[
1
]);
}
kernel
->
setArg
(
idx
++
,
*
(
input
->
opencl_image
()));
kernel
->
setArg
(
idx
++
,
*
(
weight
->
opencl_image
()));
if
(
bias
!=
nullptr
)
{
...
...
@@ -173,12 +222,6 @@ void FCWTXKernel(cl::Kernel *kernel,
// FIXME handle flexable data type: half not supported
kernel
->
setArg
(
idx
++
,
relux_max_limit
);
const
index_t
batch
=
output
->
dim
(
0
);
const
index_t
output_blocks
=
RoundUpDiv4
(
output
->
dim
(
3
));
*
gws
=
{
static_cast
<
uint32_t
>
(
batch
),
static_cast
<
uint32_t
>
(
output_blocks
),
};
*
prev_input_shape
=
input
->
shape
();
}
...
...
mace/ops/depthwise_conv2d_test.cc
浏览文件 @
e9eaa4d4
...
...
@@ -57,7 +57,6 @@ void SimpleValidTest() {
.
AddIntsArg
(
"strides"
,
{
1
,
1
})
.
AddIntArg
(
"padding"
,
Padding
::
VALID
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run
net
.
RunOp
(
D
);
...
...
mace/ops/fully_connected_test.cc
浏览文件 @
e9eaa4d4
...
...
@@ -225,7 +225,7 @@ void TestWXFormat(const index_t batch,
kernels
::
BufferType
::
IN_OUT_CHANNEL
);
BufferToImage
<
DeviceType
::
OPENCL
,
T
>
(
&
net
,
"Weight"
,
"WeightImage"
,
kernels
::
BufferType
::
WEIGHT_WIDTH
);
BufferToImage
<
DeviceType
::
OPENCL
,
float
>
(
&
net
,
"Bias"
,
"BiasImage"
,
BufferToImage
<
DeviceType
::
OPENCL
,
T
>
(
&
net
,
"Bias"
,
"BiasImage"
,
kernels
::
BufferType
::
ARGUMENT
);
OpDefBuilder
(
"FC"
,
"FullyConnectedTest"
)
...
...
@@ -236,7 +236,7 @@ void TestWXFormat(const index_t batch,
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DataTypeToEnum
<
T
>::
value
))
.
Finalize
(
net
.
NewOperatorDef
());
// Run
on opencl
// Run
net
.
RunOp
(
DeviceType
::
OPENCL
);
ImageToBuffer
<
DeviceType
::
OPENCL
,
float
>
(
&
net
,
"OutputImage"
,
"OPENCLOutput"
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录