Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
c4a0b84c
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
c4a0b84c
编写于
6月 11, 2018
作者:
L
liutuo
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix build wino transform kernel error
上级
4aed901d
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
25 addition
and
452 deletion
+25
-452
mace/kernels/opencl/cl/winograd_transform.cl
mace/kernels/opencl/cl/winograd_transform.cl
+0
-310
mace/kernels/opencl/winograd_transform.cc
mace/kernels/opencl/winograd_transform.cc
+25
-73
mace/ops/winograd_convolution_benchmark.cc
mace/ops/winograd_convolution_benchmark.cc
+0
-11
mace/ops/winograd_convolution_test.cc
mace/ops/winograd_convolution_test.cc
+0
-42
mace/ops/winograd_transform_benchmark.cc
mace/ops/winograd_transform_benchmark.cc
+0
-16
未找到文件。
mace/kernels/opencl/cl/winograd_transform.cl
浏览文件 @
c4a0b84c
...
...
@@ -235,316 +235,6 @@ __kernel void winograd_inverse_transform_2x2(KERNEL_ERROR_PARAMS
}
__kernel void winograd_transform_6x6(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input,
__write_only image2d_t output,
__private const int in_height,
__private const int in_width,
__private const int in_channel,
__private const int round_hw,
__private const float round_hw_r,
__private const int round_w,
__private const float round_w_r,
__private const int padding_top,
__private const int padding_left) {
int out_width_idx_i = get_global_id(0);
int chan_blk_idx_i = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (out_width_idx_i >= global_size_dim0 || chan_blk_idx_i >= global_size_dim1) {
return;
}
const int chan_blk_size = global_size_dim1 >> 3;
#else
const int chan_blk_size = get_global_size(1) >> 3;
#endif
__local DATA_TYPE4 in[8][8];
int out_width_idx = out_width_idx_i >> 3;
int chan_blk_idx = chan_blk_idx_i >> 3;
int i = mad24(out_width_idx, -8, out_width_idx_i);
int j = mad24(chan_blk_idx, -8, chan_blk_idx_i);
const int batch_idx = out_width_idx / round_hw;
const int t_idx = mad24(batch_idx, -round_hw, out_width_idx);
const int n_round_w = t_idx / round_w;
const int mod_round_w = mad24(n_round_w, -round_w, t_idx);
const int height_idx = mad24(n_round_w, 6, -padding_top);
const int width_idx = mad24(mod_round_w, 6, -padding_left);
const int nh_idx = mad24(batch_idx, in_height, height_idx);
const int wc_idx = mad24(chan_blk_idx, in_width, width_idx);
int y = select(nh_idx + j, -1, height_idx + j < 0 || height_idx + j >= in_height);
int x = width_idx + i;
x = select(wc_idx + i, -1, x < 0 || x >= in_width);
in[j][i] = READ_IMAGET(input, SAMPLER, (int2)(x, y));
barrier(CLK_LOCAL_MEM_FENCE);
DATA_TYPE4 tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7;
DATA_TYPE4 tmp;
if (j == 0) {
tmp = 0.5f * (in[1][i] + in[5][i]) - 2.5f * in[3][i];
tt2 = 1.5f * in[5][i] + tmp;
tt4 = 1.5f * in[1][i] + tmp;
tt0 = in[1][i] - 4.25f * in[3][i] + in[5][i];
tt1 = in[2][i] - 4.25f * in[4][i] + in[6][i];
tmp = in[2][i] - 5 * in[4][i];
tt3 = in[6][i] + 0.25f * tmp;
tt5 = in[6][i] + 3 * in[2][i] + tmp;
tt6 = 5.25f * (in[4][i] - in[2][i]) + in[0][i] - in[6][i];
tt7 = 5.25f * (in[3][i] - in[5][i]) + in[7][i] - in[1][i];
in[0][i] = tt6;
in[1][i] = tt1 + tt0;
in[2][i] = tt1 - tt0;
in[3][i] = tt3 + tt2;
in[4][i] = tt3 - tt2;
in[5][i] = tt5 + tt4;
in[6][i] = tt5 - tt4;
in[7][i] = tt7;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(j == 0) {
tmp = 0.5f * (in[i][1] + in[i][5]) - 2.5f * in[i][3];
tt2 = 1.5f * in[i][5] + tmp;
tt4 = 1.5f * in[i][1] + tmp;
tt0 = in[i][1] - 4.25f * in[i][3] + in[i][5];
tt1 = in[i][2] - 4.25f * in[i][4] + in[i][6];
tmp = in[i][2] - 5 * in[i][4];
tt3 = in[i][6] + 0.25f * tmp;
tt5 = in[i][6] + 3 * in[i][2] + tmp;
tt6 = 5.25f * (in[i][4] - in[i][2]) + in[i][0] - in[i][6];
tt7 = 5.25f * (in[i][3] - in[i][5]) + in[i][7] - in[i][1];
in[i][0] = tt6;
in[i][1] = tt1 + tt0;
in[i][2] = tt1 - tt0;
in[i][3] = tt3 + tt2;
in[i][4] = tt3 - tt2;
in[i][5] = tt5 + tt4;
in[i][6] = tt5 - tt4;
in[i][7] = tt7;
}
barrier(CLK_LOCAL_MEM_FENCE);
chan_blk_idx += mul24(mad24(j, 8, i), chan_blk_size);
WRITE_IMAGET(output, (int2)(out_width_idx, chan_blk_idx), in[j][i]);
}
__kernel void winograd_inverse_transform_6x6(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input,
#ifdef BIAS
__read_only image2d_t bias, /* cout%4 * cout/4 */
#endif
__write_only image2d_t output,
__private const int out_height,
__private const int out_width,
__private const int round_hw,
__private const float round_hw_r,
__private const int round_w,
__private const float round_w_r,
__private const float relux_max_limit) {
const int width_idx = get_global_id(0);
const int height_idx = get_global_id(1);
#ifndef NON_UNIFORM_WORK_GROUP
if (width_idx >= global_size_dim0 || height_idx >= global_size_dim1) {
return;
}
const int out_channel = global_size_dim1;
#else
const int out_channel = get_global_size(1);
#endif
DATA_TYPE4 in0[8], in1[8], in2[8], in3[8], in4[8], in5[8], in6[8], in7[8];
DATA_TYPE4 tv0[8], tv1[8], tv2[8], tv3[8], tv4[8], tv5[8];
const int width = width_idx;
const int height = height_idx;
const int batch = width / round_hw;
const int t = mad24(batch, -round_hw, width);
const int n_round_w = t / round_w;
const int mod_round_w = mad24(n_round_w, -round_w, t);
const int out_height_idx = mul24(n_round_w, 6);
const int out_width_idx = mul24(mod_round_w, 6);
const int out_chan_idx = height;
const int coord_x = mad24(out_chan_idx, out_width, out_width_idx);
const int coord_y = mad24(batch, out_height, out_height_idx);
int h = height_idx;
#pragma unroll
for (short i = 0; i < 8; ++i) {
in0[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in1[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in2[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in3[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in4[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in5[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in6[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
#pragma unroll
for (short i = 0; i < 8; ++i) {
in7[i] = READ_IMAGET(input, SAMPLER, (int2)(width_idx, h));
h += out_channel;
}
DATA_TYPE4 tt0, tt1, tt2, tt3, tt4, tt5, d0, d7;
#define PROCESS_IN(i) \
d0 = in0[i];\
d7 = in7[i];\
tt0 = in1[i] + in2[i];\
tt1 = in1[i] - in2[i];\
tt2 = in3[i] + in4[i];\
tt3 = in3[i] - in4[i];\
tt3 = tt3 + tt3;\
tt4 = in5[i] + in6[i];\
tt4 = tt4 + tt4;\
tt5 = in5[i] - in6[i];\
tt0 = tt0 + tt2 + tt4;\
tt1 = tt1 + tt3 + tt5;\
tv0[i] = tt0 + tt4 * 15 + d0;\
tv1[i] = tt1 + tt5 * 15;\
tv2[i] = tt0 + 3 * (tt2 + tt4);\
tv3[i] = tt1 + 3 * (tt3 + tt5);\
tv4[i] = tt0 + tt2 * 15;\
tv5[i] = tt1 + tt3 * 15 + d7;\
PROCESS_IN(0);
PROCESS_IN(1);
PROCESS_IN(2);
PROCESS_IN(3);
PROCESS_IN(4);
PROCESS_IN(5);
PROCESS_IN(6);
PROCESS_IN(7);
#undef PROCESS_IN
#define PROCESS_SND(i) \
d0 = tv##i[0];\
d7 = tv##i[7];\
tt0 = tv##i[1] + tv##i[2];\
tt1 = tv##i[1] - tv##i[2];\
tt2 = tv##i[3] + tv##i[4];\
tt3 = tv##i[3] - tv##i[4];\
tt3 = tt3 + tt3;\
tt4 = tv##i[5] + tv##i[6];\
tt4 = tt4 + tt4;\
tt5 = tv##i[5] - tv##i[6];\
tt0 = tt0 + tt2 + tt4;\
tt1 = tt1 + tt3 + tt5;\
in##i[0] = tt0 + tt4 * 15 + d0;\
in##i[1] = tt1 + tt5 * 15;\
in##i[2] = tt0 + (tt2 + tt4) * 3;\
in##i[3] = tt1 + (tt3 + tt5) * 3;\
in##i[4] = tt0 + tt2 * 15;\
in##i[5] = tt1 + tt3 * 15 + d7;
PROCESS_SND(0);
PROCESS_SND(1);
PROCESS_SND(2);
PROCESS_SND(3);
PROCESS_SND(4);
PROCESS_SND(5);
#undef PROCESS_SND
#ifdef BIAS
const DATA_TYPE4 bias_value = READ_IMAGET(bias, SAMPLER, (int2)(out_chan_idx, 0));
#pragma unroll
for (short i = 0; i < 6; ++i) {
in0[i] += bias_value;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in1[i] += bias_value;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in2[i] += bias_value;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in3[i] += bias_value;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in4[i] += bias_value;
}
#pragma unroll
for (short i = 0; i < 6; ++i) {
in5[i] += bias_value;
}
#endif
#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
#pragma unroll
for (short i = 0; i < 6; ++i) {
in0[i] = do_activation(in0[i], relux_max_limit);
in1[i] = do_activation(in1[i], relux_max_limit);
in2[i] = do_activation(in2[i], relux_max_limit);
in3[i] = do_activation(in3[i], relux_max_limit);
in4[i] = do_activation(in4[i], relux_max_limit);
in5[i] = do_activation(in5[i], relux_max_limit);
}
#endif
const int num = min(6, out_width - out_width_idx);
const int h_num = out_height - out_height_idx;
if(h_num < 1) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y), in0[i]);
}
if(h_num < 2) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 1), in1[i]);
}
if(h_num < 3) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 2), in2[i]);
}
if(h_num < 4) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 3), in3[i]);
}
if(h_num < 5) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 4), in4[i]);
}
if(h_num < 6) return;
#pragma unroll
for (int i = 0; i < num; ++i) {
WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 5), in5[i]);
}
}
__kernel void winograd_transform_4x4(KERNEL_ERROR_PARAMS
GLOBAL_WORK_GROUP_SIZE_DIM2
__read_only image2d_t input,
...
...
mace/kernels/opencl/winograd_transform.cc
浏览文件 @
c4a0b84c
...
...
@@ -29,21 +29,19 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
string
obfuscated_kernel_name
;
std
::
set
<
std
::
string
>
built_options
;
if
(
wino_blk_size_
==
6
)
{
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_transform_6x6"
);
built_options
.
emplace
(
"-Dwinograd_transform_6x6="
+
obfuscated_kernel_name
);
}
else
if
(
wino_blk_size_
==
4
)
{
if
(
wino_blk_size_
==
4
)
{
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_transform_4x4"
);
built_options
.
emplace
(
"-Dwinograd_transform_4x4="
+
obfuscated_kernel_name
);
}
else
{
}
else
if
(
wino_blk_size_
==
2
)
{
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_transform_2x2"
);
built_options
.
emplace
(
"-Dwinograd_transform_2x2="
+
obfuscated_kernel_name
);
}
else
{
MACE_CHECK
(
false
,
"mace only supports 4x4 and 2x2 gpu winograd."
);
return
MACE_SUCCESS
;
}
built_options
.
emplace
(
"-DDATA_TYPE="
+
DtToUpstreamCLDt
(
DataTypeToEnum
<
T
>::
value
));
...
...
@@ -90,16 +88,10 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
const
float
round_w_r
=
1.
f
/
static_cast
<
float
>
(
round_w
);
const
index_t
blk_sqr
=
(
wino_blk_size_
+
2
)
*
(
wino_blk_size_
+
2
);
uint32_t
gws
[
2
];
if
(
wino_blk_size_
==
6
)
{
gws
[
0
]
=
static_cast
<
uint32_t
>
(
out_width
)
*
(
wino_blk_size_
+
2
);
gws
[
1
]
=
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_tensor
->
dim
(
3
)))
*
(
wino_blk_size_
+
2
);
}
else
{
gws
[
0
]
=
static_cast
<
uint32_t
>
(
out_width
);
gws
[
1
]
=
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_tensor
->
dim
(
3
)));
}
const
uint32_t
gws
[
2
]
=
{
static_cast
<
uint32_t
>
(
out_width
),
static_cast
<
uint32_t
>
(
RoundUpDiv4
(
input_tensor
->
dim
(
3
)))
};
if
(
!
IsVecEqual
(
input_shape_
,
input_tensor
->
shape
()))
{
output_shape
=
{
blk_sqr
,
input_tensor
->
dim
(
3
),
out_width
};
std
::
vector
<
size_t
>
image_shape
;
...
...
@@ -130,44 +122,7 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
input_shape_
=
input_tensor
->
shape
();
}
if
(
wino_blk_size_
==
6
)
{
const
std
::
vector
<
uint32_t
>
lws
=
{
static_cast
<
uint32_t
>
(
wino_blk_size_
+
2
),
static_cast
<
uint32_t
>
(
wino_blk_size_
+
2
),
0
};
cl
::
Event
event
;
cl_int
error
;
if
(
runtime
->
IsNonUniformWorkgroupsSupported
())
{
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel_
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
else
{
std
::
vector
<
uint32_t
>
roundup_gws
(
2
,
0
);
roundup_gws
[
0
]
=
RoundUp
(
gws
[
0
],
lws
[
0
]);
roundup_gws
[
1
]
=
RoundUp
(
gws
[
1
],
lws
[
1
]);
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
kernel_
,
cl
::
NullRange
,
cl
::
NDRange
(
roundup_gws
[
0
],
roundup_gws
[
1
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
]),
nullptr
,
&
event
);
}
if
(
runtime
->
IsOutOfRangeCheckEnabled
())
{
kernel_error_
->
Map
(
nullptr
);
char
*
kerror_code
=
kernel_error_
->
mutable_data
<
char
>
();
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
MACE_CHECK
(
error
==
CL_SUCCESS
)
<<
"Error code: "
<<
error
;
if
(
future
!=
nullptr
)
{
future
->
wait_fn
=
[
runtime
,
event
](
CallStats
*
stats
)
{
event
.
wait
();
if
(
stats
!=
nullptr
)
{
runtime
->
GetCallStats
(
event
,
stats
);
}
};
}
}
else
{
const
std
::
vector
<
uint32_t
>
lws
=
{
kwg_size_
/
8
,
8
,
0
};
std
::
string
tuning_key
=
Concat
(
"winograd_transform_kernel"
,
output_tensor
->
dim
(
0
),
...
...
@@ -181,7 +136,6 @@ MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
MACE_CHECK
(
*
kerror_code
==
0
)
<<
"Kernel error code: "
<<
*
kerror_code
;
kernel_error_
->
UnMap
();
}
}
return
MACE_SUCCESS
;
}
...
...
@@ -197,21 +151,19 @@ MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
if
(
kernel_
.
get
()
==
nullptr
)
{
std
::
string
obfuscated_kernel_name
;
std
::
set
<
std
::
string
>
built_options
;
if
(
wino_blk_size_
==
6
)
{
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_inverse_transform_6x6"
);
built_options
.
emplace
(
"-Dwinograd_inverse_transform_6x6="
+
obfuscated_kernel_name
);
}
else
if
(
wino_blk_size_
==
4
)
{
if
(
wino_blk_size_
==
4
)
{
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_inverse_transform_4x4"
);
built_options
.
emplace
(
"-Dwinograd_inverse_transform_4x4="
+
obfuscated_kernel_name
);
}
else
{
}
else
if
(
wino_blk_size_
==
2
)
{
obfuscated_kernel_name
=
MACE_OBFUSCATE_SYMBOL
(
"winograd_inverse_transform_2x2"
);
built_options
.
emplace
(
"-Dwinograd_inverse_transform_2x2="
+
obfuscated_kernel_name
);
}
else
{
MACE_CHECK
(
false
,
"mace only supports 4x4 and 2x2 gpu winograd."
);
return
MACE_SUCCESS
;
}
built_options
.
emplace
(
"-DDATA_TYPE="
+
...
...
mace/ops/winograd_convolution_benchmark.cc
浏览文件 @
c4a0b84c
...
...
@@ -109,25 +109,14 @@ MACE_BM_WINOGRAD_CONV(1, 256, 256, 3, 16, 2);
MACE_BM_WINOGRAD_CONV
(
1
,
64
,
64
,
3
,
16
,
4
);
MACE_BM_WINOGRAD_CONV
(
1
,
128
,
128
,
3
,
16
,
4
);
MACE_BM_WINOGRAD_CONV
(
1
,
256
,
256
,
3
,
16
,
4
);
MACE_BM_WINOGRAD_CONV
(
1
,
64
,
64
,
3
,
16
,
6
);
MACE_BM_WINOGRAD_CONV
(
1
,
128
,
128
,
3
,
16
,
6
);
MACE_BM_WINOGRAD_CONV
(
1
,
256
,
256
,
3
,
16
,
6
);
MACE_BM_WINOGRAD_CONV
(
1
,
28
,
28
,
256
,
256
,
2
);
MACE_BM_WINOGRAD_CONV
(
1
,
28
,
28
,
256
,
256
,
4
);
MACE_BM_WINOGRAD_CONV
(
1
,
28
,
28
,
256
,
256
,
6
);
MACE_BM_WINOGRAD_CONV
(
1
,
56
,
56
,
256
,
256
,
2
);
MACE_BM_WINOGRAD_CONV
(
1
,
56
,
56
,
256
,
256
,
4
);
MACE_BM_WINOGRAD_CONV
(
1
,
56
,
56
,
256
,
256
,
6
);
MACE_BM_WINOGRAD_CONV
(
1
,
128
,
128
,
128
,
256
,
2
);
MACE_BM_WINOGRAD_CONV
(
1
,
128
,
128
,
128
,
256
,
4
);
MACE_BM_WINOGRAD_CONV
(
1
,
128
,
128
,
128
,
256
,
6
);
MACE_BM_WINOGRAD_CONV
(
1
,
256
,
256
,
256
,
256
,
2
);
MACE_BM_WINOGRAD_CONV
(
1
,
256
,
256
,
256
,
256
,
4
);
MACE_BM_WINOGRAD_CONV
(
1
,
256
,
256
,
256
,
256
,
6
);
}
// namespace test
}
// namespace ops
...
...
mace/ops/winograd_convolution_test.cc
浏览文件 @
c4a0b84c
...
...
@@ -141,27 +141,6 @@ TEST_F(WinogradConvlutionTest, BatchConvolutionM2) {
Padding
::
SAME
,
2
);
}
TEST_F
(
WinogradConvlutionTest
,
AlignedConvolutionM6
)
{
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
3
,
3
,
Padding
::
VALID
,
6
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
3
,
3
,
Padding
::
SAME
,
6
);
}
TEST_F
(
WinogradConvlutionTest
,
UnAlignedConvolutionM6
)
{
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
31
,
37
,
Padding
::
VALID
,
6
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
37
,
31
,
Padding
::
SAME
,
6
);
}
TEST_F
(
WinogradConvlutionTest
,
BatchConvolutionM6
)
{
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
3
,
64
,
64
,
32
,
32
,
Padding
::
VALID
,
6
);
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
5
,
61
,
67
,
37
,
31
,
Padding
::
SAME
,
6
);
}
TEST_F
(
WinogradConvlutionTest
,
AlignedConvolutionM4
)
{
WinogradConvolution
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
3
,
3
,
Padding
::
VALID
,
4
);
...
...
@@ -299,27 +278,6 @@ TEST_F(WinogradConvlutionTest, BatchConvolutionWithM2Pad) {
2
,
2
);
}
TEST_F
(
WinogradConvlutionTest
,
AlignedConvolutionM6WithPad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
32
,
16
,
1
,
6
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
32
,
16
,
2
,
6
);
}
TEST_F
(
WinogradConvlutionTest
,
UnAlignedConvolutionM6WithPad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
31
,
37
,
1
,
6
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
1
,
61
,
67
,
37
,
31
,
2
,
6
);
}
TEST_F
(
WinogradConvlutionTest
,
BatchConvolutionWithM6Pad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
3
,
64
,
64
,
32
,
32
,
1
,
6
);
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
5
,
61
,
67
,
37
,
31
,
2
,
6
);
}
TEST_F
(
WinogradConvlutionTest
,
AlignedConvolutionM4WithPad
)
{
WinogradConvolutionWithPad
<
DeviceType
::
GPU
,
float
>
(
1
,
32
,
32
,
32
,
16
,
1
,
4
);
...
...
mace/ops/winograd_transform_benchmark.cc
浏览文件 @
c4a0b84c
...
...
@@ -74,9 +74,6 @@ MACE_BM_WINO_TRANSFORM(1, 64, 64, 3, 2);
MACE_BM_WINO_TRANSFORM
(
1
,
128
,
128
,
3
,
4
);
MACE_BM_WINO_TRANSFORM
(
1
,
256
,
256
,
3
,
4
);
MACE_BM_WINO_TRANSFORM
(
1
,
64
,
64
,
3
,
4
);
MACE_BM_WINO_TRANSFORM
(
1
,
128
,
128
,
3
,
6
);
MACE_BM_WINO_TRANSFORM
(
1
,
256
,
256
,
3
,
6
);
MACE_BM_WINO_TRANSFORM
(
1
,
64
,
64
,
3
,
6
);
namespace
{
template
<
DeviceType
D
,
typename
T
>
...
...
@@ -142,11 +139,6 @@ MACE_BM_WINO_INVERSE_TRANSFORM(1, 126, 126, 16, 4);
MACE_BM_WINO_INVERSE_TRANSFORM
(
1
,
62
,
62
,
16
,
4
);
MACE_BM_WINO_INVERSE_TRANSFORM
(
1
,
254
,
254
,
16
,
4
);
MACE_BM_WINO_INVERSE_TRANSFORM
(
1
,
126
,
126
,
16
,
6
);
MACE_BM_WINO_INVERSE_TRANSFORM
(
1
,
62
,
62
,
16
,
6
);
MACE_BM_WINO_INVERSE_TRANSFORM
(
1
,
254
,
254
,
16
,
6
);
namespace
{
template
<
DeviceType
D
,
typename
T
>
void
WinoFilterBufferToImage
(
int
iters
,
...
...
@@ -199,18 +191,12 @@ void WinoFilterBufferToImage(int iters,
MACE_BM_WINO_B2I
(
16
,
3
,
3
,
3
,
2
);
MACE_BM_WINO_B2I
(
16
,
3
,
3
,
3
,
4
);
MACE_BM_WINO_B2I
(
16
,
3
,
3
,
3
,
6
);
MACE_BM_WINO_B2I
(
32
,
3
,
3
,
3
,
2
);
MACE_BM_WINO_B2I
(
32
,
3
,
3
,
3
,
4
);
MACE_BM_WINO_B2I
(
32
,
3
,
3
,
3
,
6
);
MACE_BM_WINO_B2I
(
128
,
3
,
3
,
3
,
2
);
MACE_BM_WINO_B2I
(
128
,
3
,
3
,
3
,
4
);
MACE_BM_WINO_B2I
(
128
,
3
,
3
,
3
,
6
);
MACE_BM_WINO_B2I
(
256
,
3
,
3
,
3
,
2
);
MACE_BM_WINO_B2I
(
256
,
3
,
3
,
3
,
4
);
MACE_BM_WINO_B2I
(
256
,
3
,
3
,
3
,
6
);
namespace
{
template
<
DeviceType
D
,
typename
T
>
...
...
@@ -278,10 +264,8 @@ void WinoMatMulBenchmark(
MACE_BM_WINO_MATMUL
(
16
,
3
,
128
,
128
,
2
);
MACE_BM_WINO_MATMUL
(
16
,
3
,
128
,
128
,
4
);
MACE_BM_WINO_MATMUL
(
16
,
3
,
128
,
128
,
6
);
MACE_BM_WINO_MATMUL
(
32
,
3
,
256
,
256
,
2
);
MACE_BM_WINO_MATMUL
(
32
,
3
,
256
,
256
,
4
);
MACE_BM_WINO_MATMUL
(
32
,
3
,
256
,
256
,
6
);
}
// namespace test
}
// namespace ops
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录