Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
52cc0540
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
52cc0540
编写于
11月 02, 2017
作者:
L
liuqi
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Finish conv 3x3 with stride 1 and 2.
上级
0461beb5
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
307 addition
and
102 deletion
+307
-102
mace/kernels/opencl/cl/conv_2d_3x3.cl
mace/kernels/opencl/cl/conv_2d_3x3.cl
+76
-0
mace/kernels/opencl/cl/conv_helper.cl
mace/kernels/opencl/cl/conv_helper.cl
+41
-0
mace/kernels/opencl/cl/depthwise_conv_3x3.cl
mace/kernels/opencl/cl/depthwise_conv_3x3.cl
+8
-35
mace/kernels/opencl/conv_2d_opencl.cc
mace/kernels/opencl/conv_2d_opencl.cc
+15
-6
mace/kernels/opencl/conv_2d_opencl_3x3.cc
mace/kernels/opencl/conv_2d_opencl_3x3.cc
+64
-0
mace/ops/conv_2d_benchmark.cc
mace/ops/conv_2d_benchmark.cc
+11
-18
mace/ops/conv_2d_test.cc
mace/ops/conv_2d_test.cc
+90
-41
mace/ops/depthwise_conv_2d_benchmark.cc
mace/ops/depthwise_conv_2d_benchmark.cc
+2
-2
未找到文件。
mace/kernels/opencl/cl/conv_2d_3x3.cl
0 → 100644
浏览文件 @
52cc0540
float4
conv1x3_s1
(
const
float
*input_ptr,
const
float
*filter_ptr
)
;
float4
conv1x3_s2
(
const
float
*input_ptr,
const
float
*filter_ptr
)
;
float
conv3x3
(
const
float
*input_ptr,
const
float
*filter_ptr,
const
int
row_width
)
;
void
kernel
conv_2d_3x3
(
global
const
float
*input,
global
const
float
*filter,
global
const
float
*bias,
global
float
*output,
private
const
uint
in_chan_num,
private
const
uint
out_chan_num,
private
const
uint
in_height,
private
const
uint
in_width,
private
const
uint
out_height,
private
const
uint
out_width,
private
const
uint
stride_h,
private
const
uint
stride_w
)
{
int
batch
=
get_global_id
(
0
)
;
int
out_chan_blk
=
get_global_id
(
1
)
;
int
out_pixel_blk
=
get_global_id
(
2
)
;
const
uint
in_pixel
=
in_height
*
in_width
;
const
uint
out_pixel
=
out_height
*
out_width
;
const
uint
round_out_width
=
(
out_width
+
3
)
/
4
;
const
uint
out_pixel_height
=
out_pixel_blk
/
round_out_width
;
const
uint
out_pixel_width
=
out_pixel_blk
%
round_out_width
;
const
uint
out_chan_begin
=
out_chan_blk
*
4
;
const
uint
out_chan_end
=
min
(
out_chan_begin
+
4
,
out_chan_num
)
;
const
uint
out_pixel_begin
=
out_pixel_height
*
out_width
+
out_pixel_width
*
4
;
const
uint
out_pixel_end
=
min
(
out_pixel_begin
+
4
,
(
out_pixel_height
+
1
)
*
out_width
)
;
const
uint
in_pixel_begin
=
out_pixel_height
*
stride_h
*
in_width
+
out_pixel_width
*
stride_w
*
4
;
const
uint
in_offset
=
batch
*
in_chan_num
*
in_pixel
;
const
uint
out_offset
=
batch
*
out_chan_num
*
out_pixel
;
const
float
*input_base
=
input
+
in_offset
+
in_pixel_begin
;
float
*output_base
=
output
+
out_offset
+
out_pixel_begin
;
uint
pixels
=
out_pixel_end
-
out_pixel_begin
;
for
(
uint
i
=
out_chan_begin
; i < out_chan_end; ++i) {
float4
res
=
(
float4
)
bias[i]
;
float
*output_ptr
=
output_base
+
i
*
out_pixel
;
const
float
*filter_base
=
filter
+
i
*
in_chan_num
*
9
;
if
(
pixels
==
4
)
{
for
(
uint
in_chan_idx
=
0
; in_chan_idx < in_chan_num; ++in_chan_idx) {
const
float*
input_ptr
=
input_base
+
in_chan_idx
*
in_pixel
;
const
float*
filter_ptr
=
filter_base
+
in_chan_idx
*
9
;
if
(
stride_w
==
1
)
{
res
+=
conv1x3_s1
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
}
else
{
res
+=
conv1x3_s2
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s2
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s2
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
}
}
vstore4
(
res,
0
,
output_ptr
)
;
}
else
{
for
(
uint
p
=
0
; p < pixels; ++p) {
float
res
=
bias[i]
;
for
(
uint
in_chan_idx
=
0
; in_chan_idx < in_chan_num; ++in_chan_idx) {
const
float*
input_ptr
=
input_base
+
in_chan_idx
*
in_pixel
+
p
*
stride_w
;
const
float*
filter_ptr
=
filter_base
+
in_chan_idx
*
9
;
res
+=
conv3x3
(
input_ptr,
filter_ptr,
in_width
)
;
}
output_ptr[p]
=
res
;
}
}
}
}
mace/kernels/opencl/cl/conv_helper.cl
0 → 100644
浏览文件 @
52cc0540
float4
conv1x3_s1
(
const
float
*input_ptr,
const
float
*filter_ptr
)
{
float4
row0
=
vload4
(
0
,
input_ptr
)
;
float2
input1
=
vload2
(
0
,
input_ptr+4
)
;
float4
row1
=
(
float4
)(
row0.s123,
input1.s0
)
;
float4
row2
=
(
float4
)(
row0.s23,
input1.s01
)
;
float3
filter_values
=
vload3
(
0
,
filter_ptr
)
;
return
(
float4
)
filter_values.s0
*
row0
+
(
float4
)
filter_values.s1
*
row1
+
(
float4
)
filter_values.s2
*
row2
;
}
float4
conv1x3_s2
(
const
float
*input_ptr,
const
float
*filter_ptr
)
{
float8
input
=
vload8
(
0
,
input_ptr
)
;
float4
row0
=
input.even
;
float4
row1
=
input.odd
;
float4
row2
=
(
float4
)(
row0.s123,
input_ptr[8]
)
;
float3
filter_values
=
vload3
(
0
,
filter_ptr
)
;
return
(
float4
)
filter_values.s0
*
row0
+
(
float4
)
filter_values.s1
*
row1
+
(
float4
)
filter_values.s2
*
row2
;
}
float
conv3x3
(
const
float
*input_ptr,
const
float
*filter_ptr,
const
int
row_width
)
{
float3
input_value
=
vload3
(
0
,
input_ptr
)
;
float3
filter_value
=
vload3
(
0
,
filter_ptr
)
;
float3
res
=
input_value
*
filter_value
;
input_ptr
+=
row_width
;
input_value
=
vload3
(
0
,
input_ptr
)
;
filter_value
=
vload3
(
1
,
filter_ptr
)
;
res
+=
input_value
*
filter_value
;
input_ptr
+=
row_width
;
input_value
=
vload3
(
0
,
input_ptr
)
;
filter_value
=
vload3
(
2
,
filter_ptr
)
;
res
+=
input_value
*
filter_value
;
return
res.s0
+
res.s1
+
res.s2
;
}
mace/kernels/opencl/cl/depthwise_conv_3x3.cl
浏览文件 @
52cc0540
inline
float4
conv1x3
(
const
float
*input_ptr,
float4
conv1x3_s1
(
const
float
*input_ptr,
const
float
*filter_ptr
)
{
const
float
*filter_ptr
)
;
float8
input
=
vload8
(
0
,
input_ptr
)
;
float
conv3x3
(
const
float
*input_ptr,
float4
row0
=
convert_float4
(
input.s0123
)
;
float4
row1
=
convert_float4
(
input.s1234
)
;
float4
row2
=
convert_float4
(
input.s2345
)
;
return
(
float4
)
filter_ptr[0]
*
row0
+
(
float4
)
filter_ptr[1]
*
row1
+
(
float4
)
filter_ptr[2]
*
row2
;
}
inline
float4
conv3x3x4
(
const
float
*input_ptr,
const
float
*filter_ptr,
const
int
row_width
)
{
float4
res
;
res
=
conv1x3
(
input_ptr
+
0
*
row_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3
(
input_ptr
+
1
*
row_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3
(
input_ptr
+
2
*
row_width,
filter_ptr
+
2
*
3
)
;
return
res
;
}
inline
float
conv3x3
(
const
float
*input_ptr,
const
float
*filter_ptr,
const
float
*filter_ptr,
const
int
row_width
)
{
const
int
row_width
)
;
float
res
=
input_ptr[0]
*
filter_ptr[0]
+
input_ptr[1]
*
filter_ptr[1]
+
input_ptr[2]
*
filter_ptr[2]
;
input_ptr
+=
row_width
;
filter_ptr
+=
3
;
res
+=
input_ptr[0]
*
filter_ptr[0]
+
input_ptr[1]
*
filter_ptr[1]
+
input_ptr[2]
*
filter_ptr[2]
;
input_ptr
+=
row_width
;
filter_ptr
+=
3
;
res
+=
input_ptr[0]
*
filter_ptr[0]
+
input_ptr[1]
*
filter_ptr[1]
+
input_ptr[2]
*
filter_ptr[2]
;
return
res
;
}
void
kernel
depthwise_conv_3x3_s1
(
global
const
float
*input,
/*
n,
c,
h,
w
*/
void
kernel
depthwise_conv_3x3_s1
(
global
const
float
*input,
/*
n,
c,
h,
w
*/
global
const
float
*filter,
/*
m,
i,
kh,
kw
*/
global
const
float
*filter,
/*
m,
i,
kh,
kw
*/
...
@@ -80,8 +51,10 @@ void kernel depthwise_conv_3x3_s1(global const float *input, /* n, c, h, w */
...
@@ -80,8 +51,10 @@ void kernel depthwise_conv_3x3_s1(global const float *input, /* n, c, h, w */
input_ptr
+=
1
;
input_ptr
+=
1
;
}
}
}
else
{
}
else
{
float4
res
=
conv3x3x4
(
input_ptr,
filter_ptr,
in_width
)
;
float4
res
=
(
float4
)
bias_value
;
res
+=
(
float4
)
bias_value
;
res
+=
conv1x3_s1
(
input_ptr
+
0
*
in_width,
filter_ptr
+
0
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
1
*
in_width,
filter_ptr
+
1
*
3
)
;
res
+=
conv1x3_s1
(
input_ptr
+
2
*
in_width,
filter_ptr
+
2
*
3
)
;
vstore4
(
res,
0
,
output_ptr
)
;
vstore4
(
res,
0
,
output_ptr
)
;
}
}
}
}
...
...
mace/kernels/opencl/conv_2d_opencl.cc
浏览文件 @
52cc0540
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
//
//
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/conv_pool_2d_util.h"
namespace
mace
{
namespace
mace
{
namespace
kernels
{
namespace
kernels
{
...
@@ -11,6 +10,11 @@ namespace kernels {
...
@@ -11,6 +10,11 @@ namespace kernels {
extern
void
Conv2dOpenclK1x1S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
extern
void
Conv2dOpenclK1x1S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
);
const
Tensor
*
bias
,
Tensor
*
output
);
extern
void
Conv2dOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
);
extern
void
Conv2dOpenclK3x3S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
);
template
<
>
template
<
>
void
Conv2dFunctor
<
DeviceType
::
OPENCL
,
float
>::
operator
()(
const
Tensor
*
input
,
void
Conv2dFunctor
<
DeviceType
::
OPENCL
,
float
>::
operator
()(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
filter
,
...
@@ -22,7 +26,7 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
...
@@ -22,7 +26,7 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
static
const
Conv2dOpenclFunction
selector
[
5
][
2
]
=
{
static
const
Conv2dOpenclFunction
selector
[
5
][
2
]
=
{
{
Conv2dOpenclK1x1S1
,
nullptr
},
{
Conv2dOpenclK1x1S1
,
nullptr
},
{
nullptr
,
nullptr
},
{
nullptr
,
nullptr
},
{
nullptr
,
nullptr
},
{
Conv2dOpenclK3x3S1
,
Conv2dOpenclK3x3S2
},
{
nullptr
,
nullptr
},
{
nullptr
,
nullptr
},
{
nullptr
,
nullptr
}};
{
nullptr
,
nullptr
}};
...
@@ -40,11 +44,16 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
...
@@ -40,11 +44,16 @@ void Conv2dFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
input
,
filter
,
bias
,
output
);
input
,
filter
,
bias
,
output
);
return
;
return
;
}
}
MACE_CHECK
(
paddings_
[
0
]
==
0
&&
paddings_
[
1
]
==
0
,
"Padding not supported"
);
auto
conv2d_func
=
selector
[
kernel_h
-
1
][
strides_
[
0
]
-
1
];
auto
conv2d_func
=
selector
[
kernel_h
-
1
][
strides_
[
0
]
-
1
];
conv2d_func
(
input
,
filter
,
bias
,
output
);
if
(
paddings_
[
0
]
>
0
||
paddings_
[
1
]
>
0
)
{
Tensor
padded_input
(
GetDeviceAllocator
(
DeviceType
::
OPENCL
),
DataTypeToEnum
<
float
>::
v
());
Tensor
::
MappingGuard
input_mapper
(
input
);
ConstructInputWithPadding
(
input
->
data
<
float
>
(),
input
->
shape
().
data
(),
paddings_
.
data
(),
&
padded_input
);
conv2d_func
(
&
padded_input
,
filter
,
bias
,
output
);
}
else
{
conv2d_func
(
input
,
filter
,
bias
,
output
);
}
}
}
}
// namespace kernels
}
// namespace kernels
...
...
mace/kernels/opencl/conv_2d_opencl_3x3.cc
0 → 100644
浏览文件 @
52cc0540
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include "mace/core/common.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/conv_2d.h"
namespace
mace
{
namespace
kernels
{
static
void
InnerConv2dK3x3S12
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
const
uint32_t
stride
,
Tensor
*
output
)
{
const
index_t
channels
=
output
->
shape
()[
1
];
const
index_t
height
=
output
->
shape
()[
2
];
const
index_t
width
=
output
->
shape
()[
3
];
MACE_CHECK
(
input
->
dim
(
0
)
==
output
->
dim
(
0
));
const
index_t
channel_blocks
=
(
channels
+
3
)
/
4
;
const
index_t
pixel_blocks
=
(
width
+
3
)
/
4
*
height
;
auto
runtime
=
OpenCLRuntime
::
Get
();
auto
program
=
runtime
->
program
();
auto
bm_kernel
=
cl
::
Kernel
(
program
,
"conv_2d_3x3"
);
uint32_t
idx
=
0
;
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
input
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
filter
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
const
cl
::
Buffer
*>
(
bias
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
*
(
static_cast
<
cl
::
Buffer
*>
(
output
->
buffer
())));
bm_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input
->
dim
(
1
)));
bm_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
channels
));
bm_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input
->
dim
(
2
)));
bm_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
input
->
dim
(
3
)));
bm_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
height
));
bm_kernel
.
setArg
(
idx
++
,
static_cast
<
uint32_t
>
(
width
));
bm_kernel
.
setArg
(
idx
++
,
stride
);
bm_kernel
.
setArg
(
idx
++
,
stride
);
const
uint32_t
gws
[
3
]
=
{
static_cast
<
uint32_t
>
(
output
->
dim
(
0
)),
static_cast
<
uint32_t
>
(
channel_blocks
),
static_cast
<
uint32_t
>
(
pixel_blocks
)};
const
uint32_t
lws
[
3
]
=
{
static_cast
<
uint32_t
>
(
1
),
static_cast
<
uint32_t
>
(
1
),
static_cast
<
uint32_t
>
(
256
)};
cl_int
error
=
runtime
->
command_queue
().
enqueueNDRangeKernel
(
bm_kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
gws
[
0
],
gws
[
1
],
gws
[
2
]),
cl
::
NDRange
(
lws
[
0
],
lws
[
1
],
lws
[
2
]));
MACE_CHECK
(
error
==
CL_SUCCESS
);
}
void
Conv2dOpenclK3x3S1
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
)
{
InnerConv2dK3x3S12
(
input
,
filter
,
bias
,
1
,
output
);
};
void
Conv2dOpenclK3x3S2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
const
Tensor
*
bias
,
Tensor
*
output
)
{
InnerConv2dK3x3S12
(
input
,
filter
,
bias
,
2
,
output
);
};
}
// namespace kernels
}
// namespace mace
mace/ops/conv_2d_benchmark.cc
浏览文件 @
52cc0540
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
//
//
#include <algorithm>
#include <algorithm>
#include <sstream>
#include "mace/core/operator.h"
#include "mace/core/operator.h"
#include "mace/core/testing/test_benchmark.h"
#include "mace/core/testing/test_benchmark.h"
...
@@ -14,7 +13,6 @@ namespace mace {
...
@@ -14,7 +13,6 @@ namespace mace {
template
<
DeviceType
D
,
typename
T
>
template
<
DeviceType
D
,
typename
T
>
static
void
Conv2d
(
int
iters
,
static
void
Conv2d
(
int
iters
,
int
iters_to_sync
,
int
batch
,
int
batch
,
int
channels
,
int
channels
,
int
height
,
int
height
,
...
@@ -32,37 +30,32 @@ static void Conv2d(int iters,
...
@@ -32,37 +30,32 @@ static void Conv2d(int iters,
.
Input
(
"Filter"
)
.
Input
(
"Filter"
)
.
Input
(
"Bias"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
Output
(
"Output"
)
.
AddIntsArg
(
"strides"
,
{
stride
,
stride
})
.
Finalize
(
net
.
operator_def
());
.
AddIntArg
(
"padding"
,
padding
)
.
AddIntsArg
(
"dilations"
,
{
1
,
1
})
// Add args
.
Finalize
(
net
.
NewOperatorDef
());
net
.
AddIntsArg
(
"strides"
,
{
stride
,
stride
});
net
.
AddIntArg
(
"padding"
,
padding
);
net
.
AddIntsArg
(
"dilations"
,
{
1
,
1
});
// Add input data
// Add input data
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
channels
,
height
,
width
});
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
channels
,
height
,
width
});
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
{
output_channels
,
channels
,
kernel_h
,
kernel_w
});
{
output_channels
,
channels
,
kernel_h
,
kernel_w
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
// Warm-up
// Warm-up
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
net
.
RunOp
(
D
);
net
.
RunOp
(
D
);
net
.
Sync
();
}
}
net
.
Sync
();
mace
::
testing
::
StartTiming
();
mace
::
testing
::
StartTiming
();
while
(
iters
--
)
{
while
(
iters
--
)
{
net
.
RunOp
(
D
);
net
.
RunOp
(
D
);
if
(
iters
%
iters_to_sync
==
0
)
{
net
.
Sync
();
}
}
}
net
.
Sync
();
}
}
// In common network, there are usually more than 1 layers, this is used to
// approximate the amortized latency. The OpenCL runtime for Mali/Adreno is
// in-order.
constexpr
int
kItersToSync
=
10
;
#define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, DEVICE) \
#define BM_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, DEVICE) \
static void \
static void \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
...
@@ -70,8 +63,8 @@ constexpr int kItersToSync = 10;
...
@@ -70,8 +63,8 @@ constexpr int kItersToSync = 10;
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::ItemsProcessed(tot); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
mace::testing::BytesProcessed(tot *(sizeof(TYPE))); \
Conv2d<DEVICE, TYPE>(iters,
kItersToSync, N, C, H, W, KH, KW, STRIDE,
\
Conv2d<DEVICE, TYPE>(iters,
N, C, H, W, KH, KW, STRIDE, mace::Padding::P,
\
mace::Padding::P, OC);
\
OC);
\
} \
} \
BENCHMARK( \
BENCHMARK( \
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
BM_CONV_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
...
...
mace/ops/conv_2d_test.cc
浏览文件 @
52cc0540
...
@@ -3,16 +3,15 @@
...
@@ -3,16 +3,15 @@
//
//
#include "mace/ops/conv_2d.h"
#include "mace/ops/conv_2d.h"
#include "mace/core/operator.h"
#include "mace/ops/ops_test_util.h"
#include "mace/ops/ops_test_util.h"
using
namespace
mace
;
using
namespace
mace
;
class
Conv2dOpTest
:
public
OpsTestBase
{};
class
Conv2dOpTest
:
public
OpsTestBase
{};
TEST_F
(
Conv2dOpTest
,
Simple_VALID
)
{
template
<
DeviceType
D
>
// Construct graph
void
TestSimple3x3VALID
()
{
auto
&
net
=
test_net
()
;
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Filter"
)
...
@@ -26,27 +25,28 @@ TEST_F(Conv2dOpTest, Simple_VALID) {
...
@@ -26,27 +25,28 @@ TEST_F(Conv2dOpTest, Simple_VALID) {
// Add args
// Add args
// Add input data
// Add input data
net
.
AddInputFromArray
<
D
eviceType
::
CPU
,
float
>
(
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
2
,
3
,
3
},
"Input"
,
{
1
,
2
,
3
,
3
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
eviceType
::
CPU
,
float
>
(
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
1
,
2
,
3
,
3
},
"Filter"
,
{
1
,
2
,
3
,
3
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
net
.
AddInputFromArray
<
D
eviceType
::
CPU
,
float
>
(
"Bias"
,
{
1
},
{
0.1
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
1
},
{
0.1
f
});
// Run
// Run
net
.
RunOp
();
net
.
RunOp
(
D
);
// Check
// Check
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
1
,
1
},
{
18.1
f
});
auto
expected
=
CreateTensor
<
float
>
({
1
,
1
,
1
,
1
},
{
18.1
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
}
TEST_F
(
Conv2dOpTest
,
Simple_SAME
)
{
template
<
DeviceType
D
>
// Construct graph
void
TestSimple3x3SAME
()
{
auto
&
net
=
test_net
()
;
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Filter"
)
...
@@ -58,17 +58,17 @@ TEST_F(Conv2dOpTest, Simple_SAME) {
...
@@ -58,17 +58,17 @@ TEST_F(Conv2dOpTest, Simple_SAME) {
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
// Add input data
net
.
AddInputFromArray
<
D
eviceType
::
CPU
,
float
>
(
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
2
,
3
,
3
},
"Input"
,
{
1
,
2
,
3
,
3
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
eviceType
::
CPU
,
float
>
(
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
1
,
2
,
3
,
3
},
"Filter"
,
{
1
,
2
,
3
,
3
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
});
net
.
AddInputFromArray
<
D
eviceType
::
CPU
,
float
>
(
"Bias"
,
{
1
},
{
0.1
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
1
},
{
0.1
f
});
// Run
// Run
net
.
RunOp
();
net
.
RunOp
(
D
);
// Check
// Check
auto
expected
=
CreateTensor
<
float
>
(
auto
expected
=
CreateTensor
<
float
>
(
...
@@ -78,9 +78,25 @@ TEST_F(Conv2dOpTest, Simple_SAME) {
...
@@ -78,9 +78,25 @@ TEST_F(Conv2dOpTest, Simple_SAME) {
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
}
TEST_F
(
Conv2dOpTest
,
Combined
)
{
TEST_F
(
Conv2dOpTest
,
CPUSimple
)
{
TestSimple3x3VALID
<
DeviceType
::
CPU
>
();
TestSimple3x3SAME
<
DeviceType
::
CPU
>
();
}
TEST_F
(
Conv2dOpTest
,
NEONSimple
)
{
TestSimple3x3VALID
<
DeviceType
::
NEON
>
();
TestSimple3x3SAME
<
DeviceType
::
NEON
>
();
}
TEST_F
(
Conv2dOpTest
,
OPENCLSimple
)
{
TestSimple3x3VALID
<
DeviceType
::
OPENCL
>
();
TestSimple3x3SAME
<
DeviceType
::
OPENCL
>
();
}
template
<
DeviceType
D
>
static
void
TestCombined3x3
()
{
// Construct graph
// Construct graph
auto
&
net
=
test_net
()
;
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2DTest"
)
.
Input
(
"Input"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Filter"
)
...
@@ -92,19 +108,19 @@ TEST_F(Conv2dOpTest, Combined) {
...
@@ -92,19 +108,19 @@ TEST_F(Conv2dOpTest, Combined) {
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
// Add input data
net
.
AddInputFromArray
<
D
eviceType
::
CPU
,
float
>
(
net
.
AddInputFromArray
<
D
,
float
>
(
"Input"
,
{
1
,
2
,
5
,
5
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
"Input"
,
{
1
,
2
,
5
,
5
},
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
});
net
.
AddInputFromArray
<
D
eviceType
::
CPU
,
float
>
(
net
.
AddInputFromArray
<
D
,
float
>
(
"Filter"
,
{
2
,
2
,
3
,
3
},
"Filter"
,
{
2
,
2
,
3
,
3
},
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
{
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
1.0
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
});
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
,
0.5
f
});
net
.
AddInputFromArray
<
D
eviceType
::
CPU
,
float
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
net
.
AddInputFromArray
<
D
,
float
>
(
"Bias"
,
{
2
},
{
0.1
f
,
0.2
f
});
// Run
// Run
net
.
RunOp
();
net
.
RunOp
(
D
);
// Check
// Check
auto
expected
=
CreateTensor
<
float
>
(
auto
expected
=
CreateTensor
<
float
>
(
...
@@ -112,6 +128,19 @@ TEST_F(Conv2dOpTest, Combined) {
...
@@ -112,6 +128,19 @@ TEST_F(Conv2dOpTest, Combined) {
4.2
f
,
6.2
f
,
4.2
f
,
6.2
f
,
9.2
f
,
6.2
f
,
4.2
f
,
6.2
f
,
4.2
f
});
4.2
f
,
6.2
f
,
4.2
f
,
6.2
f
,
9.2
f
,
6.2
f
,
4.2
f
,
6.2
f
,
4.2
f
});
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
TEST_F
(
Conv2dOpTest
,
CPUCombined
)
{
TestCombined3x3
<
DeviceType
::
CPU
>
();
}
TEST_F
(
Conv2dOpTest
,
NEONCombined
)
{
TestCombined3x3
<
DeviceType
::
NEON
>
();
}
TEST_F
(
Conv2dOpTest
,
OPENCLCombined
)
{
TestCombined3x3
<
DeviceType
::
OPENCL
>
();
}
}
template
<
DeviceType
D
>
template
<
DeviceType
D
>
...
@@ -159,13 +188,16 @@ void TestConv1x1() {
...
@@ -159,13 +188,16 @@ void TestConv1x1() {
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
ExpectTensorNear
<
float
>
(
*
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
}
}
TEST_F
(
Conv2dOpTest
,
Conv1x1
)
{
TEST_F
(
Conv2dOpTest
,
C
PUC
onv1x1
)
{
TestConv1x1
<
DeviceType
::
CPU
>
();
TestConv1x1
<
DeviceType
::
CPU
>
();
}
TEST_F
(
Conv2dOpTest
,
OPENCLConv1x1
)
{
TestConv1x1
<
DeviceType
::
OPENCL
>
();
TestConv1x1
<
DeviceType
::
OPENCL
>
();
}
}
// TODO we need more tests
template
<
DeviceType
D
>
TEST_F
(
Conv2dOpTest
,
AlignedConvNxNS12
)
{
static
void
TestAlignedConvNxNS12
(
)
{
testing
::
internal
::
LogToStderr
();
testing
::
internal
::
LogToStderr
();
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
Padding
type
)
{
Padding
type
)
{
...
@@ -178,7 +210,7 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
...
@@ -178,7 +210,7 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
index_t
width
=
32
;
index_t
width
=
32
;
index_t
output_channels
=
128
;
index_t
output_channels
=
128
;
// Construct graph
// Construct graph
auto
&
net
=
test_net
()
;
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Filter"
)
...
@@ -190,19 +222,19 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
...
@@ -190,19 +222,19 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
// Add input data
net
.
AddRandomInput
<
D
eviceType
::
CPU
,
float
>
(
"Input"
,
{
batch
,
input_channels
,
height
,
width
});
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
input_channels
,
height
,
width
});
net
.
AddRandomInput
<
D
eviceType
::
CPU
,
float
>
(
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
{
output_channels
,
input_channels
,
kernel_h
,
kernel_w
});
"Filter"
,
{
output_channels
,
input_channels
,
kernel_h
,
kernel_w
});
net
.
AddRandomInput
<
D
eviceType
::
CPU
,
float
>
(
"Bias"
,
{
output_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
//
run cpu
//
Run on device
net
.
RunOp
();
net
.
RunOp
(
D
);
// Check
// Check
Tensor
expected
;
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
//
Run NEON
//
run cpu
net
.
RunOp
(
DeviceType
::
NEON
);
net
.
RunOp
();
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
};
};
...
@@ -214,7 +246,16 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
...
@@ -214,7 +246,16 @@ TEST_F(Conv2dOpTest, AlignedConvNxNS12) {
}
}
}
}
TEST_F
(
Conv2dOpTest
,
UnalignedConvNxNS12
)
{
TEST_F
(
Conv2dOpTest
,
NEONAlignedConvNxNS12
)
{
TestAlignedConvNxNS12
<
DeviceType
::
NEON
>
();
}
TEST_F
(
Conv2dOpTest
,
OPENCLAlignedConvNxNS12
)
{
TestAlignedConvNxNS12
<
DeviceType
::
OPENCL
>
();
}
template
<
DeviceType
D
>
static
void
TestUnalignedConvNxNS12
()
{
testing
::
internal
::
LogToStderr
();
testing
::
internal
::
LogToStderr
();
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
auto
func
=
[
&
](
int
kernel_h
,
int
kernel_w
,
int
stride_h
,
int
stride_w
,
Padding
type
)
{
Padding
type
)
{
...
@@ -227,7 +268,7 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
...
@@ -227,7 +268,7 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
index_t
width
=
113
;
index_t
width
=
113
;
index_t
output_channels
=
3
+
rand
()
%
10
;
index_t
output_channels
=
3
+
rand
()
%
10
;
// Construct graph
// Construct graph
auto
&
net
=
test_net
()
;
OpsTestNet
net
;
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
OpDefBuilder
(
"Conv2D"
,
"Conv2dTest"
)
.
Input
(
"Input"
)
.
Input
(
"Input"
)
.
Input
(
"Filter"
)
.
Input
(
"Filter"
)
...
@@ -239,19 +280,19 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
...
@@ -239,19 +280,19 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
.
Finalize
(
net
.
NewOperatorDef
());
.
Finalize
(
net
.
NewOperatorDef
());
// Add input data
// Add input data
net
.
AddRandomInput
<
D
eviceType
::
CPU
,
float
>
(
"Input"
,
{
batch
,
input_channels
,
height
,
width
});
net
.
AddRandomInput
<
D
,
float
>
(
"Input"
,
{
batch
,
input_channels
,
height
,
width
});
net
.
AddRandomInput
<
D
eviceType
::
CPU
,
float
>
(
net
.
AddRandomInput
<
D
,
float
>
(
"Filter"
,
{
output_channels
,
input_channels
,
kernel_h
,
kernel_w
});
"Filter"
,
{
output_channels
,
input_channels
,
kernel_h
,
kernel_w
});
net
.
AddRandomInput
<
D
eviceType
::
CPU
,
float
>
(
"Bias"
,
{
output_channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"Bias"
,
{
output_channels
});
//
run cpu
//
Run on device
net
.
RunOp
();
net
.
RunOp
(
D
);
// Check
// Check
Tensor
expected
;
Tensor
expected
;
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
expected
.
Copy
(
*
net
.
GetOutput
(
"Output"
));
//
Run NEON
//
run cpu
net
.
RunOp
(
DeviceType
::
NEON
);
net
.
RunOp
();
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
ExpectTensorNear
<
float
>
(
expected
,
*
net
.
GetOutput
(
"Output"
),
0.001
);
};
};
...
@@ -262,3 +303,11 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
...
@@ -262,3 +303,11 @@ TEST_F(Conv2dOpTest, UnalignedConvNxNS12) {
}
}
}
}
}
}
TEST_F
(
Conv2dOpTest
,
NEONUnalignedConvNxNS12
)
{
TestUnalignedConvNxNS12
<
DeviceType
::
NEON
>
();
}
TEST_F
(
Conv2dOpTest
,
OPENCLUnalignedConvNxNS12
)
{
TestUnalignedConvNxNS12
<
DeviceType
::
OPENCL
>
();
}
mace/ops/depthwise_conv_2d_benchmark.cc
浏览文件 @
52cc0540
...
@@ -57,7 +57,7 @@ static void DepthwiseConv2d(int iters,
...
@@ -57,7 +57,7 @@ static void DepthwiseConv2d(int iters,
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, \
#define BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, STRIDE, P, OC, TYPE, \
DEVICE) \
DEVICE) \
static void \
static void \
BM_DEPTHWISE_
CONV_
2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE( \
int iters) { \
int iters) { \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
mace::testing::ItemsProcessed(tot); \
mace::testing::ItemsProcessed(tot); \
...
@@ -66,7 +66,7 @@ static void DepthwiseConv2d(int iters,
...
@@ -66,7 +66,7 @@ static void DepthwiseConv2d(int iters,
mace::Padding::P, OC); \
mace::Padding::P, OC); \
} \
} \
BENCHMARK( \
BENCHMARK( \
BM_DEPTHWISE_
CONV_
2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
BM_DEPTHWISE_2D_##N##_##C##_##H##_##W##_K##KH##x##KW##S##STRIDE##_##P##_##OC##_##TYPE##_##DEVICE)
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
#define BM_DEPTHWISE_CONV_2D(N, C, H, W, KH, KW, S, P, OC, TYPE) \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
BM_DEPTHWISE_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, CPU); \
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录