Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
fa488338
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
396
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
fa488338
编写于
10月 26, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(dnn,imperative): remove the restriction of tensor shape when using uint8 region mask
GitOrigin-RevId: 37d99006978d756d22111347be2ab2d895f71661
上级
cdf7baa2
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
258 addition
and
142 deletion
+258
-142
dnn/src/cuda/region_restricted_convolution/chanwise/depthwise_large_filter_algo.cuh
...cted_convolution/chanwise/depthwise_large_filter_algo.cuh
+28
-21
dnn/src/cuda/region_restricted_convolution/opr_impl.cpp
dnn/src/cuda/region_restricted_convolution/opr_impl.cpp
+1
-10
dnn/test/cuda/region_restricted_convolution.cpp
dnn/test/cuda/region_restricted_convolution.cpp
+227
-109
imperative/python/test/unit/functional/test_functional.py
imperative/python/test/unit/functional/test_functional.py
+2
-2
未找到文件。
dnn/src/cuda/region_restricted_convolution/chanwise/depthwise_large_filter_algo.cuh
浏览文件 @
fa488338
...
...
@@ -784,20 +784,24 @@ __global__ void DepthwiseConv2dGPUKernelNCHW(
static_assert
((
OutTileConfig
::
unroll_w
&
3
)
==
0
,
"output tile unroll_w & 3 != 0"
);
static_assert
((
OutTileConfig
::
block_w
&
3
)
==
0
,
"output block_w & 3 != 0"
);
int
reg_rout
[
OutTileConfig
::
unroll_size
]
=
{
0
};
int
relative_offset
=
sizeof
(
dt_int32
)
/
sizeof
(
dt_uint8
);
#pragma unroll
for
(
int
i
=
0
;
i
<
OutTileConfig
::
unroll_h
;
++
i
)
{
int
out_h_idx
=
out_base_h_idx
+
i
;
if
(
out_h_idx
<
param
.
out_h
)
{
#pragma unroll
for
(
int
j
=
0
;
j
<
OutTileConfig
::
unroll_w
;
j
+=
4
)
{
for
(
int
j
=
0
;
j
<
OutTileConfig
::
unroll_w
;
j
+=
relative_offset
)
{
int
out_w_idx
=
out_start_w
+
j
;
if
(
out_w_idx
<
param
.
out_w
)
{
uint32_t
val
=
*
(
reinterpret_cast
<
const
uint32_t
*>
(
&
rout_base_ptr
[
out_h_idx
*
param
.
out_w
+
out_w_idx
]));
reg_rout
[
i
*
OutTileConfig
::
unroll_w
+
j
]
=
val
&
0xff
;
reg_rout
[
i
*
OutTileConfig
::
unroll_w
+
j
+
1
]
=
(
val
>>
8
)
&
0xff
;
reg_rout
[
i
*
OutTileConfig
::
unroll_w
+
j
+
2
]
=
(
val
>>
16
)
&
0xff
;
reg_rout
[
i
*
OutTileConfig
::
unroll_w
+
j
+
3
]
=
(
val
>>
24
)
&
0xff
;
int
valid_offset
=
relative_offset
+
out_w_idx
>
param
.
out_w
?
param
.
out_w
-
out_w_idx
:
relative_offset
;
#pragma unroll
for
(
int
t
=
0
;
t
<
valid_offset
;
t
+=
1
)
{
uint8_t
val
=
rout_base_ptr
[
out_h_idx
*
param
.
out_w
+
out_w_idx
+
t
];
reg_rout
[
i
*
OutTileConfig
::
unroll_w
+
j
+
t
]
=
val
&
0xff
;
}
}
}
}
...
...
@@ -855,21 +859,23 @@ __global__ void DepthwiseConv2dGPUKernelNCHW(
int
s_idx
=
(
off_oh
*
stride_h
+
s_h
)
%
SrcTileCount
::
smem_h
*
SrcTileCount
::
smem_w
+
(
off_oh
*
stride_h
+
s_h
)
/
SrcTileCount
::
bank_offset_line
;
int
r_idx
=
(
off_oh
*
stride_h
+
s_h
)
%
RinTileCount
::
smem_h
*
RinTileCount
::
smem_w
+
(
off_oh
*
stride_h
+
s_h
)
/
RinTileCount
::
bank_offset_line
;
#pragma unroll
for
(
int
s_w
=
0
;
s_w
<
irin_unroll_w
;
s_w
+=
4
)
{
uint32_t
val
=
smem_rin_ptr
[(
off_oh
*
stride_h
+
s_h
)
%
RinTileCount
::
smem_h
*
RinTileCount
::
smem_w
+
(
s_w
>>
2
)
+
(
off_oh
*
stride_h
+
s_h
)
/
RinTileCount
::
bank_offset_line
];
reg_src
[
0
][
s_h
*
irin_unroll_w
+
s_w
]
=
smem_src_ptr
[
s_idx
+
s_w
];
reg_src
[
0
][
s_h
*
irin_unroll_w
+
s_w
+
1
]
=
smem_src_ptr
[
s_idx
+
s_w
+
1
];
reg_src
[
0
][
s_h
*
irin_unroll_w
+
s_w
+
2
]
=
smem_src_ptr
[
s_idx
+
s_w
+
2
];
reg_src
[
0
][
s_h
*
irin_unroll_w
+
s_w
+
3
]
=
smem_src_ptr
[
s_idx
+
s_w
+
3
];
reg_rin
[
0
][
s_h
*
irin_unroll_w
+
s_w
]
=
val
&
0xff
;
reg_rin
[
0
][
s_h
*
irin_unroll_w
+
s_w
+
1
]
=
(
val
>>
8
)
&
0xff
;
reg_rin
[
0
][
s_h
*
irin_unroll_w
+
s_w
+
2
]
=
(
val
>>
16
)
&
0xff
;
reg_rin
[
0
][
s_h
*
irin_unroll_w
+
s_w
+
3
]
=
(
val
>>
24
)
&
0xff
;
for
(
int
s_w
=
0
;
s_w
<
SrcTileConfig
::
unroll_w
;
++
s_w
)
{
reg_src
[
0
][
s_h
*
SrcTileConfig
::
unroll_w
+
s_w
]
=
smem_src_ptr
[
s_idx
+
s_w
];
}
#pragma unroll
for
(
int
s_w
=
0
;
s_w
<
irin_unroll_w
;
s_w
+=
relative_offset
)
{
reg_rin
[
0
][
s_h
*
irin_unroll_w
+
s_w
]
=
(
smem_rin_ptr
[
r_idx
+
(
s_w
>>
2
)])
&
0xff
;
reg_rin
[
0
][
s_h
*
irin_unroll_w
+
s_w
+
1
]
=
(
smem_rin_ptr
[
r_idx
+
(
s_w
>>
2
)]
>>
8
)
&
0xff
;
reg_rin
[
0
][
s_h
*
irin_unroll_w
+
s_w
+
2
]
=
(
smem_rin_ptr
[
r_idx
+
(
s_w
>>
2
)]
>>
16
)
&
0xff
;
reg_rin
[
0
][
s_h
*
irin_unroll_w
+
s_w
+
3
]
=
(
smem_rin_ptr
[
r_idx
+
(
s_w
>>
2
)]
>>
24
)
&
0xff
;
}
}
...
...
@@ -1108,6 +1114,7 @@ void LaunchDepthwiseConv2dGPU(
if
(
param
.
is_compute_deafult
)
{
kernel
=
DepthwiseConv2dGPUKernelNCHW
<
IConvTrait
,
kDirection
,
stride
>
;
}
else
{
printf
(
"expected dnn param compute default mode
\n
"
);
megdnn_assert_internal
(
0
);
}
if
(
is_fwd
)
{
...
...
dnn/src/cuda/region_restricted_convolution/opr_impl.cpp
浏览文件 @
fa488338
...
...
@@ -25,9 +25,6 @@ void RegionRestrictedConvolutionForwardImpl::exec(
fm
.
spatial_ndim
==
2
&&
fm
.
icpg
==
1
&&
fm
.
ocpg
==
1
&&
fm
.
dilation
[
0
]
==
1
&&
fm
.
dilation
[
1
]
==
1
&&
!
fm
.
should_flip
&&
param
().
stride_h
==
1
&&
param
().
stride_w
==
1
);
if
(
rin
.
layout
.
dtype
==
dtype
::
Uint8
())
{
megdnn_assert
((
src
.
layout
.
shape
[
3
]
&
3
)
==
0
&&
(
dst
.
layout
.
shape
[
3
]
&
3
)
==
0
);
}
auto
stream
=
cuda_stream
(
handle
());
...
...
@@ -43,6 +40,7 @@ void RegionRestrictedConvolutionForwardImpl::exec(
dst
.
ptr
<
float
>
(),
src
.
ptr
<
float
>
(),
filter
.
ptr
<
float
>
(),
rin
.
ptr
<
uint8_t
>
(),
rout
.
ptr
<
uint8_t
>
(),
kparam
,
stream
);
}
else
{
printf
(
"unexpected region restricted conv mode
\n
"
);
megdnn_assert_internal
(
0
);
}
}
...
...
@@ -81,11 +79,6 @@ void RegionRestrictedConvolutionBackwardDataImpl::exec(
fm
.
spatial_ndim
==
2
&&
fm
.
icpg
==
1
&&
fm
.
ocpg
==
1
&&
fm
.
dilation
[
0
]
==
1
&&
fm
.
dilation
[
1
]
==
1
&&
!
fm
.
should_flip
&&
param
().
stride_h
==
1
&&
param
().
stride_w
==
1
);
// NOTE: uint8 dtype region mask requires the spatial size of src&dst is 4*N
if
(
rin
.
layout
.
dtype
==
dtype
::
Uint8
())
{
megdnn_assert
(
(
grad
.
layout
.
shape
[
3
]
&
3
)
==
0
&&
(
diff
.
layout
.
shape
[
3
]
&
3
)
==
0
);
}
auto
stream
=
cuda_stream
(
handle
());
if
(
filter
.
layout
.
dtype
==
dtype
::
Float32
()
&&
rin
.
layout
.
dtype
==
dtype
::
Int32
()
&&
rout
.
layout
.
dtype
==
dtype
::
Int32
())
{
...
...
@@ -135,8 +128,6 @@ void RegionRestrictedConvolutionBackwardFilterImpl::exec(
int
ph
=
fm
.
padding
[
0
],
pw
=
fm
.
padding
[
1
];
int
dh
=
0
,
dw
=
0
;
// check if channelwise convolution
megdnn_assert
(
fm
.
icpg
==
1
&&
fm
.
ocpg
==
1
);
auto
stream
=
cuda_stream
(
handle
());
float
alpha
=
1.
f
;
...
...
dnn/test/cuda/region_restricted_convolution.cpp
浏览文件 @
fa488338
#include "megdnn/dtype.h"
#include "megdnn/opr_param_defs.h"
#include "megdnn/oprs.h"
#include "megdnn/oprs/nn.h"
#include "test/common/checker.h"
#include "test/common/conv_bias.h"
#include "test/common/rng.h"
...
...
@@ -11,6 +12,7 @@
#include "test/cuda/utils.h"
#include <cudnn.h>
#include <gtest/gtest.h>
#define V1(x) #x
#define V(x) V1(x)
...
...
@@ -37,9 +39,6 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER) {
UniformIntRNG
r_rng
{
0
,
2
};
checker
.
set_rng
(
0
,
&
rng
).
set_rng
(
1
,
&
rng
).
set_rng
(
2
,
&
r_rng
).
set_rng
(
3
,
&
r_rng
);
if
(
dt
.
enumv
()
==
DTypeEnum
::
Float16
)
{
checker
.
set_epsilon
(
1e-1
);
}
cur_param
.
pad_h
=
cur_param
.
pad_w
=
padding
;
cur_param
.
stride_h
=
cur_param
.
stride_w
=
stride
;
...
...
@@ -49,11 +48,32 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER) {
checker
.
set_param
(
cur_param
).
execs
(
{{
n
,
g
,
h
,
h
},
{
g
,
1
,
1
,
fh
,
fh
},
{
n
,
h
,
h
},
{
n
,
ho
,
ho
},
{}});
};
run
(
1
,
1
,
3
,
2
,
1
,
1
);
run
(
1
,
1
,
5
,
2
,
1
,
1
);
run
(
1
,
1
,
6
,
2
,
1
,
1
);
run
(
1
,
1
,
7
,
2
,
1
,
1
);
run
(
1
,
1
,
9
,
2
,
1
,
1
);
run
(
1
,
1
,
10
,
2
,
1
,
1
);
run
(
1
,
1
,
11
,
2
,
1
,
1
);
run
(
1
,
1
,
13
,
2
,
1
,
1
);
run
(
1
,
1
,
14
,
2
,
1
,
1
);
run
(
1
,
1
,
15
,
2
,
1
,
1
);
run
(
1
,
1
,
17
,
2
,
1
,
1
);
run
(
1
,
1
,
18
,
2
,
1
,
1
);
run
(
1
,
1
,
19
,
2
,
1
,
1
);
run
(
1
,
1
,
21
,
2
,
1
,
1
);
run
(
1
,
1
,
22
,
2
,
1
,
1
);
run
(
1
,
1
,
23
,
2
,
1
,
1
);
run
(
1
,
1
,
25
,
2
,
1
,
1
);
run
(
1
,
1
,
26
,
2
,
1
,
1
);
run
(
1
,
1
,
27
,
2
,
1
,
1
);
run
(
1
,
1
,
29
,
2
,
1
,
1
);
run
(
1
,
1
,
30
,
2
,
1
,
1
);
run
(
1
,
1
,
31
,
2
,
1
,
1
);
run
(
4
,
8
,
32
,
3
,
3
/
2
,
1
);
run
(
4
,
8
,
32
,
5
,
5
/
2
,
1
);
run
(
4
,
8
,
32
,
7
,
7
/
2
,
1
);
run
(
1
,
2
,
32
,
9
,
9
/
2
,
1
);
run
(
4
,
1
,
32
,
9
,
9
/
2
,
1
);
run
(
4
,
8
,
32
,
9
,
9
/
2
,
1
);
run
(
4
,
8
,
32
,
11
,
11
/
2
,
1
);
run
(
4
,
8
,
32
,
13
,
13
/
2
,
1
);
run
(
4
,
8
,
32
,
15
,
15
/
2
,
1
);
...
...
@@ -65,12 +85,27 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER) {
run
(
4
,
8
,
32
,
27
,
27
/
2
,
1
);
run
(
4
,
8
,
32
,
29
,
29
/
2
,
1
);
run
(
4
,
8
,
32
,
31
,
31
/
2
,
1
);
run
(
4
,
8
,
31
,
3
,
3
/
2
,
1
);
run
(
4
,
8
,
31
,
5
,
5
/
2
,
1
);
run
(
4
,
8
,
31
,
7
,
7
/
2
,
1
);
run
(
4
,
8
,
31
,
9
,
9
/
2
,
1
);
run
(
4
,
8
,
31
,
11
,
11
/
2
,
1
);
run
(
4
,
8
,
31
,
13
,
13
/
2
,
1
);
run
(
4
,
8
,
31
,
15
,
15
/
2
,
1
);
run
(
4
,
8
,
31
,
17
,
17
/
2
,
1
);
run
(
4
,
8
,
31
,
19
,
19
/
2
,
1
);
run
(
4
,
8
,
31
,
21
,
21
/
2
,
1
);
run
(
4
,
8
,
31
,
23
,
23
/
2
,
1
);
run
(
4
,
8
,
31
,
25
,
25
/
2
,
1
);
run
(
4
,
8
,
31
,
27
,
27
/
2
,
1
);
run
(
4
,
8
,
31
,
29
,
29
/
2
,
1
);
run
(
4
,
8
,
31
,
31
,
31
/
2
,
1
);
}
}
#if MEGDNN_WITH_BENCHMARK
TEST_F
(
CUDA
,
BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_FP32
)
{
TEST_F
(
CUDA
,
BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_FP32
_INT32
)
{
require_compute_capability
(
7
,
5
);
Benchmarker
<
ConvBiasForward
>
bencher
(
handle_cuda
());
bencher
.
set_display
(
false
);
...
...
@@ -153,24 +188,24 @@ TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_FP32) {
time_in_ms
/
rr_time_in_ms
);
};
run_bench
(
64
,
384
,
32
,
32
,
3
,
3
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
5
,
5
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
7
,
7
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
9
,
9
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
11
,
11
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
13
,
13
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
15
,
15
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
17
,
17
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
19
,
19
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
21
,
21
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
23
,
23
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
25
,
25
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
27
,
27
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
29
,
29
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
31
,
31
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
3
,
3
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
5
,
5
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
7
,
7
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
9
,
9
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
11
,
11
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
13
,
13
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
15
,
15
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
17
,
17
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
19
,
19
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
21
,
21
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
23
,
23
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
25
,
25
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
27
,
27
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
29
,
29
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
31
,
31
,
1
,
1
,
10
00
);
}
TEST_F
(
CUDA
,
BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_
LARGE_FILTER_FP
32
)
{
TEST_F
(
CUDA
,
BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_
DATA_FP32_INT
32
)
{
require_compute_capability
(
7
,
5
);
Benchmarker
<
ConvolutionBackwardData
>
bencher
(
handle_cuda
());
bencher
.
set_display
(
false
);
...
...
@@ -251,24 +286,24 @@ TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_LARGE_FILTER_FP32) {
time_in_ms
/
rr_time_in_ms
);
};
run_bench
(
64
,
384
,
32
,
32
,
3
,
3
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
5
,
5
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
7
,
7
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
9
,
9
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
11
,
11
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
13
,
13
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
15
,
15
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
17
,
17
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
19
,
19
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
21
,
21
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
23
,
23
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
25
,
25
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
27
,
27
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
29
,
29
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
31
,
31
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
3
,
3
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
5
,
5
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
7
,
7
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
9
,
9
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
11
,
11
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
13
,
13
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
15
,
15
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
17
,
17
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
19
,
19
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
21
,
21
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
23
,
23
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
25
,
25
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
27
,
27
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
29
,
29
,
1
,
1
,
10
00
);
run_bench
(
64
,
384
,
32
,
32
,
31
,
31
,
1
,
1
,
10
00
);
}
TEST_F
(
CUDA
,
BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_
LARGE_FILTER
_FP32_UINT8
)
{
TEST_F
(
CUDA
,
BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_
DATA
_FP32_UINT8
)
{
require_compute_capability
(
7
,
5
);
Benchmarker
<
ConvolutionBackwardData
>
bencher
(
handle_cuda
());
bencher
.
set_display
(
false
);
...
...
@@ -349,21 +384,36 @@ TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_LARGE_FILTER_FP32_UINT8)
time_in_ms
/
rr_time_in_ms
);
};
run_bench
(
64
,
384
,
32
,
32
,
3
,
3
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
5
,
5
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
7
,
7
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
9
,
9
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
11
,
11
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
13
,
13
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
15
,
15
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
17
,
17
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
19
,
19
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
21
,
21
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
23
,
23
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
25
,
25
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
27
,
27
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
29
,
29
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
31
,
31
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
5
,
5
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
7
,
7
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
9
,
9
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
11
,
11
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
13
,
13
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
15
,
15
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
17
,
17
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
19
,
19
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
21
,
21
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
23
,
23
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
25
,
25
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
27
,
27
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
29
,
29
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
31
,
31
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
5
,
5
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
7
,
7
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
9
,
9
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
11
,
11
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
13
,
13
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
15
,
15
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
17
,
17
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
19
,
19
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
21
,
21
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
23
,
23
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
25
,
25
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
27
,
27
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
29
,
29
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
31
,
31
,
1
,
1
,
1000
);
}
TEST_F
(
CUDA
,
BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_UINT8
)
{
...
...
@@ -449,21 +499,36 @@ TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_UINT8) {
time_in_ms
/
rr_time_in_ms
);
};
run_bench
(
64
,
384
,
32
,
32
,
3
,
3
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
5
,
5
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
7
,
7
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
9
,
9
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
11
,
11
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
13
,
13
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
15
,
15
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
17
,
17
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
19
,
19
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
21
,
21
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
23
,
23
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
25
,
25
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
27
,
27
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
29
,
29
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
31
,
31
,
1
,
1
,
10
);
run_bench
(
64
,
384
,
32
,
32
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
5
,
5
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
7
,
7
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
9
,
9
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
11
,
11
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
13
,
13
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
15
,
15
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
17
,
17
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
19
,
19
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
21
,
21
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
23
,
23
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
25
,
25
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
27
,
27
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
29
,
29
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
32
,
32
,
31
,
31
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
5
,
5
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
7
,
7
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
9
,
9
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
11
,
11
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
13
,
13
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
15
,
15
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
17
,
17
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
19
,
19
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
21
,
21
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
23
,
23
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
25
,
25
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
27
,
27
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
29
,
29
,
1
,
1
,
1000
);
run_bench
(
64
,
384
,
31
,
31
,
31
,
31
,
1
,
1
,
1000
);
}
TEST_F
(
CUDA
,
BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32
)
{
...
...
@@ -538,7 +603,7 @@ TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32) {
auto
rr_time_in_ms
=
rr_bencher
.
execs
({
src
,
diff
,
rin
,
rout
,
grad
})
/
nr_times
;
auto
rr_ops
=
2.0
*
batch
*
g
*
hi
*
wi
*
fh
*
fw
/
(
rr_time_in_ms
*
1e-3
)
*
1e-12
;
printf
(
"[
D
GRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
printf
(
"[
W
GRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
"src=%s, "
"diff=%s, grad=%s
\n
"
"time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops
\n
"
...
...
@@ -638,7 +703,7 @@ TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32_RINT8) {
auto
rr_time_in_ms
=
rr_bencher
.
execs
({
src
,
diff
,
rin
,
rout
,
grad
})
/
nr_times
;
auto
rr_ops
=
2.0
*
batch
*
g
*
hi
*
wi
*
fh
*
fw
/
(
rr_time_in_ms
*
1e-3
)
*
1e-12
;
printf
(
"[
D
GRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
printf
(
"[
W
GRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: "
"src=%s, "
"diff=%s, grad=%s
\n
"
"time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops
\n
"
...
...
@@ -703,15 +768,30 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32) {
{
n
,
g
*
1
,
ih
,
ih
}
// grad
});
};
if
(
dt
==
dtype
::
Int32
())
{
run
(
4
,
8
,
32
,
5
,
5
/
2
,
1
);
run
(
1
,
2
,
2
,
2
,
0
,
1
);
run
(
1
,
2
,
3
,
3
,
0
,
1
);
run
(
1
,
2
,
4
,
4
,
0
,
1
);
run
(
1
,
2
,
5
,
5
,
0
,
1
);
run
(
1
,
2
,
6
,
6
,
0
,
1
);
run
(
1
,
2
,
7
,
7
,
0
,
1
);
}
run
(
1
,
1
,
3
,
2
,
1
,
1
);
run
(
1
,
1
,
5
,
2
,
1
,
1
);
run
(
1
,
1
,
6
,
2
,
1
,
1
);
run
(
1
,
1
,
7
,
2
,
1
,
1
);
run
(
1
,
1
,
9
,
2
,
1
,
1
);
run
(
1
,
1
,
10
,
2
,
1
,
1
);
run
(
1
,
1
,
11
,
2
,
1
,
1
);
run
(
1
,
1
,
13
,
2
,
1
,
1
);
run
(
1
,
1
,
14
,
2
,
1
,
1
);
run
(
1
,
1
,
15
,
2
,
1
,
1
);
run
(
1
,
1
,
17
,
2
,
1
,
1
);
run
(
1
,
1
,
18
,
2
,
1
,
1
);
run
(
1
,
1
,
19
,
2
,
1
,
1
);
run
(
1
,
1
,
21
,
2
,
1
,
1
);
run
(
1
,
1
,
22
,
2
,
1
,
1
);
run
(
1
,
1
,
23
,
2
,
1
,
1
);
run
(
1
,
1
,
25
,
2
,
1
,
1
);
run
(
1
,
1
,
26
,
2
,
1
,
1
);
run
(
1
,
1
,
27
,
2
,
1
,
1
);
run
(
1
,
1
,
29
,
2
,
1
,
1
);
run
(
1
,
1
,
30
,
2
,
1
,
1
);
run
(
1
,
1
,
31
,
2
,
1
,
1
);
run
(
4
,
8
,
32
,
3
,
3
/
2
,
1
);
run
(
4
,
8
,
32
,
5
,
5
/
2
,
1
);
run
(
4
,
8
,
32
,
7
,
7
/
2
,
1
);
run
(
4
,
8
,
32
,
9
,
9
/
2
,
1
);
run
(
4
,
8
,
32
,
11
,
11
/
2
,
1
);
...
...
@@ -724,8 +804,22 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32) {
run
(
4
,
8
,
32
,
25
,
25
/
2
,
1
);
run
(
4
,
8
,
32
,
27
,
27
/
2
,
1
);
run
(
4
,
8
,
32
,
29
,
29
/
2
,
1
);
run
(
4
,
1
,
32
,
29
,
29
/
2
,
1
);
run
(
4
,
8
,
32
,
31
,
31
/
2
,
1
);
run
(
4
,
8
,
31
,
3
,
3
/
2
,
1
);
run
(
4
,
8
,
31
,
5
,
5
/
2
,
1
);
run
(
4
,
8
,
31
,
7
,
7
/
2
,
1
);
run
(
4
,
8
,
31
,
9
,
9
/
2
,
1
);
run
(
4
,
8
,
31
,
11
,
11
/
2
,
1
);
run
(
4
,
8
,
31
,
13
,
13
/
2
,
1
);
run
(
4
,
8
,
31
,
15
,
15
/
2
,
1
);
run
(
4
,
8
,
31
,
17
,
17
/
2
,
1
);
run
(
4
,
8
,
31
,
19
,
19
/
2
,
1
);
run
(
4
,
8
,
31
,
21
,
21
/
2
,
1
);
run
(
4
,
8
,
31
,
23
,
23
/
2
,
1
);
run
(
4
,
8
,
31
,
25
,
25
/
2
,
1
);
run
(
4
,
8
,
31
,
27
,
27
/
2
,
1
);
run
(
4
,
8
,
31
,
29
,
29
/
2
,
1
);
run
(
4
,
8
,
31
,
31
,
31
/
2
,
1
);
}
}
...
...
@@ -761,16 +855,30 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32_RIN_EQ_ROUT) {
/*rout*/
{
n
,
oh
,
oh
},
/*grad*/
{
n
,
g
*
1
,
ih
,
ih
}});
};
if
(
dt
==
dtype
::
Int32
())
{
// NOTE: UINT8 assert the spatial size of src&dst is 4*N
run
(
4
,
8
,
32
,
5
,
5
/
2
,
1
);
run
(
1
,
2
,
2
,
2
,
0
,
1
);
run
(
1
,
2
,
3
,
3
,
0
,
1
);
run
(
1
,
2
,
4
,
4
,
0
,
1
);
run
(
1
,
2
,
5
,
5
,
0
,
1
);
run
(
1
,
2
,
6
,
6
,
0
,
1
);
run
(
1
,
2
,
7
,
7
,
0
,
1
);
}
run
(
1
,
1
,
3
,
2
,
1
,
1
);
run
(
1
,
1
,
5
,
2
,
1
,
1
);
run
(
1
,
1
,
6
,
2
,
1
,
1
);
run
(
1
,
1
,
7
,
2
,
1
,
1
);
run
(
1
,
1
,
9
,
2
,
1
,
1
);
run
(
1
,
1
,
10
,
2
,
1
,
1
);
run
(
1
,
1
,
11
,
2
,
1
,
1
);
run
(
1
,
1
,
13
,
2
,
1
,
1
);
run
(
1
,
1
,
14
,
2
,
1
,
1
);
run
(
1
,
1
,
15
,
2
,
1
,
1
);
run
(
1
,
1
,
17
,
2
,
1
,
1
);
run
(
1
,
1
,
18
,
2
,
1
,
1
);
run
(
1
,
1
,
19
,
2
,
1
,
1
);
run
(
1
,
1
,
21
,
2
,
1
,
1
);
run
(
1
,
1
,
22
,
2
,
1
,
1
);
run
(
1
,
1
,
23
,
2
,
1
,
1
);
run
(
1
,
1
,
25
,
2
,
1
,
1
);
run
(
1
,
1
,
26
,
2
,
1
,
1
);
run
(
1
,
1
,
27
,
2
,
1
,
1
);
run
(
1
,
1
,
29
,
2
,
1
,
1
);
run
(
1
,
1
,
30
,
2
,
1
,
1
);
run
(
1
,
1
,
31
,
2
,
1
,
1
);
run
(
4
,
8
,
32
,
3
,
3
/
2
,
1
);
run
(
4
,
8
,
32
,
5
,
5
/
2
,
1
);
run
(
4
,
8
,
32
,
7
,
7
/
2
,
1
);
run
(
4
,
8
,
32
,
9
,
9
/
2
,
1
);
run
(
4
,
8
,
32
,
11
,
11
/
2
,
1
);
...
...
@@ -781,10 +889,24 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32_RIN_EQ_ROUT) {
run
(
4
,
8
,
32
,
21
,
21
/
2
,
1
);
run
(
4
,
8
,
32
,
23
,
23
/
2
,
1
);
run
(
4
,
8
,
32
,
25
,
25
/
2
,
1
);
run
(
4
,
1
,
32
,
25
,
25
/
2
,
1
);
run
(
4
,
8
,
32
,
27
,
27
/
2
,
1
);
run
(
4
,
8
,
32
,
29
,
29
/
2
,
1
);
run
(
4
,
8
,
32
,
31
,
31
/
2
,
1
);
run
(
4
,
8
,
31
,
3
,
3
/
2
,
1
);
run
(
4
,
8
,
31
,
5
,
5
/
2
,
1
);
run
(
4
,
8
,
31
,
7
,
7
/
2
,
1
);
run
(
4
,
8
,
31
,
9
,
9
/
2
,
1
);
run
(
4
,
8
,
31
,
11
,
11
/
2
,
1
);
run
(
4
,
8
,
31
,
13
,
13
/
2
,
1
);
run
(
4
,
8
,
31
,
15
,
15
/
2
,
1
);
run
(
4
,
8
,
31
,
17
,
17
/
2
,
1
);
run
(
4
,
8
,
31
,
19
,
19
/
2
,
1
);
run
(
4
,
8
,
31
,
21
,
21
/
2
,
1
);
run
(
4
,
8
,
31
,
23
,
23
/
2
,
1
);
run
(
4
,
8
,
31
,
25
,
25
/
2
,
1
);
run
(
4
,
8
,
31
,
27
,
27
/
2
,
1
);
run
(
4
,
8
,
31
,
29
,
29
/
2
,
1
);
run
(
4
,
8
,
31
,
31
,
31
/
2
,
1
);
}
}
...
...
@@ -824,15 +946,13 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32) {
{
g
,
1
,
1
,
fh
,
fh
}
// grad
});
};
if
(
dt
==
dtype
::
Int32
())
{
run
(
4
,
8
,
32
,
5
,
5
/
2
,
1
);
run
(
1
,
2
,
2
,
2
,
0
,
1
);
run
(
1
,
2
,
3
,
3
,
0
,
1
);
run
(
1
,
2
,
4
,
4
,
0
,
1
);
run
(
1
,
2
,
5
,
5
,
0
,
1
);
run
(
1
,
2
,
6
,
6
,
0
,
1
);
run
(
1
,
2
,
7
,
7
,
0
,
1
);
}
run
(
4
,
8
,
32
,
5
,
5
/
2
,
1
);
run
(
1
,
2
,
2
,
2
,
0
,
1
);
run
(
1
,
2
,
3
,
3
,
0
,
1
);
run
(
1
,
2
,
4
,
4
,
0
,
1
);
run
(
1
,
2
,
5
,
5
,
0
,
1
);
run
(
1
,
2
,
6
,
6
,
0
,
1
);
run
(
1
,
2
,
7
,
7
,
0
,
1
);
run
(
4
,
8
,
32
,
7
,
7
/
2
,
1
);
run
(
4
,
8
,
32
,
9
,
9
/
2
,
1
);
run
(
4
,
8
,
32
,
11
,
11
/
2
,
1
);
...
...
@@ -886,15 +1006,13 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32_RIN_EQ_ROUT) {
{
g
,
1
,
1
,
fh
,
fh
}
// grad
});
};
if
(
dt
==
dtype
::
Int32
())
{
run
(
4
,
8
,
32
,
5
,
5
/
2
,
1
);
run
(
1
,
2
,
2
,
2
,
0
,
1
);
run
(
1
,
2
,
3
,
3
,
0
,
1
);
run
(
1
,
2
,
4
,
4
,
0
,
1
);
run
(
1
,
2
,
5
,
5
,
0
,
1
);
run
(
1
,
2
,
6
,
6
,
0
,
1
);
run
(
1
,
2
,
7
,
7
,
0
,
1
);
}
run
(
4
,
8
,
32
,
5
,
5
/
2
,
1
);
run
(
1
,
2
,
2
,
2
,
0
,
1
);
run
(
1
,
2
,
3
,
3
,
0
,
1
);
run
(
1
,
2
,
4
,
4
,
0
,
1
);
run
(
1
,
2
,
5
,
5
,
0
,
1
);
run
(
1
,
2
,
6
,
6
,
0
,
1
);
run
(
1
,
2
,
7
,
7
,
0
,
1
);
run
(
4
,
8
,
32
,
7
,
7
/
2
,
1
);
run
(
4
,
8
,
32
,
9
,
9
/
2
,
1
);
run
(
4
,
8
,
32
,
11
,
11
/
2
,
1
);
...
...
imperative/python/test/unit/functional/test_functional.py
浏览文件 @
fa488338
...
...
@@ -1061,8 +1061,8 @@ def test_region_restricted_conv_forward_backward_uint8(bias, groups):
N
=
1
GROUP
=
groups
FH
=
FW
=
1
IH
=
IW
=
4
OH
=
OW
=
4
IH
=
IW
=
3
OH
=
OW
=
3
ICPG
=
OCPG
=
1
grad_shape
=
(
N
,
GROUP
*
ICPG
,
IH
,
IW
)
src_shape
=
grad_shape
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录