Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
869a0327
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
403
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
869a0327
编写于
7月 13, 2021
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
perf(mgb): disable FoldingConvBiasDimshufflePass in cuda10 for performance
GitOrigin-RevId: d1b95a6f01ba73f98c0094e00fee3e61e9139628
上级
0baf6b0d
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
54 addition
and
5 deletion
+54
-5
dnn/test/cuda/conv_bias_int8.cpp
dnn/test/cuda/conv_bias_int8.cpp
+40
-0
src/gopt/impl/framework.cpp
src/gopt/impl/framework.cpp
+4
-0
src/gopt/impl/tensor_reformat.cpp
src/gopt/impl/tensor_reformat.cpp
+3
-4
src/gopt/include/megbrain/gopt/inference.h
src/gopt/include/megbrain/gopt/inference.h
+6
-0
src/gopt/test/inference.cpp
src/gopt/test/inference.cpp
+1
-1
未找到文件。
dnn/test/cuda/conv_bias_int8.cpp
浏览文件 @
869a0327
...
...
@@ -1060,6 +1060,46 @@ TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4_SMALL_CHANNEL) {
param
::
ConvBias
::
Format
::
CHWN4
);
}
TEST_F
(
CUDA
,
BENCHMARK_CONV_BIAS_INT8_NCHW4_NCHW
)
{
CUBenchmarker
<
ConvBiasForward
>
benchmarker
(
handle_cuda
());
size_t
RUNS
=
1000
;
benchmarker
.
set_display
(
false
).
set_times
(
RUNS
);
using
namespace
conv_bias
;
UniformIntRNG
int_rng
{
-
3
,
3
};
UniformIntRNG
bias_rng
{
-
50
,
50
};
ConvBias
::
Param
param
;
param
.
format
=
ConvBias
::
Param
::
Format
::
NCHW4_NCHW
;
param
.
nonlineMode
=
ConvBias
::
Param
::
NonlineMode
::
IDENTITY
;
benchmarker
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBiasForward
>
(
"INT8_NCHW4_DOTPROD_IMPLICIT_GEMM"
));
benchmarker
.
set_dtype
(
0
,
dtype
::
QuantizedS8
(
1.9980618
f
))
.
set_dtype
(
1
,
dtype
::
QuantizedS8
(
1.9980927
f
))
.
set_dtype
(
2
,
dtype
::
Float32
())
.
set_dtype
(
3
,
dtype
::
Float32
())
.
set_dtype
(
4
,
dtype
::
Float32
())
.
set_rng
(
0
,
&
int_rng
)
.
set_rng
(
1
,
&
int_rng
)
.
set_param
(
param
);
auto
run
=
[
&
](
const
TensorShapeArray
&
shapes
)
{
auto
time_in_ms
=
benchmarker
.
execs
({
shapes
[
0
],
shapes
[
1
],
shapes
[
2
],
{},
{}})
/
RUNS
;
printf
(
"src=%s, filter=%s, dst=%s, time=%.2f
\n
"
,
shapes
[
0
].
to_string
().
c_str
(),
shapes
[
1
].
to_string
().
c_str
(),
shapes
[
2
].
to_string
().
c_str
(),
time_in_ms
);
};
run
({{
16
,
16
,
224
,
224
,
4
},
{
32
,
16
,
3
,
3
,
4
},
{
1
,
32
,
1
,
1
}});
run
({{
16
,
16
,
92
,
160
,
4
},
{
32
,
16
,
3
,
3
,
4
},
{
1
,
32
,
1
,
1
}});
run
({{
16
,
16
,
46
,
80
,
4
},
{
32
,
16
,
3
,
3
,
4
},
{
1
,
32
,
1
,
1
}});
}
#if CUDA_VERSION >= 10020
TEST_F
(
CUDA
,
BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32
)
{
...
...
src/gopt/impl/framework.cpp
浏览文件 @
869a0327
...
...
@@ -772,7 +772,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options(
add_pass
<
RemoveRedundantTypeCvtPass
>
();
add_pass
(
FuseNCHW4Int8Preprocess
::
make
());
add_pass
<
FuseWarpPerspectiveDimshufflePass
>
();
#if CUDA_VERSION >= 10020
add_pass
<
FoldingConvBiasDimshufflePass
>
();
#endif
});
cb
(
chwn4
,
{
add_pass
<
FuseConvBiasNonlinPass
>
();
...
...
@@ -791,7 +793,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options(
add_pass
<
RemoveRedundantTypeCvtPass
>
();
add_pass
(
FuseNCHW4Int8Preprocess
::
make
());
add_pass
<
FuseWarpPerspectiveDimshufflePass
>
();
#if CUDA_VERSION >= 10020
add_pass
<
FoldingConvBiasDimshufflePass
>
();
#endif
});
cb
(
fuse_conv_bias_nonlinearity
,
{
add_pass
<
FuseConvBiasNonlinPass
>
();
});
...
...
src/gopt/impl/tensor_reformat.cpp
浏览文件 @
869a0327
...
...
@@ -3638,6 +3638,7 @@ void ShuffleShuffleRemovePass::apply(OptState& opt) const {
MIDOUT_E
}
#if CUDA_VERSION >= 10020
/* ==================== FoldingConvBiasDimshufflePass ================= */
const
char
*
FoldingConvBiasDimshufflePass
::
name
()
const
{
return
mgb_cstr_log
(
"folding conv bias dimshuffle pass"
);
...
...
@@ -4068,20 +4069,17 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const {
return
true
;
};
MGB_MARK_USED_VAR
(
try_conv_reformat_nchw322nchw4
);
MGB_MARK_USED_VAR
(
try_conv_reformat_nchw42nchw32
);
auto
on_opr
=
[
&
try_conv_dimshuffle_reshape_typecvt
,
&
try_conv_reformat_nchw42nchw32
,
&
try_conv_reformat_nchw42nhwc
,
#if CUDA_VERSION >= 10020
&
try_conv_reformat_nchw322nchw4
,
#endif
&
rewriter
](
OperatorNodeBase
*
opr
)
{
if
(
!
try_conv_dimshuffle_reshape_typecvt
(
opr
)
&&
!
try_conv_reformat_nchw42nchw32
(
opr
)
&&
!
try_conv_reformat_nchw42nhwc
(
opr
)
#if CUDA_VERSION >= 10020
&&
!
try_conv_reformat_nchw322nchw4
(
opr
)
#endif
)
{
rewriter
.
auto_replace_outputs
(
opr
);
}
...
...
@@ -4091,6 +4089,7 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const {
MIDOUT_E
}
#endif
/* ==================== PaddingChannelPass ================= */
const
char
*
PaddingChannelPass
::
name
()
const
{
...
...
src/gopt/include/megbrain/gopt/inference.h
浏览文件 @
869a0327
...
...
@@ -16,6 +16,10 @@
#include "megbrain/opr/dnn/convolution.h"
#include "megbrain/opr/search_policy/algo_chooser_helper.h"
#if MGB_CUDA
#include <cuda.h>
#endif
namespace
mgb
{
namespace
gopt
{
...
...
@@ -427,11 +431,13 @@ namespace gopt {
void
apply
(
OptState
&
opt
)
const
override
;
};
#if CUDA_VERSION >= 10020
class
FoldingConvBiasDimshufflePass
final
:
public
Pass
{
public:
const
char
*
name
()
const
override
;
void
apply
(
OptState
&
opt
)
const
override
;
};
#endif
/*!
* \brief padding channel to enable fast int8/int4 support
...
...
src/gopt/test/inference.cpp
浏览文件 @
869a0327
...
...
@@ -4155,6 +4155,7 @@ TEST(TestGoptInference, WarpAndPreProcessCase1) {
MGB_ASSERT_TENSOR_NEAR
(
host_y
,
host_y_opt
,
1e-5
);
}
#if CUDA_VERSION >= 10020
TEST
(
TestGoptInference
,
FoldingConvDimshuffle
)
{
REQUIRE_GPU
(
1
);
auto
cn
=
CompNode
::
load
(
"gpu0"
);
...
...
@@ -4307,7 +4308,6 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) {
MGB_ASSERT_TENSOR_EQ
(
host_y_fuse
,
host_y_non_fuse
);
}
#if CUDA_VERSION >= 10020
TEST
(
TestGoptInference
,
FoldingConvDimshuffleNCHW32NCHW4
)
{
REQUIRE_GPU
(
1
);
auto
cn
=
CompNode
::
load
(
"gpu0"
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录