Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
ae6ff2c5
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
404
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
ae6ff2c5
编写于
4月 02, 2021
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(mgb/gopt): add opt pass for nchw64 layout transform
GitOrigin-RevId: adede7cef6218b1a4871278bf45973ca599ad594
上级
63a9bd30
变更
12
展开全部
显示空白变更内容
内联
并排
Showing
12 changed file
with
1392 addition
and
118 deletion
+1392
-118
dnn/src/fallback/conv_bias/opr_impl.cpp
dnn/src/fallback/conv_bias/opr_impl.cpp
+2
-1
dnn/src/fallback/convolution/opr_impl.cpp
dnn/src/fallback/convolution/opr_impl.cpp
+12
-4
dnn/src/fallback/relayout/opr_impl.cpp
dnn/src/fallback/relayout/opr_impl.cpp
+7
-0
sdk/load-and-run/src/mgblar.cpp
sdk/load-and-run/src/mgblar.cpp
+6
-0
src/core/impl/graph/operator_node.cpp
src/core/impl/graph/operator_node.cpp
+0
-1
src/core/impl/tensor.cpp
src/core/impl/tensor.cpp
+2
-2
src/core/include/megbrain/graph/cg.h
src/core/include/megbrain/graph/cg.h
+3
-0
src/gopt/impl/framework.cpp
src/gopt/impl/framework.cpp
+11
-0
src/gopt/impl/tensor_reformat.cpp
src/gopt/impl/tensor_reformat.cpp
+1009
-96
src/gopt/include/megbrain/gopt/inference.h
src/gopt/include/megbrain/gopt/inference.h
+21
-0
src/gopt/test/inference.cpp
src/gopt/test/inference.cpp
+317
-12
src/opr/test/dnn/convolution.cpp
src/opr/test/dnn/convolution.cpp
+2
-2
未找到文件。
dnn/src/fallback/conv_bias/opr_impl.cpp
浏览文件 @
ae6ff2c5
...
...
@@ -343,6 +343,7 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param(
param
().
format
==
Param
::
Format
::
NCHW44
||
param
().
format
==
Param
::
Format
::
NCHW44_DOT
||
param
().
format
==
Param
::
Format
::
NCHW
||
param
().
format
==
Param
::
Format
::
NCHW32
||
param
().
format
==
Param
::
Format
::
NCHW64
)
{
spatial_pos
=
2
;
}
else
if
(
param
().
format
==
Param
::
Format
::
NHWC
)
{
...
...
dnn/src/fallback/convolution/opr_impl.cpp
浏览文件 @
ae6ff2c5
...
...
@@ -481,7 +481,9 @@ void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter,
_megdnn_tensor_out
grad
,
_megdnn_workspace
workspace
)
{
if
(
param
().
format
==
param
::
Convolution
::
Format
::
NHWCD4
||
param
().
format
==
param
::
Convolution
::
Format
::
NCHW4
)
{
param
().
format
==
param
::
Convolution
::
Format
::
NCHW4
||
(
param
().
format
==
param
::
Convolution
::
Format
::
NCHW
&&
grad
.
layout
.
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS8
))
{
return
naive
::
ConvolutionBackwardDataImpl
::
exec
(
filter
,
diff
,
grad
,
workspace
);
}
...
...
@@ -493,7 +495,9 @@ size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes(
const
TensorLayout
&
filter
,
const
TensorLayout
&
diff
,
const
TensorLayout
&
grad
)
{
if
(
param
().
format
==
param
::
Convolution
::
Format
::
NHWCD4
||
param
().
format
==
param
::
Convolution
::
Format
::
NCHW4
)
{
param
().
format
==
param
::
Convolution
::
Format
::
NCHW4
||
(
param
().
format
==
param
::
Convolution
::
Format
::
NCHW
&&
grad
.
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS8
))
{
return
naive
::
ConvolutionBackwardDataImpl
::
get_workspace_in_bytes
(
filter
,
diff
,
grad
);
}
...
...
@@ -506,7 +510,9 @@ ConvolutionBackwardDataImpl::get_all_algorithms(const TensorLayout& filter,
const
TensorLayout
&
diff
,
const
TensorLayout
&
grad
)
{
if
(
param
().
format
==
param
::
Convolution
::
Format
::
NHWCD4
||
param
().
format
==
param
::
Convolution
::
Format
::
NCHW4
)
{
param
().
format
==
param
::
Convolution
::
Format
::
NCHW4
||
(
param
().
format
==
param
::
Convolution
::
Format
::
NCHW
&&
grad
.
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS8
))
{
return
naive
::
ConvolutionBackwardDataImpl
::
get_all_algorithms
(
filter
,
diff
,
grad
);
}
...
...
@@ -523,7 +529,9 @@ ConvolutionBackwardDataImpl::get_algorithm_heuristic(
const
AlgoAttribute
&
positive_attr
,
const
AlgoAttribute
&
negative_attr
)
{
if
(
param
().
format
==
param
::
Convolution
::
Format
::
NHWCD4
||
param
().
format
==
param
::
Convolution
::
Format
::
NCHW4
)
{
param
().
format
==
param
::
Convolution
::
Format
::
NCHW4
||
(
param
().
format
==
param
::
Convolution
::
Format
::
NCHW
&&
grad
.
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS8
))
{
return
naive
::
ConvolutionBackwardDataImpl
::
get_algorithm_heuristic
(
filter
,
diff
,
grad
,
workspace_limit_in_bytes
,
positive_attr
,
negative_attr
);
...
...
dnn/src/fallback/relayout/opr_impl.cpp
浏览文件 @
ae6ff2c5
...
...
@@ -229,6 +229,13 @@ void RelayoutForwardImpl::exec(
return
;
}
// FIXME: optimize for lowbit cases
if
(
src
.
layout
.
dtype
.
enumv
()
==
DTypeEnum
::
QuantizedS4
||
src
.
layout
.
dtype
.
enumv
()
==
DTypeEnum
::
Quantized4Asymm
)
{
NaiveRelayoutForwardImpl
::
do_exec
(
src
,
dst
);
return
;
}
relayout
::
TransposeParam
trans_param
;
bool
trans
=
relayout
::
is_transpose
(
src
.
layout
,
dst
.
layout
,
trans_param
);
exec_after_preprocess
(
src
,
dst
,
trans
?
&
trans_param
:
nullptr
);
...
...
sdk/load-and-run/src/mgblar.cpp
浏览文件 @
ae6ff2c5
...
...
@@ -230,6 +230,11 @@ R"__usage__(
--enable-fuse-preprocess
Fusion astype\pad_channel\dimshuffle and etc opr from h2d op
)__usage__"
R"__usage__(
--enable-nchw64
Execute operators with kernels implemented in MegDNN with NCHW64 tensor format. Can only be used
on Nvidia GPUs, which natively support fast int4 tensorcore inference.
)__usage__"
;
struct
DataParser
{
...
...
@@ -1150,6 +1155,7 @@ Args Args::from_argv(int argc, char **argv) {
cb
(
nchw88
);
cb
(
nchw32
);
cb
(
nhwcd4
);
cb
(
nchw64
);
#undef cb
if
(
!
strcmp
(
argv
[
i
],
"--enable-nchw44-dot"
))
{
mgb_log_warn
(
"enable-nchw44-dot optimization"
);
...
...
src/core/impl/graph/operator_node.cpp
浏览文件 @
ae6ff2c5
...
...
@@ -351,7 +351,6 @@ void OperatorNodeBase::init_output_format() {
}
for
(
auto
i
:
output
())
{
if
(
i
->
contain_flag
(
VarNode
::
Flag
::
VOLATILE_CONTENT
))
{
mgb_assert
(
format
.
is_default
());
i
->
format
(
TensorFormat
(
i
->
dtype
()));
}
else
{
if
(
!
format
.
is_default
())
...
...
src/core/impl/tensor.cpp
浏览文件 @
ae6ff2c5
...
...
@@ -643,8 +643,8 @@ TensorND<TensorStorage>::copy_from_fixlayout(
(
m_layout
.
format
.
is_lowbit_aligned
()
&&
m_layout
.
is_contiguous
()),
src_contig
=
src
.
layout
().
is_physical_contiguous
()
||
(
m_layout
.
format
.
is_lowbit_aligned
()
&&
m_layout
.
is_contiguous
());
(
src
.
layout
()
.
format
.
is_lowbit_aligned
()
&&
src
.
layout
()
.
is_contiguous
());
if
(
self_contig
&&
src_contig
)
{
if
((
m_layout
.
format
.
is_default
()
&&
src
.
layout
().
format
.
is_default
())
||
...
...
src/core/include/megbrain/graph/cg.h
浏览文件 @
ae6ff2c5
...
...
@@ -112,6 +112,8 @@ struct GraphCommonOptimizeOptions {
///< tensorcore
CHWN4
,
///< compute using CHWN4 tensor format, transformed mainly
///< used for cuda
NCHW64
,
///< compute using NCHW64 tensor format, used for fast int4
///< support on Nvidia GPU
};
LayoutTransform
layout_transform
=
LayoutTransform
::
DEFAULT
;
...
...
@@ -154,6 +156,7 @@ struct GraphCommonOptimizeOptions {
SET
(
nchw44_dot
,
NCHW44_DOT
);
SET
(
nchw32
,
NCHW32
);
SET
(
chwn4
,
CHWN4
);
SET
(
nchw64
,
NCHW64
);
#undef SET
};
...
...
src/gopt/impl/framework.cpp
浏览文件 @
ae6ff2c5
...
...
@@ -774,6 +774,17 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options(
add_pass
<
ShuffleShuffleRemovePass
>
();
add_pass
<
RemoveRedundantTypeCvtPass
>
();
});
cb
(
nchw64
,
{
add_pass
<
FuseConvBiasNonlinPass
>
();
add_pass
<
PaddingChannelPass
>
();
add_pass
<
FuseConvBiasZPass
>
();
add_pass
(
EnableNCHW64Pass
::
make_nchw64_converter
());
add_pass
<
ShuffleShuffleRemovePass
>
();
add_pass
<
RemoveRedundantTypeCvtPass
>
();
add_pass
(
FuseNCHW4Int8Preprocess
::
make
());
add_pass
<
FuseWarpPerspectiveDimshufflePass
>
();
add_pass
<
FoldingConvBiasDimshufflePass
>
();
});
cb
(
fuse_conv_bias_nonlinearity
,
{
add_pass
<
FuseConvBiasNonlinPass
>
();
});
cb
(
fuse_conv_bias_with_z
,
{
...
...
src/gopt/impl/tensor_reformat.cpp
浏览文件 @
ae6ff2c5
此差异已折叠。
点击以展开。
src/gopt/include/megbrain/gopt/inference.h
浏览文件 @
ae6ff2c5
...
...
@@ -419,6 +419,27 @@ namespace gopt {
void
apply
(
OptState
&
opt
)
const
override
;
};
/*!
* \brief convert tensor format to nchw64 to enable tensorcore int4 on CUDA
* we assume that the input network is in NCHW layout
*/
class
EnableNCHW64Pass
final
:
public
TensorReformatPass
{
public:
using
Format
=
opr
::
ConvBias
::
Param
::
Format
;
const
char
*
name
()
const
override
{
return
mgb_cstr_log
(
"tensor_format_nchw64"
);
}
//! make nchw -> nchw64 converter opt pass
static
std
::
unique_ptr
<
EnableNCHW64Pass
>
make_nchw64_converter
();
private:
ThinHashMap
<
OperatorNodeBase
*
,
Format
>
m_opr_format_map
;
VarNode
*
on_graph_endpoint_var
(
VarNode
*
new_var
,
VarNode
*
orig_var
)
const
override
;
};
}
// namespace gopt
}
// namespace mgb
...
...
src/gopt/test/inference.cpp
浏览文件 @
ae6ff2c5
...
...
@@ -4248,7 +4248,7 @@ TEST(TestGoptInference, PaddingChannels) {
};
cg
::
DepOprIter
{
cb
}.
add
(
y3_pad
.
node
()
->
owner_opr
());
ASSERT_EQ
(
oprs
.
size
(),
3
);
ASSERT_EQ
(
oprs
[
0
]
->
output
(
0
)
->
shape
()[
1
],
20
);
ASSERT_EQ
(
oprs
[
0
]
->
output
(
0
)
->
shape
()[
1
],
32
);
ASSERT_EQ
(
oprs
[
1
]
->
output
(
0
)
->
shape
()[
1
],
32
);
ASSERT_EQ
(
oprs
[
2
]
->
output
(
0
)
->
shape
()[
1
],
32
);
HostTensorND
t1
,
t2
;
...
...
@@ -4322,7 +4322,7 @@ TEST(TestGoptInference, ConcatAfterPaddingChannels) {
};
cg
::
DepOprIter
{
cb
}.
add
(
y2_pad
.
node
()
->
owner_opr
());
ASSERT_EQ
(
oprs
.
size
(),
2
);
ASSERT_EQ
(
oprs
[
0
]
->
output
(
0
)
->
shape
()[
1
],
20
);
ASSERT_EQ
(
oprs
[
0
]
->
output
(
0
)
->
shape
()[
1
],
32
);
ASSERT_EQ
(
oprs
[
1
]
->
output
(
0
)
->
shape
()[
1
],
32
);
HostTensorND
t1
,
t2
;
auto
func1
=
graph
->
compile
({
make_callback_copy
(
y2
,
t1
)});
...
...
@@ -4335,16 +4335,16 @@ TEST(TestGoptInference, ConcatAfterPaddingChannels) {
// FIXME replace cpu with gpu to enable gpu validation
TEST
(
TestGoptInference
,
PaddingChannelsWithPooling
)
{
REQUIRE_GPU
(
1
);
auto
cn
=
CompNode
::
load
(
"
c
pu0"
);
//
cn.activate();
//
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
//
auto sm_ver = prop.major * 10 + prop.minor;
//
if (sm_ver < 61) {
//
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
//
"expected: %d)\n",
//
sm_ver, 61);
//
return;
//
}
auto
cn
=
CompNode
::
load
(
"
g
pu0"
);
cn
.
activate
();
auto
&&
prop
=
CompNodeEnv
::
from_comp_node
(
cn
).
cuda_env
().
device_prop
;
auto
sm_ver
=
prop
.
major
*
10
+
prop
.
minor
;
if
(
sm_ver
<
61
)
{
printf
(
"This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)
\n
"
,
sm_ver
,
61
);
return
;
}
HostTensorGenerator
<
dtype
::
Int8
>
gen
;
auto
graph
=
ComputingGraph
::
make
();
...
...
@@ -4485,6 +4485,311 @@ TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) {
func2
->
execute
();
MGB_ASSERT_TENSOR_EQ
(
t1
,
t2
);
}
TEST
(
TestGoptInference
,
EnableNCHW64Basic
)
{
REQUIRE_GPU
(
1
);
auto
cn
=
CompNode
::
load
(
"cpu0"
);
// cn.activate();
// auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
// auto sm_ver = prop.major * 10 + prop.minor;
// if (sm_ver < 61) {
// printf("This testcast ignored due to insufficient cuda cap(got: %d, "
// "expected: %d)\n",
// sm_ver, 61);
// return;
// }
HostTensorGenerator
<
dtype
::
Int8
>
gen
;
auto
graph
=
ComputingGraph
::
make
();
graph
->
options
().
graph_opt_level
=
0
;
auto
mkvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
,
const
DType
&
dtype
)
{
return
opr
::
TypeCvt
::
make
(
opr
::
Host2DeviceCopy
::
make
(
*
graph
,
gen
(
shp
,
cn
)).
rename
(
name
),
dtype
);
};
auto
mkcvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
,
const
DType
&
dtype
)
{
return
opr
::
TypeCvt
::
make
(
opr
::
SharedDeviceTensor
::
make
(
*
graph
,
*
gen
(
shp
,
cn
))
.
rename
(
name
),
dtype
);
};
auto
x
=
mkvar
(
"x"
,
{
16
,
4
,
14
,
14
},
dtype
::
QuantizedS8
(
2.5
f
)),
w
=
mkcvar
(
"w"
,
{
32
,
4
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b
=
mkcvar
(
"b"
,
{
1
,
32
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
opr
::
ConvBias
::
Param
param
;
param
.
format
=
opr
::
ConvBias
::
Param
::
Format
::
NCHW
;
param
.
nonlineMode
=
opr
::
ConvBias
::
Param
::
NonlineMode
::
IDENTITY
;
param
.
stride_h
=
param
.
stride_w
=
1
;
param
.
pad_h
=
param
.
pad_w
=
1
;
auto
y
=
opr
::
ConvBias
::
make
(
x
,
w
,
b
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
auto
w1
=
mkcvar
(
"w1"
,
{
32
,
32
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b1
=
mkcvar
(
"b1"
,
{
1
,
32
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
auto
y1
=
opr
::
ConvBias
::
make
(
y
,
w1
,
b1
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
auto
w2
=
mkcvar
(
"w2"
,
{
64
,
32
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b2
=
mkcvar
(
"b2"
,
{
1
,
64
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
auto
y2
=
opr
::
ConvBias
::
make
(
y1
,
w2
,
b2
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
y2
=
opr
::
TypeCvt
::
make
(
y2
,
dtype
::
QuantizedS4
{
40.
f
});
auto
w3
=
mkcvar
(
"w3"
,
{
64
,
64
,
3
,
3
},
dtype
::
QuantizedS4
(
2.5
f
)),
b3
=
mkcvar
(
"b3"
,
{
1
,
64
,
1
,
1
},
dtype
::
QuantizedS32
(
100.
f
));
auto
y3
=
opr
::
ConvBias
::
make
(
y2
,
w3
,
b3
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS4
{
40.
f
}});
y3
=
opr
::
TypeCvt
::
make
(
y3
,
dtype
::
QuantizedS8
{
2.5
f
});
auto
w4
=
mkcvar
(
"w4"
,
{
32
,
64
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b4
=
mkcvar
(
"b4"
,
{
1
,
32
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
auto
y4
=
opr
::
ConvBias
::
make
(
y3
,
w4
,
b4
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
using
ElemMultiMode
=
opr
::
ElemwiseMultiType
::
Param
::
Mode
;
auto
y5
=
opr
::
ElemwiseMultiType
::
make
(
{
y
,
y4
},
{
ElemMultiMode
::
QFUSE_ADD_RELU
},
OperatorNodeConfig
{
dtype
::
QuantizedS8
{
1.2
f
}});
y5
=
opr
::
TypeCvt
::
make
(
y5
,
dtype
::
Float32
());
SymbolVar
y5_pad
;
unpack_vector
(
gopt
::
GraphOptimizer
{}
.
add_pass
(
gopt
::
EnableNCHW64Pass
::
make_nchw64_converter
())
.
apply
({{
y5
}})
.
endpoint_vars
(),
y5_pad
);
EXPECT_TRUE
(
y5
.
node
()
->
shape
().
eq_shape
(
y5_pad
.
node
()
->
shape
()));
SmallVector
<
cg
::
OperatorNodeBase
*>
oprs
;
auto
cb
=
[
&
oprs
](
cg
::
OperatorNodeBase
*
opr
)
{
if
(
opr
->
same_type
<
opr
::
ConvBias
>
())
{
oprs
.
push_back
(
opr
);
}
};
cg
::
DepOprIter
{
cb
}.
add
(
y5_pad
.
node
()
->
owner_opr
());
ASSERT_EQ
(
oprs
.
size
(),
5
);
using
Format
=
opr
::
ConvBiasForward
::
Param
::
Format
;
#define CHECK(_i, _fmt) \
{ \
const auto& o = oprs[_i]->cast_final<opr::ConvBias>(); \
ASSERT_EQ(o.param().format, Format::_fmt); \
}
CHECK
(
0
,
NCHW4
);
CHECK
(
1
,
NCHW32
);
CHECK
(
2
,
NCHW32
);
CHECK
(
3
,
NCHW64
);
CHECK
(
4
,
NCHW32
);
#undef CHECK
HostTensorND
t1
,
t2
;
auto
func1
=
graph
->
compile
({
make_callback_copy
(
y5
,
t1
)});
func1
->
execute
();
auto
func2
=
graph
->
compile
({
make_callback_copy
(
y5_pad
,
t2
)});
func2
->
execute
();
MGB_ASSERT_TENSOR_EQ
(
t1
,
t2
);
}
TEST
(
TestGoptInference
,
EnableNCHW64PaddingChannel
)
{
REQUIRE_GPU
(
1
);
auto
cn
=
CompNode
::
load
(
"cpu0"
);
// cn.activate();
// auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
// auto sm_ver = prop.major * 10 + prop.minor;
// if (sm_ver < 61) {
// printf("This testcast ignored due to insufficient cuda cap(got: %d, "
// "expected: %d)\n",
// sm_ver, 61);
// return;
// }
HostTensorGenerator
<
dtype
::
Int8
>
gen
;
auto
graph
=
ComputingGraph
::
make
();
graph
->
options
().
graph_opt_level
=
0
;
auto
mkvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
,
const
DType
&
dtype
)
{
return
opr
::
TypeCvt
::
make
(
opr
::
Host2DeviceCopy
::
make
(
*
graph
,
gen
(
shp
,
cn
)).
rename
(
name
),
dtype
);
};
auto
mkcvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
,
const
DType
&
dtype
)
{
return
opr
::
TypeCvt
::
make
(
opr
::
SharedDeviceTensor
::
make
(
*
graph
,
*
gen
(
shp
,
cn
))
.
rename
(
name
),
dtype
);
};
auto
x
=
mkvar
(
"x"
,
{
16
,
3
,
14
,
14
},
dtype
::
QuantizedS8
(
2.5
f
)),
w
=
mkcvar
(
"w"
,
{
20
,
3
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b
=
mkcvar
(
"b"
,
{
1
,
20
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
opr
::
ConvBias
::
Param
param
;
param
.
format
=
opr
::
ConvBias
::
Param
::
Format
::
NCHW
;
param
.
nonlineMode
=
opr
::
ConvBias
::
Param
::
NonlineMode
::
IDENTITY
;
param
.
stride_h
=
param
.
stride_w
=
1
;
param
.
pad_h
=
param
.
pad_w
=
1
;
auto
y
=
opr
::
ConvBias
::
make
(
x
,
w
,
b
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
opr
::
Pooling
::
Param
pool
;
pool
.
format
=
opr
::
Pooling
::
Param
::
Format
::
NCHW
;
y
=
opr
::
Pooling
::
make
(
y
,
pool
);
auto
w1
=
mkcvar
(
"w1"
,
{
24
,
20
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b1
=
mkcvar
(
"b1"
,
{
1
,
24
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
auto
y1
=
opr
::
ConvBias
::
make
(
y
,
w1
,
b1
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
auto
w2
=
mkcvar
(
"w2"
,
{
20
,
24
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b2
=
mkcvar
(
"b2"
,
{
1
,
20
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
auto
y2
=
opr
::
ConvBias
::
make
(
y1
,
w2
,
b2
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
y2
=
opr
::
TypeCvt
::
make
(
y2
,
dtype
::
QuantizedS4
{
40.
f
});
auto
w3
=
mkcvar
(
"w3"
,
{
64
,
20
,
3
,
3
},
dtype
::
QuantizedS4
(
2.5
f
)),
b3
=
mkcvar
(
"b3"
,
{
1
,
64
,
1
,
1
},
dtype
::
QuantizedS32
(
100.
f
));
auto
y3
=
opr
::
ConvBias
::
make
(
y2
,
w3
,
b3
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS4
{
40.
f
}});
auto
w4
=
mkcvar
(
"w4"
,
{
20
,
64
,
3
,
3
},
dtype
::
QuantizedS4
(
2.5
f
)),
b4
=
mkcvar
(
"b4"
,
{
1
,
20
,
1
,
1
},
dtype
::
QuantizedS32
(
100.
f
));
auto
y4
=
opr
::
ConvBias
::
make
(
y3
,
w4
,
b4
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS4
{
40.
f
}});
y4
=
opr
::
TypeCvt
::
make
(
y4
,
dtype
::
QuantizedS8
{
2.5
f
});
using
ElemMultiMode
=
opr
::
ElemwiseMultiType
::
Param
::
Mode
;
auto
y5
=
opr
::
ElemwiseMultiType
::
make
(
{
y
,
y4
},
{
ElemMultiMode
::
QFUSE_ADD_RELU
},
OperatorNodeConfig
{
dtype
::
QuantizedS8
{
1.2
f
}});
opr
::
ConvolutionBackwardData
::
Param
deconv
;
deconv
.
format
=
opr
::
ConvolutionBackwardData
::
Param
::
Format
::
NCHW
;
deconv
.
stride_h
=
deconv
.
stride_w
=
2
;
deconv
.
pad_h
=
deconv
.
pad_w
=
1
;
auto
w6
=
mkcvar
(
"w6"
,
{
20
,
20
,
4
,
4
},
dtype
::
QuantizedS8
{
2.5
f
});
auto
y6
=
opr
::
ConvolutionBackwardData
::
make
(
w6
,
y5
,
deconv
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.0
f
)});
std
::
shared_ptr
<
HostTensorND
>
mat
=
std
::
make_shared
<
HostTensorND
>
(
cn
,
TensorShape
{
16
,
3
,
3
},
dtype
::
Float32
());
warp_perspective_mat_gen
(
*
mat
,
16
,
14
,
14
);
auto
mat_var
=
opr
::
Host2DeviceCopy
::
make
(
*
graph
,
mat
).
rename
(
"mat"
);
opr
::
WarpPerspective
::
Param
warp_param
;
warp_param
.
format
=
opr
::
WarpPerspective
::
Param
::
Format
::
NCHW
;
auto
y7
=
opr
::
WarpPerspective
::
make
(
y6
,
mat_var
,
TensorShape
{
14
,
14
},
warp_param
);
y7
=
opr
::
TypeCvt
::
make
(
y7
,
dtype
::
Float32
());
SymbolVar
y7_pad
;
auto
opt
=
gopt
::
OptimizeForInferenceOptions
{};
opt
.
enable_nchw64
();
unpack_vector
(
gopt
::
optimize_for_inference
({
y7
},
opt
),
y7_pad
);
EXPECT_TRUE
(
y7
.
node
()
->
shape
().
eq_shape
(
y7_pad
.
node
()
->
shape
()));
SmallVector
<
cg
::
OperatorNodeBase
*>
oprs
;
auto
cb
=
[
&
oprs
](
cg
::
OperatorNodeBase
*
opr
)
{
if
(
opr
->
same_type
<
opr
::
ConvBias
>
())
{
oprs
.
push_back
(
opr
);
}
};
cg
::
DepOprIter
{
cb
}.
add
(
y7_pad
.
node
()
->
owner_opr
());
ASSERT_EQ
(
oprs
.
size
(),
5
);
using
Format
=
opr
::
ConvBiasForward
::
Param
::
Format
;
#define CHECK(_i, _fmt) \
{ \
const auto& o = oprs[_i]->cast_final<opr::ConvBias>(); \
ASSERT_EQ(o.param().format, Format::_fmt); \
}
CHECK
(
0
,
NCHW4
);
CHECK
(
1
,
NCHW32
);
CHECK
(
2
,
NCHW32
);
CHECK
(
3
,
NCHW64
);
CHECK
(
4
,
NCHW64
);
#undef CHECK
{
const
auto
&
deconv
=
find_opr
<
opr
::
ConvolutionBackwardData
>
(
y7_pad
);
ASSERT_EQ
(
deconv
.
param
().
format
,
Format
::
NCHW4
);
const
auto
&
pool
=
find_opr
<
opr
::
PoolingForward
>
(
y7_pad
);
ASSERT_EQ
(
pool
.
param
().
format
,
Format
::
NCHW4
);
const
auto
&
warp
=
find_opr
<
opr
::
WarpPerspectiveForward
>
(
y7_pad
);
ASSERT_EQ
(
warp
.
param
().
format
,
Format
::
NCHW4
);
}
size_t
nr_dimshuffle
=
find_opr_num
<
opr
::
Dimshuffle
>
(
y7_pad
);
HostTensorND
t1
,
t2
;
auto
func1
=
graph
->
compile
({
make_callback_copy
(
y7
,
t1
)});
func1
->
execute
();
auto
func2
=
graph
->
compile
({
make_callback_copy
(
y7_pad
,
t2
)});
func2
->
execute
();
MGB_ASSERT_TENSOR_EQ
(
t1
,
t2
);
}
TEST
(
TestGoptInference
,
EnableNCHW64FuseConvBiasZ
)
{
REQUIRE_GPU
(
1
);
auto
cn
=
CompNode
::
load
(
"cpu0"
);
// cn.activate();
// auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
// auto sm_ver = prop.major * 10 + prop.minor;
// if (sm_ver < 61) {
// printf("This testcast ignored due to insufficient cuda cap(got: %d, "
// "expected: %d)\n",
// sm_ver, 61);
// return;
// }
HostTensorGenerator
<
dtype
::
Int8
>
gen
;
auto
graph
=
ComputingGraph
::
make
();
graph
->
options
().
graph_opt_level
=
0
;
auto
mkvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
,
const
DType
&
dtype
)
{
return
opr
::
TypeCvt
::
make
(
opr
::
Host2DeviceCopy
::
make
(
*
graph
,
gen
(
shp
,
cn
)).
rename
(
name
),
dtype
);
};
auto
mkcvar
=
[
&
](
const
char
*
name
,
const
TensorShape
&
shp
,
const
DType
&
dtype
)
{
return
opr
::
TypeCvt
::
make
(
opr
::
SharedDeviceTensor
::
make
(
*
graph
,
*
gen
(
shp
,
cn
))
.
rename
(
name
),
dtype
);
};
auto
x
=
mkvar
(
"x"
,
{
16
,
4
,
14
,
14
},
dtype
::
QuantizedS8
(
2.5
f
)),
w
=
mkcvar
(
"w"
,
{
32
,
4
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b
=
mkcvar
(
"b"
,
{
1
,
32
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
opr
::
ConvBias
::
Param
param
;
param
.
format
=
opr
::
ConvBias
::
Param
::
Format
::
NCHW
;
param
.
nonlineMode
=
opr
::
ConvBias
::
Param
::
NonlineMode
::
IDENTITY
;
param
.
stride_h
=
param
.
stride_w
=
1
;
param
.
pad_h
=
param
.
pad_w
=
1
;
auto
y
=
opr
::
ConvBias
::
make
(
x
,
w
,
b
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
auto
w1
=
mkcvar
(
"w1"
,
{
64
,
32
,
3
,
3
},
dtype
::
QuantizedS8
(
2.5
f
)),
b1
=
mkcvar
(
"b1"
,
{
1
,
64
,
1
,
1
},
dtype
::
QuantizedS32
(
6.25
f
));
auto
y1
=
opr
::
ConvBias
::
make
(
y
,
w1
,
b1
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS8
(
2.5
f
)});
y1
=
opr
::
TypeCvt
::
make
(
y1
,
dtype
::
QuantizedS4
{
40.
f
});
auto
w2
=
mkcvar
(
"w2"
,
{
64
,
64
,
3
,
3
},
dtype
::
QuantizedS4
(
2.5
f
)),
b2
=
mkcvar
(
"b2"
,
{
1
,
64
,
1
,
1
},
dtype
::
QuantizedS32
(
100.
f
));
auto
y2
=
opr
::
ConvBias
::
make
(
y1
,
w2
,
b2
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS4
{
40.
f
}});
auto
w3
=
mkcvar
(
"w3"
,
{
64
,
64
,
3
,
3
},
dtype
::
QuantizedS4
(
2.5
f
)),
b3
=
mkcvar
(
"b3"
,
{
1
,
64
,
1
,
1
},
dtype
::
QuantizedS32
(
100.
f
));
auto
y3
=
opr
::
ConvBias
::
make
(
y2
,
w3
,
b3
,
param
,
{},
OperatorNodeConfig
{
dtype
::
QuantizedS4
(
40.
f
)});
using
ElemMultiMode
=
opr
::
ElemwiseMultiType
::
Param
::
Mode
;
auto
y4
=
opr
::
ElemwiseMultiType
::
make
(
{
y1
,
y3
},
{
ElemMultiMode
::
QFUSE_ADD_RELU
},
OperatorNodeConfig
{
dtype
::
QuantizedS4
{
40.
f
}});
y4
=
opr
::
TypeCvt
::
make
(
y4
,
dtype
::
Float32
());
SymbolVar
y4_pad
;
auto
opt
=
gopt
::
OptimizeForInferenceOptions
{};
opt
.
enable_nchw64
();
unpack_vector
(
gopt
::
optimize_for_inference
({
y4
},
opt
),
y4_pad
);
EXPECT_TRUE
(
y4
.
node
()
->
shape
().
eq_shape
(
y4_pad
.
node
()
->
shape
()));
size_t
nr_elem_mult_type
=
find_opr_num
<
opr
::
ElemwiseMultiType
>
(
y4_pad
);
ASSERT_EQ
(
nr_elem_mult_type
,
0
);
// FIXME need impl of elemwise/elemwise_multi_type on CUDA
#if 0
HostTensorND t1, t2;
auto func1 = graph->compile({make_callback_copy(y4, t1)});
func1->execute();
auto func2 = graph->compile({make_callback_copy(y4_pad, t2)});
func2->execute();
MGB_ASSERT_TENSOR_EQ(t1, t2);
#endif
}
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
src/opr/test/dnn/convolution.cpp
浏览文件 @
ae6ff2c5
...
...
@@ -2642,7 +2642,7 @@ TEST(TestOprDNN, ConvBiasInt4NCHW) {
cn
);
opr
::
ConvBias
::
Param
param
;
param
.
format
=
opr
::
ConvBias
::
Param
::
Format
::
NCHW
;
param
.
nonlineMode
=
opr
::
ConvBias
::
Param
::
NonlineMode
::
IDENTITY
;
param
.
nonlineMode
=
opr
::
ConvBias
::
Param
::
NonlineMode
::
RELU
;
param
.
stride_h
=
param
.
stride_w
=
S
;
param
.
pad_h
=
param
.
pad_w
=
P
;
Policy
policy
;
...
...
@@ -2719,7 +2719,7 @@ TEST(TestOprDNN, ConvBiasInt4NCHW64) {
cn
);
opr
::
ConvBias
::
Param
param
;
param
.
format
=
opr
::
ConvBias
::
Param
::
Format
::
NCHW64
;
param
.
nonlineMode
=
opr
::
ConvBias
::
Param
::
NonlineMode
::
IDENTITY
;
param
.
nonlineMode
=
opr
::
ConvBias
::
Param
::
NonlineMode
::
RELU
;
param
.
stride_h
=
param
.
stride_w
=
S
;
param
.
pad_h
=
param
.
pad_w
=
P
;
Policy
policy
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录