Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
32c86211
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
399
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
32c86211
编写于
6月 04, 2020
作者:
M
Megvii Engine Team
提交者:
Xu Xinran
6月 19, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix(dnn/cuda): enable cuda algos for nchw quantized
GitOrigin-RevId: 4d1e167b86764ea18a0ea45e58491428b778aa74
上级
b8d8886e
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
202 addition
and
3 deletion
+202
-3
dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp
dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp
+15
-3
dnn/test/cuda/conv_bias.cpp
dnn/test/cuda/conv_bias.cpp
+187
-0
未找到文件。
dnn/src/cuda/conv_bias/cudnn_conv_bias_activation.cpp
浏览文件 @
32c86211
...
@@ -79,9 +79,11 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available(
...
@@ -79,9 +79,11 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available(
if
(
args
.
src_layout
->
dtype
.
category
()
==
DTypeCategory
::
QUANTIZED
)
if
(
args
.
src_layout
->
dtype
.
category
()
==
DTypeCategory
::
QUANTIZED
)
return
false
;
return
false
;
MEGDNN_FALLTHRU
// XXX: why?
MEGDNN_FALLTHRU
// XXX: why?
case
param
::
ConvBias
::
NonlineMode
::
IDENTITY
case
param
::
ConvBias
::
NonlineMode
::
IDENTITY
:
:
if
(
m_cudnn_enum
!=
if
(
args
.
src_layout
->
dtype
.
category
()
==
DTypeCategory
::
QUANTIZED
)
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
)
{
break
;
if
(
m_cudnn_enum
!=
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
)
{
// cudnn require algo to
// cudnn require algo to
// CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
// CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
// when activation if IDENTITY
// when activation if IDENTITY
...
@@ -89,6 +91,8 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available(
...
@@ -89,6 +91,8 @@ bool ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::is_available(
}
}
break
;
break
;
case
param
::
ConvBias
::
NonlineMode
::
H_SWISH
:
case
param
::
ConvBias
::
NonlineMode
::
H_SWISH
:
if
(
args
.
src_layout
->
dtype
.
category
()
==
DTypeCategory
::
QUANTIZED
)
break
;
return
false
;
return
false
;
default:
default:
megdnn_throw
(
megdnn_mangle
(
"unsupported NonlineMode"
));
megdnn_throw
(
megdnn_mangle
(
"unsupported NonlineMode"
));
...
@@ -226,6 +230,14 @@ void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::exec(
...
@@ -226,6 +230,14 @@ void ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation::exec(
}
}
case
param
::
ConvBias
::
NonlineMode
::
IDENTITY
:
case
param
::
ConvBias
::
NonlineMode
::
IDENTITY
:
break
;
break
;
case
param
::
ConvBias
::
NonlineMode
::
H_SWISH
:
{
megdnn_assert
(
args
.
dst_layout
->
dtype
.
category
()
==
DTypeCategory
::
QUANTIZED
);
auto
&&
elem_opr
=
args
.
handle
->
create_operator
<
ElemwiseMultiType
>
();
elem_opr
->
param
().
mode
=
ElemwiseMultiType
::
Param
::
Mode
::
QH_SWISH
;
elem_opr
->
exec
({
*
(
args
.
dst_tensor
)},
*
(
args
.
dst_tensor
));
break
;
}
default:
default:
megdnn_throw
(
megdnn_mangle
(
"unsupported NonlineMode"
));
megdnn_throw
(
megdnn_mangle
(
"unsupported NonlineMode"
));
}
}
...
...
dnn/test/cuda/conv_bias.cpp
浏览文件 @
32c86211
...
@@ -189,6 +189,193 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_QS8) {
...
@@ -189,6 +189,193 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_QS8) {
}
}
}
}
TEST_F
(
CUDA
,
CONV_BIAS_NCHW_QS8
)
{
//! not support NonlineMode::SIGMOID and NonlineMode::H_SWISH
require_compute_capability
(
6
,
1
);
Checker
<
ConvBiasForward
>
checker
(
handle_cuda
());
UniformIntRNG
int_rng
{
-
128
,
127
};
using
NonlineMode
=
ConvBias
::
Param
::
NonlineMode
;
ConvBias
::
Param
param
;
param
.
format
=
ConvBias
::
Param
::
Format
::
NCHW
;
checker
.
set_dtype
(
0
,
dtype
::
QuantizedS8
(
2.5
f
))
.
set_dtype
(
1
,
dtype
::
QuantizedS8
(
2.5
f
))
.
set_dtype
(
2
,
dtype
::
QuantizedS32
(
6.25
f
))
.
set_dtype
(
3
,
dtype
::
QuantizedS8
(
0.25
f
))
.
set_dtype
(
4
,
dtype
::
QuantizedS8
(
0.25
f
))
.
set_rng
(
0
,
&
int_rng
)
.
set_rng
(
1
,
&
int_rng
)
.
set_rng
(
2
,
&
int_rng
)
.
set_rng
(
3
,
&
int_rng
);
for
(
NonlineMode
mode
:
{
NonlineMode
::
RELU
,
NonlineMode
::
IDENTITY
,
NonlineMode
::
H_SWISH
})
{
for
(
size_t
g
:
{
1
,
2
})
{
for
(
size_t
b
:
{
2
})
{
for
(
size_t
ic
:
{
6
,
16
})
{
for
(
size_t
oc
:
{
4
})
{
for
(
size_t
fh
:
{
1
,
3
})
{
for
(
int
ph
:
{
static_cast
<
int
>
(
fh
/
2
)})
{
for
(
int
sh
:
{
1
,
2
})
{
size_t
ih
=
16
,
iw
=
16
;
param
.
nonlineMode
=
mode
;
param
.
stride_h
=
param
.
stride_w
=
sh
;
param
.
pad_h
=
param
.
pad_w
=
ph
;
param
.
sparse
=
ConvBias
::
Param
::
Sparse
::
DENSE
;
checker
.
set_param
(
param
)
.
execs
({{
b
,
ic
/
2
,
ih
,
iw
},
{
oc
,
ic
/
2
,
fh
,
fh
},
{
1
,
oc
,
1
,
1
},
{},
{}});
param
.
sparse
=
ConvBias
::
Param
::
Sparse
::
GROUP
;
checker
.
set_param
(
param
)
.
execs
({{
b
,
ic
,
ih
,
iw
},
{
g
,
oc
/
g
,
ic
/
g
,
fh
,
fh
},
{
1
,
oc
,
1
,
1
},
{},
{}});
}
}
}
}
}
}
}
}
}
#if MEGDNN_WITH_BENCHMARK
TEST_F
(
CUDA
,
BENCHMARK_CONV_BIAS_NCHW4_INT8
)
{
require_compute_capability
(
6
,
1
);
Benchmarker
<
ConvBiasForward
>
bencher
(
handle_cuda
());
bencher
.
set_display
(
false
);
ConvBias
::
Param
param_nchw
;
param_nchw
.
format
=
ConvBias
::
Param
::
Format
::
NCHW
;
ConvBias
::
Param
param_nchw4
;
param_nchw4
.
format
=
ConvBias
::
Param
::
Format
::
NCHW4
;
auto
i8_min
=
std
::
numeric_limits
<
int8_t
>
().
min
();
auto
i8_max
=
std
::
numeric_limits
<
int8_t
>
().
max
();
UniformIntRNG
int_rng
{
i8_min
,
i8_max
};
param_nchw
.
nonlineMode
=
ConvBias
::
Param
::
NonlineMode
::
IDENTITY
;
auto
run_bench
=
[
&
](
size_t
b
,
size_t
ci
,
size_t
hi
,
size_t
wi
,
size_t
co
,
size_t
fh
,
size_t
fw
,
size_t
sh
,
size_t
sw
,
size_t
nr_times
)
{
param_nchw
.
pad_h
=
fh
/
2
;
param_nchw
.
pad_w
=
fw
/
2
;
param_nchw
.
stride_h
=
sh
;
param_nchw
.
stride_w
=
sw
;
param_nchw4
.
pad_h
=
fh
/
2
;
param_nchw4
.
pad_w
=
fh
/
2
;
param_nchw4
.
stride_h
=
sh
;
param_nchw4
.
stride_w
=
sw
;
bencher
.
set_times
(
nr_times
)
.
set_dtype
(
0
,
dtype
::
QuantizedS8
(
2.5
f
))
.
set_dtype
(
1
,
dtype
::
QuantizedS8
(
2.5
f
))
.
set_dtype
(
2
,
dtype
::
QuantizedS32
(
6.25
f
))
.
set_dtype
(
4
,
dtype
::
QuantizedS8
(
0.35
f
))
.
set_rng
(
0
,
&
int_rng
)
.
set_rng
(
1
,
&
int_rng
)
.
set_rng
(
2
,
&
int_rng
);
bencher
.
set_param
(
param_nchw
);
size_t
ho
=
infer_conv_shape
(
hi
,
fh
,
sh
,
param_nchw
.
pad_h
);
size_t
wo
=
infer_conv_shape
(
wi
,
fw
,
sw
,
param_nchw
.
pad_w
);
TensorShape
inp
{
b
,
ci
,
hi
,
wi
},
kern
{
co
,
ci
,
fh
,
fw
},
out
{
b
,
co
,
ho
,
wo
};
auto
time_in_ms
=
bencher
.
execs
(
{
inp
,
kern
,
{
1
,
co
,
1
,
1
},
{},
out
})
/
nr_times
;
auto
ops_nchw
=
2.0
*
b
*
co
*
ho
*
wo
*
ci
*
fh
*
fw
/
(
time_in_ms
*
1e-3
)
*
1e-12
;
printf
(
"inp=%s, kern=%s, out=%s, time: %.2fms, perf: %.2f Tops "
"(NCHW)
\n
"
,
inp
.
to_string
().
c_str
(),
kern
.
to_string
().
c_str
(),
out
.
to_string
().
c_str
(),
time_in_ms
,
ops_nchw
);
bencher
.
set_param
(
param_nchw4
);
decltype
(
ops_nchw
)
ops_nchw4
;
{
TensorShape
inp
{
b
,
ci
/
4
,
hi
,
wi
,
4
},
kern
{
co
,
ci
/
4
,
fh
,
fw
,
4
},
out
{
b
,
co
/
4
,
ho
,
wo
,
4
};
auto
time_in_ms
=
bencher
.
execs
(
{
inp
,
kern
,
{
1
,
co
/
4
,
1
,
1
,
4
},
{},
out
})
/
nr_times
;
ops_nchw4
=
2.0
*
b
*
co
*
ho
*
wo
*
ci
*
fh
*
fw
/
(
time_in_ms
*
1e-3
)
*
1e-12
;
printf
(
"inp=%s, kern=%s, out=%s, time: %.2fms, perf: %.2f Tops "
"(NCHW4)
\n
"
,
inp
.
to_string
().
c_str
(),
kern
.
to_string
().
c_str
(),
out
.
to_string
().
c_str
(),
time_in_ms
,
ops_nchw4
);
}
printf
(
"speedup: %.2fx
\n
"
,
ops_nchw4
/
ops_nchw
);
};
// resnet-50
// bottleneck-1
// proj
run_bench
(
1
,
64
,
56
,
56
,
256
,
1
,
1
,
1
,
1
,
1000
);
run_bench
(
1
,
64
,
56
,
56
,
64
,
1
,
1
,
1
,
1
,
1000
);
run_bench
(
1
,
64
,
56
,
56
,
64
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
1
,
64
,
56
,
56
,
256
,
1
,
1
,
1
,
1
,
1000
);
// bottleneck-2
// proj
run_bench
(
1
,
256
,
56
,
56
,
512
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
1
,
256
,
56
,
56
,
128
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
1
,
128
,
28
,
28
,
128
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
1
,
128
,
28
,
28
,
512
,
1
,
1
,
1
,
1
,
1000
);
// bottleneck-3
// proj
run_bench
(
1
,
512
,
28
,
28
,
1024
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
1
,
512
,
28
,
28
,
256
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
1
,
256
,
14
,
14
,
256
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
1
,
256
,
14
,
14
,
1024
,
1
,
1
,
1
,
1
,
1000
);
// bottleneck-4
// proj
run_bench
(
1
,
1024
,
14
,
14
,
2048
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
1
,
1024
,
14
,
14
,
512
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
1
,
512
,
7
,
7
,
512
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
1
,
512
,
7
,
7
,
2048
,
1
,
1
,
1
,
1
,
1000
);
run_bench
(
32
,
64
,
56
,
56
,
256
,
1
,
1
,
1
,
1
,
1000
);
run_bench
(
32
,
64
,
56
,
56
,
64
,
1
,
1
,
1
,
1
,
1000
);
run_bench
(
32
,
64
,
56
,
56
,
64
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
32
,
64
,
56
,
56
,
256
,
1
,
1
,
1
,
1
,
1000
);
run_bench
(
32
,
256
,
56
,
56
,
512
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
32
,
256
,
56
,
56
,
128
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
32
,
128
,
28
,
28
,
128
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
32
,
128
,
28
,
28
,
512
,
1
,
1
,
1
,
1
,
1000
);
run_bench
(
32
,
512
,
28
,
28
,
1024
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
32
,
512
,
28
,
28
,
256
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
32
,
256
,
14
,
14
,
256
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
32
,
256
,
14
,
14
,
1024
,
1
,
1
,
1
,
1
,
1000
);
run_bench
(
32
,
1024
,
14
,
14
,
2048
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
32
,
1024
,
14
,
14
,
512
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
32
,
512
,
7
,
7
,
512
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
32
,
512
,
7
,
7
,
2048
,
1
,
1
,
1
,
1
,
1000
);
run_bench
(
256
,
64
,
56
,
56
,
256
,
1
,
1
,
1
,
1
,
1000
);
run_bench
(
256
,
64
,
56
,
56
,
64
,
1
,
1
,
1
,
1
,
1000
);
run_bench
(
256
,
64
,
56
,
56
,
64
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
256
,
64
,
56
,
56
,
256
,
1
,
1
,
1
,
1
,
1000
);
run_bench
(
256
,
256
,
56
,
56
,
512
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
256
,
256
,
56
,
56
,
128
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
256
,
128
,
28
,
28
,
128
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
256
,
128
,
28
,
28
,
512
,
1
,
1
,
1
,
1
,
1000
);
run_bench
(
256
,
512
,
28
,
28
,
1024
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
256
,
512
,
28
,
28
,
256
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
256
,
256
,
14
,
14
,
256
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
256
,
256
,
14
,
14
,
1024
,
1
,
1
,
1
,
1
,
1000
);
run_bench
(
256
,
1024
,
14
,
14
,
2048
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
256
,
1024
,
14
,
14
,
512
,
1
,
1
,
2
,
2
,
1000
);
run_bench
(
256
,
512
,
7
,
7
,
512
,
3
,
3
,
1
,
1
,
1000
);
run_bench
(
256
,
512
,
7
,
7
,
2048
,
1
,
1
,
1
,
1
,
1000
);
}
#endif
TEST_F
(
CUDA
,
CONV_BIAS_FORWARD_NCHW4
)
{
TEST_F
(
CUDA
,
CONV_BIAS_FORWARD_NCHW4
)
{
require_compute_capability
(
6
,
1
);
require_compute_capability
(
6
,
1
);
using
namespace
conv_bias
;
using
namespace
conv_bias
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录