Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
38067472
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
404
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
38067472
编写于
2月 11, 2022
作者:
M
Megvii Engine Team
提交者:
王彪
2月 27, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix(dnn/cuda): fix ci
GitOrigin-RevId: 8267e5f9ddd5c6813fcfccf8df197f3c8112fa98
上级
1da58ae1
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
29 addition
and
6 deletion
+29
-6
dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp
dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp
+2
-3
dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp
...uda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp
+2
-0
dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp
...cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp
+3
-0
dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float32_nchw_fma.cpp
.../backward_data/implicit_batched_gemm_float32_nchw_fma.cpp
+1
-0
dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp
...nvolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp
+14
-1
dnn/test/cuda/chanwise_convolution.cpp
dnn/test/cuda/chanwise_convolution.cpp
+6
-1
dnn/test/cuda/conv_bias.cpp
dnn/test/cuda/conv_bias.cpp
+1
-1
未找到文件。
dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp
浏览文件 @
38067472
...
...
@@ -245,9 +245,8 @@ std::pair<int, int> get_tensor_alignment(
int
threads
=
warp_size
*
algo_param
.
threadblock_m
*
algo_param
.
threadblock_n
*
algo_param
.
threadblock_k
/
(
algo_param
.
warp_m
*
algo_param
.
warp_n
*
algo_param
.
warp_k
);
int
threadblock_loads
=
filter
.
dtype
.
size
(
algo_param
.
threadblock_m
*
algo_param
.
threadblock_n
*
algo_param
.
threadblock_k
);
int
threadblock_loads
=
filter
.
dtype
.
size
(
algo_param
.
threadblock_m
*
algo_param
.
threadblock_k
);
int
load_per_thread
=
threadblock_loads
/
threads
;
if
(
load_per_thread
>=
16
)
alignment_filter
=
16
;
...
...
dnn/src/cuda/conv_bias/implicit_batched_gemm_float16_nchw_hmma.cpp
浏览文件 @
38067472
...
...
@@ -30,6 +30,7 @@ bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available(
using
Format
=
Param
::
Format
;
using
Sparse
=
Param
::
Sparse
;
using
Mode
=
Param
::
Mode
;
using
NonlineMode
=
Param
::
NonlineMode
;
auto
&&
param
=
args
.
opr
->
param
();
auto
&&
fm
=
args
.
filter_meta
;
RETURN_IF_FALSE
(
...
...
@@ -37,6 +38,7 @@ bool ConvBiasForwardImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm::is_available(
args
.
src_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Float16
&&
args
.
filter_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Float16
&&
args
.
dst_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Float16
);
RETURN_IF_FALSE
(
param
.
nonlineMode
!=
NonlineMode
::
SIGMOID
);
RETURN_IF_FALSE
(
args
.
bias_layout
->
ndim
<=
0
||
(
args
.
bias_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Float16
&&
...
...
dnn/src/cuda/conv_bias/implicit_batched_gemm_float32_nchw_fma.cpp
浏览文件 @
38067472
...
...
@@ -23,12 +23,14 @@ bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available(
#define RETURN_IF_FALSE(stmt_) \
if (!(stmt_)) \
return false;
RETURN_IF_FALSE
(
is_compute_capability_required
(
6
,
1
));
RETURN_IF_FALSE
(
args
.
src_layout
->
is_contiguous
()
&&
args
.
dst_layout
->
is_contiguous
());
using
Param
=
param
::
ConvBias
;
using
Format
=
Param
::
Format
;
using
Sparse
=
Param
::
Sparse
;
using
Mode
=
Param
::
Mode
;
using
NonlineMode
=
Param
::
NonlineMode
;
auto
&&
param
=
args
.
opr
->
param
();
auto
&&
fm
=
args
.
filter_meta
;
RETURN_IF_FALSE
(
...
...
@@ -36,6 +38,7 @@ bool ConvBiasForwardImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_available(
args
.
src_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Float32
&&
args
.
filter_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Float32
&&
args
.
dst_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Float32
);
RETURN_IF_FALSE
(
param
.
nonlineMode
!=
NonlineMode
::
SIGMOID
);
RETURN_IF_FALSE
(
args
.
bias_layout
->
ndim
<=
0
||
(
args
.
bias_layout
->
dtype
.
enumv
()
==
DTypeEnum
::
Float32
&&
...
...
dnn/src/cuda/convolution/backward_data/implicit_batched_gemm_float32_nchw_fma.cpp
浏览文件 @
38067472
...
...
@@ -63,6 +63,7 @@ bool ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm::is_avai
#define RETURN_IF_FALSE(stmt_) \
if (!(stmt_)) \
return false;
RETURN_IF_FALSE
(
is_compute_capability_required
(
6
,
1
));
RETURN_IF_FALSE
(
args
.
diff_layout
->
is_contiguous
()
&&
args
.
grad_layout
->
is_contiguous
());
using
Param
=
param
::
Convolution
;
...
...
dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp
浏览文件 @
38067472
...
...
@@ -29,6 +29,19 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
(
sh
==
2
&&
sw
==
2
)
?
cutlass
::
conv
::
SpecialOptimizeDesc
::
DECONV_DOUBLE_UPSAMPLING
:
cutlass
::
conv
::
SpecialOptimizeDesc
::
NONE
;
int
alignment_filter
=
4
;
constexpr
int
warp_size
=
32
;
int
threads
=
warp_size
*
m_algo_param
.
threadblock_m
*
m_algo_param
.
threadblock_n
*
m_algo_param
.
threadblock_k
/
(
m_algo_param
.
warp_m
*
m_algo_param
.
warp_n
*
m_algo_param
.
warp_k
);
int
threadblock_loads
=
args
.
filter_layout
->
dtype
.
size
(
m_algo_param
.
threadblock_m
*
m_algo_param
.
threadblock_k
);
int
load_per_thread
=
threadblock_loads
/
threads
;
if
(
load_per_thread
>=
16
)
alignment_filter
=
16
;
else
if
(
load_per_thread
>=
8
)
alignment_filter
=
8
;
megdnn_assert
(
load_per_thread
>=
4
);
ConvolutionKey
key
{
cutlass
::
conv
::
Operator
::
kDgrad
,
NumericTypeID
::
kS8
,
...
...
@@ -54,7 +67,7 @@ const void* ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
m_algo_param
.
stage
,
special_optimization
,
4
,
4
,
alignment_filter
,
false
};
return
(
void
*
)
Singleton
::
get
().
operation_table
.
find_op
(
key
);
}
...
...
dnn/test/cuda/chanwise_convolution.cpp
浏览文件 @
38067472
...
...
@@ -20,6 +20,7 @@
#include "test/common/workspace_wrapper.h"
#include "test/cuda/benchmark.h"
#include "test/cuda/fixture.h"
#include "test/cuda/utils.h"
#include <cuda_profiler_api.h>
#include <cuda_runtime_api.h>
...
...
@@ -510,6 +511,7 @@ void check_chanwise(DType io_type, DType comp_type, Handle* handle, const char*
#define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_FMA_##tag) { \
require_compute_capability(6, 1); \
check_chanwise<ConvolutionForward>( \
dtype::Float32(), dtype::Float32(), handle_cuda(), \
"FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
...
...
@@ -522,6 +524,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb)
#define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_FMA_##tag) { \
require_compute_capability(6, 1); \
check_chanwise<ConvolutionBackwardData>( \
dtype::Float32(), dtype::Float32(), handle_cuda(), \
"FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
...
...
@@ -544,6 +547,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_FMA_KERNEL(cb)
// check both ioc16 and io16xc32
#define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
TEST_F(CUDA, CHANWISE_CONVOLUTION_FORWARD_CUTLASS_HMMA_##tag) { \
require_compute_capability(7, 0); \
check_chanwise<ConvolutionForward>( \
dtype::Float16(), dtype::Float16(), handle_cuda(), \
"FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
...
...
@@ -560,6 +564,7 @@ MEGDNN_FOREACH_CUTLASS_CHANWISE_CONV_HMMA_KERNEL(cb)
#define cb(tag, tbm, tbn, tbk, wm, wn, wk) \
TEST_F(CUDA, CHANWISE_CONVOLUTION_BACKWARD_DATA_CUTLASS_HMMA_##tag) { \
require_compute_capability(7, 0); \
check_chanwise<ConvolutionBackwardData>( \
dtype::Float16(), dtype::Float16(), handle_cuda(), \
"FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM_" #tbm "X" #tbn "X" #tbk \
...
...
@@ -1407,7 +1412,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_BACKWARD_DATA_LARGE_KERNEL) {
bencher
.
proxy
()
->
target_execution_policy
.
algo
.
reset
();
param
.
compute_mode
=
param
::
Convolution
::
ComputeMode
::
FLOAT32
;
bencher
.
set_param
(
param
);
auto
time_in_ms_pseudo_fp16
=
bencher
.
execs
({
src
,
filter
,
{}
})
/
RUNS
;
auto
time_in_ms_pseudo_fp16
=
bencher
.
execs
({
filter
,
src
,
src
})
/
RUNS
;
printf
(
"stride=%zu src=%s, filter=%s, float32: %.2fms %.2fGB/s "
"float16: %.2fms %.2fGB/s "
...
...
dnn/test/cuda/conv_bias.cpp
浏览文件 @
38067472
...
...
@@ -1033,7 +1033,7 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_GROUP) {
ConvBiasForward
::
algo_name
<
ConvBiasForward
::
DirectParam
>
(
"CUDA:GROUP_CONV"
,
{})
.
c_str
(),
{{
"CUDNN"
,
{}}}}));
{{
"
DEFAULT:
CUDNN"
,
{}}}}));
ConvBias
::
Param
param
;
param
.
sparse
=
ConvBias
::
Param
::
Sparse
::
GROUP
;
param
.
nonlineMode
=
mode
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录