Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
25089e52
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
396
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
25089e52
编写于
1月 26, 2021
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refactor(megdnn): refactor matmul algo in conv backward data
GitOrigin-RevId: 8de601df6e7ddcdb48cd38d6ff05d132c1a014fe
上级
3620a940
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
159 addition
and
72 deletion
+159
-72
dnn/src/cuda/convolution/backward_data/algo.h
dnn/src/cuda/convolution/backward_data/algo.h
+4
-0
dnn/src/cuda/convolution/backward_data/matmul.cpp
dnn/src/cuda/convolution/backward_data/matmul.cpp
+114
-65
dnn/src/cuda/convolution/backward_filter/matmul.cpp
dnn/src/cuda/convolution/backward_filter/matmul.cpp
+5
-3
dnn/src/cuda/convolution/helper.cpp
dnn/src/cuda/convolution/helper.cpp
+2
-2
dnn/src/cuda/convolution/helper.h
dnn/src/cuda/convolution/helper.h
+2
-1
dnn/test/cuda/convolution.cpp
dnn/test/cuda/convolution.cpp
+32
-1
未找到文件。
dnn/src/cuda/convolution/backward_data/algo.h
浏览文件 @
25089e52
...
...
@@ -141,6 +141,10 @@ public:
size_t
get_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
override
;
void
exec
(
const
ExecArgs
&
args
)
const
override
;
std
::
vector
<
SearchItem
>
get_subopr_list
(
const
TensorLayoutArray
&
layouts
,
const
OperatorBase
*
opr
)
const
override
;
const
char
*
name
()
const
override
{
return
"MATMUL"
;
}
bool
is_reproducible
()
const
override
{
return
true
;
}
MEGDNN_DECL_ALGO_TYPE
(
CUDA_MATMUL
)
...
...
dnn/src/cuda/convolution/backward_data/matmul.cpp
浏览文件 @
25089e52
...
...
@@ -6,42 +6,101 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "./algo.h"
#include "src/cuda/utils.h"
#include "src/cuda/convolution/helper.h"
#include "src/cuda/convolution/im2col.cuh"
#include "src/cuda/matrix_mul/opr_impl.h"
#include "src/cuda/utils.h"
using
namespace
megdnn
;
using
namespace
cuda
;
namespace
{
std
::
pair
<
TensorLayoutArray
,
MatrixMulForward
::
Param
>
sub_opr_config
(
const
ConvolutionBackwardDataImpl
::
CanonizedFilterMeta
&
fm
,
const
TensorLayout
&
filter_layout
,
const
TensorLayout
&
diff_layout
,
const
TensorLayout
&
grad_layout
,
const
ConvolutionBackwardDataImpl
*
opr
)
{
size_t
N
=
grad_layout
.
shape
[
0
],
IC
=
fm
.
icpg
,
OC
=
fm
.
ocpg
,
OH
=
diff_layout
.
shape
[
2
],
OW
=
diff_layout
.
shape
[
3
],
FH
=
fm
.
spatial
[
0
],
FW
=
fm
.
spatial
[
1
];
megdnn_assert
(
filter_layout
.
dtype
.
enumv
()
==
diff_layout
.
dtype
.
enumv
());
TensorLayout
Al
({
OC
,
IC
*
FH
*
FW
},
filter_layout
.
dtype
),
Bl
({
IC
*
FH
*
FW
,
OH
*
OW
*
N
},
filter_layout
.
dtype
),
Cl
({
OC
,
OH
*
OW
*
N
},
filter_layout
.
dtype
);
MatrixMulForward
::
Param
param
;
if
(
opr
->
param
().
compute_mode
==
param
::
Convolution
::
ComputeMode
::
FLOAT32
)
{
param
.
compute_mode
=
param
::
MatrixMul
::
ComputeMode
::
FLOAT32
;
}
param
.
transposeA
=
true
;
return
{{
Al
,
Cl
,
Bl
},
param
};
}
}
// namespace
std
::
vector
<
Algorithm
::
SearchItem
>
ConvolutionBackwardDataImpl
::
AlgoMatmul
::
get_subopr_list
(
const
TensorLayoutArray
&
layouts
,
const
OperatorBase
*
opr
)
const
{
const
ConvolutionBackwardDataImpl
*
conv_backward_data_opr
=
static_cast
<
const
ConvolutionBackwardDataImpl
*>
(
opr
);
CanonizedFilterMeta
fm
=
conv_backward_data_opr
->
check_layout_fwd
(
layouts
[
2
],
layouts
[
0
],
layouts
[
1
]);
auto
&&
config
=
sub_opr_config
(
fm
,
layouts
[
0
],
layouts
[
1
],
layouts
[
2
],
conv_backward_data_opr
);
std
::
string
param_str
;
Algorithm
::
serialize_write_pod
(
config
.
second
,
param_str
);
return
{{
Algorithm
::
OprType
::
MATRIX_MUL_FORWARD
,
param_str
,
config
.
first
}};
}
bool
ConvolutionBackwardDataImpl
::
AlgoMatmul
::
is_available
(
const
SizeArgs
&
args
)
const
{
const
SizeArgs
&
args
)
const
{
if
(
args
.
diff_layout
->
dtype
==
args
.
filter_layout
->
dtype
&&
args
.
diff_layout
->
dtype
==
dtype
::
BFloat16
())
{
return
false
;
}
auto
&&
fm
=
args
.
filter_meta
;
auto
&&
fm
=
args
.
filter_meta
;
return
args
.
filter_meta
.
format
==
Param
::
Format
::
NCHW
&&
args
.
diff_layout
->
dtype
.
category
()
==
DTypeCategory
::
FLOAT
&&
fm
.
group
==
1
&&
fm
.
spatial_ndim
==
2
;
}
size_t
ConvolutionBackwardDataImpl
::
AlgoMatmul
::
get_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
{
return
matmul_get_workspace_bundle
(
args
.
as_fwd_args
()).
total_size_in_bytes
();
const
SizeArgs
&
args
)
const
{
auto
matmul_opr
=
args
.
handle
->
create_operator
<
MatrixMulForward
>
();
if
(
args
.
opr
->
execution_policy
().
algo
.
valid
()
&&
!
args
.
opr
->
execution_policy
().
sub_policy
.
empty
())
{
megdnn_assert
(
args
.
opr
->
execution_policy
().
sub_policy
.
size
()
==
1
);
matmul_opr
->
execution_policy
()
=
args
.
opr
->
execution_policy
().
sub_policy
[
0
];
}
auto
&&
config
=
sub_opr_config
(
args
.
filter_meta
,
*
args
.
filter_layout
,
*
args
.
diff_layout
,
*
args
.
grad_layout
,
args
.
opr
);
matmul_opr
->
param
()
=
config
.
second
;
auto
&&
sizes
=
matmul_get_workspace_bundle
(
args
.
as_fwd_args
());
sizes
.
push_back
(
matmul_opr
->
get_workspace_in_bytes
(
config
.
first
[
0
],
config
.
first
[
1
],
config
.
first
[
2
]));
return
WorkspaceBundle
(
nullptr
,
sizes
).
total_size_in_bytes
();
}
void
ConvolutionBackwardDataImpl
::
AlgoMatmul
::
exec
(
const
ExecArgs
&
args
)
const
{
#define cb(DType) \
if (args.diff_layout->dtype == DType()) { \
void
ConvolutionBackwardDataImpl
::
AlgoMatmul
::
exec
(
const
ExecArgs
&
args
)
const
{
#define cb(DType)
\
if (args.diff_layout->dtype == DType()) {
\
using ctype = typename DTypeTrait<DType>::ctype; \
exec_internal<ctype>(args); \
return; \
exec_internal<ctype>(args);
\
return;
\
}
MEGDNN_FOREACH_COMPUTING_DTYPE_FLOAT
(
cb
)
#undef cb
...
...
@@ -49,77 +108,67 @@ void ConvolutionBackwardDataImpl::AlgoMatmul::exec(const ExecArgs &args) const {
megdnn_assert_internal
(
0
);
}
template
<
typename
T
>
template
<
typename
T
>
void
ConvolutionBackwardDataImpl
::
AlgoMatmul
::
exec_internal
(
const
ExecArgs
&
args
)
{
auto
&&
fm
=
args
.
filter_meta
;
size_t
N
=
args
.
grad_layout
->
shape
[
0
],
IC
=
fm
.
icpg
,
IH
=
args
.
grad_layout
->
shape
[
2
],
IW
=
args
.
grad_layout
->
shape
[
3
],
OC
=
fm
.
ocpg
,
OH
=
args
.
diff_layout
->
shape
[
2
],
OW
=
args
.
diff_layout
->
shape
[
3
],
FH
=
fm
.
spatial
[
0
],
FW
=
fm
.
spatial
[
1
],
PH
=
fm
.
padding
[
0
],
PW
=
fm
.
padding
[
1
],
SH
=
fm
.
stride
[
0
],
SW
=
fm
.
stride
[
1
],
DH
=
fm
.
dilation
[
0
],
const
ExecArgs
&
args
)
{
auto
&&
fm
=
args
.
filter_meta
;
size_t
N
=
args
.
grad_layout
->
shape
[
0
],
IC
=
fm
.
icpg
,
IH
=
args
.
grad_layout
->
shape
[
2
],
IW
=
args
.
grad_layout
->
shape
[
3
],
OC
=
fm
.
ocpg
,
OH
=
args
.
diff_layout
->
shape
[
2
],
OW
=
args
.
diff_layout
->
shape
[
3
],
FH
=
fm
.
spatial
[
0
],
FW
=
fm
.
spatial
[
1
],
PH
=
fm
.
padding
[
0
],
PW
=
fm
.
padding
[
1
],
SH
=
fm
.
stride
[
0
],
SW
=
fm
.
stride
[
1
],
DH
=
fm
.
dilation
[
0
],
DW
=
fm
.
dilation
[
1
];
auto
stream
=
cuda_stream
(
args
.
handle
);
auto
wbundle
=
matmul_get_workspace_bundle
(
args
.
as_fwd_args
());
wbundle
.
set
(
args
.
workspace
.
raw_ptr
);
T
*
diff_t
=
static_cast
<
T
*>
(
wbundle
.
get
(
0
));
T
*
col
=
static_cast
<
T
*>
(
wbundle
.
get
(
1
));
auto
matmul_opr
=
args
.
handle
->
create_operator
<
MatrixMulForward
>
();
if
(
args
.
opr
->
execution_policy
().
algo
.
valid
())
{
megdnn_assert
(
args
.
opr
->
execution_policy
().
sub_policy
.
size
()
==
1
);
matmul_opr
->
execution_policy
()
=
args
.
opr
->
execution_policy
().
sub_policy
[
0
];
}
auto
&&
config
=
sub_opr_config
(
args
.
filter_meta
,
*
args
.
filter_layout
,
*
args
.
diff_layout
,
*
args
.
grad_layout
,
args
.
opr
);
matmul_opr
->
param
()
=
config
.
second
;
auto
&&
sizes
=
matmul_get_workspace_bundle
(
args
.
as_fwd_args
());
sizes
.
push_back
(
matmul_opr
->
get_workspace_in_bytes
(
config
.
first
[
0
],
config
.
first
[
1
],
config
.
first
[
2
]));
auto
wbundle
=
WorkspaceBundle
(
args
.
workspace
.
raw_ptr
,
sizes
);
T
*
diff_t
=
static_cast
<
T
*>
(
wbundle
.
get
(
0
));
T
*
col
=
static_cast
<
T
*>
(
wbundle
.
get
(
1
));
{
// transpose diff
TensorLayout
froml
({
N
,
OC
*
OH
*
OW
},
typename
DTypeTrait
<
T
>::
dtype
()),
tol
(
froml
);
TensorLayout
froml
({
N
,
OC
*
OH
*
OW
},
typename
DTypeTrait
<
T
>::
dtype
()),
tol
(
froml
);
froml
.
stride
[
0
]
=
args
.
diff_layout
->
stride
[
0
];
tol
.
stride
[
0
]
=
1
;
tol
.
stride
[
1
]
=
N
;
TensorND
from
(
args
.
diff_tensor
->
ptr
<
T
>
(),
froml
),
to
(
diff_t
,
tol
);
TensorND
from
(
args
.
diff_tensor
->
ptr
<
T
>
(),
froml
),
to
(
diff_t
,
tol
);
args
.
handle
->
relayout_opr
()
->
exec
(
from
,
to
);
}
{
// take gemm grad
TensorLayout
Al
({
OC
,
IC
*
FH
*
FW
},
typename
DTypeTrait
<
T
>::
dtype
()),
Bl
({
IC
*
FH
*
FW
,
OH
*
OW
*
N
},
typename
DTypeTrait
<
T
>::
dtype
()),
Cl
({
OC
,
OH
*
OW
*
N
},
typename
DTypeTrait
<
T
>::
dtype
());
TensorND
A
(
args
.
filter_tensor
->
ptr
<
T
>
(),
Al
),
B
(
col
,
Bl
),
C
(
diff_t
,
Cl
);
TensorLayout
Al
({
OC
,
IC
*
FH
*
FW
},
typename
DTypeTrait
<
T
>::
dtype
()),
Bl
({
IC
*
FH
*
FW
,
OH
*
OW
*
N
},
typename
DTypeTrait
<
T
>::
dtype
()),
Cl
({
OC
,
OH
*
OW
*
N
},
typename
DTypeTrait
<
T
>::
dtype
());
TensorND
A
(
args
.
filter_tensor
->
ptr
<
T
>
(),
Al
),
B
(
col
,
Bl
),
C
(
diff_t
,
Cl
);
if
(
fm
.
should_flip
)
{
convolution
::
flip_filter
(
args
.
as_fwd_args
(),
wbundle
.
get_workspace
(
2
),
A
.
raw_ptr
);
}
auto
&&
matmul_opr
=
args
.
handle
->
create_operator
<
MatrixMulForward
>
();
if
(
args
.
opr
->
param
().
compute_mode
==
param
::
Convolution
::
ComputeMode
::
FLOAT32
)
{
matmul_opr
->
param
().
compute_mode
=
param
::
MatrixMul
::
ComputeMode
::
FLOAT32
;
wbundle
.
get_workspace
(
2
),
A
.
raw_ptr
);
matmul_opr
->
exec
(
A
,
C
,
B
,
wbundle
.
get_workspace
(
3
));
}
else
{
matmul_opr
->
exec
(
A
,
C
,
B
,
wbundle
.
get_workspace
(
2
));
}
matmul_opr
->
param
().
transposeA
=
true
;
megdnn_assert
(
matmul_opr
->
get_workspace_in_bytes
(
A
.
layout
,
C
.
layout
,
B
.
layout
)
==
0
_z
,
"Assume matmul opr in algo MATMUL doesn't need extra "
"workspace"
);
matmul_opr
->
exec
(
A
,
C
,
B
,
Workspace
());
}
{
// col2im
convolution
::
col2im
<
T
>
(
col
,
args
.
grad_tensor
->
ptr
<
T
>
(),
N
,
args
.
grad_layout
->
stride
[
0
],
IC
,
IH
,
IW
,
FH
,
FW
,
OH
,
OW
,
PH
,
PW
,
SH
,
SW
,
DH
,
DW
,
stream
);
convolution
::
col2im
<
T
>
(
col
,
args
.
grad_tensor
->
ptr
<
T
>
(),
N
,
args
.
grad_layout
->
stride
[
0
],
IC
,
IH
,
IW
,
FH
,
FW
,
OH
,
OW
,
PH
,
PW
,
SH
,
SW
,
DH
,
DW
,
stream
);
}
}
...
...
dnn/src/cuda/convolution/backward_filter/matmul.cpp
浏览文件 @
25089e52
...
...
@@ -31,8 +31,9 @@ bool ConvolutionBackwardFilterImpl::AlgoMatmul::is_available(
size_t
ConvolutionBackwardFilterImpl
::
AlgoMatmul
::
get_workspace_in_bytes
(
const
SizeArgs
&
args
)
const
{
return
matmul_get_workspace_bundle
(
args
.
as_fwd_args
()).
total_size_in_bytes
();
return
WorkspaceBundle
(
nullptr
,
matmul_get_workspace_bundle
(
args
.
as_fwd_args
()))
.
total_size_in_bytes
();
}
void
ConvolutionBackwardFilterImpl
::
AlgoMatmul
::
exec
(
...
...
@@ -69,7 +70,8 @@ void ConvolutionBackwardFilterImpl::AlgoMatmul::exec_internal(
DH
=
fm
.
dilation
[
0
],
DW
=
fm
.
dilation
[
1
];
auto
stream
=
cuda_stream
(
args
.
handle
);
auto
wbundle
=
matmul_get_workspace_bundle
(
args
.
as_fwd_args
());
auto
wbundle
=
WorkspaceBundle
(
nullptr
,
matmul_get_workspace_bundle
(
args
.
as_fwd_args
()));
wbundle
.
set
(
args
.
workspace
.
raw_ptr
);
T
*
diff_t
=
static_cast
<
T
*>
(
wbundle
.
get
(
0
));
T
*
col
=
static_cast
<
T
*>
(
wbundle
.
get
(
1
));
...
...
dnn/src/cuda/convolution/helper.cpp
浏览文件 @
25089e52
...
...
@@ -48,7 +48,7 @@ bool convolution::is_cudnn_supported(const ForwardSizeArgs &args) {
return
supported
;
}
WorkspaceBundle
convolution
::
matmul_get_workspace_bundle
(
SmallVector
<
size_t
>
convolution
::
matmul_get_workspace_bundle
(
const
ForwardSizeArgs
&
args
)
{
auto
dtype
=
args
.
src_layout
->
dtype
;
auto
&&
fm
=
args
.
filter_meta
;
...
...
@@ -67,7 +67,7 @@ WorkspaceBundle convolution::matmul_get_workspace_bundle(
if
(
args
.
filter_meta
.
should_flip
)
{
sizes
.
push_back
(
dtype
.
size
()
*
OC
*
IC
*
FH
*
FW
);
}
return
{
nullptr
,
std
::
move
(
sizes
)}
;
return
sizes
;
}
void
convolution
::
flip_filter
(
const
ForwardSizeArgs
&
args
,
...
...
dnn/src/cuda/convolution/helper.h
浏览文件 @
25089e52
...
...
@@ -34,7 +34,8 @@ namespace convolution {
bool
is_cudnn_supported
(
const
ForwardSizeArgs
&
args
);
//! get workspace bundle for matmul algo
WorkspaceBundle
matmul_get_workspace_bundle
(
const
ForwardSizeArgs
&
args
);
SmallVector
<
size_t
>
matmul_get_workspace_bundle
(
const
ForwardSizeArgs
&
args
);
struct
CUDNNForwardDescs
{
TensorDesc
src_desc
,
dst_desc
;
...
...
dnn/test/cuda/convolution.cpp
浏览文件 @
25089e52
...
...
@@ -230,7 +230,7 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA)
}
checker
.
set_before_exec_callback
(
AlgoChecker
<
ConvolutionBackwardData
>
(
ExecutionPolicyAlgoName
{
"CONVOLUTION_BACKWARD_DATD_BFLOAT16"
,
{{
"MATMUL"
,
{}}}}));
{{
"MATMUL"
,
{
{
"CUBLAS"
,
{}}
}}}}));
src
.
dtype
=
dst
.
dtype
=
filter
.
dtype
=
dtype
::
BFloat16
();
arg
.
param
.
compute_mode
=
param
::
Convolution
::
ComputeMode
::
FLOAT32
;
checker
.
set_rng
(
0
,
&
rng
)
...
...
@@ -243,6 +243,37 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA)
}
}
TEST_F
(
CUDA
,
CONVOLUTION_BACKWARD_DATA_MATMUL
)
{
using
namespace
convolution
;
std
::
vector
<
TestArg
>
args
=
get_args_cuda_conv_bwd_data
();
Checker
<
ConvolutionBackwardData
>
checker
(
handle_cuda
());
checker
.
set_before_exec_callback
(
AlgoChecker
<
ConvolutionBackwardData
>
(
ExecutionPolicyAlgoName
{
"MATMUL"
,
{{
"CUBLAS"
,
{}}}}));
NormalRNG
default_rng
;
for
(
auto
&&
arg
:
args
)
{
float
scale
=
64.
f
/
sqrt
(
arg
.
filter
[
0
]
*
arg
.
filter
[
2
]
*
arg
.
filter
[
3
]);
UniformFloatRNG
rng
(
scale
,
2
*
scale
);
auto
src
=
TensorLayout
(
arg
.
src
,
dtype
::
Float32
());
auto
filter
=
TensorLayout
(
arg
.
filter
,
dtype
::
Float32
());
TensorLayout
dst
;
{
auto
opr
=
handle_cuda
()
->
create_operator
<
Convolution
>
();
opr
->
param
()
=
arg
.
param
;
opr
->
deduce_layout
(
src
,
filter
,
dst
);
}
src
.
dtype
=
dst
.
dtype
=
filter
.
dtype
=
dtype
::
Float32
();
checker
.
set_rng
(
0
,
&
default_rng
)
.
set_rng
(
1
,
&
default_rng
)
.
set_epsilon
(
1e-3
)
.
set_param
(
arg
.
param
)
.
exec
(
TensorLayoutArray
{
filter
,
dst
,
src
});
}
}
TEST_F
(
CUDA
,
CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5
)
{
// BRAIN-481 failed on architectures 7.0, remove the following if statement,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录