Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
756c1eb7
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
399
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
756c1eb7
编写于
10月 13, 2020
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix(mgb/dnn): add cuda float naive matmul algo
GitOrigin-RevId: db7f7fc05712d09cb918931da31233faf60d5896
上级
4f0e6eae
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
121 addition
and
18 deletion
+121
-18
dnn/src/cuda/matrix_mul/naive.cpp
dnn/src/cuda/matrix_mul/naive.cpp
+52
-7
dnn/src/cuda/matrix_mul/naive.cu
dnn/src/cuda/matrix_mul/naive.cu
+26
-9
dnn/src/cuda/matrix_mul/naive.cuh
dnn/src/cuda/matrix_mul/naive.cuh
+3
-2
dnn/test/cuda/matrix_mul.cpp
dnn/test/cuda/matrix_mul.cpp
+40
-0
未找到文件。
dnn/src/cuda/matrix_mul/naive.cpp
浏览文件 @
756c1eb7
...
...
@@ -17,8 +17,21 @@
using
namespace
megdnn
;
using
namespace
cuda
;
#include "midout.h"
MIDOUT_DECL
(
megdnn_naive_matmul
)
bool
MatrixMulForwardImpl
::
AlgoNaive
::
is_available
(
const
SizeArgs
&
args
)
const
{
return
args
.
can_be_treated_as_int8x8x32
();
if
(
args
.
can_be_treated_as_int8x8x32
())
return
true
;
auto
&&
layout_a
=
args
.
layout_a
;
auto
&&
layout_b
=
args
.
layout_b
;
auto
&&
layout_c
=
args
.
layout_c
;
return
layout_a
.
dtype
.
enumv
()
==
layout_b
.
dtype
.
enumv
()
&&
(
layout_a
.
dtype
.
enumv
()
==
DTypeEnum
::
Float32
||
layout_a
.
dtype
.
enumv
()
==
DTypeEnum
::
Float16
)
&&
(
layout_c
.
dtype
.
enumv
()
==
DTypeEnum
::
Float32
||
layout_c
.
dtype
.
enumv
()
==
DTypeEnum
::
Float16
)
&&
args
.
opr
->
param
().
format
==
param
::
MatrixMul
::
Format
::
DEFAULT
;
}
void
MatrixMulForwardImpl
::
AlgoNaive
::
exec
(
const
ExecArgs
&
args
)
const
{
auto
&&
param
=
args
.
opr
->
param
();
...
...
@@ -28,13 +41,45 @@ void MatrixMulForwardImpl::AlgoNaive::exec(const ExecArgs& args) const {
LDB
=
args
.
tensor_b
.
layout
.
stride
[
0
],
LDC
=
args
.
tensor_c
.
layout
.
stride
[
0
];
int8_t
*
A
=
args
.
tensor_a
.
compatible_ptr
<
dt_int8
>
();
int8_t
*
B
=
args
.
tensor_b
.
compatible_ptr
<
dt_int8
>
();
int32_t
*
C
=
args
.
tensor_c
.
compatible_ptr
<
dt_int32
>
();
auto
&&
handle
=
concrete_handle
(
args
.
opr
->
handle
());
exec_gemm_int8_naive
(
A
,
B
,
C
,
m
,
n
,
k
,
LDA
,
LDB
,
LDC
,
param
.
transposeA
,
param
.
transposeB
,
cuda_stream
(
handle
));
using
ComputeMode
=
Param
::
ComputeMode
;
#define DISPATCH_CMODE(in_dt, out_dt, in_ct, out_ct, comp_ct, cmode) \
MIDOUT_BEGIN(megdnn_naive_matmul, midout_iv(#in_dt #out_dt #in_ct, \
#out_ct, #comp_ct, #cmode)) { \
do { \
using namespace dtype; \
if (args.tensor_a.layout.dtype.enumv() == \
DTypeTrait<in_dt>::enumv && \
args.tensor_c.layout.dtype.enumv() == \
DTypeTrait<out_dt>::enumv && \
param.compute_mode == cmode) { \
in_ct* A = args.tensor_a.compatible_ptr<in_ct>(); \
in_ct* B = args.tensor_b.compatible_ptr<in_ct>(); \
out_ct* C = args.tensor_c.compatible_ptr<out_ct>(); \
exec_gemm_naive<in_ct, in_ct, out_ct, comp_ct>( \
A, B, C, m, n, k, LDA, LDB, LDC, param.transposeA, \
param.transposeB, cuda_stream(handle)); \
return; \
} \
} while (0); \
} \
MIDOUT_END();
#define DISPATCH(in_dt, out_dt, in_ct, out_ct, comp_ct) \
DISPATCH_CMODE(in_dt, out_dt, in_ct, out_ct, comp_ct, ComputeMode::DEFAULT)
DISPATCH
(
Float32
,
Float32
,
dt_float32
,
dt_float32
,
dt_float32
);
DISPATCH
(
Float16
,
Float16
,
dt_float16
,
dt_float16
,
dt_float16
);
DISPATCH
(
Int8
,
Int32
,
dt_int8
,
dt_int32
,
dt_int32
);
DISPATCH
(
QuantizedS8
,
QuantizedS32
,
dt_int8
,
dt_int32
,
dt_int32
);
DNN_INC_FLOAT16
(
DISPATCH_CMODE
(
Float16
,
Float16
,
dt_float16
,
dt_float16
,
dt_float32
,
ComputeMode
::
FLOAT32
));
#undef DISPATCH_CMODE
#undef DISPATCH
megdnn_throw
(
ssprintf
(
"unsupported Matmul(%s, %s) -> %s with cmode = %d"
,
args
.
layout_a
.
dtype
.
name
(),
args
.
layout_b
.
dtype
.
name
(),
args
.
layout_c
.
dtype
.
name
(),
static_cast
<
int
>
(
param
.
compute_mode
)));
}
// vim: syntax=cpp.doxygen
dnn/src/cuda/matrix_mul/naive.cu
浏览文件 @
756c1eb7
...
...
@@ -14,16 +14,18 @@
#include "src/cuda/utils.cuh"
namespace
{
__global__
void
do_exec
(
const
int8_t
*
A
,
const
int8_t
*
B
,
int32_t
*
C
,
size_t
M
,
template
<
typename
AType
,
typename
BType
,
typename
CType
,
typename
CompType
>
__global__
void
do_exec
(
const
AType
*
A
,
const
BType
*
B
,
CType
*
C
,
size_t
M
,
size_t
N
,
size_t
K
,
size_t
LDA
,
size_t
LDB
,
size_t
LDC
,
bool
transA
,
bool
transB
)
{
size_t
m
=
blockIdx
.
x
;
for
(;
m
<
M
;
m
+=
gridDim
.
x
)
{
size_t
n
=
threadIdx
.
x
;
for
(;
n
<
N
;
n
+=
blockDim
.
x
)
{
int32_t
res
=
0
;
CompType
res
=
static_cast
<
CompType
>
(
0
)
;
for
(
size_t
k
=
0
;
k
<
K
;
++
k
)
{
int8_t
av
=
transA
?
A
[
k
*
LDA
+
m
]
:
A
[
m
*
LDA
+
k
],
AType
av
=
transA
?
A
[
k
*
LDA
+
m
]
:
A
[
m
*
LDA
+
k
],
bv
=
transB
?
B
[
n
*
LDB
+
k
]
:
B
[
k
*
LDB
+
n
];
res
+=
av
*
bv
;
}
...
...
@@ -36,14 +38,29 @@ __global__ void do_exec(const int8_t* A, const int8_t* B, int32_t* C, size_t M,
namespace
megdnn
{
namespace
cuda
{
void
exec_gemm_int8_naive
(
const
int8_t
*
A
,
const
int8_t
*
B
,
int32_t
*
C
,
size_t
M
,
size_t
N
,
size_t
K
,
size_t
LDA
,
size_t
LDB
,
size_t
LDC
,
bool
transA
,
bool
transB
,
cudaStream_t
stream
)
{
do_exec
<
<<
128
,
128
,
0
,
stream
>>>
(
A
,
B
,
C
,
M
,
N
,
K
,
LDA
,
LDB
,
LDC
,
transA
,
transB
);
template
<
typename
AType
,
typename
BType
,
typename
CType
,
typename
CompType
>
void
exec_gemm_naive
(
const
AType
*
A
,
const
BType
*
B
,
CType
*
C
,
size_t
M
,
size_t
N
,
size_t
K
,
size_t
LDA
,
size_t
LDB
,
size_t
LDC
,
bool
transA
,
bool
transB
,
cudaStream_t
stream
)
{
do_exec
<
AType
,
BType
,
CType
,
CompType
><<<
128
,
128
,
0
,
stream
>>>
(
A
,
B
,
C
,
M
,
N
,
K
,
LDA
,
LDB
,
LDC
,
transA
,
transB
);
}
#define INST(in_ct, out_ct, comp_ct) \
template void exec_gemm_naive<typename in_ct, typename in_ct, \
typename out_ct, typename comp_ct>( \
const in_ct* A, const in_ct* B, out_ct* C, size_t M, size_t N, \
size_t K, size_t LDA, size_t LDB, size_t LDC, bool transA, \
bool transB, cudaStream_t stream);
INST
(
megdnn
::
dt_float32
,
megdnn
::
dt_float32
,
megdnn
::
dt_float32
)
INST
(
megdnn
::
dt_float16
,
megdnn
::
dt_float16
,
megdnn
::
dt_float16
)
INST
(
megdnn
::
dt_int8
,
megdnn
::
dt_int32
,
megdnn
::
dt_int32
)
INST
(
megdnn
::
dt_float16
,
megdnn
::
dt_float16
,
megdnn
::
dt_float32
)
#undef cb
#undef INST
}
// namespace cuda
}
// namespace megdnn
...
...
dnn/src/cuda/matrix_mul/naive.cuh
浏览文件 @
756c1eb7
...
...
@@ -15,8 +15,9 @@
namespace
megdnn
{
namespace
cuda
{
void
exec_gemm_int8_naive
(
const
int8_t
*
A
,
const
int8_t
*
B
,
int32_t
*
C
,
size_t
m
,
size_t
n
,
size_t
k
,
size_t
ldA
,
size_t
ldB
,
template
<
typename
AType
,
typename
BType
,
typename
CType
,
typename
CompType
>
void
exec_gemm_naive
(
const
AType
*
A
,
const
BType
*
B
,
CType
*
C
,
size_t
m
,
size_t
n
,
size_t
k
,
size_t
ldA
,
size_t
ldB
,
size_t
ldC
,
bool
transA
,
bool
transB
,
cudaStream_t
stream
);
}
// namespace cuda
...
...
dnn/test/cuda/matrix_mul.cpp
浏览文件 @
756c1eb7
...
...
@@ -185,6 +185,46 @@ TEST_F(CUDA, MATRIX_MUL_INT8x8x32_NAIVE) {
}
}
TEST_F
(
CUDA
,
MATRIX_MUL_FLOAT_NAIVE
)
{
Checker
<
MatrixMul
>
checker
(
handle_cuda
());
checker
.
set_before_exec_callback
(
AlgoChecker
<
MatrixMulForward
>
(
"NAIVE"
));
using
Param
=
MatrixMul
::
Param
;
size_t
m
=
12
,
n
=
16
,
k
=
20
;
std
::
vector
<
DType
>
dtype_array
;
dtype_array
.
push_back
(
dtype
::
Float32
());
dtype_array
.
push_back
(
dtype
::
Float16
());
for
(
DType
dtype
:
dtype_array
)
{
for
(
unsigned
mask
=
0
;
mask
<
4
;
++
mask
)
{
Param
param
;
param
.
transposeA
=
mask
&
1
;
param
.
transposeB
=
mask
&
2
;
DType
stype
=
dtype
;
TensorShape
A
,
B
;
if
(
param
.
transposeA
)
A
=
TensorShape
{
k
,
m
};
else
A
=
TensorShape
{
m
,
k
};
if
(
param
.
transposeB
)
B
=
TensorShape
{
n
,
k
};
else
B
=
TensorShape
{
k
,
n
};
if
(
dtype
==
dtype
::
Float16
())
{
param
.
compute_mode
=
param
::
MatrixMul
::
ComputeMode
::
FLOAT32
;
}
checker
.
set_param
(
param
)
.
set_dtype
(
0
,
stype
)
.
set_dtype
(
1
,
stype
)
.
set_dtype
(
2
,
dtype
)
.
set_epsilon
(
dtype
==
dtype
::
Float16
()
?
5e-2
:
5e-3
)
.
execs
({
A
,
B
,
{}});
}
}
}
TEST_F
(
CUDA
,
MATRIX_MUL
)
{
if
(
cuda
::
current_device_prop
().
major
<
6
)
{
printf
(
"Skip CUDA.MATRIX_MUL test as current device doesn't support
\n
"
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录