Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
58b682ca
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
410
Star
4707
Fork
583
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
58b682ca
编写于
4月 25, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(dnn/cuda): add naive bmm
GitOrigin-RevId: 4ba4b22e40368dd0918d02f9d01c24dd0a815640
上级
eddb0aba
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
268 addition
and
13 deletion
+268
-13
dnn/src/cuda/batched_matrix_mul/algo.cpp
dnn/src/cuda/batched_matrix_mul/algo.cpp
+1
-0
dnn/src/cuda/batched_matrix_mul/algo.h
dnn/src/cuda/batched_matrix_mul/algo.h
+23
-0
dnn/src/cuda/batched_matrix_mul/naive.cpp
dnn/src/cuda/batched_matrix_mul/naive.cpp
+81
-0
dnn/src/cuda/batched_matrix_mul/naive.cu
dnn/src/cuda/batched_matrix_mul/naive.cu
+72
-0
dnn/src/cuda/batched_matrix_mul/naive.cuh
dnn/src/cuda/batched_matrix_mul/naive.cuh
+26
-0
dnn/src/cuda/batched_matrix_mul/opr_impl.h
dnn/src/cuda/batched_matrix_mul/opr_impl.h
+1
-0
dnn/test/common/matrix_mul.cpp
dnn/test/common/matrix_mul.cpp
+8
-13
dnn/test/cuda/batched_matrix_mul.cpp
dnn/test/cuda/batched_matrix_mul.cpp
+56
-0
未找到文件。
dnn/src/cuda/batched_matrix_mul/algo.cpp
浏览文件 @
58b682ca
...
...
@@ -45,6 +45,7 @@ BatchedMatrixMulForwardImpl::AlgoPack::AlgoPack() {
#endif
all_algos
.
push_back
(
&
int8x8x32
);
all_algos
.
push_back
(
&
brute_force
);
all_algos
.
push_back
(
&
naive_bmm
);
for
(
auto
&&
algo
:
all_algos
)
{
m_all_algos_map
.
emplace
(
algo
->
info
().
desc
,
algo
);
...
...
dnn/src/cuda/batched_matrix_mul/algo.h
浏览文件 @
58b682ca
...
...
@@ -24,6 +24,7 @@ public:
CUDA_CUBLAS
,
CUDA_CUBLASLT
,
CUDA_INT8X8X32
,
CUDA_NAIVE_BMM
,
};
using
Mapper
=
std
::
unordered_map
<
AlgorithmDesc
,
AlgoBase
*>
;
...
...
@@ -94,6 +95,27 @@ public:
std
::
vector
<
SearchItem
>
get_subopr_list
(
const
TensorLayoutArray
&
layouts
,
const
OperatorBase
*
opr
)
const
override
;
};
class
BatchedMatrixMulForwardImpl
::
AlgoNaive
final
:
public
BatchedMatrixMulForwardImpl
::
AlgoBase
{
using
Param
=
MatrixMulForward
::
Param
;
private:
WorkspaceBundle
get_workspace_bundle
();
public:
bool
is_available
(
const
SizeArgs
&
args
)
const
override
;
size_t
get_workspace_in_bytes
(
const
SizeArgs
&
/*args*/
)
const
override
{
return
0
;
};
void
exec
(
const
ExecArgs
&
args
)
const
final
;
AlgoAttribute
attribute
()
const
override
{
return
AlgoAttribute
::
REPRODUCIBLE
|
AlgoAttribute
::
NAIVE
;
}
const
char
*
name
()
const
override
{
return
"NAIVE_BMM"
;
}
MEGDNN_DECL_ALGO_TYPE
(
CUDA_NAIVE_BMM
)
};
class
BatchedMatrixMulForwardImpl
::
AlgoCublas
final
:
public
BatchedMatrixMulForwardImpl
::
AlgoBase
{
public:
...
...
@@ -148,6 +170,7 @@ public:
AlgoInt8x8x32
int8x8x32
;
std
::
vector
<
AlgoBase
*>
all_algos
;
AlgoBruteForce
brute_force
;
AlgoNaive
naive_bmm
;
const
AlgoBase
::
Mapper
&
all_algos_map
()
const
{
return
m_all_algos_map
;
}
};
...
...
dnn/src/cuda/batched_matrix_mul/naive.cpp
0 → 100644
浏览文件 @
58b682ca
/**
* \file dnn/src/cuda/batched_matrix_mul/naive.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "src/cuda/batched_matrix_mul/naive.cuh"
#include <cuda.h>
#include "src/cuda/batched_matrix_mul/algo.h"
#include "src/cuda/utils.h"
using
namespace
megdnn
;
using
namespace
cuda
;
#include "midout.h"
MIDOUT_DECL
(
megdnn_naive_matmul
)
bool
BatchedMatrixMulForwardImpl
::
AlgoNaive
::
is_available
(
const
SizeArgs
&
args
)
const
{
auto
&&
layout_a
=
args
.
layout_a
;
auto
&&
layout_b
=
args
.
layout_b
;
auto
&&
layout_c
=
args
.
layout_c
;
return
layout_a
.
dtype
.
enumv
()
==
layout_b
.
dtype
.
enumv
()
&&
(
layout_a
.
dtype
.
enumv
()
==
DTypeEnum
::
Float32
||
layout_a
.
dtype
.
enumv
()
==
DTypeEnum
::
Float16
)
&&
(
layout_c
.
dtype
.
enumv
()
==
DTypeEnum
::
Float32
||
layout_c
.
dtype
.
enumv
()
==
DTypeEnum
::
Float16
)
&&
args
.
opr
->
param
().
format
==
param
::
MatrixMul
::
Format
::
DEFAULT
;
}
void
BatchedMatrixMulForwardImpl
::
AlgoNaive
::
exec
(
const
ExecArgs
&
args
)
const
{
auto
&&
param
=
args
.
opr
->
param
();
auto
Batch
=
args
.
tensor_c
.
layout
.
shape
[
0
];
auto
m
=
args
.
tensor_c
.
layout
.
shape
[
1
],
n
=
args
.
tensor_c
.
layout
.
shape
[
2
],
k
=
args
.
tensor_a
.
layout
.
shape
[
param
.
transposeA
?
1
:
2
];
auto
LDA
=
args
.
tensor_a
.
layout
.
stride
[
1
],
LDB
=
args
.
tensor_b
.
layout
.
stride
[
1
],
LDC
=
args
.
tensor_c
.
layout
.
stride
[
1
];
auto
&&
handle
=
concrete_handle
(
args
.
opr
->
handle
());
using
ComputeMode
=
Param
::
ComputeMode
;
#define DISPATCH_CMODE(in_dt, out_dt, in_ct, out_ct, comp_ct, cmode) \
MIDOUT_BEGIN( \
megdnn_naive_matmul, \
midout_iv(#in_dt #out_dt #in_ct, #out_ct, #comp_ct, #cmode)) { \
do { \
using namespace dtype; \
if (args.tensor_a.layout.dtype.enumv() == DTypeTrait<in_dt>::enumv && \
args.tensor_c.layout.dtype.enumv() == DTypeTrait<out_dt>::enumv && \
param.compute_mode == cmode) { \
in_ct* A = args.tensor_a.compatible_ptr<in_ct>(); \
in_ct* B = args.tensor_b.compatible_ptr<in_ct>(); \
out_ct* C = args.tensor_c.compatible_ptr<out_ct>(); \
exec_bgemm_naive<in_ct, in_ct, out_ct, comp_ct>( \
A, B, C, Batch, m, n, k, LDA, LDB, LDC, param.transposeA, \
param.transposeB, cuda_stream(handle)); \
return; \
} \
} while (0); \
} \
MIDOUT_END();
#define DISPATCH(in_dt, out_dt, in_ct, out_ct, comp_ct) \
DISPATCH_CMODE(in_dt, out_dt, in_ct, out_ct, comp_ct, ComputeMode::DEFAULT)
DISPATCH
(
Float32
,
Float32
,
dt_float32
,
dt_float32
,
dt_float32
);
DISPATCH
(
Float16
,
Float16
,
dt_float16
,
dt_float16
,
dt_float16
);
DNN_INC_FLOAT16
(
DISPATCH_CMODE
(
Float16
,
Float16
,
dt_float16
,
dt_float16
,
dt_float32
,
ComputeMode
::
FLOAT32
));
#undef DISPATCH_CMODE
#undef DISPATCH
megdnn_throw
(
ssprintf
(
"unsupported Matmul(%s, %s) -> %s with cmode = %d"
,
args
.
layout_a
.
dtype
.
name
(),
args
.
layout_b
.
dtype
.
name
(),
args
.
layout_c
.
dtype
.
name
(),
static_cast
<
int
>
(
param
.
compute_mode
)));
}
// vim: syntax=cpp.doxygen
dnn/src/cuda/batched_matrix_mul/naive.cu
0 → 100644
浏览文件 @
58b682ca
/**
* \file dnn/src/cuda/batched_matrix_mul/naive.cu
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include <cuda.h>
#include "src/cuda/matrix_mul/naive.cuh"
#include "src/cuda/utils.cuh"
namespace
{
template
<
typename
AType
,
typename
BType
,
typename
CType
,
typename
CompType
>
__global__
void
do_exec
(
const
AType
*
A
,
const
BType
*
B
,
CType
*
C
,
size_t
Batch
,
size_t
M
,
size_t
N
,
size_t
K
,
size_t
LDA
,
size_t
LDB
,
size_t
LDC
,
bool
transA
,
bool
transB
)
{
for
(
int
bid
=
blockIdx
.
x
;
bid
<
Batch
;
bid
+=
gridDim
.
x
)
{
const
AType
*
A_r
=
A
+
(
transA
?
bid
*
K
*
LDA
:
bid
*
M
*
LDA
);
const
BType
*
B_r
=
B
+
(
transB
?
bid
*
N
*
LDB
:
bid
*
K
*
LDB
);
CType
*
C_r
=
C
+
bid
*
M
*
LDC
;
for
(
size_t
m
=
0
;
m
<
M
;
++
m
)
{
size_t
n
=
threadIdx
.
x
;
for
(;
n
<
N
;
n
+=
blockDim
.
x
)
{
CompType
res
=
static_cast
<
CompType
>
(
0
);
for
(
size_t
k
=
0
;
k
<
K
;
++
k
)
{
AType
av
=
transA
?
A_r
[
k
*
LDA
+
m
]
:
A_r
[
m
*
LDA
+
k
];
BType
bv
=
transB
?
B_r
[
n
*
LDB
+
k
]
:
B_r
[
k
*
LDB
+
n
];
res
+=
av
*
bv
;
}
C_r
[
m
*
LDC
+
n
]
=
res
;
}
}
}
}
}
// namespace
namespace
megdnn
{
namespace
cuda
{
template
<
typename
AType
,
typename
BType
,
typename
CType
,
typename
CompType
>
void
exec_bgemm_naive
(
const
AType
*
A
,
const
BType
*
B
,
CType
*
C
,
size_t
Batch
,
size_t
M
,
size_t
N
,
size_t
K
,
size_t
LDA
,
size_t
LDB
,
size_t
LDC
,
bool
transA
,
bool
transB
,
cudaStream_t
stream
)
{
do_exec
<
AType
,
BType
,
CType
,
CompType
><<<
Batch
,
128
,
0
,
stream
>>>
(
A
,
B
,
C
,
Batch
,
M
,
N
,
K
,
LDA
,
LDB
,
LDC
,
transA
,
transB
);
}
#define INST(in_ct, out_ct, comp_ct) \
template void exec_bgemm_naive< \
typename in_ct, typename in_ct, typename out_ct, typename comp_ct>( \
const in_ct* A, const in_ct* B, out_ct* C, size_t Batch, size_t M, \
size_t N, size_t K, size_t LDA, size_t LDB, size_t LDC, bool transA, \
bool transB, cudaStream_t stream);
INST
(
megdnn
::
dt_float32
,
megdnn
::
dt_float32
,
megdnn
::
dt_float32
)
INST
(
megdnn
::
dt_float16
,
megdnn
::
dt_float16
,
megdnn
::
dt_float16
)
INST
(
megdnn
::
dt_float16
,
megdnn
::
dt_float16
,
megdnn
::
dt_float32
)
#undef cb
#undef INST
}
// namespace cuda
}
// namespace megdnn
// vim: syntax=cpp.doxygen
dnn/src/cuda/batched_matrix_mul/naive.cuh
0 → 100644
浏览文件 @
58b682ca
/**
* \file dnn/src/cuda/batched_matrix_mul/naive.cuh
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "src/cuda/utils.cuh"
namespace
megdnn
{
namespace
cuda
{
template
<
typename
AType
,
typename
BType
,
typename
CType
,
typename
CompType
>
void
exec_bgemm_naive
(
const
AType
*
A
,
const
BType
*
B
,
CType
*
C
,
size_t
Batch
,
size_t
m
,
size_t
n
,
size_t
k
,
size_t
ldA
,
size_t
ldB
,
size_t
ldC
,
bool
transA
,
bool
transB
,
cudaStream_t
stream
);
}
// namespace cuda
}
// namespace megdnn
// vim: syntax=cpp.doxygen
dnn/src/cuda/batched_matrix_mul/opr_impl.h
浏览文件 @
58b682ca
...
...
@@ -10,6 +10,7 @@ public:
BatchedMatrixMulForwardImpl
(
Handle
*
handle
)
:
BatchedMatrixMul
(
handle
)
{}
class
AlgoBase
;
class
AlgoNaive
;
class
AlgoBruteForce
;
class
AlgoCublas
;
#if CUDA_VERSION >= 10010
...
...
dnn/test/common/matrix_mul.cpp
浏览文件 @
58b682ca
...
...
@@ -266,19 +266,14 @@ void matrix_mul::check_matrix_mul(
checker
.
set_param
(
param
);
if
(
format
==
param
::
MatrixMul
::
Format
::
DEFAULT
)
{
if
(
batched
)
{
checker
.
execl
(
{
TensorLayout
{
{
arg
.
b
,
A0
,
A1
},
{
A_batch_stride
,
A_stride
,
1
},
A_dtype
},
TensorLayout
{
{
arg
.
b
,
B0
,
B1
},
{
B_batch_stride
,
B_stride
,
1
},
B_dtype
},
TensorLayout
{
{
arg
.
b
,
m
,
n
},
{
C_batch_stride
,
C_stride
,
1
},
C_dtype
}});
auto
a_layout
=
TensorLayout
{
{
arg
.
b
,
A0
,
A1
},
{
A_batch_stride
,
A_stride
,
1
},
A_dtype
};
auto
b_layout
=
TensorLayout
{
{
arg
.
b
,
B0
,
B1
},
{
B_batch_stride
,
B_stride
,
1
},
B_dtype
};
auto
c_layout
=
TensorLayout
{
{
arg
.
b
,
m
,
n
},
{
C_batch_stride
,
C_stride
,
1
},
C_dtype
};
checker
.
execl
({
a_layout
,
b_layout
,
c_layout
});
}
else
{
checker
.
execl
(
{
TensorLayout
{{
A0
,
A1
},
{
A_stride
,
1
},
A_dtype
},
...
...
dnn/test/cuda/batched_matrix_mul.cpp
浏览文件 @
58b682ca
...
...
@@ -105,6 +105,62 @@ TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BRUTE_FORCE_PART4) {
matrix_mul
::
get_batched_matmul_args_mask
(
3
));
}
TEST_F
(
CUDA
,
BATCHED_MATRIX_MUL_F32_NAIVE_PART0
)
{
matrix_mul
::
check_batched_matrix_mul
(
dtype
::
Float32
{},
dtype
::
Float32
{},
{},
handle_cuda
(),
ExecutionPolicyAlgoName
{
"NAIVE_BMM"
},
1e-5
,
matrix_mul
::
get_batched_matmul_args_mask
(
0
));
}
TEST_F
(
CUDA
,
BATCHED_MATRIX_MUL_F32_NAIVE_PART1
)
{
matrix_mul
::
check_batched_matrix_mul
(
dtype
::
Float32
{},
dtype
::
Float32
{},
{},
handle_cuda
(),
ExecutionPolicyAlgoName
{
"NAIVE_BMM"
},
1e-5
,
matrix_mul
::
get_batched_matmul_args_mask
(
1
));
}
TEST_F
(
CUDA
,
BATCHED_MATRIX_MUL_F32_NAIVE_PART2
)
{
matrix_mul
::
check_batched_matrix_mul
(
dtype
::
Float32
{},
dtype
::
Float32
{},
{},
handle_cuda
(),
ExecutionPolicyAlgoName
{
"NAIVE_BMM"
},
1e-5
,
matrix_mul
::
get_batched_matmul_args_mask
(
2
));
}
TEST_F
(
CUDA
,
BATCHED_MATRIX_MUL_F32_NAIVE_PART3
)
{
matrix_mul
::
check_batched_matrix_mul
(
dtype
::
Float32
{},
dtype
::
Float32
{},
{},
handle_cuda
(),
ExecutionPolicyAlgoName
{
"NAIVE_BMM"
},
1e-5
,
matrix_mul
::
get_batched_matmul_args_mask
(
3
));
}
TEST_F
(
CUDA
,
BATCHED_MATRIX_MUL_F16_NAIVE_PART0
)
{
matrix_mul
::
check_batched_matrix_mul
(
dtype
::
Float16
{},
dtype
::
Float16
{},
{},
handle_cuda
(),
ExecutionPolicyAlgoName
{
"NAIVE_BMM"
},
1e-5
,
matrix_mul
::
get_batched_matmul_args_mask
(
0
));
}
TEST_F
(
CUDA
,
BATCHED_MATRIX_MUL_F16_NAIVE_PART1
)
{
matrix_mul
::
check_batched_matrix_mul
(
dtype
::
Float16
{},
dtype
::
Float16
{},
{},
handle_cuda
(),
ExecutionPolicyAlgoName
{
"NAIVE_BMM"
},
1e-5
,
matrix_mul
::
get_batched_matmul_args_mask
(
1
));
}
TEST_F
(
CUDA
,
BATCHED_MATRIX_MUL_F16_NAIVE_PART2
)
{
matrix_mul
::
check_batched_matrix_mul
(
dtype
::
Float16
{},
dtype
::
Float16
{},
{},
handle_cuda
(),
ExecutionPolicyAlgoName
{
"NAIVE_BMM"
},
1e-5
,
matrix_mul
::
get_batched_matmul_args_mask
(
2
));
}
TEST_F
(
CUDA
,
BATCHED_MATRIX_MUL_F16_NAIVE_PART3
)
{
matrix_mul
::
check_batched_matrix_mul
(
dtype
::
Float16
{},
dtype
::
Float16
{},
{},
handle_cuda
(),
ExecutionPolicyAlgoName
{
"NAIVE_BMM"
},
1e-5
,
matrix_mul
::
get_batched_matmul_args_mask
(
3
));
}
TEST_F
(
CUDA
,
BATCHED_MATRIX_MUL_F16_PART1
)
{
require_compute_capability
(
6
,
0
);
matrix_mul
::
check_batched_matrix_mul
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录