Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
4c0bff1d
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
403
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
4c0bff1d
编写于
3月 26, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refactor(megdnn): refactor TEGRA_X1/X2 macro
GitOrigin-RevId: 1aa78712c6413ed770996893bc2409524da13758
上级
758549b9
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
78 addition
and
83 deletion
+78
-83
dnn/src/cuda/utils.cuh
dnn/src/cuda/utils.cuh
+7
-0
dnn/test/cuda/convolution.cpp
dnn/test/cuda/convolution.cpp
+19
-17
dnn/test/cuda/flip.cpp
dnn/test/cuda/flip.cpp
+2
-0
dnn/test/cuda/images2neibs.cpp
dnn/test/cuda/images2neibs.cpp
+2
-0
dnn/test/cuda/indexing_one_hot.cpp
dnn/test/cuda/indexing_one_hot.cpp
+6
-4
dnn/test/cuda/matrix_mul.cpp
dnn/test/cuda/matrix_mul.cpp
+7
-27
dnn/test/cuda/padding.cpp
dnn/test/cuda/padding.cpp
+2
-4
dnn/test/cuda/rotate.cpp
dnn/test/cuda/rotate.cpp
+2
-0
dnn/test/cuda/sliding_window_transpose.cpp
dnn/test/cuda/sliding_window_transpose.cpp
+12
-12
dnn/test/cuda/type_cvt.cpp
dnn/test/cuda/type_cvt.cpp
+19
-19
未找到文件。
dnn/src/cuda/utils.cuh
浏览文件 @
4c0bff1d
...
...
@@ -83,6 +83,12 @@
cuda_check(cudaGetLastError()); \
} while (0)
#if MEGDNN_TEGRA_X2
//! tx2 only have 256 cuda cores
#define NR_THREADS 256
#define NR_THREADS_X 32
#define NR_THREADS_Y 8
#else
#if MEGDNN_THREADS_512
#define NR_THREADS 512
#define NR_THREADS_X 32
...
...
@@ -92,6 +98,7 @@
#define NR_THREADS_X 32
#define NR_THREADS_Y 32
#endif
#endif
#define DIVUP(x, y) (((x) + (y)-1) / (y))
#define ROUNDUP(x, y) (DIVUP(x, y) * (y))
...
...
dnn/test/cuda/convolution.cpp
浏览文件 @
4c0bff1d
...
...
@@ -22,6 +22,8 @@
#include "test/cuda/fixture.h"
#include "test/cuda/utils.h"
#include <cudnn.h>
#define V1(x) #x
#define V(x) V1(x)
#define CUDNN_VERSION_STRING \
...
...
@@ -161,23 +163,6 @@ TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) {
}
}
TEST_F
(
CUDA
,
BENCHMARK_CONVOLUTION_1X1_FORWARD
)
{
using
namespace
convolution
;
std
::
vector
<
TestArg
>
args
=
get_1x1_args
();
Benchmarker
<
ConvolutionForward
>
marker
(
handle_cuda
());
NormalRNG
default_rng
;
for
(
auto
&&
arg
:
args
)
{
float
scale
=
1.0
f
/
sqrt
(
arg
.
filter
[
1
]
*
arg
.
filter
[
2
]
*
arg
.
filter
[
3
]);
UniformFloatRNG
rng
(
scale
,
2
*
scale
);
marker
.
set_dtype
(
0
,
dtype
::
Float32
())
.
set_dtype
(
1
,
dtype
::
Float32
())
.
set_rng
(
0
,
&
default_rng
)
.
set_rng
(
1
,
&
default_rng
)
.
set_param
(
arg
.
param
)
.
execs
({
arg
.
src
,
arg
.
filter
,
{}});
}
}
TEST_F
(
CUDA
,
CONVOLUTION_BACKWARD_DATA
)
{
using
namespace
convolution
;
std
::
vector
<
TestArg
>
args
=
get_args_cuda_conv_bwd_data
();
...
...
@@ -767,6 +752,23 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DEPTHWISE_LARGE_FILTER) {
}
#if MEGDNN_WITH_BENCHMARK
TEST_F
(
CUDA
,
BENCHMARK_CONVOLUTION_1X1_FORWARD
)
{
using
namespace
convolution
;
std
::
vector
<
TestArg
>
args
=
get_1x1_args
();
Benchmarker
<
ConvolutionForward
>
marker
(
handle_cuda
());
NormalRNG
default_rng
;
for
(
auto
&&
arg
:
args
)
{
float
scale
=
1.0
f
/
sqrt
(
arg
.
filter
[
1
]
*
arg
.
filter
[
2
]
*
arg
.
filter
[
3
]);
UniformFloatRNG
rng
(
scale
,
2
*
scale
);
marker
.
set_dtype
(
0
,
dtype
::
Float32
())
.
set_dtype
(
1
,
dtype
::
Float32
())
.
set_rng
(
0
,
&
default_rng
)
.
set_rng
(
1
,
&
default_rng
)
.
set_param
(
arg
.
param
)
.
execs
({
arg
.
src
,
arg
.
filter
,
{}});
}
}
TEST_F
(
CUDA
,
CONV_FWD_BENCHMARK
)
{
auto
run
=
[
&
](
size_t
N
,
size_t
OC
,
size_t
IC
,
size_t
IH
,
size_t
IW
,
size_t
SH
=
1
,
size_t
SW
=
1
,
size_t
FH
=
1
,
size_t
FW
=
1
,
size_t
PH
=
0
,
...
...
dnn/test/cuda/flip.cpp
浏览文件 @
4c0bff1d
...
...
@@ -44,6 +44,7 @@ TEST_F(CUDA, FLIP) {
}
}
#if MEGDNN_WITH_BENCHMARK
TEST_F
(
CUDA
,
FLIP_BENCHMARK
)
{
auto
run
=
[
&
](
const
TensorShapeArray
&
shapes
)
{
Benchmarker
<
Flip
>
benchmarker
(
handle_cuda
());
...
...
@@ -75,6 +76,7 @@ TEST_F(CUDA, FLIP_BENCHMARK) {
run
(
shapes
);
}
#endif
}
// namespace test
}
// namespace megdnn
...
...
dnn/test/cuda/images2neibs.cpp
浏览文件 @
4c0bff1d
...
...
@@ -14,6 +14,7 @@
#include "test/common/images2neibs.h"
#include "test/common/rng.h"
#include "test/cuda/benchmark.h"
#include "test/cuda/utils.h"
namespace
megdnn
{
namespace
test
{
...
...
@@ -44,6 +45,7 @@ TEST_F(CUDA, BENCHMARK_IMAGES2NEIBS_FORWARD) {
#endif
TEST_F
(
CUDA
,
IMAGES2NEIBS_BACKWARD
)
{
require_compute_capability
(
6
,
1
);
UniformFloatRNG
rng
(
0
,
1
);
auto
args
=
images2neibs
::
get_args
();
for
(
auto
&&
arg
:
args
)
{
...
...
dnn/test/cuda/indexing_one_hot.cpp
浏览文件 @
4c0bff1d
...
...
@@ -39,6 +39,11 @@ TEST_F(CUDA_ERROR_INFO, INDEXING_ONE_HOT) {
ASSERT_TRUE
(
failed
);
}
TEST_F
(
CUDA
,
INDEXING_SET_ONE_HOT
)
{
run_indexing_set_one_hot_test
(
handle_cuda
());
}
#if MEGDNN_WITH_BENCHMARK
TEST_F
(
CUDA
,
BENCHMARK_INDEXING_ONE_HOT
)
{
Benchmarker
<
IndexingOneHot
>
bench
{
handle_cuda
()};
bench
.
set_times
(
1
);
...
...
@@ -53,9 +58,6 @@ TEST_F(CUDA, BENCHMARK_INDEXING_ONE_HOT) {
printf
(
"bandwidth: %.2fGiB/s
\n
"
,
A
*
B
*
D
*
sizeof
(
float
)
/
1024.0
/
1024
/
1024
/
time
);
}
TEST_F
(
CUDA
,
INDEXING_SET_ONE_HOT
)
{
run_indexing_set_one_hot_test
(
handle_cuda
());
}
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
dnn/test/cuda/matrix_mul.cpp
浏览文件 @
4c0bff1d
...
...
@@ -14,13 +14,12 @@
#include "test/common/benchmarker.h"
#include "test/common/checker.h"
#include "test/common/matrix_mul.h"
#include "test/cuda/utils.h"
#if defined(cuda_check)
#undef cuda_check
#endif
#include "test/cuda/utils.h"
#include <cuda.h>
#include "src/cuda/utils.h"
namespace
megdnn
{
namespace
test
{
...
...
@@ -47,13 +46,7 @@ TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION) {
}
TEST_F
(
CUDA
,
MATRIX_MUL_QUANTIZED4x4x32
)
{
if
(
cuda
::
current_device_prop
().
major
<
7
||
(
cuda
::
current_device_prop
().
major
==
7
&&
cuda
::
current_device_prop
().
minor
<
5
))
{
printf
(
"Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device "
"doesn't support
\n
"
);
return
;
}
require_compute_capability
(
7
,
5
);
Checker
<
MatrixMul
>
checker
(
handle_cuda
(),
false
);
using
Param
=
MatrixMul
::
Param
;
Param
param
;
...
...
@@ -65,21 +58,15 @@ TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) {
checker
.
exec
({{
256
,
256
},
{
256
,
256
},
{
256
,
256
}});
auto
args
=
matrix_mul
::
get_matmul_args
();
for
(
auto
arg
:
args
)
{
size_t
m
=
DIVUP
(
arg
.
m
,
8
)
*
8
,
n
=
DIVUP
(
arg
.
n
,
8
)
*
8
,
k
=
DIVUP
(
arg
.
k
,
32
)
*
32
;
size_t
m
=
(
arg
.
m
+
7
)
/
8
*
8
,
n
=
(
arg
.
n
+
7
)
/
8
*
8
,
k
=
(
arg
.
k
+
31
)
/
32
*
32
;
checker
.
exec
({{
m
,
k
},
{
n
,
k
},
{
m
,
n
}});
}
}
#if MEGDNN_WITH_BENCHMARK
TEST_F
(
CUDA
,
BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32
)
{
if
(
cuda
::
current_device_prop
().
major
<
7
||
(
cuda
::
current_device_prop
().
major
==
7
&&
cuda
::
current_device_prop
().
minor
<
5
))
{
printf
(
"Skip CUDA.BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as current "
"device doesn't support
\n
"
);
return
;
}
require_compute_capability
(
7
,
5
);
Benchmarker
<
MatrixMul
>
bencher
(
handle_cuda
());
using
Param
=
MatrixMul
::
Param
;
Param
param
;
...
...
@@ -102,14 +89,7 @@ TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
}
TEST_F
(
CUDA
,
PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32
)
{
if
(
cuda
::
current_device_prop
().
major
<
7
||
(
cuda
::
current_device_prop
().
major
==
7
&&
cuda
::
current_device_prop
().
minor
<
5
))
{
printf
(
"Skip CUDA.PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as "
"current "
"device doesn't support
\n
"
);
return
;
}
require_compute_capability
(
7
,
5
);
Benchmarker
<
MatrixMul
>
bencher
(
handle_cuda
());
using
Param
=
MatrixMul
::
Param
;
Param
param
;
...
...
dnn/test/cuda/padding.cpp
浏览文件 @
4c0bff1d
...
...
@@ -188,8 +188,7 @@ TEST_F(CUDA, PADDING_REPLICATE2) {
6
,
7
,
7
,
8
,
9
,
9
,
9
,
9
})});
}
// #if MEGDNN_WITH_BENCHMARK
#if MEGDNN_WITH_BENCHMARK
TEST_F
(
CUDA
,
BENCHMARK_PADDING_CONSTANT
)
{
using
Param
=
Padding
::
Param
;
...
...
@@ -240,5 +239,4 @@ TEST_F(CUDA, BENCHMARK_PADDING_CONSTANT) {
run
(
shapes
,
param
);
}
}
// #endif
\ No newline at end of file
#endif
dnn/test/cuda/rotate.cpp
浏览文件 @
4c0bff1d
...
...
@@ -40,6 +40,7 @@ TEST_F(CUDA, ROTATE) {
}
}
#if MEGDNN_WITH_BENCHMARK
TEST_F
(
CUDA
,
BENCHMARK_ROTATE
)
{
auto
run
=
[
&
](
const
TensorShapeArray
&
shapes
)
{
Benchmarker
<
Rotate
>
benchmarker
(
handle_cuda
());
...
...
@@ -74,6 +75,7 @@ TEST_F(CUDA, BENCHMARK_ROTATE) {
run
(
shapes
);
}
#endif
}
// namespace rotate
}
// namespace test
...
...
dnn/test/cuda/sliding_window_transpose.cpp
浏览文件 @
4c0bff1d
...
...
@@ -42,18 +42,6 @@ TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_FORWARD) {
}
}
#if MEGDNN_WITH_BENCHMARK
TEST_F
(
CUDA
,
BENCHMARK_SLIDINGWINDOWTRANSPOSE_FORWARD
)
{
auto
args
=
sliding_window_transpose
::
get_benchmark_args
();
for
(
auto
&&
arg
:
args
)
{
CUBenchmarker
<
SlidingWindowTransposeForward
>
bencher
(
handle_cuda
());
bencher
.
set_param
(
arg
.
param
)
.
set_dtype
(
0
,
dtype
::
Float32
())
.
exec
(
TensorShapeArray
{
arg
.
ishape
,
{}});
}
}
#endif
TEST_F
(
CUDA
,
SLIDINGWINDOWTRANSPOSE_BACKWARD
)
{
UniformFloatRNG
rng
(
0
,
1
);
auto
args
=
sliding_window_transpose
::
get_args
();
...
...
@@ -78,6 +66,18 @@ TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_BACKWARD) {
}
}
#if MEGDNN_WITH_BENCHMARK
TEST_F
(
CUDA
,
BENCHMARK_SLIDINGWINDOWTRANSPOSE_FORWARD
)
{
auto
args
=
sliding_window_transpose
::
get_benchmark_args
();
for
(
auto
&&
arg
:
args
)
{
CUBenchmarker
<
SlidingWindowTransposeForward
>
bencher
(
handle_cuda
());
bencher
.
set_param
(
arg
.
param
)
.
set_dtype
(
0
,
dtype
::
Float32
())
.
exec
(
TensorShapeArray
{
arg
.
ishape
,
{}});
}
}
#endif
}
// namespace test
}
// namespace megdnn
...
...
dnn/test/cuda/type_cvt.cpp
浏览文件 @
4c0bff1d
...
...
@@ -33,25 +33,6 @@ TEST_F(CUDA, TYPE_CVT) {
}
}
TEST_F
(
CUDA
,
BENCHMARK_TYPE_CVT_LAST_NOT_CONTIG
)
{
const
size_t
RUNS
=
3
;
auto
run
=
[
&
](
TensorLayout
src
,
TensorLayout
dst
)
{
Benchmarker
<
TypeCvt
>
benchmarker
(
handle_cuda
());
auto
&&
layout
=
src
;
benchmarker
.
set_times
(
RUNS
);
dst
.
init_contiguous_stride
();
auto
used
=
benchmarker
.
execl
({
src
,
dst
});
printf
(
"layout: %s bandwith: %f gbps/s
\n
"
,
layout
.
to_string
().
c_str
(),
2
*
layout
.
total_nr_elems
()
*
layout
.
dtype
.
size
()
*
RUNS
/
used
*
1000
/
(
1024
*
1024
*
1024
));
};
TensorLayout
src
({
16
,
128
,
128
},
{
49152
,
384
,
3
},
dtype
::
Float32
()),
dst
({
16
,
128
,
128
},
{
16384
,
128
,
1
},
dtype
::
Float32
());
run
(
src
,
dst
);
}
TEST_F
(
CUDA
,
QUANTIZED_TYPECVT
)
{
UniformIntRNG
int_rng
{
-
66
,
66
};
Checker
<
TypeCvt
>
checker
(
handle_cuda
());
...
...
@@ -162,6 +143,25 @@ TEST_F(CUDA, TYPE_CVT_BFLOAT16) {
}
#if MEGDNN_WITH_BENCHMARK
TEST_F
(
CUDA
,
BENCHMARK_TYPE_CVT_LAST_NOT_CONTIG
)
{
const
size_t
RUNS
=
3
;
auto
run
=
[
&
](
TensorLayout
src
,
TensorLayout
dst
)
{
Benchmarker
<
TypeCvt
>
benchmarker
(
handle_cuda
());
auto
&&
layout
=
src
;
benchmarker
.
set_times
(
RUNS
);
dst
.
init_contiguous_stride
();
auto
used
=
benchmarker
.
execl
({
src
,
dst
});
printf
(
"layout: %s bandwith: %f gbps/s
\n
"
,
layout
.
to_string
().
c_str
(),
2
*
layout
.
total_nr_elems
()
*
layout
.
dtype
.
size
()
*
RUNS
/
used
*
1000
/
(
1024
*
1024
*
1024
));
};
TensorLayout
src
({
16
,
128
,
128
},
{
49152
,
384
,
3
},
dtype
::
Float32
()),
dst
({
16
,
128
,
128
},
{
16384
,
128
,
1
},
dtype
::
Float32
());
run
(
src
,
dst
);
}
TEST_F
(
CUDA
,
BENCHMARK_TYPE_CVT
)
{
UniformIntRNG
rng
{
-
128
,
127
};
auto
run
=
[
&
](
TensorLayout
src
,
TensorLayout
dst
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录