提交 4c0bff1d 编写于 作者: M Megvii Engine Team

refactor(megdnn): refactor TEGRA_X1/X2 macro

GitOrigin-RevId: 1aa78712c6413ed770996893bc2409524da13758
上级 758549b9
...@@ -83,6 +83,12 @@ ...@@ -83,6 +83,12 @@
cuda_check(cudaGetLastError()); \ cuda_check(cudaGetLastError()); \
} while (0) } while (0)
#if MEGDNN_TEGRA_X2
//! tx2 only have 256 cuda cores
#define NR_THREADS 256
#define NR_THREADS_X 32
#define NR_THREADS_Y 8
#else
#if MEGDNN_THREADS_512 #if MEGDNN_THREADS_512
#define NR_THREADS 512 #define NR_THREADS 512
#define NR_THREADS_X 32 #define NR_THREADS_X 32
...@@ -92,6 +98,7 @@ ...@@ -92,6 +98,7 @@
#define NR_THREADS_X 32 #define NR_THREADS_X 32
#define NR_THREADS_Y 32 #define NR_THREADS_Y 32
#endif #endif
#endif
#define DIVUP(x, y) (((x) + (y)-1) / (y)) #define DIVUP(x, y) (((x) + (y)-1) / (y))
#define ROUNDUP(x, y) (DIVUP(x, y) * (y)) #define ROUNDUP(x, y) (DIVUP(x, y) * (y))
......
...@@ -22,6 +22,8 @@ ...@@ -22,6 +22,8 @@
#include "test/cuda/fixture.h" #include "test/cuda/fixture.h"
#include "test/cuda/utils.h" #include "test/cuda/utils.h"
#include <cudnn.h>
#define V1(x) #x #define V1(x) #x
#define V(x) V1(x) #define V(x) V1(x)
#define CUDNN_VERSION_STRING \ #define CUDNN_VERSION_STRING \
...@@ -161,23 +163,6 @@ TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) { ...@@ -161,23 +163,6 @@ TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) {
} }
} }
TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) {
using namespace convolution;
std::vector<TestArg> args = get_1x1_args();
Benchmarker<ConvolutionForward> marker(handle_cuda());
NormalRNG default_rng;
for (auto&& arg : args) {
float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
UniformFloatRNG rng(scale, 2 * scale);
marker.set_dtype(0, dtype::Float32())
.set_dtype(1, dtype::Float32())
.set_rng(0, &default_rng)
.set_rng(1, &default_rng)
.set_param(arg.param)
.execs({arg.src, arg.filter, {}});
}
}
TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) { TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) {
using namespace convolution; using namespace convolution;
std::vector<TestArg> args = get_args_cuda_conv_bwd_data(); std::vector<TestArg> args = get_args_cuda_conv_bwd_data();
...@@ -767,6 +752,23 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DEPTHWISE_LARGE_FILTER) { ...@@ -767,6 +752,23 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DEPTHWISE_LARGE_FILTER) {
} }
#if MEGDNN_WITH_BENCHMARK #if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) {
using namespace convolution;
std::vector<TestArg> args = get_1x1_args();
Benchmarker<ConvolutionForward> marker(handle_cuda());
NormalRNG default_rng;
for (auto&& arg : args) {
float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]);
UniformFloatRNG rng(scale, 2 * scale);
marker.set_dtype(0, dtype::Float32())
.set_dtype(1, dtype::Float32())
.set_rng(0, &default_rng)
.set_rng(1, &default_rng)
.set_param(arg.param)
.execs({arg.src, arg.filter, {}});
}
}
TEST_F(CUDA, CONV_FWD_BENCHMARK) { TEST_F(CUDA, CONV_FWD_BENCHMARK) {
auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1, auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH = 1,
size_t SW = 1, size_t FH = 1, size_t FW = 1, size_t PH = 0, size_t SW = 1, size_t FH = 1, size_t FW = 1, size_t PH = 0,
......
...@@ -44,6 +44,7 @@ TEST_F(CUDA, FLIP) { ...@@ -44,6 +44,7 @@ TEST_F(CUDA, FLIP) {
} }
} }
#if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, FLIP_BENCHMARK) { TEST_F(CUDA, FLIP_BENCHMARK) {
auto run = [&](const TensorShapeArray& shapes) { auto run = [&](const TensorShapeArray& shapes) {
Benchmarker<Flip> benchmarker(handle_cuda()); Benchmarker<Flip> benchmarker(handle_cuda());
...@@ -75,6 +76,7 @@ TEST_F(CUDA, FLIP_BENCHMARK) { ...@@ -75,6 +76,7 @@ TEST_F(CUDA, FLIP_BENCHMARK) {
run(shapes); run(shapes);
} }
#endif
} // namespace test } // namespace test
} // namespace megdnn } // namespace megdnn
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "test/common/images2neibs.h" #include "test/common/images2neibs.h"
#include "test/common/rng.h" #include "test/common/rng.h"
#include "test/cuda/benchmark.h" #include "test/cuda/benchmark.h"
#include "test/cuda/utils.h"
namespace megdnn { namespace megdnn {
namespace test { namespace test {
...@@ -44,6 +45,7 @@ TEST_F(CUDA, BENCHMARK_IMAGES2NEIBS_FORWARD) { ...@@ -44,6 +45,7 @@ TEST_F(CUDA, BENCHMARK_IMAGES2NEIBS_FORWARD) {
#endif #endif
TEST_F(CUDA, IMAGES2NEIBS_BACKWARD) { TEST_F(CUDA, IMAGES2NEIBS_BACKWARD) {
require_compute_capability(6, 1);
UniformFloatRNG rng(0, 1); UniformFloatRNG rng(0, 1);
auto args = images2neibs::get_args(); auto args = images2neibs::get_args();
for (auto&& arg : args) { for (auto&& arg : args) {
......
...@@ -39,6 +39,11 @@ TEST_F(CUDA_ERROR_INFO, INDEXING_ONE_HOT) { ...@@ -39,6 +39,11 @@ TEST_F(CUDA_ERROR_INFO, INDEXING_ONE_HOT) {
ASSERT_TRUE(failed); ASSERT_TRUE(failed);
} }
TEST_F(CUDA, INDEXING_SET_ONE_HOT) {
run_indexing_set_one_hot_test(handle_cuda());
}
#if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, BENCHMARK_INDEXING_ONE_HOT) { TEST_F(CUDA, BENCHMARK_INDEXING_ONE_HOT) {
Benchmarker<IndexingOneHot> bench{handle_cuda()}; Benchmarker<IndexingOneHot> bench{handle_cuda()};
bench.set_times(1); bench.set_times(1);
...@@ -53,9 +58,6 @@ TEST_F(CUDA, BENCHMARK_INDEXING_ONE_HOT) { ...@@ -53,9 +58,6 @@ TEST_F(CUDA, BENCHMARK_INDEXING_ONE_HOT) {
printf("bandwidth: %.2fGiB/s\n", printf("bandwidth: %.2fGiB/s\n",
A * B * D * sizeof(float) / 1024.0 / 1024 / 1024 / time); A * B * D * sizeof(float) / 1024.0 / 1024 / 1024 / time);
} }
#endif
TEST_F(CUDA, INDEXING_SET_ONE_HOT) {
run_indexing_set_one_hot_test(handle_cuda());
}
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
...@@ -14,13 +14,12 @@ ...@@ -14,13 +14,12 @@
#include "test/common/benchmarker.h" #include "test/common/benchmarker.h"
#include "test/common/checker.h" #include "test/common/checker.h"
#include "test/common/matrix_mul.h" #include "test/common/matrix_mul.h"
#include "test/cuda/utils.h"
#if defined(cuda_check) #if defined(cuda_check)
#undef cuda_check #undef cuda_check
#endif #endif
#include "test/cuda/utils.h" #include "src/cuda/utils.h"
#include <cuda.h>
namespace megdnn { namespace megdnn {
namespace test { namespace test {
...@@ -47,13 +46,7 @@ TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION) { ...@@ -47,13 +46,7 @@ TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION) {
} }
TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) { TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) {
if (cuda::current_device_prop().major < 7 || require_compute_capability(7, 5);
(cuda::current_device_prop().major == 7 &&
cuda::current_device_prop().minor < 5)) {
printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device "
"doesn't support\n");
return;
}
Checker<MatrixMul> checker(handle_cuda(), false); Checker<MatrixMul> checker(handle_cuda(), false);
using Param = MatrixMul::Param; using Param = MatrixMul::Param;
Param param; Param param;
...@@ -65,21 +58,15 @@ TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) { ...@@ -65,21 +58,15 @@ TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) {
checker.exec({{256, 256}, {256, 256}, {256, 256}}); checker.exec({{256, 256}, {256, 256}, {256, 256}});
auto args = matrix_mul::get_matmul_args(); auto args = matrix_mul::get_matmul_args();
for (auto arg : args) { for (auto arg : args) {
size_t m = DIVUP(arg.m, 8) * 8, n = DIVUP(arg.n, 8) * 8, size_t m = (arg.m + 7) / 8 * 8, n = (arg.n + 7) / 8 * 8,
k = DIVUP(arg.k, 32) * 32; k = (arg.k + 31) / 32 * 32;
checker.exec({{m, k}, {n, k}, {m, n}}); checker.exec({{m, k}, {n, k}, {m, n}});
} }
} }
#if MEGDNN_WITH_BENCHMARK #if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) { TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
if (cuda::current_device_prop().major < 7 || require_compute_capability(7, 5);
(cuda::current_device_prop().major == 7 &&
cuda::current_device_prop().minor < 5)) {
printf("Skip CUDA.BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as current "
"device doesn't support\n");
return;
}
Benchmarker<MatrixMul> bencher(handle_cuda()); Benchmarker<MatrixMul> bencher(handle_cuda());
using Param = MatrixMul::Param; using Param = MatrixMul::Param;
Param param; Param param;
...@@ -102,14 +89,7 @@ TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) { ...@@ -102,14 +89,7 @@ TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
} }
TEST_F(CUDA, PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) { TEST_F(CUDA, PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
if (cuda::current_device_prop().major < 7 || require_compute_capability(7, 5);
(cuda::current_device_prop().major == 7 &&
cuda::current_device_prop().minor < 5)) {
printf("Skip CUDA.PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as "
"current "
"device doesn't support\n");
return;
}
Benchmarker<MatrixMul> bencher(handle_cuda()); Benchmarker<MatrixMul> bencher(handle_cuda());
using Param = MatrixMul::Param; using Param = MatrixMul::Param;
Param param; Param param;
......
...@@ -188,8 +188,7 @@ TEST_F(CUDA, PADDING_REPLICATE2) { ...@@ -188,8 +188,7 @@ TEST_F(CUDA, PADDING_REPLICATE2) {
6, 7, 7, 8, 9, 9, 9, 9})}); 6, 7, 7, 8, 9, 9, 9, 9})});
} }
// #if MEGDNN_WITH_BENCHMARK #if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, BENCHMARK_PADDING_CONSTANT) { TEST_F(CUDA, BENCHMARK_PADDING_CONSTANT) {
using Param = Padding::Param; using Param = Padding::Param;
...@@ -240,5 +239,4 @@ TEST_F(CUDA, BENCHMARK_PADDING_CONSTANT) { ...@@ -240,5 +239,4 @@ TEST_F(CUDA, BENCHMARK_PADDING_CONSTANT) {
run(shapes, param); run(shapes, param);
} }
} }
#endif
// #endif
\ No newline at end of file
...@@ -40,6 +40,7 @@ TEST_F(CUDA, ROTATE) { ...@@ -40,6 +40,7 @@ TEST_F(CUDA, ROTATE) {
} }
} }
#if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, BENCHMARK_ROTATE) { TEST_F(CUDA, BENCHMARK_ROTATE) {
auto run = [&](const TensorShapeArray& shapes) { auto run = [&](const TensorShapeArray& shapes) {
Benchmarker<Rotate> benchmarker(handle_cuda()); Benchmarker<Rotate> benchmarker(handle_cuda());
...@@ -74,6 +75,7 @@ TEST_F(CUDA, BENCHMARK_ROTATE) { ...@@ -74,6 +75,7 @@ TEST_F(CUDA, BENCHMARK_ROTATE) {
run(shapes); run(shapes);
} }
#endif
} // namespace rotate } // namespace rotate
} // namespace test } // namespace test
......
...@@ -42,18 +42,6 @@ TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_FORWARD) { ...@@ -42,18 +42,6 @@ TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_FORWARD) {
} }
} }
#if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, BENCHMARK_SLIDINGWINDOWTRANSPOSE_FORWARD) {
auto args = sliding_window_transpose::get_benchmark_args();
for (auto&& arg : args) {
CUBenchmarker<SlidingWindowTransposeForward> bencher(handle_cuda());
bencher.set_param(arg.param)
.set_dtype(0, dtype::Float32())
.exec(TensorShapeArray{arg.ishape, {}});
}
}
#endif
TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_BACKWARD) { TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_BACKWARD) {
UniformFloatRNG rng(0, 1); UniformFloatRNG rng(0, 1);
auto args = sliding_window_transpose::get_args(); auto args = sliding_window_transpose::get_args();
...@@ -78,6 +66,18 @@ TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_BACKWARD) { ...@@ -78,6 +66,18 @@ TEST_F(CUDA, SLIDINGWINDOWTRANSPOSE_BACKWARD) {
} }
} }
#if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, BENCHMARK_SLIDINGWINDOWTRANSPOSE_FORWARD) {
auto args = sliding_window_transpose::get_benchmark_args();
for (auto&& arg : args) {
CUBenchmarker<SlidingWindowTransposeForward> bencher(handle_cuda());
bencher.set_param(arg.param)
.set_dtype(0, dtype::Float32())
.exec(TensorShapeArray{arg.ishape, {}});
}
}
#endif
} // namespace test } // namespace test
} // namespace megdnn } // namespace megdnn
......
...@@ -33,25 +33,6 @@ TEST_F(CUDA, TYPE_CVT) { ...@@ -33,25 +33,6 @@ TEST_F(CUDA, TYPE_CVT) {
} }
} }
TEST_F(CUDA, BENCHMARK_TYPE_CVT_LAST_NOT_CONTIG) {
const size_t RUNS = 3;
auto run = [&](TensorLayout src, TensorLayout dst) {
Benchmarker<TypeCvt> benchmarker(handle_cuda());
auto&& layout = src;
benchmarker.set_times(RUNS);
dst.init_contiguous_stride();
auto used = benchmarker.execl({src, dst});
printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
(1024 * 1024 * 1024));
};
TensorLayout src({16, 128, 128}, {49152, 384, 3}, dtype::Float32()),
dst({16, 128, 128}, {16384, 128, 1}, dtype::Float32());
run(src, dst);
}
TEST_F(CUDA, QUANTIZED_TYPECVT) { TEST_F(CUDA, QUANTIZED_TYPECVT) {
UniformIntRNG int_rng{-66, 66}; UniformIntRNG int_rng{-66, 66};
Checker<TypeCvt> checker(handle_cuda()); Checker<TypeCvt> checker(handle_cuda());
...@@ -162,6 +143,25 @@ TEST_F(CUDA, TYPE_CVT_BFLOAT16) { ...@@ -162,6 +143,25 @@ TEST_F(CUDA, TYPE_CVT_BFLOAT16) {
} }
#if MEGDNN_WITH_BENCHMARK #if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, BENCHMARK_TYPE_CVT_LAST_NOT_CONTIG) {
const size_t RUNS = 3;
auto run = [&](TensorLayout src, TensorLayout dst) {
Benchmarker<TypeCvt> benchmarker(handle_cuda());
auto&& layout = src;
benchmarker.set_times(RUNS);
dst.init_contiguous_stride();
auto used = benchmarker.execl({src, dst});
printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(),
2 * layout.total_nr_elems() * layout.dtype.size() * RUNS / used * 1000 /
(1024 * 1024 * 1024));
};
TensorLayout src({16, 128, 128}, {49152, 384, 3}, dtype::Float32()),
dst({16, 128, 128}, {16384, 128, 1}, dtype::Float32());
run(src, dst);
}
TEST_F(CUDA, BENCHMARK_TYPE_CVT) { TEST_F(CUDA, BENCHMARK_TYPE_CVT) {
UniformIntRNG rng{-128, 127}; UniformIntRNG rng{-128, 127};
auto run = [&](TensorLayout src, TensorLayout dst) { auto run = [&](TensorLayout src, TensorLayout dst) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册