#include "test/common/elemwise.h" #include "./utils.h" #include "megdnn/oprs.h" #include "test/common/benchmarker.h" #include "test/common/checker.h" #include "test/common/rng.h" #include "test/common/tensor.h" #include "test/cuda/fixture.h" #include #include using namespace megdnn; using namespace test; #define cudnn_check(e) megdnn_assert((e) == CUDNN_STATUS_SUCCESS) namespace { __attribute__((unused)) cudnnTensorDescriptor_t make_cudnn_tensor_desc( const TensorLayout& ly) { megdnn_assert(ly.ndim && ly.ndim <= 4 && ly.is_contiguous()); int dim[4] = {1, 1, 1, 1}, stride[4] = {1, 1, 1, 1}; for (size_t i = 0; i < ly.ndim; ++i) { dim[i] = ly.shape[i]; stride[i] = ly.stride[i]; } cudnnTensorDescriptor_t ret; cudnn_check(cudnnCreateTensorDescriptor(&ret)); // cudnn requires tensors to be at-least 4D cudnn_check(cudnnSetTensor4dDescriptorEx( ret, CUDNN_DATA_FLOAT, dim[0], dim[1], dim[2], dim[3], stride[0], stride[1], stride[2], stride[3])); return ret; } void run_tensor_add( Handle* handle_cuda, const TensorND& a, const TensorND& b, const TensorND& c) { #if 1 cudnnHandle_t cudnn_handle; cudnn_check(cudnnCreate(&cudnn_handle)); cuda_check(cudaDeviceSynchronize()); cuda_check(cudaMemcpy( c.raw_ptr(), a.raw_ptr(), a.layout.span().dist_byte(), cudaMemcpyDeviceToDevice)); auto bdesc = make_cudnn_tensor_desc(b.layout), cdesc = make_cudnn_tensor_desc(c.layout); float alpha = 1, beta = 1; cudaProfilerStart(); cudnn_check(cudnnAddTensor( cudnn_handle, &alpha, bdesc, b.raw_ptr(), &beta, cdesc, c.raw_ptr())); cudaProfilerStop(); cudnn_check(cudnnDestroyTensorDescriptor(cdesc)); cudnn_check(cudnnDestroyTensorDescriptor(bdesc)); cudnn_check(cudnnDestroy(cudnn_handle)); cuda_check(cudaMemset(c.raw_ptr(), 0, c.layout.span().dist_byte())); cuda_check(cudaDeviceSynchronize()); #endif auto opr = handle_cuda->create_operator(); opr->param().mode = ElemwiseForward::Mode::ADD; cudaProfilerStart(); opr->exec({a, b}, c); cudaProfilerStop(); } } // anonymous namespace template class CUDA_ELEMWISE : public CUDA {}; TYPED_TEST_SUITE(CUDA_ELEMWISE, elemwise::test_types); TYPED_TEST(CUDA_ELEMWISE, run) { elemwise::run_test(this->handle_cuda()); } TEST_F(CUDA, ELEMWISE_IBYTE) { Checker checker(handle_cuda()); using Mode = ElemwiseForward::Param::Mode; UniformIntRNG i_rng{-128, 127}; UniformIntRNG ui_rng{0, 255}; checker.set_rng(0, &i_rng); auto run_unary = [&](size_t N, Mode mode, DType dtype) { checker.set_param(mode).set_dtype(0, dtype); checker.execs({{N}, {}}); }; #define RUN_UNARY_IBYTE(_dt) \ run_unary(100, Mode::RELU, _dt); \ run_unary(100, Mode::ABS, _dt); RUN_UNARY_IBYTE(dtype::Int8()); checker.set_rng(0, &i_rng); RUN_UNARY_IBYTE(dtype::Uint8()); #undef RUN_UNARY_IBYTE auto run_binary = [&](size_t N, size_t C, size_t H, size_t W, Mode mode, DType dtype) { checker.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype); checker.execs({{5}, {5}, {}}); checker.execs({{4}, {4}, {}}); checker.execs({{4}, {1}, {}}); checker.execs({{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}}); checker.execs({{N, C / 4, H, W, 4}, {1, C / 4, 1, 1, 4}, {}}); checker.execs({{N, C / 32, H, W, 32}, {N, C / 32, H, W, 32}, {}}); checker.execs({{N, C / 32, H, W, 32}, {1, C / 32, 1, 1, 32}, {}}); checker.execs({{3, 5, 7}, {3, 5, 7}, {}}); checker.execs({{3, 5, 7}, {3, 5, 1}, {}}); checker.execs({{3, 5, 1}, {3, 5, 7}, {}}); checker.execs({{1}, {3, 5, 7}, {}}); checker.execs({{3, 5, 7}, {1}, {}}); }; #define RUN_BINARY_IBYTE(_dt) \ run_binary(4, 32, 10, 10, Mode::ADD, _dt); \ run_binary(4, 32, 10, 10, Mode::MUL, _dt); \ run_binary(4, 32, 10, 10, Mode::MAX, _dt); \ run_binary(4, 32, 10, 10, Mode::MIN, _dt); \ run_binary(4, 32, 10, 10, Mode::SUB, _dt); checker.set_rng(0, &i_rng).set_rng(1, &i_rng); RUN_BINARY_IBYTE(dtype::Int8()); checker.set_rng(0, &ui_rng).set_rng(1, &ui_rng); RUN_BINARY_IBYTE(dtype::Uint8()); #undef RUN_BINARY_IBYTE auto run_ternary = [&](size_t N, size_t C, size_t H, size_t W, Mode mode, DType dtype) { checker.set_param(mode).set_dtype(0, dtype).set_dtype(1, dtype).set_dtype( 2, dtype); checker.execs({{5}, {5}, {5}, {}}); checker.execs({{4}, {4}, {1}, {}}); checker.execs( {{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}}); checker.execs( {{N, C / 4, H, W, 4}, {1, C / 4, 1, 1, 4}, {1, C / 4, 1, 1, 4}, {}}); checker.execs( {{N, C / 32, H, W, 32}, {N, C / 32, H, W, 32}, {N, C / 32, H, W, 32}, {}}); checker.execs( {{N, C / 32, H, W, 32}, {1, C / 32, 1, 1, 32}, {1, C / 32, 1, 1, 32}, {}}); checker.execs({{1}, {3, 5, 7}, {3, 5, 7}, {}}); checker.execs({{3, 5, 7}, {3, 5, 1}, {3, 5, 1}, {}}); checker.execs({{3, 5, 1}, {3, 5, 7}, {3, 5, 1}, {}}); checker.execs({{1}, {3, 5, 7}, {1}, {}}); checker.execs({{3, 5, 7}, {1}, {3, 5, 7}, {}}); }; #define RUN_TERNARY_IBYTE(_dt) run_ternary(4, 32, 10, 10, Mode::FUSE_MUL_ADD3, _dt); checker.set_rng(0, &i_rng).set_rng(1, &i_rng); RUN_TERNARY_IBYTE(dtype::Int8()); checker.set_rng(0, &ui_rng).set_rng(1, &ui_rng); RUN_TERNARY_IBYTE(dtype::Uint8()); #undef RUN_TERNARY_IBYTE } // from common/elemwise.cpp TEST_F(CUDA, ELEMWISE_BFLOAT16) { using Mode = ElemwiseForward::Param::Mode; Checker checker(handle_cuda()); // unary #define UNARY_TEST_CASE(_optr) \ checker.set_param(Mode::_optr).execs({{1, 127}, {}}); \ checker.set_param(Mode::_optr).execs({{1, 7}, {}}); #define BUILD_UNARY_TEST_CASE_FLOAT \ UNARY_TEST_CASE(ABS) \ UNARY_TEST_CASE(LOG) \ UNARY_TEST_CASE(COS) \ UNARY_TEST_CASE(SIN) \ UNARY_TEST_CASE(FLOOR) \ UNARY_TEST_CASE(CEIL) \ UNARY_TEST_CASE(SIGMOID) \ UNARY_TEST_CASE(EXP) \ UNARY_TEST_CASE(TANH) \ UNARY_TEST_CASE(FAST_TANH) \ UNARY_TEST_CASE(RELU) \ UNARY_TEST_CASE(ROUND) checker.set_dtype(0, dtype::BFloat16()); checker.set_dtype(1, dtype::BFloat16()); UniformFloatRNG rng0(1e-2, 6e1); checker.set_rng(0, &rng0); checker.set_epsilon(1e-2); BUILD_UNARY_TEST_CASE_FLOAT #undef UNARY_TEST_CASE #undef BUILD_UNARY_TEST_CASE_FLOAT // binary #define BINARY_COMPLATE_TEST_CASE(_optr) \ checker.set_param(Mode::_optr).execs({{3, 4, 7}, {3, 4, 7}, {}}); \ checker.set_param(Mode::_optr).execs({{3, 4, 5, 7}, {1, 4, 1, 1}, {}}); \ checker.set_param(Mode::_optr).execs({{1, 4, 1, 1}, {3, 4, 5, 7}, {}}); \ checker.set_param(Mode::_optr).execs({{3, 4, 7}, {1, 4, 1}, {}}); \ checker.set_param(Mode::_optr).execs({{1, 4, 1}, {3, 4, 7}, {}}); \ checker.set_param(Mode::_optr).execs({{3, 4, 5, 7}, {1, 1, 1, 1}, {}}); \ checker.set_param(Mode::_optr).execs({{1, 1, 1, 1}, {3, 4, 5, 7}, {}}); \ checker.set_param(Mode::_optr).execs({{1, 7}, {1, 7}, {}}); \ checker.set_param(Mode::_optr).execs({{1, 2, 2}, {1, 2, 1}, {}}); \ checker.set_param(Mode::_optr).execs({{1, 2, 1}, {1, 2, 2}, {}}); \ checker.set_param(Mode::_optr).execs({{1, 2, 2}, {1, 1, 1}, {}}); \ checker.set_param(Mode::_optr).execs({{1, 1, 1}, {1, 2, 2}, {}}); \ checker.set_param(Mode::_optr).execs({{3, 4, 1}, {3, 4, 1}, {}}); #define BUILD_BINARY_COMPLATE_TEST_CASE \ BINARY_COMPLATE_TEST_CASE(ADD) \ BINARY_COMPLATE_TEST_CASE(MUL) \ BINARY_COMPLATE_TEST_CASE(MAX) \ BINARY_COMPLATE_TEST_CASE(MIN) \ BINARY_COMPLATE_TEST_CASE(SUB) UniformFloatRNG rng1(1e-5, 7e1); checker.set_rng(0, &rng1); checker.set_epsilon(1e-2); checker.set_dtype(0, dtype::BFloat16()); checker.set_dtype(1, dtype::BFloat16()); BUILD_BINARY_COMPLATE_TEST_CASE #undef BINARY_COMPLATE_TEST_CASE #undef BUILD_BINARY_COMPLATE_TEST_CASE // ternary #define TERNARY_COMPLATE_TEST_CASE(_optr) \ checker.set_param(Mode::_optr).execs({{3, 4, 7}, {3, 4, 7}, {3, 4, 7}, {}}); \ checker.set_param(Mode::_optr) \ .execs({{1, 4, 1, 1}, {3, 4, 5, 7}, {1, 4, 1, 1}, {}}); \ checker.set_param(Mode::_optr).execs({{1, 4, 1}, {3, 4, 7}, {1, 4, 1}, {}}); \ checker.set_param(Mode::_optr) \ .execs({{3, 4, 5, 7}, {3, 4, 5, 7}, {1, 1, 1, 1}, {}}); \ checker.set_param(Mode::_optr).execs({{1, 7}, {1, 7}, {1, 7}, {}}); \ checker.set_param(Mode::_optr).execs({{1, 2, 1}, {1, 2, 2}, {1, 2, 1}, {}}); \ checker.set_param(Mode::_optr).execs({{1, 2, 2}, {1, 2, 2}, {1, 1, 1}, {}}); \ checker.set_param(Mode::_optr).execs({{3, 4, 1}, {3, 4, 1}, {3, 4, 1}, {}}); #define BUILD_TERNARY_COMPLATE_TEST_CASE TERNARY_COMPLATE_TEST_CASE(FUSE_MUL_ADD3) UniformFloatRNG rng2(1e-5, 7e1); checker.set_rng(0, &rng2); checker.set_epsilon(1e-2); checker.set_dtype(0, dtype::BFloat16()); checker.set_dtype(1, dtype::BFloat16()); checker.set_dtype(2, dtype::BFloat16()); BUILD_TERNARY_COMPLATE_TEST_CASE #undef TERNARY_COMPLATE_TEST_CASE #undef BUILD_TERNARY_COMPLATE_TEST_CASE } TEST_F(CUDA, ELEMWISE_ADD_BCAST_10_INT8_INPLACE) { constexpr size_t A = 2, B = 48, C0 = 14, C1 = 14, C = C0 * C1; SyncedTensor t0(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Int8()}), t1(handle_cuda(), {TensorShape{1, B, C0, C1}, dtype::Int8()}), t2(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Int8()}); UniformIntRNG rng{-128, 127}; rng.gen(t0.tensornd_host()); rng.gen(t1.tensornd_host()); auto p0 = t0.ptr_host(), p1 = t1.ptr_host(); auto p2 = t2.ptr_mutable_host(); for (size_t i = 0; i < A; ++i) { for (size_t j = 0; j < B; ++j) { for (size_t k = 0; k < C; ++k) { auto off0 = j * C + k; auto off1 = i * B * C + j * C + k; p2[off1] = p0[off1] + p1[off0]; } } } auto opr = handle_cuda()->create_operator(); opr->param().mode = ElemwiseForward::Mode::ADD; opr->exec({t0.tensornd_dev(), t1.tensornd_dev()}, t0.tensornd_dev()); auto pt = t0.ptr_host(); for (size_t i = 0; i < A; ++i) { for (size_t j = 0; j < B; ++j) { for (size_t k = 0; k < C; ++k) { auto off = i * B * C + j * C + k; ASSERT_EQ(pt[off], p2[off]); } } } } //! the memory of this test case is too large, sometimes will fail on tx1 TEST_F(CUDA, ELEMWISE_BENCHMARK_DENSE) { constexpr size_t A = 256 * 1024 * 64, S0 = 16, S1 = 256, S2 = 64, S3 = 64; static_assert(A == S0 * S1 * S2 * S3, "bad value"); SyncedTensor<> t0(handle_cuda(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}), t1(handle_cuda(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}); UniformFloatRNG rng{-2.f, 2.f}; rng.gen(t0.tensornd_host()); run_tensor_add( handle_cuda(), t0.tensornd_dev(), t0.tensornd_dev(), t1.tensornd_dev()); auto p0 = t0.ptr_host(), p1 = t1.ptr_host(); for (size_t i = 0; i < A; ++i) { ASSERT_EQ(p0[i] + p0[i], p1[i]) << "at index " << i << "/" << A; } } #if MEGDNN_WITH_BENCHMARK TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_101) { constexpr size_t A = 511, B = 509, C0 = 23, C1 = 23, C = C0 * C1; SyncedTensor<> t0(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Float32()}), t1(handle_cuda(), {TensorShape{1, B, 1, 1}, dtype::Float32()}), t2(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Float32()}); UniformFloatRNG rng{-2.f, 2.f}; rng.gen(t0.tensornd_host()); rng.gen(t1.tensornd_host()); run_tensor_add( handle_cuda(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev()); auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host(); for (size_t i = 0; i < A; ++i) { for (size_t j = 0; j < B; ++j) { for (size_t k = 0; k < C; ++k) { auto off = i * B * C + j * C + k; ASSERT_EQ(p0[off] + p1[j], p2[off]); } } } } TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_10) { constexpr size_t A = 11583, B = 11587; SyncedTensor<> t0(handle_cuda(), {TensorShape{A, B}, dtype::Float32()}), t1(handle_cuda(), {TensorShape{1, B}, dtype::Float32()}), t2(handle_cuda(), {TensorShape{A, B}, dtype::Float32()}); UniformFloatRNG rng{-2.f, 2.f}; rng.gen(t0.tensornd_host()); rng.gen(t1.tensornd_host()); run_tensor_add( handle_cuda(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev()); auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host(); for (size_t i = 0; i < A; ++i) { for (size_t j = 0; j < B; ++j) { auto off = i * B + j; ASSERT_EQ(p0[off] + p1[j], p2[off]); } } } TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_01) { constexpr size_t A = 11583, B = 11587; SyncedTensor<> t0(handle_cuda(), {TensorShape{1, A, B}, dtype::Float32()}), t1(handle_cuda(), {TensorShape{1, A, 1}, dtype::Float32()}), t2(handle_cuda(), {TensorShape{1, A, B}, dtype::Float32()}); UniformFloatRNG rng{-2.f, 2.f}; rng.gen(t0.tensornd_host()); rng.gen(t1.tensornd_host()); run_tensor_add( handle_cuda(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev()); auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host(); for (size_t i = 0; i < A; ++i) { for (size_t j = 0; j < B; ++j) { auto off = i * B + j; ASSERT_EQ(p0[off] + p1[i], p2[off]); } } } TEST_F(CUDA, BENCHMARK_ELEMWISE_IBYTE) { Benchmarker bencher(handle_cuda()); using Mode = ElemwiseForward::Param::Mode; auto run_bench = [&](size_t N, size_t C, size_t H, size_t W) { size_t nr_times = 100; bencher.set_times(nr_times) .set_param(Mode::FUSE_ADD_RELU) .set_dtype(0, dtype::Int8()) .set_dtype(1, dtype::Int8()); auto time = bencher.execs({{N * C * H * W + 1}, {N * C * H * W + 1}, {}}) / nr_times; printf("time = %.2fms, bandwidth = %.2fGB/s\n", time, (3.0 * (N * C * H * W + 1)) / (time * 1e6)); time = bencher.execs({{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}}) / nr_times; printf("time = %.2fms, bandwidth = %.2fGB/s\n", time, (3.0 * N * C * H * W) / (time * 1e6)); time = bencher.execs({{N, C / 4, H, W, 4}, {1, C / 4, 1, 1, 4}, {}}) / nr_times; printf("time = %.2fms, bandwidth = %.2fGB/s\n", time, (C + 2.0 * N * C * H * W) / (time * 1e6)); time = bencher.execs({{N, C / 4, H, W, 4}, {1}, {}}) / nr_times; printf("time = %.2fms, bandwidth = %.2fGB/s\n", time, (2.0 * N * C * H * W + 1) / (time * 1e6)); time = bencher.execs({{N, C / 32, H, W, 32}, {N, C / 32, H, W, 32}, {}}) / nr_times; printf("time = %.2fms, bandwidth = %.2fGB/s\n", time, (3.0 * N * C * H * W) / (time * 1e6)); time = bencher.execs({{N, C / 32, H, W, 32}, {1, C / 32, 1, 1, 32}, {}}) / nr_times; printf("time = %.2fms, bandwidth = %.2fGB/s\n", time, (C + 2.0 * N * C * H * W) / (time * 1e6)); bencher.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32()); time = bencher.execs({{N, C / 4, H, W}, {N, C / 4, H, W}, {}}) / nr_times; printf("time = %.2fms, bandwidth = %.2fGB/s\n", time, (3.0 * N * C * H * W) / (time * 1e6)); time = bencher.execs({{N, C / 4, H, W}, {1, C / 4, 1, 1}, {}}) / nr_times; printf("time = %.2fms, bandwidth = %.2fGB/s\n", time, (C + 2.0 * N * C * H * W) / (time * 1e6)); }; run_bench(256, 256, 56, 56); } TEST_F(CUDA, BENCHMARK_ELEMWISE_MIN_MAX) { Benchmarker bencher(handle_cuda()); using Mode = ElemwiseForward::Param::Mode; UniformIntRNG const_1{1, 1}, rng{-128, 127}; auto run_bench = [&](size_t N, size_t C, size_t H, size_t W, DType dtype) { size_t nr_times = 1000; bencher.set_times(nr_times) .set_param(Mode::MIN) .set_rng(0, &rng) .set_rng(1, &rng) .set_dtype(0, dtype) .set_dtype(1, dtype); auto time = bencher.execs({{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}}) / nr_times; printf("time = %.2fms, bandwidth = %.2fGB/s\n", time, (3.0 * N * C * H * W) / (time * 1e6)); bencher.set_param(Mode::MAX).set_rng(0, &const_1).set_rng(1, &const_1); time = bencher.execs({{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}}) / nr_times; printf("time = %.2fms, bandwidth = %.2fGB/s\n", time, (3.0 * N * C * H * W) / (time * 1e6)); }; run_bench(256, 256, 56, 56, dtype::Int8()); } #endif // vim: syntax=cpp.doxygen