#include "hcc_detail/hcc_defs_prologue.h" #include "megdnn/oprs.h" #include "test/common/benchmarker.h" #include "test/common/checker.h" #include "test/common/relayout.h" #include "test/rocm/benchmarker.h" #include "test/rocm/fixture.h" using namespace megdnn; using namespace test; namespace { template class ROCM_RELAYOUT : public ROCM {}; TYPED_TEST_CASE(ROCM_RELAYOUT, relayout::test_types); TYPED_TEST(ROCM_RELAYOUT, run) { relayout::run_test(this->handle_rocm()); } } // namespace TEST_F(ROCM, RELAYOUT_MEMCPY_ASYNC) { Checker checker(handle_rocm()); checker.set_epsilon(1e-3); struct Arg { TensorLayout src, dst; Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {} }; std::vector args; // test for contig args.emplace_back( Arg{{{51200}, {1}, dtype::Float32()}, {{51200}, {1}, dtype::Float32()}}); // test for copy_2d args.emplace_back( Arg{{{51200}, {9}, dtype::Float32()}, {{51200}, {1}, dtype::Float32()}}); for (auto&& arg : args) { checker.set_dtype(0, dtype::Float32()) .set_dtype(1, dtype::Float32()) .execl({arg.src, arg.dst}); } } #if MEGDNN_WITH_BENCHMARK TEST_F(ROCM, RELAYOUT_BENCHMARK) { //! benchmark contious layout, such as (a, b, c, d) -> (b, a, c,d) //! just change the first two axis megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true); auto benchmarker = ROCMBenchmarker(handle_rocm(), handle_naive(false)); benchmarker.set_display(true); auto run = [&](const TensorLayoutArray& layouts) { for (auto&& layout : layouts) { TensorLayout src = layout.dimshuffle({1, 0, 2}); TensorLayout dst = layout; std::swap(dst.shape[0], dst.shape[1]); dst.init_contiguous_stride(); benchmarker.execl({src, dst}); auto used = benchmarker.execl({src, dst}); used = benchmarker.execl({src, dst}); printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(), 2 * layout.total_nr_elems() * layout.dtype.size() / used * 1000 / (1024 * 1024 * 1024)); } }; TensorLayoutArray layouts = { {{12, 23, 2}, dtype::Int32()}, {{12, 23, 8}, dtype::Int32()}, {{12, 23, 17}, dtype::Int32()}, {{12, 23, 64}, dtype::Int32()}, {{12, 23, 129}, dtype::Int32()}, {{12, 23, 256}, dtype::Int32()}, {{12, 23, 1029}, dtype::Int32()}, {{12, 23, 4096}, dtype::Int32()}, {{12, 23, 9143}, dtype::Int32()}, {{12, 23, 18284}, dtype::Int32()}, {{2, 2, 1000000}, dtype::Int32()}, }; run(layouts); auto run2 = [&](const TensorLayoutArray& layouts) { for (auto&& layout : layouts) { TensorLayout src = layout.dimshuffle({0, 2, 1, 3}); TensorLayout dst = layout; std::swap(dst.shape[0], dst.shape[1]); dst.init_contiguous_stride(); benchmarker.execl({src, dst}); auto used = benchmarker.execl({src, dst}); used = benchmarker.execl({src, dst}); printf("layout: %s bandwith: %f gbps/s\n", layout.to_string().c_str(), 2 * layout.total_nr_elems() * layout.dtype.size() / used * 1000 / (1024 * 1024 * 1024)); } }; layouts = { {{3, 12, 24, 100}, dtype::Int32()}, {{3, 12, 24, 1029}, dtype::Int32()}, {{3, 4, 24, 9143}, dtype::Int32()}, {{3, 4, 24, 18284}, dtype::Int32()}, }; run2(layouts); } TEST_F(ROCM, RELAYOUT_LAST_CONTIG_BENCHMARK) { megdnn::rocm::enable_miopen_algo_search(handle_rocm(), true); auto benchmarker = ROCMBenchmarker(handle_rocm(), handle_naive(false)); benchmarker.set_display(true); TensorLayout src = TensorLayout({5, 5, 100000}, {800000, 100000, 1}, dtype::Float32()); TensorLayout dst = TensorLayout({5, 5, 100000}, {700000, 100000, 1}, dtype::Float32()); benchmarker.execl({src, dst}); auto used = benchmarker.execl({src, dst}); used = benchmarker.execl({src, dst}); printf("src: %s dst: %s bandwith: %f gbps/s\n", src.to_string().c_str(), dst.to_string().c_str(), 2 * src.total_nr_elems() * src.dtype.size() / used * 1000 / (1024 * 1024 * 1024)); } #endif TEST_F(ROCM, RELAYOUT) { struct Arg { TensorLayout src, dst; Arg(TensorLayout src, TensorLayout dst) : src(src), dst(dst) {} }; std::vector args; #if !MEGDNN_DISABLE_FLOAT16 { // contiguous stride args.emplace_back( TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16()), TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16())); args.emplace_back( TensorLayout({4, 3, 2}, {6, 2, 1}, dtype::Float16()), TensorLayout({4, 3, 2}, {2, 8, 1}, dtype::Float16())); args.emplace_back( TensorLayout({2, 4, 3, 5}, {60, 5, 20, 1}, dtype::Float16()), TensorLayout({2, 4, 3, 5}, {60, 15, 5, 1}, dtype::Float16())); } args.emplace_back( TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16()), TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16())); args.emplace_back( TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()), TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Float16())); args.emplace_back( TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Float16()), TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Float16())); #endif args.emplace_back( TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32()), TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32())); args.emplace_back( TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()), TensorLayout({2, 3, 4, 5}, {60, 20, 5, 1}, dtype::Int32())); args.emplace_back( TensorLayout({2, 3, 4, 5}, {120, 40, 10, 2}, dtype::Int32()), TensorLayout({2, 3, 4, 5}, {180, 60, 15, 3}, dtype::Int32())); { // 1d size_t n = 10000; args.emplace_back( TensorLayout({n}, {1}, dtype::Int32()), TensorLayout({n}, {1}, dtype::Int32())); args.emplace_back( TensorLayout({n}, {1}, dtype::Int32()), TensorLayout({n}, {2}, dtype::Int32())); args.emplace_back( TensorLayout({n}, {2}, dtype::Int32()), TensorLayout({n}, {1}, dtype::Int32())); args.emplace_back( TensorLayout({n}, {2}, dtype::Int32()), TensorLayout({n}, {2}, dtype::Int32())); } { // 2d size_t m = 200, n = 300, k = 400; ptrdiff_t k2 = k * 2; args.emplace_back( TensorLayout({m, n}, {k2, 2}, dtype::Int32()), TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32())); args.emplace_back( TensorLayout({m, n}, {2, k2}, dtype::Int32()), TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32())); args.emplace_back( TensorLayout({m, n}, {2, k2}, dtype::Int32()), TensorLayout({m, n}, {k2 + 1, 2}, dtype::Int32())); args.emplace_back( TensorLayout({m, n}, {k2, 2}, dtype::Int32()), TensorLayout({m, n}, {2, k2 + 1}, dtype::Int32())); args.emplace_back( TensorLayout({m, n}, {k2, 1}, dtype::Int32()), TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32())); args.emplace_back( TensorLayout({m, n}, {1, k2}, dtype::Int32()), TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32())); args.emplace_back( TensorLayout({m, n}, {1, k2}, dtype::Int32()), TensorLayout({m, n}, {k2 + 1, 1}, dtype::Int32())); args.emplace_back( TensorLayout({m, n}, {k2, 1}, dtype::Int32()), TensorLayout({m, n}, {1, k2 + 1}, dtype::Int32())); } { // 3d size_t m = 20, n = 30, k = 40; ptrdiff_t k2 = k; args.emplace_back( TensorLayout({m, n, k}, {k2 * k2 * 4, k2 * 3, 2}, dtype::Int32()), TensorLayout( {m, n, k}, {2 * k2 * k2 * k2 * 4, k2 * 3, 2}, dtype::Int32())); } { // simplify_layout // 234..56 // 2..3456 args.emplace_back( TensorLayout( {2, 3, 4, 5, 6}, {2 * 3 * 4 * 5 * 6, 2 * 4 * 5 * 6, 2 * 5 * 6, 6, 1}, dtype::Int32()), TensorLayout( {2, 3, 4, 5, 6}, {4 * 3 * 4 * 5 * 6, 4 * 5 * 6, 5 * 6, 6, 1}, dtype::Int32())); } Checker checker(handle_rocm()); for (auto&& arg : args) { checker.exec(TensorLayoutArray{arg.src, arg.dst}); } } // vim: syntax=cpp.doxygen