#include "hcc_detail/hcc_defs_prologue.h" #include "megdnn/oprs.h" #include "test/common/elemwise.h" #include "test/common/rng.h" #include "test/common/tensor.h" #include "test/rocm/fixture.h" #include "hip_header.h" #include "src/rocm/miopen_with_check.h" #include "test/rocm/benchmarker.h" using namespace megdnn; using namespace test; namespace { void run_tensor_add( Handle* handle_rocm, const TensorND& a, const TensorND& b, const TensorND& c) { auto opr = handle_rocm->create_operator(); opr->param().mode = ElemwiseForward::Mode::ADD; hipProfilerStart(); opr->exec({a, b}, c); hipProfilerStop(); } using Mode = ElemwiseForward::Mode; template void run_elemwise_benchmark( Handle* handle_rocm, Handle* handle_naive, TensorShapeArray shapes, DType dtype) { auto benchmarker = ROCMBenchmarker(handle_rocm, handle_naive); benchmarker.set_display(true); ElemwiseForward::Param param; param.mode = mode; benchmarker.set_param(param); TensorShape dst_shp; ElemwiseForward::deduce_shape(shapes, dst_shp); shapes.push_back(dst_shp); for (size_t i = 0; i < shapes.size(); i++) { benchmarker.set_dtype(i, dtype); } float io = 0.f; for (auto&& shp : shapes) { io += 1.f * shp.total_nr_elems() * dtype.size(); } auto time_ms = benchmarker.execs(shapes); printf("io = %.3f GB, bandwidth = %.3f GB/s\n", io / 1e9, io / (1e6 * time_ms)); } } // anonymous namespace template class ROCM_ELEMWISE : public ROCM {}; TYPED_TEST_SUITE(ROCM_ELEMWISE, elemwise::test_types); TYPED_TEST(ROCM_ELEMWISE, run) { elemwise::run_test(this->handle_rocm()); } //! the memory of this test case is too large, sometimes will fail on tx1 TEST_F(ROCM, ELEMWISE_BENCHMARK_DENSE) { constexpr size_t A = 1024 * 1024 * 64, S0 = 64, S1 = 256, S2 = 64, S3 = 64; static_assert(A == S0 * S1 * S2 * S3, "bad value"); SyncedTensor<> t0(handle_rocm(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}), t1(handle_rocm(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}); UniformFloatRNG rng{-2.f, 2.f}; rng.gen(t0.tensornd_host()); run_tensor_add( handle_rocm(), t0.tensornd_dev(), t0.tensornd_dev(), t1.tensornd_dev()); auto p0 = t0.ptr_host(), p1 = t1.ptr_host(); for (size_t i = 0; i < A; ++i) { ASSERT_EQ(p0[i] + p0[i], p1[i]) << "at index " << i << "/" << A; } } #if MEGDNN_WITH_BENCHMARK TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_101) { constexpr size_t A = 511, B = 509, C0 = 23, C1 = 23, C = C0 * C1; SyncedTensor<> t0(handle_rocm(), {TensorShape{A, B, C0, C1}, dtype::Float32()}), t1(handle_rocm(), {TensorShape{1, B, 1, 1}, dtype::Float32()}), t2(handle_rocm(), {TensorShape{A, B, C0, C1}, dtype::Float32()}); UniformFloatRNG rng{-2.f, 2.f}; rng.gen(t0.tensornd_host()); rng.gen(t1.tensornd_host()); run_tensor_add( handle_rocm(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev()); auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host(); for (size_t i = 0; i < A; ++i) { for (size_t j = 0; j < B; ++j) { for (size_t k = 0; k < C; ++k) { auto off = i * B * C + j * C + k; ASSERT_EQ(p0[off] + p1[j], p2[off]); } } } } TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_10) { constexpr size_t A = 11583, B = 11587; SyncedTensor<> t0(handle_rocm(), {TensorShape{A, B}, dtype::Float32()}), t1(handle_rocm(), {TensorShape{1, B}, dtype::Float32()}), t2(handle_rocm(), {TensorShape{A, B}, dtype::Float32()}); UniformFloatRNG rng{-2.f, 2.f}; rng.gen(t0.tensornd_host()); rng.gen(t1.tensornd_host()); run_tensor_add( handle_rocm(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev()); auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host(); for (size_t i = 0; i < A; ++i) { for (size_t j = 0; j < B; ++j) { auto off = i * B + j; ASSERT_EQ(p0[off] + p1[j], p2[off]); } } } TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_01) { constexpr size_t A = 11583, B = 11587; SyncedTensor<> t0(handle_rocm(), {TensorShape{1, A, B}, dtype::Float32()}), t1(handle_rocm(), {TensorShape{1, A, 1}, dtype::Float32()}), t2(handle_rocm(), {TensorShape{1, A, B}, dtype::Float32()}); UniformFloatRNG rng{-2.f, 2.f}; rng.gen(t0.tensornd_host()); rng.gen(t1.tensornd_host()); run_tensor_add( handle_rocm(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev()); auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host(); for (size_t i = 0; i < A; ++i) { for (size_t j = 0; j < B; ++j) { auto off = i * B + j; ASSERT_EQ(p0[off] + p1[i], p2[off]); } } } TEST_F(ROCM, ELEMWISE_BENCHMARK) { using Mode = ElemwiseForward::Mode; run_elemwise_benchmark( handle_rocm(), handle_naive(false), {{32, 128, 56, 56}, {32, 128, 56, 56}}, dtype::Float32()); run_elemwise_benchmark( handle_rocm(), handle_naive(false), {{32, 128, 56, 56}, {1, 128, 1, 1}}, dtype::Float32()); run_elemwise_benchmark( handle_rocm(), handle_naive(false), {{32, 128, 56, 56}, {1, 128, 1, 1}}, dtype::Float32()); run_elemwise_benchmark( handle_rocm(), handle_naive(false), {{32, 128, 56, 56}, {1, 128, 1, 1}, {32, 128, 56, 56}}, dtype::Float32()); } TEST_F(ROCM, ELEMWISE_BENCHMARK_PEAK_BANDWIDTH) { using Mode = ElemwiseForward::Mode; run_elemwise_benchmark( handle_rocm(), handle_naive(false), {{10000, 10000}, {10000, 10000}, {10000, 10000}, {10000, 10000}}, dtype::Float32()); } #endif // vim: syntax=cpp.doxygen