elemwise.cpp

#include "hcc_detail/hcc_defs_prologue.h"

#include "megdnn/oprs.h"
#include "test/common/elemwise.h"
#include "test/common/rng.h"
#include "test/common/tensor.h"
#include "test/rocm/fixture.h"

#include "hip_header.h"
#include "src/rocm/miopen_with_check.h"

#include "test/rocm/benchmarker.h"

using namespace megdnn;
using namespace test;

namespace {
void run_tensor_add(
        Handle* handle_rocm, const TensorND& a, const TensorND& b, const TensorND& c) {
    auto opr = handle_rocm->create_operator<ElemwiseForward>();
    opr->param().mode = ElemwiseForward::Mode::ADD;
    hipProfilerStart();
    opr->exec({a, b}, c);
    hipProfilerStop();
}

using Mode = ElemwiseForward::Mode;
template <Mode mode>
void run_elemwise_benchmark(
        Handle* handle_rocm, Handle* handle_naive, TensorShapeArray shapes,
        DType dtype) {
    auto benchmarker = ROCMBenchmarker<ElemwiseForward>(handle_rocm, handle_naive);
    benchmarker.set_display(true);
    ElemwiseForward::Param param;
    param.mode = mode;
    benchmarker.set_param(param);
    TensorShape dst_shp;
    ElemwiseForward::deduce_shape(shapes, dst_shp);
    shapes.push_back(dst_shp);
    for (size_t i = 0; i < shapes.size(); i++) {
        benchmarker.set_dtype(i, dtype);
    }
    float io = 0.f;
    for (auto&& shp : shapes) {
        io += 1.f * shp.total_nr_elems() * dtype.size();
    }
    auto time_ms = benchmarker.execs(shapes);
    printf("io = %.3f GB, bandwidth = %.3f GB/s\n", io / 1e9, io / (1e6 * time_ms));
}

}  // anonymous namespace

template <typename tag>
class ROCM_ELEMWISE : public ROCM {};
TYPED_TEST_SUITE(ROCM_ELEMWISE, elemwise::test_types);
TYPED_TEST(ROCM_ELEMWISE, run) {
    elemwise::run_test<TypeParam>(this->handle_rocm());
}

//! the memory of this test case is too large, sometimes will fail on tx1
TEST_F(ROCM, ELEMWISE_BENCHMARK_DENSE) {
    constexpr size_t A = 1024 * 1024 * 64, S0 = 64, S1 = 256, S2 = 64, S3 = 64;
    static_assert(A == S0 * S1 * S2 * S3, "bad value");
    SyncedTensor<> t0(handle_rocm(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}),
            t1(handle_rocm(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()});
    UniformFloatRNG rng{-2.f, 2.f};
    rng.gen(t0.tensornd_host());
    run_tensor_add(
            handle_rocm(), t0.tensornd_dev(), t0.tensornd_dev(), t1.tensornd_dev());
    auto p0 = t0.ptr_host(), p1 = t1.ptr_host();
    for (size_t i = 0; i < A; ++i) {
        ASSERT_EQ(p0[i] + p0[i], p1[i]) << "at index " << i << "/" << A;
    }
}

#if MEGDNN_WITH_BENCHMARK
TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_101) {
    constexpr size_t A = 511, B = 509, C0 = 23, C1 = 23, C = C0 * C1;
    SyncedTensor<> t0(handle_rocm(), {TensorShape{A, B, C0, C1}, dtype::Float32()}),
            t1(handle_rocm(), {TensorShape{1, B, 1, 1}, dtype::Float32()}),
            t2(handle_rocm(), {TensorShape{A, B, C0, C1}, dtype::Float32()});
    UniformFloatRNG rng{-2.f, 2.f};
    rng.gen(t0.tensornd_host());
    rng.gen(t1.tensornd_host());
    run_tensor_add(
            handle_rocm(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
    auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
    for (size_t i = 0; i < A; ++i) {
        for (size_t j = 0; j < B; ++j) {
            for (size_t k = 0; k < C; ++k) {
                auto off = i * B * C + j * C + k;
                ASSERT_EQ(p0[off] + p1[j], p2[off]);
            }
        }
    }
}

TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_10) {
    constexpr size_t A = 11583, B = 11587;
    SyncedTensor<> t0(handle_rocm(), {TensorShape{A, B}, dtype::Float32()}),
            t1(handle_rocm(), {TensorShape{1, B}, dtype::Float32()}),
            t2(handle_rocm(), {TensorShape{A, B}, dtype::Float32()});
    UniformFloatRNG rng{-2.f, 2.f};
    rng.gen(t0.tensornd_host());
    rng.gen(t1.tensornd_host());
    run_tensor_add(
            handle_rocm(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
    auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
    for (size_t i = 0; i < A; ++i) {
        for (size_t j = 0; j < B; ++j) {
            auto off = i * B + j;
            ASSERT_EQ(p0[off] + p1[j], p2[off]);
        }
    }
}

TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_01) {
    constexpr size_t A = 11583, B = 11587;
    SyncedTensor<> t0(handle_rocm(), {TensorShape{1, A, B}, dtype::Float32()}),
            t1(handle_rocm(), {TensorShape{1, A, 1}, dtype::Float32()}),
            t2(handle_rocm(), {TensorShape{1, A, B}, dtype::Float32()});
    UniformFloatRNG rng{-2.f, 2.f};
    rng.gen(t0.tensornd_host());
    rng.gen(t1.tensornd_host());
    run_tensor_add(
            handle_rocm(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
    auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
    for (size_t i = 0; i < A; ++i) {
        for (size_t j = 0; j < B; ++j) {
            auto off = i * B + j;
            ASSERT_EQ(p0[off] + p1[i], p2[off]);
        }
    }
}

TEST_F(ROCM, ELEMWISE_BENCHMARK) {
    using Mode = ElemwiseForward::Mode;
    run_elemwise_benchmark<Mode::ADD>(
            handle_rocm(), handle_naive(false), {{32, 128, 56, 56}, {32, 128, 56, 56}},
            dtype::Float32());
    run_elemwise_benchmark<Mode::ADD>(
            handle_rocm(), handle_naive(false), {{32, 128, 56, 56}, {1, 128, 1, 1}},
            dtype::Float32());
    run_elemwise_benchmark<Mode::FUSE_ADD_RELU>(
            handle_rocm(), handle_naive(false), {{32, 128, 56, 56}, {1, 128, 1, 1}},
            dtype::Float32());
    run_elemwise_benchmark<Mode::FUSE_MUL_ADD3>(
            handle_rocm(), handle_naive(false),
            {{32, 128, 56, 56}, {1, 128, 1, 1}, {32, 128, 56, 56}}, dtype::Float32());
}

TEST_F(ROCM, ELEMWISE_BENCHMARK_PEAK_BANDWIDTH) {
    using Mode = ElemwiseForward::Mode;
    run_elemwise_benchmark<Mode::FUSE_MUL_ADD4>(
            handle_rocm(), handle_naive(false),
            {{10000, 10000}, {10000, 10000}, {10000, 10000}, {10000, 10000}},
            dtype::Float32());
}
#endif

// vim: syntax=cpp.doxygen