elemwise.cpp 5.9 KB
Newer Older
1 2 3
#include "hcc_detail/hcc_defs_prologue.h"

#include "megdnn/oprs.h"
M
Megvii Engine Team 已提交
4
#include "test/common/elemwise.h"
5
#include "test/common/rng.h"
M
Megvii Engine Team 已提交
6 7
#include "test/common/tensor.h"
#include "test/rocm/fixture.h"
8 9 10 11 12 13 14 15 16 17

#include "hip_header.h"
#include "src/rocm/miopen_with_check.h"

#include "test/rocm/benchmarker.h"

using namespace megdnn;
using namespace test;

namespace {
M
Megvii Engine Team 已提交
18 19 20 21 22 23 24 25
void run_tensor_add(
        Handle* handle_rocm, const TensorND& a, const TensorND& b, const TensorND& c) {
    auto opr = handle_rocm->create_operator<ElemwiseForward>();
    opr->param().mode = ElemwiseForward::Mode::ADD;
    hipProfilerStart();
    opr->exec({a, b}, c);
    hipProfilerStop();
}
26

M
Megvii Engine Team 已提交
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
using Mode = ElemwiseForward::Mode;
template <Mode mode>
void run_elemwise_benchmark(
        Handle* handle_rocm, Handle* handle_naive, TensorShapeArray shapes,
        DType dtype) {
    auto benchmarker = ROCMBenchmarker<ElemwiseForward>(handle_rocm, handle_naive);
    benchmarker.set_display(true);
    ElemwiseForward::Param param;
    param.mode = mode;
    benchmarker.set_param(param);
    TensorShape dst_shp;
    ElemwiseForward::deduce_shape(shapes, dst_shp);
    shapes.push_back(dst_shp);
    for (size_t i = 0; i < shapes.size(); i++) {
        benchmarker.set_dtype(i, dtype);
    }
    float io = 0.f;
    for (auto&& shp : shapes) {
        io += 1.f * shp.total_nr_elems() * dtype.size();
46
    }
M
Megvii Engine Team 已提交
47 48 49
    auto time_ms = benchmarker.execs(shapes);
    printf("io = %.3f GB, bandwidth = %.3f GB/s\n", io / 1e9, io / (1e6 * time_ms));
}
50 51 52 53 54

}  // anonymous namespace

template <typename tag>
class ROCM_ELEMWISE : public ROCM {};
55
TYPED_TEST_SUITE(ROCM_ELEMWISE, elemwise::test_types);
56 57 58 59 60 61
TYPED_TEST(ROCM_ELEMWISE, run) {
    elemwise::run_test<TypeParam>(this->handle_rocm());
}

//! the memory of this test case is too large, sometimes will fail on tx1
TEST_F(ROCM, ELEMWISE_BENCHMARK_DENSE) {
M
Megvii Engine Team 已提交
62
    constexpr size_t A = 1024 * 1024 * 64, S0 = 64, S1 = 256, S2 = 64, S3 = 64;
63
    static_assert(A == S0 * S1 * S2 * S3, "bad value");
M
Megvii Engine Team 已提交
64 65
    SyncedTensor<> t0(handle_rocm(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}),
            t1(handle_rocm(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()});
66 67
    UniformFloatRNG rng{-2.f, 2.f};
    rng.gen(t0.tensornd_host());
M
Megvii Engine Team 已提交
68 69
    run_tensor_add(
            handle_rocm(), t0.tensornd_dev(), t0.tensornd_dev(), t1.tensornd_dev());
70
    auto p0 = t0.ptr_host(), p1 = t1.ptr_host();
M
Megvii Engine Team 已提交
71
    for (size_t i = 0; i < A; ++i) {
72 73 74 75 76 77 78
        ASSERT_EQ(p0[i] + p0[i], p1[i]) << "at index " << i << "/" << A;
    }
}

#if MEGDNN_WITH_BENCHMARK
TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_101) {
    constexpr size_t A = 511, B = 509, C0 = 23, C1 = 23, C = C0 * C1;
M
Megvii Engine Team 已提交
79 80 81
    SyncedTensor<> t0(handle_rocm(), {TensorShape{A, B, C0, C1}, dtype::Float32()}),
            t1(handle_rocm(), {TensorShape{1, B, 1, 1}, dtype::Float32()}),
            t2(handle_rocm(), {TensorShape{A, B, C0, C1}, dtype::Float32()});
82 83 84
    UniformFloatRNG rng{-2.f, 2.f};
    rng.gen(t0.tensornd_host());
    rng.gen(t1.tensornd_host());
M
Megvii Engine Team 已提交
85 86
    run_tensor_add(
            handle_rocm(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
87
    auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
M
Megvii Engine Team 已提交
88 89 90
    for (size_t i = 0; i < A; ++i) {
        for (size_t j = 0; j < B; ++j) {
            for (size_t k = 0; k < C; ++k) {
91 92 93 94 95 96 97 98 99 100
                auto off = i * B * C + j * C + k;
                ASSERT_EQ(p0[off] + p1[j], p2[off]);
            }
        }
    }
}

TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_10) {
    constexpr size_t A = 11583, B = 11587;
    SyncedTensor<> t0(handle_rocm(), {TensorShape{A, B}, dtype::Float32()}),
M
Megvii Engine Team 已提交
101 102
            t1(handle_rocm(), {TensorShape{1, B}, dtype::Float32()}),
            t2(handle_rocm(), {TensorShape{A, B}, dtype::Float32()});
103 104 105
    UniformFloatRNG rng{-2.f, 2.f};
    rng.gen(t0.tensornd_host());
    rng.gen(t1.tensornd_host());
M
Megvii Engine Team 已提交
106 107
    run_tensor_add(
            handle_rocm(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
108
    auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
M
Megvii Engine Team 已提交
109 110
    for (size_t i = 0; i < A; ++i) {
        for (size_t j = 0; j < B; ++j) {
111 112 113 114 115 116 117 118 119
            auto off = i * B + j;
            ASSERT_EQ(p0[off] + p1[j], p2[off]);
        }
    }
}

TEST_F(ROCM, ELEMWISE_BENCHMARK_BCAST_01) {
    constexpr size_t A = 11583, B = 11587;
    SyncedTensor<> t0(handle_rocm(), {TensorShape{1, A, B}, dtype::Float32()}),
M
Megvii Engine Team 已提交
120 121
            t1(handle_rocm(), {TensorShape{1, A, 1}, dtype::Float32()}),
            t2(handle_rocm(), {TensorShape{1, A, B}, dtype::Float32()});
122 123 124
    UniformFloatRNG rng{-2.f, 2.f};
    rng.gen(t0.tensornd_host());
    rng.gen(t1.tensornd_host());
M
Megvii Engine Team 已提交
125 126
    run_tensor_add(
            handle_rocm(), t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev());
127
    auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host();
M
Megvii Engine Team 已提交
128 129
    for (size_t i = 0; i < A; ++i) {
        for (size_t j = 0; j < B; ++j) {
130 131 132 133 134 135 136 137
            auto off = i * B + j;
            ASSERT_EQ(p0[off] + p1[i], p2[off]);
        }
    }
}

TEST_F(ROCM, ELEMWISE_BENCHMARK) {
    using Mode = ElemwiseForward::Mode;
M
Megvii Engine Team 已提交
138 139 140 141 142 143 144 145 146
    run_elemwise_benchmark<Mode::ADD>(
            handle_rocm(), handle_naive(false), {{32, 128, 56, 56}, {32, 128, 56, 56}},
            dtype::Float32());
    run_elemwise_benchmark<Mode::ADD>(
            handle_rocm(), handle_naive(false), {{32, 128, 56, 56}, {1, 128, 1, 1}},
            dtype::Float32());
    run_elemwise_benchmark<Mode::FUSE_ADD_RELU>(
            handle_rocm(), handle_naive(false), {{32, 128, 56, 56}, {1, 128, 1, 1}},
            dtype::Float32());
147 148
    run_elemwise_benchmark<Mode::FUSE_MUL_ADD3>(
            handle_rocm(), handle_naive(false),
M
Megvii Engine Team 已提交
149
            {{32, 128, 56, 56}, {1, 128, 1, 1}, {32, 128, 56, 56}}, dtype::Float32());
150 151 152 153 154 155 156 157 158 159 160 161
}

TEST_F(ROCM, ELEMWISE_BENCHMARK_PEAK_BANDWIDTH) {
    using Mode = ElemwiseForward::Mode;
    run_elemwise_benchmark<Mode::FUSE_MUL_ADD4>(
            handle_rocm(), handle_naive(false),
            {{10000, 10000}, {10000, 10000}, {10000, 10000}, {10000, 10000}},
            dtype::Float32());
}
#endif

// vim: syntax=cpp.doxygen