matrix_mul.cpp

/**
 * \file dnn/test/cuda/matrix_mul.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
#include "test/cuda/fixture.h"

#include "test/common/checker.h"
#include "test/common/matrix_mul.h"
#include "test/common/benchmarker.h"

#include "src/cuda/utils.h"
#if defined(cuda_check)
#undef cuda_check
#endif
#include "test/cuda/utils.h"

#include <cuda.h>

namespace megdnn {
namespace test {

#if CUDA_VERSION >= 10000
TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION) {
    if (cuda::current_device_prop().major > 7 ||
        (cuda::current_device_prop().major == 7 &&
         cuda::current_device_prop().minor >= 5)) {
        printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32_EXCEPTION test as current "
               "device support wmma intrinsics\n");
        return;
    }

    Checker<MatrixMul> checker(handle_cuda(), false);
    using Param = MatrixMul::Param;
    Param param;
    param.transposeB = true;
    checker.set_param(param);
    checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
    checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
    checker.set_dtype(2, dtype::QuantizedS32(1.3f * 1.3f));
    ASSERT_THROW(checker.exec({{256, 256}, {256, 256}, {256, 256}}),
                 MegDNNError);
}

TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) {
    if (cuda::current_device_prop().major < 7 ||
        (cuda::current_device_prop().major == 7 &&
         cuda::current_device_prop().minor < 5)) {
        printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device "
               "doesn't support\n");
        return;
    }
    Checker<MatrixMul> checker(handle_cuda(), false);
    using Param = MatrixMul::Param;
    Param param;
    param.transposeB = true;
    checker.set_param(param);
    checker.set_dtype(0, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
    checker.set_dtype(1, dtype::Quantized4Asymm(1.3f, (uint8_t)3));
    checker.set_dtype(2, dtype::QuantizedS32(1.3f*1.3f));
    checker.exec({{256, 256}, {256, 256}, {256, 256}});
    auto args = matrix_mul::get_matmul_args();
    for (auto arg : args) {
        size_t m = DIVUP(arg.m, 8) * 8, n = DIVUP(arg.n, 8) * 8,
               k = DIVUP(arg.k, 32) * 32;
        checker.exec({{m, k}, {n, k}, {m, n}});
    }
}

#if MEGDNN_WITH_BENCHMARK
TEST_F(CUDA, BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
    if (cuda::current_device_prop().major < 7 ||
        (cuda::current_device_prop().major == 7 &&
         cuda::current_device_prop().minor < 5)) {
        printf("Skip CUDA.BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as current "
               "device doesn't support\n");
        return;
    }
    Benchmarker<MatrixMul> bencher(handle_cuda());
    using Param = MatrixMul::Param;
    Param param;
    param.transposeB = true;
    bencher.set_param(param);
    bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
    bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
    bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
    for (size_t m : {256, 1024, 4096, 10240, 40960}) {
        for (size_t n : {256, 1024, 4096}) {
            for (size_t k :{512, 1024, 2048}) {
                bencher.set_times(400);
                auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
                auto gflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
                printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n",
                        m, k, n, time_in_ms, gflps);
            }
        }
    }
}

TEST_F(CUDA, PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32) {
    if (cuda::current_device_prop().major < 7 ||
        (cuda::current_device_prop().major == 7 &&
         cuda::current_device_prop().minor < 5)) {
        printf("Skip CUDA.PEAK_BENCHMARK_MATRIX_MUL_QUANTIZED4x4x32 test as "
               "current "
               "device doesn't support\n");
        return;
    }
    Benchmarker<MatrixMul> bencher(handle_cuda());
    using Param = MatrixMul::Param;
    Param param;
    param.transposeB = true;
    bencher.set_param(param);
    bencher.set_dtype(0, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
    bencher.set_dtype(1, dtype::Quantized4Asymm(1.0f, (uint8_t)3));
    bencher.set_dtype(2, dtype::QuantizedS32(1.0f));
    bencher.set_times(400);
    size_t m = 4096, n = 4096, k = 81920;
    auto time_in_ms = bencher.exec({{m, k}, {n, k}, {m, n}}) / 400;
    auto tflps = 2.0 * m * k * n / (time_in_ms * 1e-3) * 1e-12;
    printf("m=%zu, k=%zu, n=%zu, time: %fms, perf: %f TFlops\n", m, k, n,
           time_in_ms, tflps);
}
#endif
#endif

TEST_F(CUDA, MATRIX_MUL_INT8x8x32_WITH_SPETIAL_STRIDES) {
    if (!cuda::is_compute_capability_required(6, 1)) {
        printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
        return;
    }
    Checker<MatrixMul> checker(handle_cuda());
    using Param = MatrixMul::Param;
    Param param;
    DType stype = dtype::Int8();
    checker.set_param(param)
            .set_dtype(0, stype)
            .set_dtype(1, stype)
            .set_dtype(2, dtype::Int32())
            .set_epsilon(5e-3);
    size_t m = 1024, n = 1024, k = 1024;
    {
        TensorLayout A{{m, k}, {2048, 1}, dtype::Int8()},
                B{{k, n}, {2048, 1}, dtype::Int8()}, C{{m, n}, dtype::Int32()};
        checker.execl({A, B, {}});
    }
}

TEST_F(CUDA, MATRIX_MUL_INT8x8x32_NAIVE) {
    if (!cuda::is_compute_capability_required(6, 1)) {
        printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
        return;
    }

    using Param = MatrixMul::Param;
    UniformIntRNG rng{-128, 127};
    Checker<MatrixMul> checker(handle_cuda());
    checker.set_rng(0, &rng).set_rng(1, &rng);

    size_t m = 1007, n = 1003, k = 129;
    for (unsigned mask = 0; mask < 4; ++mask) {
        Param param;
        param.transposeA = mask & 1;
        param.transposeB = mask & 2;
        TensorShape A, B;
        if (param.transposeA)
            A = TensorShape{k, m};
        else
            A = TensorShape{m, k};
        if (param.transposeB)
            B = TensorShape{n, k};
        else
            B = TensorShape{k, n};
        checker.set_param(param)
                .set_dtype(0, dtype::Int8())
                .set_dtype(1, dtype::Int8())
                .set_dtype(2, dtype::Int32())
                .set_epsilon(0)
                .execs({A, B, {}});
    }
}

TEST_F(CUDA, MATRIX_MUL_FLOAT_NAIVE) {
    Checker<MatrixMul> checker(handle_cuda());
    checker.set_before_exec_callback(AlgoChecker<MatrixMulForward>("NAIVE"));
    using Param = MatrixMul::Param;
    size_t m = 12, n = 16, k = 20;

    std::vector<DType> dtype_array;
    dtype_array.push_back(dtype::Float32());
    dtype_array.push_back(dtype::Float16());

    for (DType dtype : dtype_array) {
        for (unsigned mask = 0; mask < 4; ++mask) {
            Param param;
            param.transposeA = mask & 1;
            param.transposeB = mask & 2;
            DType stype = dtype;
            TensorShape A, B;
            if (param.transposeA)
                A = TensorShape{k, m};
            else
                A = TensorShape{m, k};
            if (param.transposeB)
                B = TensorShape{n, k};
            else
                B = TensorShape{k, n};
            if (dtype == dtype::Float16()) {
                param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32;
            }
            checker.set_param(param)
                    .set_dtype(0, stype)
                    .set_dtype(1, stype)
                    .set_dtype(2, dtype)
                    .set_epsilon(dtype == dtype::Float16()
                                         ? 5e-2
                                         : 5e-3)
                    .execs({A, B, {}});
        }
    }
}

TEST_F(CUDA, MATRIX_MUL) {
    if (cuda::current_device_prop().major < 6) {
        printf("Skip CUDA.MATRIX_MUL test as current device doesn't support\n");
        return;
    }
    Checker<MatrixMul> checker(handle_cuda());
    using Param = MatrixMul::Param;
    size_t m = 12, n = 16, k = 20;

    bool is_int_available = cuda::is_compute_capability_required(6, 1);
    std::vector<DType> dtype_array;
    dtype_array.push_back(dtype::Float32());
    dtype_array.push_back(dtype::Float16());
    dtype_array.push_back(dtype::BFloat16());
    if (is_int_available)
        dtype_array.push_back(dtype::Int32());

    for (DType dtype : dtype_array) {
        for (unsigned mask = 0; mask < 4; ++mask) {
            Param param;
            param.transposeA = mask & 1;
            param.transposeB = mask & 2;
            DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
            TensorShape A, B;
            if (param.transposeA)
                A = TensorShape{k, m};
            else
                A = TensorShape{m, k};
            if (param.transposeB)
                B = TensorShape{n, k};
            else
                B = TensorShape{k, n};
            if (dtype == dtype::BFloat16()) {
                param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32;
                checker.set_before_exec_callback(
                        AlgoChecker<MatrixMulForward>(ExecutionPolicyAlgoName{
                                "MATMUL_BFLOAT16", {{"CUBLAS", {}}}}));
            }
            checker.set_param(param)
                    .set_dtype(0, stype)
                    .set_dtype(1, stype)
                    .set_dtype(2, dtype)
                    .set_epsilon(dtype == dtype::Float16() ||
                                                 dtype == dtype::BFloat16()
                                         ? 5e-2
                                         : 5e-3)
                    .execs({A, B, {}});
            if (dtype == dtype::BFloat16()) {
                checker.reset_before_exec_callback();
                checker.opr()->execution_policy() = {};
            }
        }
    }

    // general tests
    auto args = matrix_mul::get_matmul_args();
    for (auto arg: args) {
        auto m = arg.m, n = arg.n, k = arg.k;
        auto mask = arg.mask;
        Param param;
        param.transposeA = mask & 1;
        param.transposeB = mask & 2;
        TensorShape AS, BS, CS;
        if (param.transposeA)
            AS = TensorShape{k, m};
        else
            AS = TensorShape{m, k};
        if (param.transposeB)
            BS = TensorShape{n, k};
        else
            BS = TensorShape{k, n};
        CS = TensorShape{m, n};
        TensorLayout AL, BL, CL;
        if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
            AL = TensorLayout(AS, dtype::Float32());
        } else {
            AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1},
                              dtype::Float32());
        }
        if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
            BL = TensorLayout(BS, dtype::Float32());
        } else {
            BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1},
                              dtype::Float32());
        }
        if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
            CL = TensorLayout(CS, dtype::Float32());
        } else {
            CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1},
                              dtype::Float32());
        }
        checker.set_param(param).execl({AL, BL, CL});
    }
}

TEST_F(CUDA, MATRIX_MUL_CUBLASLT)
{
    require_compute_capability(7, 5);
    NormalRNG normal_rng;
    Checker<MatrixMul> checker(handle_cuda());
    checker.set_rng(0, &normal_rng)
            .set_rng(1, &normal_rng)
            .set_before_exec_callback(
                    AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
    using Param = MatrixMul::Param;
    size_t m = 32, n = 32, k = 32;
    // test Int8 matmul
    {
        DType dtype=dtype::Int32();
        Param param;
        param.transposeA = false;
        param.transposeB = false;
        DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
        TensorShape A, B;
        A = TensorShape{m, k};
        B = TensorShape{k, n};
        checker.set_param(param).
            set_dtype(0, stype).
            set_dtype(1, stype).
            set_dtype(2, dtype).
            set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3).
            execs({A, B, {}});
    }
    // test float-point matmul
    for (DType dtype: std::array<DType, 2>{
            {dtype::Float32(), dtype::Float16()}}) {
        for (unsigned mask = 0; mask < 4; ++mask) {
            Param param;
            param.transposeA = mask & 1;
            param.transposeB = mask & 2;
            DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
            TensorShape A, B;
            if (param.transposeA)
                A = TensorShape{k, m};
            else
                A = TensorShape{m, k};
            if (param.transposeB)
                B = TensorShape{n, k};
            else
                B = TensorShape{k, n};
            checker.set_param(param).
                set_dtype(0, stype).
                set_dtype(1, stype).
                set_dtype(2, dtype).
                set_epsilon(dtype == dtype::Float16() ? 5e-2 : 8e-3).
                execs({A, B, {}});
        }
    }
    // general tests
    auto args = matrix_mul::get_matmul_args();
    for (auto arg: args) {
        auto m = arg.m, n = arg.n, k = arg.k;
        auto mask = arg.mask;
        Param param;
        param.transposeA = mask & 1;
        param.transposeB = mask & 2;
        TensorShape AS, BS, CS;
        if (param.transposeA)
            AS = TensorShape{k, m};
        else
            AS = TensorShape{m, k};
        if (param.transposeB)
            BS = TensorShape{n, k};
        else
            BS = TensorShape{k, n};
        CS = TensorShape{m, n};
        TensorLayout AL, BL, CL;
        if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
            AL = TensorLayout(AS, dtype::Float32());
        } else {
            AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1},
                              dtype::Float32());
        }
        if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
            BL = TensorLayout(BS, dtype::Float32());
        } else {
            BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1},
                              dtype::Float32());
        }
        if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) {
            CL = TensorLayout(CS, dtype::Float32());
        } else {
            CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1},
                              dtype::Float32());
        }
        checker.set_param(param).execl({AL, BL, CL});
    }
}
TEST_F(CUDA, MATRIX_MUL_CUBLASLT_SPECIAL_CASE) {
    require_compute_capability(7, 5);
    size_t m = 12, n = 16, k = 20;
    Checker<MatrixMul> checker(handle_cuda());
    checker.set_before_exec_callback(
        AlgoChecker<MatrixMulForward>("CUBLAS_LT"));

    using Param = MatrixMul::Param;

    Param param;
    DType stype = dtype::Float32();
    DType dtype = dtype::Float32();
    TensorShape A, B;
    param.transposeA=param.transposeB=1;
    if (param.transposeA)
        A = TensorShape{k, m};
    else
        A = TensorShape{m, k};
    if (param.transposeB)
        B = TensorShape{n, k};
    else
        B = TensorShape{k, n};
    checker.set_param(param).
        set_dtype(0, stype).
        set_dtype(1, stype).
        set_dtype(2, dtype).
        set_epsilon(dtype == dtype::Float16() ? 5e-1 : 5e-2).
        execs({A, B, {}});
}
TEST_F(CUDA, MATRIX_MUL_CUBLASLT_INT8) {
    require_compute_capability(7, 5);
    NormalRNG normal_rng;
    Checker<MatrixMul> checker(handle_cuda());
    checker.set_rng(0, &normal_rng)
           .set_rng(1, &normal_rng)
           .set_before_exec_callback(AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
    using Param = MatrixMul::Param;

    //size_t m = 32, n = 32, k = 32;
    // test Int8 matmul
    for (size_t m=8; m<=64; m+=4)
    for (size_t n=8; n<=64; n+=4)
    for (size_t k=8; k<=64; k+=4)
    {
        DType dtype=dtype::Int32();
        Param param;
        param.transposeA = false;
        param.transposeB = false;
        DType stype = dtype == dtype::Int32() ? dtype::Int8() : dtype;
        TensorShape A, B;
        A = TensorShape{m, k};
        B = TensorShape{k, n};
        checker.set_param(param).
            set_dtype(0, stype).
            set_dtype(1, stype).
            set_dtype(2, dtype).
            set_epsilon(dtype == dtype::Float16() ? 5e-2 : 5e-3).
            execs({A, B, {}});
    }
}

} // namespace test
} // namespace megdnn
// vim: syntax=cpp.doxygen