algo.h 34.5 KB
Newer Older
1 2 3 4
/**
 * \file dnn/src/cuda/conv_bias/algo.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
5
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
6 7 8
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
9 10
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
11 12 13 14 15 16
 */

#pragma once

#include "megdnn/oprs.h"

17 18
#include "src/common/algo_base.h"
#include "src/common/metahelper.h"
19
#include "src/common/utils.h"
20
#include "src/cuda/conv_bias/conv_bias_int8.cuh"
21 22 23
#include "src/cuda/conv_bias/helper.h"
#include "src/cuda/conv_bias/opr_impl.h"
#include "src/cuda/convolution_helper/parameter.cuh"
24
#include "src/cuda/cudnn_wrapper.h"
25
#include "src/cuda/handle.h"
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44

#include <cuda.h>
#include <memory>
#include <unordered_map>

namespace megdnn {
namespace cuda {

/*!
 * \brief base class for conv bias algos
 *
 * All the algo impls should try to support non-contiguous batch dim, for group
 * conv execution.
 */
class ConvBiasForwardImpl::AlgoBase : public Algorithm {
protected:
    ~AlgoBase() = default;

public:
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
    enum class AlgoType : uint32_t {
        CUDA_CUDNN_CONVBIAS,
        CUDA_CHANWISE,
        CUDA_CHANWISE_SMALL,
        CUDA_CHANWISE_INT8X8X32,
        CUDA_CUDNN_CONV,
        CUDA_INPLACE_MATMUL,
        CUDA_MATMUL,
        CUDA_MATMUL_INT8X8X32,
        CUDA_BATCHED_MATMUL,
        CUDA_GROUP_CONV_GENERAL,
        CUDA_WMMA_UINT4X4X32,
        CUDA_IMPLICIT_GEMM_CHWN4_DOTPROD_INT8,
        CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8,
        CUDA_IMPLICIT_GEMM_CHWN4_IMMA_INT8,
        CUDA_IMPLICIT_GEMM_NCHW4_IMMA_INT8,
        CUDA_IMPLICIT_GEMM_REORDER_FILTER_CHWN4_IMMA_INT8,
        CUDA_IMPLICIT_GEMM_UNROLL_WIDTH_CHWN4_IMMA_INT8,
        CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8,
64
        CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4,
65
        CUDA_IMPLICIT_GEMM_IMMA_NCHW64_UINT4_INT4,
66 67 68 69 70
        CUDA_BFLOAT16,
        CUDA_IMPLICIT_GEMM_SASS_NCHW4_DOTPROD_INT8,
        CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW4_DOTPROD_INT8,
        CUDA_IMPLICIT_GEMM_SASS_NCHW32_IMMA_INT8,
        CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW32_IMMA_INT8,
71 72
        CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_INT4_INT4,
        CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_UINT4_INT4,
73 74 75
    };
    using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;

76
    AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; }
77 78
    struct SizeArgs : public conv_bias::BiasForwardSizeArgs {
        ConvBiasForwardImpl* opr;
M
Megvii Engine Team 已提交
79
        const PreprocessedFilter* preprocessed_filter;
80

81 82 83
        std::string to_string() const;
        SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src,
                 const TensorLayout& filter, const TensorLayout& bias,
M
Megvii Engine Team 已提交
84 85
                 const TensorLayout& z, const TensorLayout& dst,
                 const PreprocessedFilter* preprocessed_filter = nullptr);
86 87 88 89
        SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src,
                 const TensorLayout& filter,
                 const CanonizedFilterMeta& filter_meta,
                 const TensorLayout& bias, const TensorLayout& z,
M
Megvii Engine Team 已提交
90 91
                 const TensorLayout& dst,
                 const PreprocessedFilter* preprocessed_filter = nullptr);
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109

        void init_conv_bias_desc(conv_bias::CUDNNForwardDescs& desc) const {
            desc.set_conv_bias(*src_layout, filter_meta, *dst_layout,
                               *bias_layout, *z_layout, opr->param());
        }

        void init_conv_desc(conv_bias::CUDNNForwardDescs& desc) const {
            desc.set_conv(*src_layout, filter_meta, *dst_layout, opr->param());
        }
    };
    struct ExecArgs : public SizeArgs {
        const TensorND *src_tensor, *filter_tensor, *bias_tensor, *z_tensor,
                *dst_tensor;
        Workspace workspace;

        ExecArgs(ConvBiasForwardImpl* opr, _megdnn_tensor_in src,
                 _megdnn_tensor_in filter, _megdnn_tensor_in bias,
                 _megdnn_tensor_in z, _megdnn_tensor_out dst,
M
Megvii Engine Team 已提交
110 111
                 _megdnn_workspace workspace,
                 const PreprocessedFilter* preprocessed_filter = nullptr);
112 113 114 115
    };
    virtual bool is_available(const SizeArgs& args) const = 0;
    virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
    virtual void exec(const ExecArgs& args) const = 0;
M
Megvii Engine Team 已提交
116 117
    virtual size_t get_preprocess_workspace_in_bytes(
            const SizeArgs& args) const {
118
        MEGDNN_MARK_USED_VAR(args);
M
Megvii Engine Team 已提交
119 120 121 122
        return 0;
    }
    virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
            const SizeArgs& args) const {
123
        MEGDNN_MARK_USED_VAR(args);
M
Megvii Engine Team 已提交
124 125
        return {};
    }
126 127 128
    virtual void exec_preprocess(const ExecArgs& args) const {
        MEGDNN_MARK_USED_VAR(args);
    }
129 130 131 132 133

    bool is_available_wk(const SizeArgs& args, size_t limit) {
        return is_available(args) && get_workspace_in_bytes(args) <= limit;
    }

134 135
    bool is_available_attribute(
            const SizeArgs& args,
136 137
            const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE,
            const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT,
138
            size_t limit = std::numeric_limits<size_t>::max()) {
139 140 141
        return contain_attribute_all(positive_attr) &&
               !contain_attribute_any(negative_attr) &&
               is_available_wk(args, limit);
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
    }

    AlgoBase& check_workspace(const SizeArgs& args,
                              const Workspace& workspace) {
        auto req = get_workspace_in_bytes(args);
        megdnn_assert(
                req <= workspace.size,
                "conv bias fwd algo %s: required workspace %zu bytes, got %zu",
                name(), req, workspace.size);
        return *this;
    }

    virtual bool is_cudnn() const { return false; }
};

class ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation final : public AlgoBase {
public:
159 160 161 162 163 164 165 166
    AlgoCUDNNConvBiasActivation(cudnnConvolutionFwdAlgo_t cudnn_enum)
            : m_cudnn_enum(cudnn_enum) {
        megdnn_assert(CudnnAlgoPack::conv_fwd_algos().find(cudnn_enum) !=
                      CudnnAlgoPack::conv_fwd_algos().end());
        m_attr = CudnnAlgoPack::conv_fwd_algos().at(cudnn_enum);
        m_name = ConvBiasForward::algo_name<DefaultParam>(
                "CUDNN:ConvBiasActivation:" + m_attr.name, {});
    }
167 168 169 170 171 172 173 174

    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    param::Convolution get_param_convolution(const SizeArgs& args) const;
    bool is_available(const SizeArgs&) const override;

    const char* name() const override { return m_name.c_str(); }

175 176 177 178 179
    AlgoAttribute attribute() const override {
        auto ret = static_cast<AlgoAttribute>(0);
        if (m_attr.is_reproducible) {
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
180 181 182
        if (m_attr.accuracy_depend_on_batch) {
            ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
        }
183 184
        return ret;
    }
185 186 187 188 189

    cudnnConvolutionFwdAlgo_t cudnn_enum() { return m_cudnn_enum; }

    bool is_cudnn() const override { return true; }

190 191 192 193 194 195 196 197
    MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_CONVBIAS)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_cudnn_enum, ret);
        return ret;
    }

198 199 200
private:
    std::string m_name;
    cudnnConvolutionFwdAlgo_t m_cudnn_enum;
201
    CudnnAlgoPack::Attr m_attr;
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
};

class ConvBiasForwardImpl::AlgoChanwise final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

    const char* name() const override {
        if (m_name.empty()) {
            m_name =
                    ConvBiasForward::algo_name<DirectParam>("CHANNEL_WISE", {});
        }
        return m_name.c_str();
    }
217 218 219
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
220

221 222
    MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE)

223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
private:
    mutable std::string m_name;
};

class ConvBiasForwardImpl::AlgoChanwiseSmall final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

    const char* name() const override {
        if (m_name.empty()) {
            m_name = ConvBiasForward::algo_name<DirectParam>(
                    "CHANNEL_WISE_SMALL", {});
        }
        return m_name.c_str();
    }
240
    MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE_SMALL)
241 242 243
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260

private:
    mutable std::string m_name;
};

class ConvBiasForwardImpl::AlgoChanwise8x8x32 final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override {
        if (m_name.empty()) {
            m_name = ConvBiasForward::algo_name<DirectParam>(
                    "CHANNEL_WISE_8X8X32", {});
        }
        return m_name.c_str();
    }
261
    MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE_INT8X8X32)
262 263 264
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
265 266 267 268 269 270 271

private:
    mutable std::string m_name;
};

class ConvBiasForwardImpl::AlgoCUDNNConv final : public AlgoBase {
public:
272 273 274 275 276 277 278 279
    AlgoCUDNNConv(cudnnConvolutionFwdAlgo_t cudnn_enum)
            : m_cudnn_enum(cudnn_enum) {
        megdnn_assert(CudnnAlgoPack::conv_fwd_algos().find(cudnn_enum) !=
                      CudnnAlgoPack::conv_fwd_algos().end());
        m_attr = CudnnAlgoPack::conv_fwd_algos().at(cudnn_enum);
        m_name = ConvBiasForward::algo_name<DefaultParam>(
                "CUDNN:Convolution:" + m_attr.name, {});
    }
280 281 282 283 284

    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

285 286 287 288 289
    AlgoAttribute attribute() const override {
        auto ret = static_cast<AlgoAttribute>(0);
        if (m_attr.is_reproducible) {
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
290 291 292
        if (m_attr.accuracy_depend_on_batch) {
            ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
        }
293 294
        return ret;
    }
295 296 297 298 299 300

    const char* name() const override { return m_name.c_str(); }

    cudnnConvolutionFwdAlgo_t cudnn_enum() const { return m_cudnn_enum; }

    bool is_cudnn() const override { return true; }
301 302 303 304 305 306 307 308 309

    MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_CONV)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_cudnn_enum, ret);
        return ret;
    }

310 311 312
private:
    std::string m_name;
    cudnnConvolutionFwdAlgo_t m_cudnn_enum;
313
    CudnnAlgoPack::Attr m_attr;
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331

    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
};

//! compute small matmul in the kernel
class ConvBiasForwardImpl::AlgoInplaceMatmul final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

    const char* name() const override {
        if (m_name.empty()) {
            m_name = ConvBiasForward::algo_name<ConvBias::MatmulParam>(
                    "INPLACE_MATMUL", {});
        }
        return m_name.c_str();
    }
332
    MEGDNN_DECL_ALGO_TYPE(CUDA_INPLACE_MATMUL)
333 334 335
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353

private:
    mutable std::string m_name;
};

//! im2col and matmul, with dilation
class ConvBiasForwardImpl::AlgoMatmul final : public AlgoBase {
    template <typename T>
    static void exec_internal(const ExecArgs& args,
                              const WorkspaceBundle& bundle);

public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

    const char* name() const override {
        if (m_name.empty()) {
354 355
            m_name = ConvBiasForward::algo_name<ConvBias::MatmulParam>("MATMUL",
                                                                       {});
356 357 358
        }
        return m_name.c_str();
    }
359 360 361 362

    std::vector<SearchItem> get_subopr_list(
            const TensorLayoutArray& layouts,
            const OperatorBase* opr) const override;
363
    MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL)
364
    AlgoAttribute attribute() const override {
365 366
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
367
    }
368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385

private:
    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
    mutable std::string m_name;
};

class ConvBiasForwardImpl::AlgoMatmul8x8x32 final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override {
        if (m_name.empty()) {
            m_name = ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
                    "MATMUL8X8X32", {});
        }
        return m_name.c_str();
    }
386
    MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL_INT8X8X32)
387 388 389
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409

private:
    bool need_src_unroll(const SizeArgs& args) const;
    bool need_filter_reshape(const SizeArgs& args) const;
    template <Param::Format>
    WorkspaceBundle get_bundle(const SizeArgs& args) const;
    template <Param::Format>
    void exec_internal(const ExecArgs& args) const;
    mutable std::string m_name;
};

class ConvBiasForwardImpl::AlgoBatchedMatmul final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

    const char* name() const override {
        if (m_name.empty()) {
            m_name = ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
410
                    "BATCHED_MATMUL", {});
411 412 413
        }
        return m_name.c_str();
    }
414 415 416 417 418

    std::vector<SearchItem> get_subopr_list(
            const TensorLayoutArray& layouts,
            const OperatorBase* opr) const override;

419
    AlgoAttribute attribute() const override {
420 421
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
422 423
    }

424
    MEGDNN_DECL_ALGO_TYPE(CUDA_BATCHED_MATMUL)
425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441

private:
    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
    mutable std::string m_name;
};

//! implement group conv by another algo
class ConvBiasForwardImpl::AlgoGroupConvGeneral final : public AlgoBase {
public:
    AlgoGroupConvGeneral(AlgoBase* impl);

    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

    const char* name() const override { return m_name.c_str(); }

442
    AlgoAttribute attribute() const override {
443 444 445 446 447 448 449 450
        auto ret = AlgoAttribute::DEFAULT;
#define cb(attr)                               \
    if (m_impl->contain_attribute_all(attr)) { \
        ret |= attr;                           \
    }
        MEGDNN_FOREACH_ALGO_ATTRIBUTE_INHERITABLE(cb)
#undef cb

451
        if (m_impl->contain_attribute_all(AlgoAttribute::REPRODUCIBLE)) {
452 453 454 455
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
        return ret;
    }
456 457 458

    static void modify_size_args(SizeArgs& args, TensorLayout& src_pg,
                                 TensorLayout& dst_pg, TensorLayout& bias_pg);
459 460
    MEGDNN_DECL_ALGO_TYPE(CUDA_GROUP_CONV_GENERAL)

461 462 463 464 465 466 467 468 469 470 471 472 473 474
private:
    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
    AlgoBase* m_impl;
    std::string m_name;
};

#if CUDA_VERSION >= 10000
class ConvBiasForwardImpl::AlgoQUInt4x4x32WMMA final : public AlgoBase {
public:
    AlgoQUInt4x4x32WMMA() = default;
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return "QUINT4x4x32_WMMA"; }
475 476 477
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
478

479
private:
480 481
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
                                         const SizeArgs& args) const;
482 483
    bool use_kernel_fhxfw(const SizeArgs& args) const;
    size_t get_workspace_in_bytes_do_conv(const SizeArgs& args) const;
484
    MEGDNN_DECL_ALGO_TYPE(CUDA_WMMA_UINT4X4X32)
485 486 487 488 489 490 491 492 493 494 495 496 497
};
#endif

class ConvBiasForwardImpl::AlgoInt8CHWN4DotProdImplicitGemm final
        : public AlgoBase {
public:
    AlgoInt8CHWN4DotProdImplicitGemm() = default;
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override {
        return "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM";
    }
498 499 500
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
501 502 503 504 505 506 507
    template <typename BiasVisitor>
    static void dispatch_nonlinear_mode(
            const int8_t* d_src, const int8_t* d_filter,
            BiasVisitor bias_visitor, const int8_t* d_z, int8_t* d_dst,
            const convolution::ConvParam& param, float alpha, float beta,
            float gamma, float scale, cudaStream_t stream,
            param::ConvBias::NonlineMode nonlinear_mode);
508
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_CHWN4_DOTPROD_INT8)
509 510 511 512 513
};

class ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm final
        : public AlgoBase {
public:
514 515 516 517 518 519 520
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
521
        int stage;
522 523 524 525
        std::string to_string() {
            /// default algorithm
            if (threadblock_m == 128 && threadblock_n == 128 &&
                threadblock_k == 32 && warp_m == 32 && warp_n == 64 &&
526
                warp_k == 32 && stage == 2) {
527 528
                return "";
            }
529 530 531
            return ssprintf("_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m,
                            threadblock_n, threadblock_k, warp_m, warp_n,
                            warp_k, stage);
532 533 534 535 536 537
        }
    };
    AlgoInt8NCHW4DotProdImplicitGemm(AlgoParam algo_param)
            : m_algo_param{algo_param},
              m_name{ssprintf("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s",
                              m_algo_param.to_string().c_str())} {}
538 539 540
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
541
    const char* name() const override { return m_name.c_str(); }
542 543 544
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
545 546 547 548 549
    size_t get_preprocess_workspace_in_bytes(
            const SizeArgs& args) const override;
    SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
            const SizeArgs& args) const override;
    void exec_preprocess(const ExecArgs& args) const override;
550 551 552 553 554 555 556
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_algo_param, ret);
        return ret;
    }
557 558 559 560

private:
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
                                         const SizeArgs& args) const;
561 562
    AlgoParam m_algo_param;
    std::string m_name;
563 564
};

565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586
class ConvBiasForwardImpl::AlgoFallbackNCHWQS8 final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override {
        return "FALLBACK_CONV_NCHW_QS8";
    }
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8)

private:
    void make_inner_layout(const SizeArgs& args, TensorLayout& inner_src_layout,
                           TensorLayout& inner_weight_layout,
                           TensorLayout& inner_dst_layout,
                           TensorLayout& inner_bias_layout,
                           TensorLayout& inner_z_layout) const;
    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
};

587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602
#if CUDA_VERSION >= 10000
class ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm final
        : public AlgoBase {
public:
    enum class MMATileSize : uint32_t {
        IMMA16x16x16,
        IMMA32x8x16,
        IMMA8x32x16
    };
    AlgoInt8CHWN4IMMAImplicitGemm(MMATileSize mma_tile_size)
            : m_mma_tile_size{mma_tile_size},
              m_name{"INT8_CHWN4_IMMA_IMPLICIT_GEMM_" +
                     to_string(m_mma_tile_size)} {}
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
603
    const char* name() const override { return m_name.c_str(); }
604 605 606
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
607 608 609 610 611 612 613 614 615 616
    template <typename BiasVisitor>
    static void dispatch_nonlinear_mode(
            const int8_t* d_src, const int8_t* d_filter,
            BiasVisitor bias_visitor, int8_t* d_z, int8_t* d_dst,
            const convolution::ConvParam& param, float alpha, float beta,
            float gamma, float scale, cudaStream_t stream,
            param::ConvBias::NonlineMode nonlinear_mode,
            MMATileSize mma_tile_size);
    static std::string to_string(MMATileSize mma_tile_size);

617 618 619 620 621 622 623 624
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_CHWN4_IMMA_INT8)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_mma_tile_size, ret);
        return ret;
    }

625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641
private:
    MMATileSize m_mma_tile_size;
    std::string m_name;
};

class ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm final
        : public AlgoBase {
public:
    using MMATileSize = AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize;
    AlgoInt8NCHW4IMMAImplicitGemm(MMATileSize mma_tile_size)
            : m_mma_tile_size{mma_tile_size},
              m_name{"INT8_NCHW4_IMMA_IMPLICIT_GEMM_" +
                     AlgoInt8CHWN4IMMAImplicitGemm::to_string(
                             m_mma_tile_size)} {}
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
642 643 644 645 646 647 648 649
    const char* name() const override { return m_name.c_str(); }
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_IMMA_INT8)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_mma_tile_size, ret);
        return ret;
    }
650 651 652
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
653

654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673
private:
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
                                         const SizeArgs& args) const;
    MMATileSize m_mma_tile_size;
    std::string m_name;
};

class ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmReorderFilter final
        : public AlgoBase {
public:
    using MMATileSize = AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize;
    AlgoInt8CHWN4IMMAImplicitGemmReorderFilter(MMATileSize mma_tile_size)
            : m_mma_tile_size{mma_tile_size},
              m_name{"INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_" +
                     AlgoInt8CHWN4IMMAImplicitGemm::to_string(
                             m_mma_tile_size)} {}
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
674 675 676 677 678 679 680
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_REORDER_FILTER_CHWN4_IMMA_INT8)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_mma_tile_size, ret);
        return ret;
    }
681 682 683
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702

private:
    MMATileSize m_mma_tile_size;
    std::string m_name;
};

class ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth final
        : public AlgoBase {
public:
    using MMATileSize = AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize;
    AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth(MMATileSize mma_tile_size)
            : m_mma_tile_size{mma_tile_size},
              m_name{"INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_" +
                     AlgoInt8CHWN4IMMAImplicitGemm::to_string(
                             m_mma_tile_size)} {}
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
703 704 705 706 707 708 709
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_UNROLL_WIDTH_CHWN4_IMMA_INT8)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_mma_tile_size, ret);
        return ret;
    }
710 711 712
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
713 714 715 716 717 718 719

private:
    MMATileSize m_mma_tile_size;
    std::string m_name;
};
#endif

720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742
#if CUDA_VERSION >= 10020
class ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm final
        : public AlgoBase {
public:
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
    };
    AlgoInt8NCHW32IMMAImplicitGemm(AlgoParam algo_param)
            : m_algo_param{algo_param} {
        m_name = ConvBias::algo_name<ConvBias::DirectParam>(
                ssprintf("INT8_NCHW32_IMMA_IMPLICIT_GEMM_%s",
                         to_string(m_algo_param).c_str()),
                ConvBias::DirectParam{});
    }
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
743 744 745
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
746
    static std::string to_string(AlgoParam algo_param);
747 748 749 750 751
    size_t get_preprocess_workspace_in_bytes(
            const SizeArgs& args) const override;
    SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
            const SizeArgs& args) const override;
    void exec_preprocess(const ExecArgs& args) const override;
752 753 754 755 756 757 758
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_algo_param, ret);
        return ret;
    }
759

760 761 762 763 764 765 766 767
private:
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
                                         const SizeArgs& args) const;

    AlgoParam m_algo_param;
    std::string m_name;
};

768
class ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase
769 770 771 772 773 774 775 776 777 778
        : public AlgoBase {
public:
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
    };
779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822

    AlgoInt4NCHW64IMMAImplicitGemmBase(AlgoParam algo_param)
            : m_algo_param(algo_param) {}

    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
    const char* name() const override { return m_name.c_str(); }
    std::string param() const override;

    bool is_available(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

    std::string to_string(AlgoParam algo_param);

protected:
    virtual DTypeEnum src_dtype() const = 0;

    // return filter_ptr, bias_ptr
    virtual std::tuple<void*, void*> prepare_filter_bias(
            const ExecArgs& args) const = 0;

    // return alpha, beta, gamma, delta, theta
    virtual std::tuple<float, float, float, float, float> get_constants(
            const ExecArgs& args) const = 0;

    virtual void do_exec(const ExecArgs& args, void* filter_ptr, void* bias_ptr,
                         void* z_ptr, convolution::ConvParam kern_param,
                         uint32_t nonlinear_mode, float alpha, float beta,
                         float gamma, float delta, float theta,
                         cudaStream_t stream) const = 0;

    void reorder_filter(const ExecArgs& args, void* reordered_filter) const;

    std::string m_name;
    AlgoParam m_algo_param;
};

class ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm final
        : public AlgoInt4NCHW64IMMAImplicitGemmBase {
public:
    using Base = AlgoInt4NCHW64IMMAImplicitGemmBase;
    using AlgoParam = Base::AlgoParam;

823
    AlgoInt4Int4NCHW64IMMAImplicitGemm(AlgoParam algo_param)
824
            : Base{algo_param} {
825 826 827 828 829
        m_name = ConvBias::algo_name<ConvBias::DirectParam>(
                ssprintf("INT4_INT4_NCHW64_IMMA_IMPLICIT_GEMM_%s",
                         to_string(m_algo_param).c_str()),
                ConvBias::DirectParam{});
    }
830

831 832 833 834 835 836 837
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    size_t get_preprocess_workspace_in_bytes(
            const SizeArgs& args) const override;
    SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
            const SizeArgs& args) const override;
    void exec_preprocess(const ExecArgs& args) const override;

838
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4)
839

840
private:
841
    DTypeEnum src_dtype() const override { return DTypeEnum::QuantizedS4; }
842

843 844 845 846 847 848 849 850 851 852
    std::tuple<void*, void*> prepare_filter_bias(
            const ExecArgs& args) const override;

    std::tuple<float, float, float, float, float> get_constants(
            const ExecArgs& args) const override;

    void do_exec(const ExecArgs& args, void* filter_ptr, void* bias_ptr,
                 void* z_ptr, convolution::ConvParam kern_param,
                 uint32_t nonlinear_mode, float alpha, float beta, float gamma,
                 float delta, float theta, cudaStream_t stream) const override;
853
};
854 855

class ConvBiasForwardImpl::AlgoUInt4Int4NCHW64IMMAImplicitGemm final
856
        : public AlgoInt4NCHW64IMMAImplicitGemmBase {
857
public:
858 859 860
    using Base = AlgoInt4NCHW64IMMAImplicitGemmBase;
    using AlgoParam = Base::AlgoParam;

861
    AlgoUInt4Int4NCHW64IMMAImplicitGemm(AlgoParam algo_param)
862
            : Base{algo_param} {
863 864 865 866 867
        m_name = ConvBias::algo_name<ConvBias::DirectParam>(
                ssprintf("UINT4_INT4_NCHW64_IMMA_IMPLICIT_GEMM_%s",
                         to_string(m_algo_param).c_str()),
                ConvBias::DirectParam{});
    }
868

869 870 871 872 873 874 875
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    size_t get_preprocess_workspace_in_bytes(
            const SizeArgs& args) const override;
    SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
            const SizeArgs& args) const override;
    void exec_preprocess(const ExecArgs& args) const override;

876
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NCHW64_UINT4_INT4)
877 878

private:
879 880 881 882 883 884 885 886 887 888 889 890 891 892 893
    DTypeEnum src_dtype() const override { return DTypeEnum::Quantized4Asymm; }

    std::tuple<void*, void*> prepare_filter_bias(
            const ExecArgs& args) const override;

    std::tuple<float, float, float, float, float> get_constants(
            const ExecArgs& args) const override;

    void do_exec(const ExecArgs& args, void* filter_ptr, void* bias_ptr,
                 void* z_ptr, convolution::ConvParam kern_param,
                 uint32_t nonlinear_mode, float alpha, float beta, float gamma,
                 float delta, float theta, cudaStream_t stream) const override;

    void update_bias(const ExecArgs& args, void* updated_bias,
                     void* reduce_filter_ptr, void* reduce_workspace) const;
894
};
895 896
#endif

897 898 899 900 901 902
class ConvBiasForwardImpl::AlgoBFloat16 final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

903 904 905
    std::vector<SearchItem> get_subopr_list(
            const TensorLayoutArray& layouts,
            const OperatorBase* opr) const override;
906

907
    const char* name() const override { return "CONVBIAS_BFLOAT16"; }
908 909 910 911

    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
912

913
    MEGDNN_DECL_ALGO_TYPE(CUDA_BFLOAT16)
914 915 916 917
private:
    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
};

918

919 920 921
class ConvBiasForwardImpl::AlgoPack : NonCopyableObj {
private:
    AlgoBase::Mapper m_all_algos_map;
922 923 924 925 926 927

public:
    AlgoPack();

    std::vector<AlgoBase*> all_algos,
            //! non-cudnn algos, used for heuristic if cudnn is not supported
928
            non_cudnn_algos, bfloat16_algos;
929 930
    std::vector<AlgoCUDNNConvBiasActivation> cudnn_conv_bias_activations;
    std::vector<AlgoCUDNNConv> cudnn_convs;
931
    AlgoFallbackNCHWQS8 fallback_nchw_qs8;
932 933 934 935 936 937 938
    AlgoChanwise chanwise;
    AlgoChanwiseSmall chanwise_small;
    AlgoChanwise8x8x32 chanwise8x8x32;
    AlgoInplaceMatmul inplace_matmul;
    AlgoMatmul matmul;
    AlgoMatmul8x8x32 matmul8x8x32;
    AlgoBatchedMatmul batched_matmul;
939
    std::vector<AlgoInt8NCHW4DotProdImplicitGemm> int8_nchw4_dotprod;
940 941 942 943 944 945 946 947 948
    AlgoInt8CHWN4DotProdImplicitGemm int8_chwn4_dotprod;
#if CUDA_VERSION >= 10000
    AlgoQUInt4x4x32WMMA wmma_quint4x4x32;
    std::vector<AlgoInt8CHWN4IMMAImplicitGemm> int8_chwn4_imma;
    std::vector<AlgoInt8NCHW4IMMAImplicitGemm> int8_nchw4_imma;
    std::vector<AlgoInt8CHWN4IMMAImplicitGemmReorderFilter>
            int8_chwn4_imma_reorder_filter;
    std::vector<AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth>
            int8_chwn4_imma_unroll_width;
949 950 951
#endif
#if CUDA_VERSION >= 10020
    std::vector<AlgoInt8NCHW32IMMAImplicitGemm> int8_nchw32_imma;
952
    std::vector<AlgoInt4Int4NCHW64IMMAImplicitGemm> int4_int4_nchw64_imma;
953
    std::vector<AlgoUInt4Int4NCHW64IMMAImplicitGemm> uint4_int4_nchw64_imma;
954 955
#endif
    std::vector<std::unique_ptr<AlgoGroupConvGeneral>> gconv_refhold;
956
    AlgoBFloat16 bfloat16;
957 958 959 960 961 962
    std::unordered_map<AlgoBase*, AlgoGroupConvGeneral*> algo2gconv;

    AlgoBase* cudnn_conv_bias_act_from_enum(cudnnConvolutionFwdAlgo_t algo);

    AlgoBase* cudnn_conv_from_enum(cudnnConvolutionFwdAlgo_t algo);

963 964
    const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }

965 966 967 968 969
private:
#if CUDA_VERSION >= 10000
    void fill_imma_algos();
#endif
    void fill_cudnn_algos();
970
    void fill_dp4a_algos();
971 972 973 974 975 976
};

}  // namespace cuda
}  // namespace megdnn

// vim: syntax=cpp.doxygen