algo.h 31.2 KB
Newer Older
1 2 3 4
/**
 * \file dnn/src/cuda/conv_bias/algo.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
5
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
6 7 8
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
9 10
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
11 12 13 14 15 16
 */

#pragma once

#include "megdnn/oprs.h"

17 18
#include "src/common/algo_base.h"
#include "src/common/metahelper.h"
19
#include "src/common/utils.h"
20
#include "src/cuda/conv_bias/conv_bias_int8.cuh"
21 22 23
#include "src/cuda/conv_bias/helper.h"
#include "src/cuda/conv_bias/opr_impl.h"
#include "src/cuda/convolution_helper/parameter.cuh"
24
#include "src/cuda/cudnn_wrapper.h"
25
#include "src/cuda/handle.h"
26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44

#include <cuda.h>
#include <memory>
#include <unordered_map>

namespace megdnn {
namespace cuda {

/*!
 * \brief base class for conv bias algos
 *
 * All the algo impls should try to support non-contiguous batch dim, for group
 * conv execution.
 */
class ConvBiasForwardImpl::AlgoBase : public Algorithm {
protected:
    ~AlgoBase() = default;

public:
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
    enum class AlgoType : uint32_t {
        CUDA_CUDNN_CONVBIAS,
        CUDA_CHANWISE,
        CUDA_CHANWISE_SMALL,
        CUDA_CHANWISE_INT8X8X32,
        CUDA_CUDNN_CONV,
        CUDA_INPLACE_MATMUL,
        CUDA_MATMUL,
        CUDA_MATMUL_INT8X8X32,
        CUDA_BATCHED_MATMUL,
        CUDA_GROUP_CONV_GENERAL,
        CUDA_WMMA_UINT4X4X32,
        CUDA_IMPLICIT_GEMM_CHWN4_DOTPROD_INT8,
        CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8,
        CUDA_IMPLICIT_GEMM_CHWN4_IMMA_INT8,
        CUDA_IMPLICIT_GEMM_NCHW4_IMMA_INT8,
        CUDA_IMPLICIT_GEMM_REORDER_FILTER_CHWN4_IMMA_INT8,
        CUDA_IMPLICIT_GEMM_UNROLL_WIDTH_CHWN4_IMMA_INT8,
        CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8,
64
        CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4,
65 66 67 68 69
        CUDA_BFLOAT16,
        CUDA_IMPLICIT_GEMM_SASS_NCHW4_DOTPROD_INT8,
        CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW4_DOTPROD_INT8,
        CUDA_IMPLICIT_GEMM_SASS_NCHW32_IMMA_INT8,
        CUDA_IMPLICIT_GEMM_1X1_SASS_NCHW32_IMMA_INT8,
70 71
        CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_INT4_INT4,
        CUDA_IMPLICIT_GEMM_SASS_NCHW64_IMMA_UINT4_INT4,
72 73 74
    };
    using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;

75
    AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; }
76 77
    struct SizeArgs : public conv_bias::BiasForwardSizeArgs {
        ConvBiasForwardImpl* opr;
M
Megvii Engine Team 已提交
78
        const PreprocessedFilter* preprocessed_filter;
79

80 81 82
        std::string to_string() const;
        SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src,
                 const TensorLayout& filter, const TensorLayout& bias,
M
Megvii Engine Team 已提交
83 84
                 const TensorLayout& z, const TensorLayout& dst,
                 const PreprocessedFilter* preprocessed_filter = nullptr);
85 86 87 88
        SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src,
                 const TensorLayout& filter,
                 const CanonizedFilterMeta& filter_meta,
                 const TensorLayout& bias, const TensorLayout& z,
M
Megvii Engine Team 已提交
89 90
                 const TensorLayout& dst,
                 const PreprocessedFilter* preprocessed_filter = nullptr);
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108

        void init_conv_bias_desc(conv_bias::CUDNNForwardDescs& desc) const {
            desc.set_conv_bias(*src_layout, filter_meta, *dst_layout,
                               *bias_layout, *z_layout, opr->param());
        }

        void init_conv_desc(conv_bias::CUDNNForwardDescs& desc) const {
            desc.set_conv(*src_layout, filter_meta, *dst_layout, opr->param());
        }
    };
    struct ExecArgs : public SizeArgs {
        const TensorND *src_tensor, *filter_tensor, *bias_tensor, *z_tensor,
                *dst_tensor;
        Workspace workspace;

        ExecArgs(ConvBiasForwardImpl* opr, _megdnn_tensor_in src,
                 _megdnn_tensor_in filter, _megdnn_tensor_in bias,
                 _megdnn_tensor_in z, _megdnn_tensor_out dst,
M
Megvii Engine Team 已提交
109 110
                 _megdnn_workspace workspace,
                 const PreprocessedFilter* preprocessed_filter = nullptr);
111 112 113 114
    };
    virtual bool is_available(const SizeArgs& args) const = 0;
    virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
    virtual void exec(const ExecArgs& args) const = 0;
M
Megvii Engine Team 已提交
115 116
    virtual size_t get_preprocess_workspace_in_bytes(
            const SizeArgs& args) const {
117
        MEGDNN_MARK_USED_VAR(args);
M
Megvii Engine Team 已提交
118 119 120 121
        return 0;
    }
    virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
            const SizeArgs& args) const {
122
        MEGDNN_MARK_USED_VAR(args);
M
Megvii Engine Team 已提交
123 124
        return {};
    }
125 126 127
    virtual void exec_preprocess(const ExecArgs& args) const {
        MEGDNN_MARK_USED_VAR(args);
    }
128 129 130 131 132

    bool is_available_wk(const SizeArgs& args, size_t limit) {
        return is_available(args) && get_workspace_in_bytes(args) <= limit;
    }

133 134
    bool is_available_attribute(
            const SizeArgs& args,
135 136
            const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE,
            const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT,
137
            size_t limit = std::numeric_limits<size_t>::max()) {
138 139 140
        return contain_attribute_all(positive_attr) &&
               !contain_attribute_any(negative_attr) &&
               is_available_wk(args, limit);
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
    }

    AlgoBase& check_workspace(const SizeArgs& args,
                              const Workspace& workspace) {
        auto req = get_workspace_in_bytes(args);
        megdnn_assert(
                req <= workspace.size,
                "conv bias fwd algo %s: required workspace %zu bytes, got %zu",
                name(), req, workspace.size);
        return *this;
    }

    virtual bool is_cudnn() const { return false; }
};

class ConvBiasForwardImpl::AlgoCUDNNConvBiasActivation final : public AlgoBase {
public:
158 159 160 161 162 163 164 165
    AlgoCUDNNConvBiasActivation(cudnnConvolutionFwdAlgo_t cudnn_enum)
            : m_cudnn_enum(cudnn_enum) {
        megdnn_assert(CudnnAlgoPack::conv_fwd_algos().find(cudnn_enum) !=
                      CudnnAlgoPack::conv_fwd_algos().end());
        m_attr = CudnnAlgoPack::conv_fwd_algos().at(cudnn_enum);
        m_name = ConvBiasForward::algo_name<DefaultParam>(
                "CUDNN:ConvBiasActivation:" + m_attr.name, {});
    }
166 167 168 169 170 171 172 173

    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    param::Convolution get_param_convolution(const SizeArgs& args) const;
    bool is_available(const SizeArgs&) const override;

    const char* name() const override { return m_name.c_str(); }

174 175 176 177 178
    AlgoAttribute attribute() const override {
        auto ret = static_cast<AlgoAttribute>(0);
        if (m_attr.is_reproducible) {
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
179 180 181
        if (m_attr.accuracy_depend_on_batch) {
            ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
        }
182 183
        return ret;
    }
184 185 186 187 188

    cudnnConvolutionFwdAlgo_t cudnn_enum() { return m_cudnn_enum; }

    bool is_cudnn() const override { return true; }

189 190 191 192 193 194 195 196
    MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_CONVBIAS)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_cudnn_enum, ret);
        return ret;
    }

197 198 199
private:
    std::string m_name;
    cudnnConvolutionFwdAlgo_t m_cudnn_enum;
200
    CudnnAlgoPack::Attr m_attr;
201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
};

class ConvBiasForwardImpl::AlgoChanwise final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

    const char* name() const override {
        if (m_name.empty()) {
            m_name =
                    ConvBiasForward::algo_name<DirectParam>("CHANNEL_WISE", {});
        }
        return m_name.c_str();
    }
216 217 218
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
219

220 221
    MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE)

222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
private:
    mutable std::string m_name;
};

class ConvBiasForwardImpl::AlgoChanwiseSmall final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

    const char* name() const override {
        if (m_name.empty()) {
            m_name = ConvBiasForward::algo_name<DirectParam>(
                    "CHANNEL_WISE_SMALL", {});
        }
        return m_name.c_str();
    }
239
    MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE_SMALL)
240 241 242
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259

private:
    mutable std::string m_name;
};

class ConvBiasForwardImpl::AlgoChanwise8x8x32 final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override {
        if (m_name.empty()) {
            m_name = ConvBiasForward::algo_name<DirectParam>(
                    "CHANNEL_WISE_8X8X32", {});
        }
        return m_name.c_str();
    }
260
    MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE_INT8X8X32)
261 262 263
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
264 265 266 267 268 269 270

private:
    mutable std::string m_name;
};

class ConvBiasForwardImpl::AlgoCUDNNConv final : public AlgoBase {
public:
271 272 273 274 275 276 277 278
    AlgoCUDNNConv(cudnnConvolutionFwdAlgo_t cudnn_enum)
            : m_cudnn_enum(cudnn_enum) {
        megdnn_assert(CudnnAlgoPack::conv_fwd_algos().find(cudnn_enum) !=
                      CudnnAlgoPack::conv_fwd_algos().end());
        m_attr = CudnnAlgoPack::conv_fwd_algos().at(cudnn_enum);
        m_name = ConvBiasForward::algo_name<DefaultParam>(
                "CUDNN:Convolution:" + m_attr.name, {});
    }
279 280 281 282 283

    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

284 285 286 287 288
    AlgoAttribute attribute() const override {
        auto ret = static_cast<AlgoAttribute>(0);
        if (m_attr.is_reproducible) {
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
289 290 291
        if (m_attr.accuracy_depend_on_batch) {
            ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
        }
292 293
        return ret;
    }
294 295 296 297 298 299

    const char* name() const override { return m_name.c_str(); }

    cudnnConvolutionFwdAlgo_t cudnn_enum() const { return m_cudnn_enum; }

    bool is_cudnn() const override { return true; }
300 301 302 303 304 305 306 307 308

    MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_CONV)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_cudnn_enum, ret);
        return ret;
    }

309 310 311
private:
    std::string m_name;
    cudnnConvolutionFwdAlgo_t m_cudnn_enum;
312
    CudnnAlgoPack::Attr m_attr;
313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330

    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
};

//! compute small matmul in the kernel
class ConvBiasForwardImpl::AlgoInplaceMatmul final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

    const char* name() const override {
        if (m_name.empty()) {
            m_name = ConvBiasForward::algo_name<ConvBias::MatmulParam>(
                    "INPLACE_MATMUL", {});
        }
        return m_name.c_str();
    }
331
    MEGDNN_DECL_ALGO_TYPE(CUDA_INPLACE_MATMUL)
332 333 334
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352

private:
    mutable std::string m_name;
};

//! im2col and matmul, with dilation
class ConvBiasForwardImpl::AlgoMatmul final : public AlgoBase {
    template <typename T>
    static void exec_internal(const ExecArgs& args,
                              const WorkspaceBundle& bundle);

public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

    const char* name() const override {
        if (m_name.empty()) {
353 354
            m_name = ConvBiasForward::algo_name<ConvBias::MatmulParam>("MATMUL",
                                                                       {});
355 356 357
        }
        return m_name.c_str();
    }
358 359 360 361

    std::vector<SearchItem> get_subopr_list(
            const TensorLayoutArray& layouts,
            const OperatorBase* opr) const override;
362
    MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL)
363
    AlgoAttribute attribute() const override {
364 365
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
366
    }
367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384

private:
    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
    mutable std::string m_name;
};

class ConvBiasForwardImpl::AlgoMatmul8x8x32 final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override {
        if (m_name.empty()) {
            m_name = ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
                    "MATMUL8X8X32", {});
        }
        return m_name.c_str();
    }
385
    MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL_INT8X8X32)
386 387 388
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408

private:
    bool need_src_unroll(const SizeArgs& args) const;
    bool need_filter_reshape(const SizeArgs& args) const;
    template <Param::Format>
    WorkspaceBundle get_bundle(const SizeArgs& args) const;
    template <Param::Format>
    void exec_internal(const ExecArgs& args) const;
    mutable std::string m_name;
};

class ConvBiasForwardImpl::AlgoBatchedMatmul final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

    const char* name() const override {
        if (m_name.empty()) {
            m_name = ConvBiasForward::algo_name<ConvBiasForward::MatmulParam>(
409
                    "BATCHED_MATMUL", {});
410 411 412
        }
        return m_name.c_str();
    }
413 414 415 416 417

    std::vector<SearchItem> get_subopr_list(
            const TensorLayoutArray& layouts,
            const OperatorBase* opr) const override;

418
    AlgoAttribute attribute() const override {
419 420
        return AlgoAttribute::REPRODUCIBLE |
               AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
421 422
    }

423
    MEGDNN_DECL_ALGO_TYPE(CUDA_BATCHED_MATMUL)
424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440

private:
    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
    mutable std::string m_name;
};

//! implement group conv by another algo
class ConvBiasForwardImpl::AlgoGroupConvGeneral final : public AlgoBase {
public:
    AlgoGroupConvGeneral(AlgoBase* impl);

    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

    const char* name() const override { return m_name.c_str(); }

441
    AlgoAttribute attribute() const override {
442 443 444 445 446 447 448 449
        auto ret = AlgoAttribute::DEFAULT;
#define cb(attr)                               \
    if (m_impl->contain_attribute_all(attr)) { \
        ret |= attr;                           \
    }
        MEGDNN_FOREACH_ALGO_ATTRIBUTE_INHERITABLE(cb)
#undef cb

450
        if (m_impl->contain_attribute_all(AlgoAttribute::REPRODUCIBLE)) {
451 452 453 454
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
        return ret;
    }
455 456 457

    static void modify_size_args(SizeArgs& args, TensorLayout& src_pg,
                                 TensorLayout& dst_pg, TensorLayout& bias_pg);
458 459
    MEGDNN_DECL_ALGO_TYPE(CUDA_GROUP_CONV_GENERAL)

460 461 462 463 464 465 466 467 468 469 470 471 472 473
private:
    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
    AlgoBase* m_impl;
    std::string m_name;
};

#if CUDA_VERSION >= 10000
class ConvBiasForwardImpl::AlgoQUInt4x4x32WMMA final : public AlgoBase {
public:
    AlgoQUInt4x4x32WMMA() = default;
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return "QUINT4x4x32_WMMA"; }
474 475 476
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
477

478
private:
479 480
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
                                         const SizeArgs& args) const;
481 482
    bool use_kernel_fhxfw(const SizeArgs& args) const;
    size_t get_workspace_in_bytes_do_conv(const SizeArgs& args) const;
483
    MEGDNN_DECL_ALGO_TYPE(CUDA_WMMA_UINT4X4X32)
484 485 486 487 488 489 490 491 492 493 494 495 496
};
#endif

class ConvBiasForwardImpl::AlgoInt8CHWN4DotProdImplicitGemm final
        : public AlgoBase {
public:
    AlgoInt8CHWN4DotProdImplicitGemm() = default;
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override {
        return "INT8_CHWN4_DOTPROD_IMPLICIT_GEMM";
    }
497 498 499
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
500 501 502 503 504 505 506
    template <typename BiasVisitor>
    static void dispatch_nonlinear_mode(
            const int8_t* d_src, const int8_t* d_filter,
            BiasVisitor bias_visitor, const int8_t* d_z, int8_t* d_dst,
            const convolution::ConvParam& param, float alpha, float beta,
            float gamma, float scale, cudaStream_t stream,
            param::ConvBias::NonlineMode nonlinear_mode);
507
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_CHWN4_DOTPROD_INT8)
508 509 510 511 512
};

class ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm final
        : public AlgoBase {
public:
513 514 515 516 517 518 519
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
520
        int stage;
521 522 523 524
        std::string to_string() {
            /// default algorithm
            if (threadblock_m == 128 && threadblock_n == 128 &&
                threadblock_k == 32 && warp_m == 32 && warp_n == 64 &&
525
                warp_k == 32 && stage == 2) {
526 527
                return "";
            }
528 529 530
            return ssprintf("_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m,
                            threadblock_n, threadblock_k, warp_m, warp_n,
                            warp_k, stage);
531 532 533 534 535 536
        }
    };
    AlgoInt8NCHW4DotProdImplicitGemm(AlgoParam algo_param)
            : m_algo_param{algo_param},
              m_name{ssprintf("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s",
                              m_algo_param.to_string().c_str())} {}
537 538 539
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
540
    const char* name() const override { return m_name.c_str(); }
541 542 543
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
544 545 546 547 548
    size_t get_preprocess_workspace_in_bytes(
            const SizeArgs& args) const override;
    SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
            const SizeArgs& args) const override;
    void exec_preprocess(const ExecArgs& args) const override;
549 550 551 552 553 554 555
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_algo_param, ret);
        return ret;
    }
556 557 558 559

private:
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
                                         const SizeArgs& args) const;
560 561
    AlgoParam m_algo_param;
    std::string m_name;
562 563
};

564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585
class ConvBiasForwardImpl::AlgoFallbackNCHWQS8 final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override {
        return "FALLBACK_CONV_NCHW_QS8";
    }
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8)

private:
    void make_inner_layout(const SizeArgs& args, TensorLayout& inner_src_layout,
                           TensorLayout& inner_weight_layout,
                           TensorLayout& inner_dst_layout,
                           TensorLayout& inner_bias_layout,
                           TensorLayout& inner_z_layout) const;
    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
};

586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601
#if CUDA_VERSION >= 10000
class ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemm final
        : public AlgoBase {
public:
    enum class MMATileSize : uint32_t {
        IMMA16x16x16,
        IMMA32x8x16,
        IMMA8x32x16
    };
    AlgoInt8CHWN4IMMAImplicitGemm(MMATileSize mma_tile_size)
            : m_mma_tile_size{mma_tile_size},
              m_name{"INT8_CHWN4_IMMA_IMPLICIT_GEMM_" +
                     to_string(m_mma_tile_size)} {}
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
602
    const char* name() const override { return m_name.c_str(); }
603 604 605
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
606 607 608 609 610 611 612 613 614 615
    template <typename BiasVisitor>
    static void dispatch_nonlinear_mode(
            const int8_t* d_src, const int8_t* d_filter,
            BiasVisitor bias_visitor, int8_t* d_z, int8_t* d_dst,
            const convolution::ConvParam& param, float alpha, float beta,
            float gamma, float scale, cudaStream_t stream,
            param::ConvBias::NonlineMode nonlinear_mode,
            MMATileSize mma_tile_size);
    static std::string to_string(MMATileSize mma_tile_size);

616 617 618 619 620 621 622 623
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_CHWN4_IMMA_INT8)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_mma_tile_size, ret);
        return ret;
    }

624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640
private:
    MMATileSize m_mma_tile_size;
    std::string m_name;
};

class ConvBiasForwardImpl::AlgoInt8NCHW4IMMAImplicitGemm final
        : public AlgoBase {
public:
    using MMATileSize = AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize;
    AlgoInt8NCHW4IMMAImplicitGemm(MMATileSize mma_tile_size)
            : m_mma_tile_size{mma_tile_size},
              m_name{"INT8_NCHW4_IMMA_IMPLICIT_GEMM_" +
                     AlgoInt8CHWN4IMMAImplicitGemm::to_string(
                             m_mma_tile_size)} {}
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
641 642 643 644 645 646 647 648
    const char* name() const override { return m_name.c_str(); }
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_IMMA_INT8)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_mma_tile_size, ret);
        return ret;
    }
649 650 651
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
652

653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672
private:
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
                                         const SizeArgs& args) const;
    MMATileSize m_mma_tile_size;
    std::string m_name;
};

class ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmReorderFilter final
        : public AlgoBase {
public:
    using MMATileSize = AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize;
    AlgoInt8CHWN4IMMAImplicitGemmReorderFilter(MMATileSize mma_tile_size)
            : m_mma_tile_size{mma_tile_size},
              m_name{"INT8_CHWN4_IMMA_IMPLICIT_GEMM_REORDER_FILTER_" +
                     AlgoInt8CHWN4IMMAImplicitGemm::to_string(
                             m_mma_tile_size)} {}
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
673 674 675 676 677 678 679
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_REORDER_FILTER_CHWN4_IMMA_INT8)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_mma_tile_size, ret);
        return ret;
    }
680 681 682
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701

private:
    MMATileSize m_mma_tile_size;
    std::string m_name;
};

class ConvBiasForwardImpl::AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth final
        : public AlgoBase {
public:
    using MMATileSize = AlgoInt8CHWN4IMMAImplicitGemm::MMATileSize;
    AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth(MMATileSize mma_tile_size)
            : m_mma_tile_size{mma_tile_size},
              m_name{"INT8_CHWN4_IMMA_IMPLICIT_GEMM_UNROLL_WIDTH_" +
                     AlgoInt8CHWN4IMMAImplicitGemm::to_string(
                             m_mma_tile_size)} {}
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
702 703 704 705 706 707 708
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_UNROLL_WIDTH_CHWN4_IMMA_INT8)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_mma_tile_size, ret);
        return ret;
    }
709 710 711
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
712 713 714 715 716 717 718

private:
    MMATileSize m_mma_tile_size;
    std::string m_name;
};
#endif

719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741
#if CUDA_VERSION >= 10020
class ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm final
        : public AlgoBase {
public:
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
    };
    AlgoInt8NCHW32IMMAImplicitGemm(AlgoParam algo_param)
            : m_algo_param{algo_param} {
        m_name = ConvBias::algo_name<ConvBias::DirectParam>(
                ssprintf("INT8_NCHW32_IMMA_IMPLICIT_GEMM_%s",
                         to_string(m_algo_param).c_str()),
                ConvBias::DirectParam{});
    }
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
742 743 744
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
745
    static std::string to_string(AlgoParam algo_param);
746 747 748 749 750
    size_t get_preprocess_workspace_in_bytes(
            const SizeArgs& args) const override;
    SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
            const SizeArgs& args) const override;
    void exec_preprocess(const ExecArgs& args) const override;
751 752 753 754 755 756 757
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_algo_param, ret);
        return ret;
    }
758

759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805
private:
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
                                         const SizeArgs& args) const;

    AlgoParam m_algo_param;
    std::string m_name;
};

class ConvBiasForwardImpl::AlgoInt4Int4NCHW64IMMAImplicitGemm final
        : public AlgoBase {
public:
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
    };
    AlgoInt4Int4NCHW64IMMAImplicitGemm(AlgoParam algo_param)
            : m_algo_param{algo_param} {
        m_name = ConvBias::algo_name<ConvBias::DirectParam>(
                ssprintf("INT4_INT4_NCHW64_IMMA_IMPLICIT_GEMM_%s",
                         to_string(m_algo_param).c_str()),
                ConvBias::DirectParam{});
    }
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
    static std::string to_string(AlgoParam algo_param);
    size_t get_preprocess_workspace_in_bytes(
            const SizeArgs& args) const override;
    SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
            const SizeArgs& args) const override;
    void exec_preprocess(const ExecArgs& args) const override;
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4)

    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_algo_param, ret);
        return ret;
    }

806 807 808 809 810 811 812 813 814
private:
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
                                         const SizeArgs& args) const;

    AlgoParam m_algo_param;
    std::string m_name;
};
#endif

815 816 817 818 819 820
class ConvBiasForwardImpl::AlgoBFloat16 final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

821 822 823
    std::vector<SearchItem> get_subopr_list(
            const TensorLayoutArray& layouts,
            const OperatorBase* opr) const override;
824

825
    const char* name() const override { return "CONVBIAS_BFLOAT16"; }
826 827 828 829

    AlgoAttribute attribute() const override {
        return AlgoAttribute::REPRODUCIBLE;
    }
830

831
    MEGDNN_DECL_ALGO_TYPE(CUDA_BFLOAT16)
832 833 834 835
private:
    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
};

836

837 838 839
class ConvBiasForwardImpl::AlgoPack : NonCopyableObj {
private:
    AlgoBase::Mapper m_all_algos_map;
840 841 842 843 844 845

public:
    AlgoPack();

    std::vector<AlgoBase*> all_algos,
            //! non-cudnn algos, used for heuristic if cudnn is not supported
846
            non_cudnn_algos, bfloat16_algos;
847 848
    std::vector<AlgoCUDNNConvBiasActivation> cudnn_conv_bias_activations;
    std::vector<AlgoCUDNNConv> cudnn_convs;
849
    AlgoFallbackNCHWQS8 fallback_nchw_qs8;
850 851 852 853 854 855 856
    AlgoChanwise chanwise;
    AlgoChanwiseSmall chanwise_small;
    AlgoChanwise8x8x32 chanwise8x8x32;
    AlgoInplaceMatmul inplace_matmul;
    AlgoMatmul matmul;
    AlgoMatmul8x8x32 matmul8x8x32;
    AlgoBatchedMatmul batched_matmul;
857
    std::vector<AlgoInt8NCHW4DotProdImplicitGemm> int8_nchw4_dotprod;
858 859 860 861 862 863 864 865 866
    AlgoInt8CHWN4DotProdImplicitGemm int8_chwn4_dotprod;
#if CUDA_VERSION >= 10000
    AlgoQUInt4x4x32WMMA wmma_quint4x4x32;
    std::vector<AlgoInt8CHWN4IMMAImplicitGemm> int8_chwn4_imma;
    std::vector<AlgoInt8NCHW4IMMAImplicitGemm> int8_nchw4_imma;
    std::vector<AlgoInt8CHWN4IMMAImplicitGemmReorderFilter>
            int8_chwn4_imma_reorder_filter;
    std::vector<AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth>
            int8_chwn4_imma_unroll_width;
867 868 869
#endif
#if CUDA_VERSION >= 10020
    std::vector<AlgoInt8NCHW32IMMAImplicitGemm> int8_nchw32_imma;
870
    std::vector<AlgoInt4Int4NCHW64IMMAImplicitGemm> int4_int4_nchw64_imma;
871 872
#endif
    std::vector<std::unique_ptr<AlgoGroupConvGeneral>> gconv_refhold;
873
    AlgoBFloat16 bfloat16;
874 875 876 877 878 879
    std::unordered_map<AlgoBase*, AlgoGroupConvGeneral*> algo2gconv;

    AlgoBase* cudnn_conv_bias_act_from_enum(cudnnConvolutionFwdAlgo_t algo);

    AlgoBase* cudnn_conv_from_enum(cudnnConvolutionFwdAlgo_t algo);

880 881
    const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }

882 883 884 885 886
private:
#if CUDA_VERSION >= 10000
    void fill_imma_algos();
#endif
    void fill_cudnn_algos();
887
    void fill_dp4a_algos();
888 889 890 891 892 893
};

}  // namespace cuda
}  // namespace megdnn

// vim: syntax=cpp.doxygen