algo.h 16.7 KB
Newer Older
1 2 3 4
/**
 * \file dnn/src/cuda/convolution/backward_data/algo.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
5
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
6 7 8
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
9 10
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
11 12 13 14 15
 */

#pragma once

#include <unordered_map>
16 17 18 19
#include "src/common/algo_base.h"
#include "src/common/metahelper.h"
#include "src/cuda/convolution/helper.h"
#include "src/cuda/cudnn_wrapper.h"
20 21 22 23 24 25 26 27 28 29

namespace megdnn {
namespace cuda {

/*!
 * \brief base class for convolution algos
 *
 * All the algo impls should try to support non-contiguous batch dim, for group
 * conv execution.
 */
30 31 32
class ConvolutionBackwardDataImpl::AlgoBase : public Algorithm {
protected:
    ~AlgoBase() = default;
33

34 35 36 37 38 39
public:
    enum class AlgoType : uint32_t {
        CUDA_CUDNN,
        CUDA_MATMUL,
        CUDA_CHANWISE,
        CUDA_CHANWISE_SMALL,
40
        CUDA_DEPTHWISE_LARGE_FILTER,
41 42
        CUDA_BFLOAT16,
        CUDA_GROUP_CONV_GENERAL,
43
        CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8,
44
        CUDA_IMPLICIT_GEMM_NCHW_DOTPROD_INT8,
45 46 47
        CUDA_IMPLICIT_GEMM_NHWC_IMMA_INT8,
        CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32,
        CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16,
48 49 50 51 52 53 54 55
    };
    using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;

    AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; }
    struct SizeArgs {
        HandleImpl* handle;
        CanonizedFilterMeta filter_meta;
        const TensorLayout *diff_layout, *grad_layout, *filter_layout;
56
        const ConvolutionBackwardDataImpl* opr;
57 58 59 60

        std::string to_string() const;
        void init_desc(convolution::CUDNNBwdDataDescs& desc) const {
            desc.set(filter_meta, *diff_layout, *grad_layout, opr->param());
61
        }
M
Megvii Engine Team 已提交
62 63 64 65 66 67 68
        SizeArgs(
                const ConvolutionBackwardDataImpl* opr, const TensorLayout& filter,
                const TensorLayout& diff, const TensorLayout& grad);
        SizeArgs(
                const ConvolutionBackwardDataImpl* opr, const TensorLayout& filter,
                const CanonizedFilterMeta& filter_meta, const TensorLayout& diff,
                const TensorLayout& grad);
69 70

        convolution::ForwardSizeArgs as_fwd_args() const {
71
            return {handle, grad_layout, filter_layout, filter_meta, diff_layout};
72
        }
73 74 75 76 77
    };
    struct ExecArgs : public SizeArgs {
        const TensorND *filter_tensor, *diff_tensor, *grad_tensor;
        Workspace workspace;

M
Megvii Engine Team 已提交
78 79 80 81
        ExecArgs(
                const ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter,
                _megdnn_tensor_in diff, _megdnn_tensor_out grad,
                _megdnn_workspace workspace);
82 83 84 85 86 87 88 89 90
    };
    virtual bool is_available(const SizeArgs& args) const = 0;
    virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
    virtual void exec(const ExecArgs& args) const = 0;

    bool is_available_wk(const SizeArgs& args, size_t limit) {
        return is_available(args) && get_workspace_in_bytes(args) <= limit;
    }

91 92
    bool is_available_attribute(
            const SizeArgs& args,
93 94
            const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE,
            const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT,
95
            size_t limit = std::numeric_limits<size_t>::max()) {
96
        return contain_attribute_all(positive_attr) &&
M
Megvii Engine Team 已提交
97
               !contain_attribute_any(negative_attr) && is_available_wk(args, limit);
98 99
    }

M
Megvii Engine Team 已提交
100
    AlgoBase& check_workspace(const SizeArgs& args, const Workspace& workspace) {
101
        auto req = get_workspace_in_bytes(args);
M
Megvii Engine Team 已提交
102 103 104 105 106
        megdnn_assert(
                req <= workspace.size,
                "conv bwd data algo %s: "
                "required workspace %zu bytes, got %zu",
                name(), req, workspace.size);
107 108 109 110
        return *this;
    }

    virtual bool is_cudnn() const { return false; }
111 112 113 114
};

class ConvolutionBackwardDataImpl::AlgoCUDNN final : public AlgoBase {
    cudnnConvolutionBwdDataAlgo_t m_cudnn_enum;
115
    CudnnAlgoPack::Attr m_attr;
116

117
public:
M
Megvii Engine Team 已提交
118 119 120 121
    AlgoCUDNN(cudnnConvolutionBwdDataAlgo_t cudnn_enum) : m_cudnn_enum(cudnn_enum) {
        megdnn_assert(
                CudnnAlgoPack::conv_bwd_data_algos().find(cudnn_enum) !=
                CudnnAlgoPack::conv_bwd_data_algos().end());
122 123
        m_attr = CudnnAlgoPack::conv_bwd_data_algos().at(cudnn_enum);
    }
124

125 126 127
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
128

129
    const char* name() const override { return m_attr.name.c_str(); }
130 131 132 133 134
    AlgoAttribute attribute() const override {
        auto ret = static_cast<AlgoAttribute>(0);
        if (m_attr.is_reproducible) {
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
135 136 137
        if (m_attr.accuracy_depend_on_batch) {
            ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
        }
138 139
        return ret;
    }
140
    cudnnConvolutionBwdDataAlgo_t cudnn_enum() const { return m_cudnn_enum; }
141

142 143
    bool is_cudnn() const override { return true; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN)
144

145 146 147 148 149
    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_cudnn_enum, ret);
        return ret;
    }
150 151 152
};

//! im2col and matmul, with dilation
153 154 155
class ConvolutionBackwardDataImpl::AlgoMatmul final : public AlgoBase {
    template <typename T>
    static void exec_internal(const ExecArgs& args);
156

157 158 159 160
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
161

162
    std::vector<SearchItem> get_subopr_list(
M
Megvii Engine Team 已提交
163
            const TensorLayoutArray& layouts, const OperatorBase* opr) const override;
164

165 166
    const char* name() const override { return "MATMUL"; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL)
167
    AlgoAttribute attribute() const override {
M
Megvii Engine Team 已提交
168
        return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
169
    }
170 171
};

172 173 174 175 176
class ConvolutionBackwardDataImpl::AlgoChanwise final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
177

178 179
    const char* name() const override { return "CHANNEL_WISE"; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE)
M
Megvii Engine Team 已提交
180
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
181 182
};

183 184 185 186 187
class ConvolutionBackwardDataImpl::AlgoChanwiseSmall final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
188

189 190
    const char* name() const override { return "CHANNEL_WISE_SMALL"; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE_SMALL)
191
    AlgoAttribute attribute() const override {
M
Megvii Engine Team 已提交
192
        return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
193
    }
194 195
};

196 197 198 199 200 201 202 203 204 205 206 207 208 209
class ConvolutionBackwardDataImpl::AlgoDepthwiseLargeFilter final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

    const char* name() const override { return "DEPTHWISE_LARGE_FILTER"; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_DEPTHWISE_LARGE_FILTER)
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }

private:
    mutable std::string m_name;
};

210 211 212 213 214 215
class ConvolutionBackwardDataImpl::AlgoBFloat16 final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

216
    std::vector<SearchItem> get_subopr_list(
M
Megvii Engine Team 已提交
217
            const TensorLayoutArray& layouts, const OperatorBase* opr) const override;
218

M
Megvii Engine Team 已提交
219
    const char* name() const override { return "CONVOLUTION_BACKWARD_DATD_BFLOAT16"; }
220

M
Megvii Engine Team 已提交
221
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
222 223 224

private:
    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
225
    MEGDNN_DECL_ALGO_TYPE(CUDA_BFLOAT16)
226 227
};

228
//! implement group conv by another algo
M
Megvii Engine Team 已提交
229
class ConvolutionBackwardDataImpl::AlgoGroupConvGeneral final : public AlgoBase {
230 231 232 233
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
234

235
    std::vector<SearchItem> get_subopr_list(
M
Megvii Engine Team 已提交
236
            const TensorLayoutArray& layouts, const OperatorBase* opr) const override;
237

M
Megvii Engine Team 已提交
238
    const char* name() const override { return "CUDA:GROUP_CONV_BACKWARD_DATA"; }
239 240

    MEGDNN_DECL_ALGO_TYPE(CUDA_GROUP_CONV_GENERAL)
M
Megvii Engine Team 已提交
241
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
242 243 244

private:
    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
245 246
};

247 248 249 250 251 252 253 254 255 256 257 258
class ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm final
        : public AlgoBase {
public:
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
        int stage;
        std::string to_string() {
M
Megvii Engine Team 已提交
259 260 261
            return ssprintf(
                    "_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n,
                    threadblock_k, warp_m, warp_n, warp_k, stage);
262 263 264 265
        }
    };
    AlgoInt8NCHW4DotProdImplicitGemm(AlgoParam algo_param)
            : m_algo_param{algo_param},
M
Megvii Engine Team 已提交
266 267 268
              m_name{ssprintf(
                      "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s",
                      m_algo_param.to_string().c_str())} {}
269 270 271 272
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
M
Megvii Engine Team 已提交
273
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
274 275
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8)
private:
M
Megvii Engine Team 已提交
276
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, const SizeArgs& args) const;
277
    const void* get_available_op(const SizeArgs& args) const;
278 279 280 281
    AlgoParam m_algo_param;
    std::string m_name;
};

282 283 284 285 286 287
class ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm final
        : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
M
Megvii Engine Team 已提交
288 289
    const char* name() const override { return "INT8_NCHW_DOTPROD_IMPLICIT_GEMM"; }
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
290
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW_DOTPROD_INT8);
M
Megvii Engine Team 已提交
291

292
private:
M
Megvii Engine Team 已提交
293
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, const SizeArgs& args) const;
294
    const void* get_available_op(const SizeArgs& args) const;
295 296
};

297 298 299 300 301 302 303 304 305 306 307 308 309
class ConvolutionBackwardDataImpl::AlgoInt8NHWCIMMAImplicitGemm final
        : public AlgoBase {
public:
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
        int stage;
        int access_size;
        std::string to_string() {
M
Megvii Engine Team 已提交
310 311 312
            return ssprintf(
                    "_%dX%dX%d_%dX%dX%d_%dstage_%d", threadblock_m, threadblock_n,
                    threadblock_k, warp_m, warp_n, warp_k, stage, access_size);
313 314 315 316
        }
    };
    AlgoInt8NHWCIMMAImplicitGemm(AlgoParam algo_param)
            : m_algo_param{algo_param},
M
Megvii Engine Team 已提交
317 318 319
              m_name{ssprintf(
                      "INT8_NHWC_IMMA_IMPLICIT_GEMM%s",
                      m_algo_param.to_string().c_str())} {}
320 321 322 323
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
M
Megvii Engine Team 已提交
324
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
325 326
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NHWC_IMMA_INT8)
private:
M
Megvii Engine Team 已提交
327
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, const SizeArgs& args) const;
328
    const void* get_available_op(const SizeArgs& args) const;
M
Megvii Engine Team 已提交
329 330
    void reorder_filter(
            const ExecArgs& args, const int iterleaved, int8_t* reordered_filter) const;
331 332 333 334
    AlgoParam m_algo_param;
    std::string m_name;
};

335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410
class ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm final
        : public AlgoBase {
public:
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
        int stage;
        std::string to_string() {
            return ssprintf(
                    "_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n,
                    threadblock_k, warp_m, warp_n, warp_k, stage);
        }
    };
    AlgoFloat32NCHWFMAImplicitBatchedGemm(AlgoParam algo_param)
            : m_algo_param{algo_param},
              m_name{ssprintf(
                      "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM%s",
                      m_algo_param.to_string().c_str())} {}
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override { return 0; }
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32)

private:
    const void* get_available_op(const SizeArgs& args) const;
    AlgoParam m_algo_param;
    std::string m_name;
};

class ConvolutionBackwardDataImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm final
        : public AlgoBase {
public:
    /// add instruction shape as member of algo param, because f16 tensor core has 2
    /// different matrix shapes (i.e. mma.884 and mma.1688)
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
        int instruction_m;
        int instruction_n;
        int instruction_k;
        int stage;
        std::string to_string() {
            return ssprintf(
                    "_%dX%dX%d_%dX%dX%d_mma%dX%dX%d_%dstage", threadblock_m,
                    threadblock_n, threadblock_k, warp_m, warp_n, warp_k, instruction_m,
                    instruction_n, instruction_k, stage);
        }
    };
    AlgoFloat16NCHWHMMAImplicitBatchedGemm(AlgoParam algo_param)
            : m_algo_param{algo_param},
              m_name{ssprintf(
                      "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM%s",
                      m_algo_param.to_string().c_str())} {}
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override { return 0; }
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16)

private:
    const void* get_available_op(const SizeArgs& args) const;
    AlgoParam m_algo_param;
    std::string m_name;
};

411
class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj {
412 413
    // defined in cudnn.cpp
    void fill_cudnn_algos();
414 415
    // defined in implicit_gemm_int8_nchw4_dp4a.cpp
    void fill_int8_dp4a_algos();
416 417
    // defined in implicit_gemm_int8_nhwc_imma.cpp
    void fill_int8_imma_algos();
418
    void fill_dwconv_algos();
419

420
    AlgoBase::Mapper m_all_algos_map;
421

422 423
public:
    AlgoPack();
424

425 426 427 428
    std::vector<AlgoCUDNN> cudnn;
    AlgoMatmul matmul;
    AlgoChanwise chanwise;
    AlgoChanwiseSmall chanwise_small;
429
    AlgoDepthwiseLargeFilter depthwise_large_filter;
430
    AlgoBFloat16 bfloat16;
431
    AlgoGroupConvGeneral group;
432
    std::vector<AlgoInt8NCHW4DotProdImplicitGemm> int8_nchw4_dotprod;
433
    AlgoInt8NCHWDotProdImplicitGemm int8_nchw_dotprod;
434
    std::vector<AlgoInt8NHWCIMMAImplicitGemm> int8_nhwc_imma;
435 436
    std::vector<AlgoFloat32NCHWFMAImplicitBatchedGemm> implbmm_nchw_fma;
    std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> implbmm_nchw_hmma;
437

438
    std::vector<AlgoBase*>
439 440 441
            //! all algorithms
            all_algos,
            //! non-cudnn algos, used for heuristic if cudnn is not supported
442
            non_cudnn_algos, bfloat16_algos, int8_algos;
443 444

    AlgoCUDNN* cudnn_from_enum(cudnnConvolutionBwdDataAlgo_t algo);
445

446
    const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
447 448
};

449 450
}  // namespace cuda
}  // namespace megdnn
451 452

// vim: syntax=cpp.doxygen