algo.h 16.1 KB
Newer Older
1 2 3 4
/**
 * \file dnn/src/cuda/convolution/backward_data/algo.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
5
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
6 7 8
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
9 10
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
11 12 13 14 15
 */

#pragma once

#include <unordered_map>
16 17 18 19
#include "src/common/algo_base.h"
#include "src/common/metahelper.h"
#include "src/cuda/convolution/helper.h"
#include "src/cuda/cudnn_wrapper.h"
20 21 22 23 24 25 26 27 28 29

namespace megdnn {
namespace cuda {

/*!
 * \brief base class for convolution algos
 *
 * All the algo impls should try to support non-contiguous batch dim, for group
 * conv execution.
 */
30 31 32
class ConvolutionBackwardDataImpl::AlgoBase : public Algorithm {
protected:
    ~AlgoBase() = default;
33

34 35 36 37 38 39 40 41
public:
    enum class AlgoType : uint32_t {
        CUDA_CUDNN,
        CUDA_MATMUL,
        CUDA_CHANWISE,
        CUDA_CHANWISE_SMALL,
        CUDA_BFLOAT16,
        CUDA_GROUP_CONV_GENERAL,
42
        CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8,
43
        CUDA_IMPLICIT_GEMM_NCHW_DOTPROD_INT8,
44 45 46
        CUDA_IMPLICIT_GEMM_NHWC_IMMA_INT8,
        CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32,
        CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16,
47 48 49 50 51 52 53 54
    };
    using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;

    AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; }
    struct SizeArgs {
        HandleImpl* handle;
        CanonizedFilterMeta filter_meta;
        const TensorLayout *diff_layout, *grad_layout, *filter_layout;
55
        const ConvolutionBackwardDataImpl* opr;
56 57 58 59

        std::string to_string() const;
        void init_desc(convolution::CUDNNBwdDataDescs& desc) const {
            desc.set(filter_meta, *diff_layout, *grad_layout, opr->param());
60
        }
M
Megvii Engine Team 已提交
61 62 63 64 65 66 67
        SizeArgs(
                const ConvolutionBackwardDataImpl* opr, const TensorLayout& filter,
                const TensorLayout& diff, const TensorLayout& grad);
        SizeArgs(
                const ConvolutionBackwardDataImpl* opr, const TensorLayout& filter,
                const CanonizedFilterMeta& filter_meta, const TensorLayout& diff,
                const TensorLayout& grad);
68 69

        convolution::ForwardSizeArgs as_fwd_args() const {
M
Megvii Engine Team 已提交
70
            return {handle, grad_layout, filter_layout, filter_meta, diff_layout};
71
        }
72 73 74 75 76
    };
    struct ExecArgs : public SizeArgs {
        const TensorND *filter_tensor, *diff_tensor, *grad_tensor;
        Workspace workspace;

M
Megvii Engine Team 已提交
77 78 79 80
        ExecArgs(
                const ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter,
                _megdnn_tensor_in diff, _megdnn_tensor_out grad,
                _megdnn_workspace workspace);
81 82 83 84 85 86 87 88 89
    };
    virtual bool is_available(const SizeArgs& args) const = 0;
    virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0;
    virtual void exec(const ExecArgs& args) const = 0;

    bool is_available_wk(const SizeArgs& args, size_t limit) {
        return is_available(args) && get_workspace_in_bytes(args) <= limit;
    }

90 91
    bool is_available_attribute(
            const SizeArgs& args,
92 93
            const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE,
            const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT,
94
            size_t limit = std::numeric_limits<size_t>::max()) {
95
        return contain_attribute_all(positive_attr) &&
M
Megvii Engine Team 已提交
96
               !contain_attribute_any(negative_attr) && is_available_wk(args, limit);
97 98
    }

M
Megvii Engine Team 已提交
99
    AlgoBase& check_workspace(const SizeArgs& args, const Workspace& workspace) {
100
        auto req = get_workspace_in_bytes(args);
M
Megvii Engine Team 已提交
101 102 103 104 105
        megdnn_assert(
                req <= workspace.size,
                "conv bwd data algo %s: "
                "required workspace %zu bytes, got %zu",
                name(), req, workspace.size);
106 107 108 109
        return *this;
    }

    virtual bool is_cudnn() const { return false; }
110 111 112 113
};

class ConvolutionBackwardDataImpl::AlgoCUDNN final : public AlgoBase {
    cudnnConvolutionBwdDataAlgo_t m_cudnn_enum;
114
    CudnnAlgoPack::Attr m_attr;
115

116
public:
M
Megvii Engine Team 已提交
117 118 119 120
    AlgoCUDNN(cudnnConvolutionBwdDataAlgo_t cudnn_enum) : m_cudnn_enum(cudnn_enum) {
        megdnn_assert(
                CudnnAlgoPack::conv_bwd_data_algos().find(cudnn_enum) !=
                CudnnAlgoPack::conv_bwd_data_algos().end());
121 122
        m_attr = CudnnAlgoPack::conv_bwd_data_algos().at(cudnn_enum);
    }
123

124 125 126
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
127

128
    const char* name() const override { return m_attr.name.c_str(); }
129 130 131 132 133
    AlgoAttribute attribute() const override {
        auto ret = static_cast<AlgoAttribute>(0);
        if (m_attr.is_reproducible) {
            ret |= AlgoAttribute::REPRODUCIBLE;
        }
134 135 136
        if (m_attr.accuracy_depend_on_batch) {
            ret |= AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
        }
137 138
        return ret;
    }
139
    cudnnConvolutionBwdDataAlgo_t cudnn_enum() const { return m_cudnn_enum; }
140

141 142
    bool is_cudnn() const override { return true; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN)
143

144 145 146 147 148
    std::string param() const override {
        std::string ret;
        serialize_write_pod(m_cudnn_enum, ret);
        return ret;
    }
149 150 151
};

//! im2col and matmul, with dilation
152 153 154
class ConvolutionBackwardDataImpl::AlgoMatmul final : public AlgoBase {
    template <typename T>
    static void exec_internal(const ExecArgs& args);
155

156 157 158 159
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
160

161
    std::vector<SearchItem> get_subopr_list(
M
Megvii Engine Team 已提交
162
            const TensorLayoutArray& layouts, const OperatorBase* opr) const override;
163

164 165
    const char* name() const override { return "MATMUL"; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_MATMUL)
166
    AlgoAttribute attribute() const override {
M
Megvii Engine Team 已提交
167
        return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::ACCURACY_DEPEND_ON_BATCH;
168
    }
169 170
};

171 172 173 174 175
class ConvolutionBackwardDataImpl::AlgoChanwise final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
176

177 178
    const char* name() const override { return "CHANNEL_WISE"; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE)
M
Megvii Engine Team 已提交
179
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
180 181
};

182 183 184 185 186
class ConvolutionBackwardDataImpl::AlgoChanwiseSmall final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
187

188 189
    const char* name() const override { return "CHANNEL_WISE_SMALL"; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_CHANWISE_SMALL)
190
    AlgoAttribute attribute() const override {
M
Megvii Engine Team 已提交
191
        return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::USABLE_DEPEND_ON_SHAPE;
192
    }
193 194
};

195 196 197 198 199 200
class ConvolutionBackwardDataImpl::AlgoBFloat16 final : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;

201
    std::vector<SearchItem> get_subopr_list(
M
Megvii Engine Team 已提交
202
            const TensorLayoutArray& layouts, const OperatorBase* opr) const override;
203

M
Megvii Engine Team 已提交
204
    const char* name() const override { return "CONVOLUTION_BACKWARD_DATD_BFLOAT16"; }
205

M
Megvii Engine Team 已提交
206
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
207 208 209

private:
    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
210
    MEGDNN_DECL_ALGO_TYPE(CUDA_BFLOAT16)
211 212
};

213
//! implement group conv by another algo
M
Megvii Engine Team 已提交
214
class ConvolutionBackwardDataImpl::AlgoGroupConvGeneral final : public AlgoBase {
215 216 217 218
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
219

220
    std::vector<SearchItem> get_subopr_list(
M
Megvii Engine Team 已提交
221
            const TensorLayoutArray& layouts, const OperatorBase* opr) const override;
222

M
Megvii Engine Team 已提交
223
    const char* name() const override { return "CUDA:GROUP_CONV_BACKWARD_DATA"; }
224 225

    MEGDNN_DECL_ALGO_TYPE(CUDA_GROUP_CONV_GENERAL)
M
Megvii Engine Team 已提交
226
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
227 228 229

private:
    WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
230 231
};

232 233 234 235 236 237 238 239 240 241 242 243
class ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm final
        : public AlgoBase {
public:
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
        int stage;
        std::string to_string() {
M
Megvii Engine Team 已提交
244 245 246
            return ssprintf(
                    "_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n,
                    threadblock_k, warp_m, warp_n, warp_k, stage);
247 248 249 250
        }
    };
    AlgoInt8NCHW4DotProdImplicitGemm(AlgoParam algo_param)
            : m_algo_param{algo_param},
M
Megvii Engine Team 已提交
251 252 253
              m_name{ssprintf(
                      "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s",
                      m_algo_param.to_string().c_str())} {}
254 255 256 257
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
M
Megvii Engine Team 已提交
258
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
259 260
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8)
private:
M
Megvii Engine Team 已提交
261
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, const SizeArgs& args) const;
262
    const void* get_available_op(const SizeArgs& args) const;
263 264 265 266
    AlgoParam m_algo_param;
    std::string m_name;
};

267 268 269 270 271 272
class ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm final
        : public AlgoBase {
public:
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
M
Megvii Engine Team 已提交
273 274
    const char* name() const override { return "INT8_NCHW_DOTPROD_IMPLICIT_GEMM"; }
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
275
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW_DOTPROD_INT8);
M
Megvii Engine Team 已提交
276

277
private:
M
Megvii Engine Team 已提交
278
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, const SizeArgs& args) const;
279
    const void* get_available_op(const SizeArgs& args) const;
280 281
};

282 283 284 285 286 287 288 289 290 291 292 293 294
class ConvolutionBackwardDataImpl::AlgoInt8NHWCIMMAImplicitGemm final
        : public AlgoBase {
public:
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
        int stage;
        int access_size;
        std::string to_string() {
M
Megvii Engine Team 已提交
295 296 297
            return ssprintf(
                    "_%dX%dX%d_%dX%dX%d_%dstage_%d", threadblock_m, threadblock_n,
                    threadblock_k, warp_m, warp_n, warp_k, stage, access_size);
298 299 300 301
        }
    };
    AlgoInt8NHWCIMMAImplicitGemm(AlgoParam algo_param)
            : m_algo_param{algo_param},
M
Megvii Engine Team 已提交
302 303 304
              m_name{ssprintf(
                      "INT8_NHWC_IMMA_IMPLICIT_GEMM%s",
                      m_algo_param.to_string().c_str())} {}
305 306 307 308
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
M
Megvii Engine Team 已提交
309
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
310 311
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NHWC_IMMA_INT8)
private:
M
Megvii Engine Team 已提交
312
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, const SizeArgs& args) const;
313
    const void* get_available_op(const SizeArgs& args) const;
M
Megvii Engine Team 已提交
314 315
    void reorder_filter(
            const ExecArgs& args, const int iterleaved, int8_t* reordered_filter) const;
316 317 318 319
    AlgoParam m_algo_param;
    std::string m_name;
};

320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
class ConvolutionBackwardDataImpl::AlgoFloat32NCHWFMAImplicitBatchedGemm final
        : public AlgoBase {
public:
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
        int stage;
        std::string to_string() {
            return ssprintf(
                    "_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, threadblock_n,
                    threadblock_k, warp_m, warp_n, warp_k, stage);
        }
    };
    AlgoFloat32NCHWFMAImplicitBatchedGemm(AlgoParam algo_param)
            : m_algo_param{algo_param},
              m_name{ssprintf(
                      "FLOAT32_NCHW_FMA_IMPLICIT_BATCHED_GEMM%s",
                      m_algo_param.to_string().c_str())} {}
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override { return 0; }
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_FMA_NCHW_F32)

private:
    const void* get_available_op(const SizeArgs& args) const;
    AlgoParam m_algo_param;
    std::string m_name;
};

class ConvolutionBackwardDataImpl::AlgoFloat16NCHWHMMAImplicitBatchedGemm final
        : public AlgoBase {
public:
    /// add instruction shape as member of algo param, because f16 tensor core has 2
    /// different matrix shapes (i.e. mma.884 and mma.1688)
    struct AlgoParam {
        int threadblock_m;
        int threadblock_n;
        int threadblock_k;
        int warp_m;
        int warp_n;
        int warp_k;
        int instruction_m;
        int instruction_n;
        int instruction_k;
        int stage;
        std::string to_string() {
            return ssprintf(
                    "_%dX%dX%d_%dX%dX%d_mma%dX%dX%d_%dstage", threadblock_m,
                    threadblock_n, threadblock_k, warp_m, warp_n, warp_k, instruction_m,
                    instruction_n, instruction_k, stage);
        }
    };
    AlgoFloat16NCHWHMMAImplicitBatchedGemm(AlgoParam algo_param)
            : m_algo_param{algo_param},
              m_name{ssprintf(
                      "FLOAT16_NCHW_HMMA_IMPLICIT_BATCHED_GEMM%s",
                      m_algo_param.to_string().c_str())} {}
    bool is_available(const SizeArgs& args) const override;
    size_t get_workspace_in_bytes(const SizeArgs& args) const override { return 0; }
    void exec(const ExecArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_BATCHED_GEMM_HMMA_NCHW_F16)

private:
    const void* get_available_op(const SizeArgs& args) const;
    AlgoParam m_algo_param;
    std::string m_name;
};

396
class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj {
397 398
    // defined in cudnn.cpp
    void fill_cudnn_algos();
399 400
    // defined in implicit_gemm_int8_nchw4_dp4a.cpp
    void fill_int8_dp4a_algos();
401 402
    // defined in implicit_gemm_int8_nhwc_imma.cpp
    void fill_int8_imma_algos();
403
    void fill_dwconv_algos();
404

405
    AlgoBase::Mapper m_all_algos_map;
406

407 408
public:
    AlgoPack();
409

410 411 412 413
    std::vector<AlgoCUDNN> cudnn;
    AlgoMatmul matmul;
    AlgoChanwise chanwise;
    AlgoChanwiseSmall chanwise_small;
414
    AlgoBFloat16 bfloat16;
415
    AlgoGroupConvGeneral group;
416
    std::vector<AlgoInt8NCHW4DotProdImplicitGemm> int8_nchw4_dotprod;
417
    AlgoInt8NCHWDotProdImplicitGemm int8_nchw_dotprod;
418
    std::vector<AlgoInt8NHWCIMMAImplicitGemm> int8_nhwc_imma;
419 420
    std::vector<AlgoFloat32NCHWFMAImplicitBatchedGemm> implbmm_nchw_fma;
    std::vector<AlgoFloat16NCHWHMMAImplicitBatchedGemm> implbmm_nchw_hmma;
421

422
    std::vector<AlgoBase*>
423 424 425
            //! all algorithms
            all_algos,
            //! non-cudnn algos, used for heuristic if cudnn is not supported
426
            non_cudnn_algos, bfloat16_algos, int8_algos;
427 428

    AlgoCUDNN* cudnn_from_enum(cudnnConvolutionBwdDataAlgo_t algo);
429

430
    const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
431 432
};

433 434
}  // namespace cuda
}  // namespace megdnn
435 436

// vim: syntax=cpp.doxygen