winograd.h

/**
 * \file dnn/src/fallback/conv_bias/winograd/winograd.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

#pragma once

#include <cstddef>
#include "include/megdnn/basic_types.h"
#include "include/megdnn/dtype.h"
#include "include/megdnn/thin/small_vector.h"
#include "src/fallback/conv_bias/opr_impl.h"
#include "src/fallback/matrix_mul/opr_impl.h"

#include "midout.h"
MIDOUT_DECL(megdnn_fallback_conv_bias_winograd_common)

namespace megdnn {
namespace winograd {

/**
 * \brief Winograd convolution
 *
 * The algo is refer to https://arxiv.org/abs/1509.09308.
 *
 * Format: DEFAULT
 * filter: (OC, IC, FH, FW) -> (alpha, alpha, IC, OC)
 * src: (N, C, H, W) -> (N, NR_TILES, alpha, alpha, TILE_SIZE, IC)
 *
 * We will perform gemm on:
 * (TILE_SIZE, IC) x (IC, OC) -> (TILE_SIZE, OC)
 *
 * Format: MK4
 * filter: (OC, IC, FH, FW) -> (alpha, alpha, OCB, ICB, IC_BLOCK_SIZE,
 * OC_BLOCK_SIZE)
 * src: (N, C, H, W) -> (N, NR_TILES, alpha, alpha, ICB, TILE_SIZE,
 * IC_BLOCK_SIZE)
 *
 * We will perform gemm on:
 * (OCB, ICB, IC_BLOCK_SIZE, OC_BLOCK_SIZE) x (ICB, TILE_SIZE, IC_BLOCK_SIZE)
 * = (OCB, TILE_SIZE, OC_BLOCK_SIZE)
 */
//! The default oc size of one thread in multi-threads mode
constexpr static size_t UNIT_OC_SIZE_DEFAULT = 1024;
template <typename Strategy,
          param::MatrixMul::Format format = param::MatrixMul::Format::DEFAULT>
class ConvBias {
    using output_compute_type = typename Strategy::output_compute_type;
    using input_filter_compute_type =
            typename Strategy::input_filter_compute_type;
    using stype = typename Strategy::stype;
    using dst_type = typename Strategy::dst_type;
    using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam;
    using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam;
    using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex;
    using NCBKern = fallback::ConvBiasImpl::NCBKern;
    static_assert(
            format == param::MatrixMul::Format::DEFAULT ||
                    (format == param::MatrixMul::Format::MK4 &&
                     Strategy::IC_BLOCK_SIZE == 4 &&
                     Strategy::OC_BLOCK_SIZE == 4) ||
                    (format == param::MatrixMul::Format::MK8 &&
                     Strategy::IC_BLOCK_SIZE == 8 &&
                     Strategy::OC_BLOCK_SIZE == 8),
            "format should be default, mk4 and mk8, if mk4 IC_BLOCK_SIZE and "
            "OC_BLOCK_SIZE should be 4, if mk8 IC_BLOCK_SIZE and "
            "OC_BLOCK_SIZE should be 8");

    Strategy m_strategy;
    size_t m_unit_tile_size;
    //! m_unit_oc_size is must be times of Strategy::OC_BLOCK_SIZE
    size_t m_unit_oc_size;

    WorkspaceBundle get_wbundle(
            const NCBKernSizeParam& param,
            fallback::MatrixMulImpl::AlgoBase* matmul_algo) const {
        size_t OC = param.filter_meta.ocpg;
        size_t IC = param.filter_meta.icpg;
        size_t GROUP = param.filter_meta.group;
        size_t nr_threads = param.nr_threads;
        size_t filter_transform_buf_size = 0;
        //! filter : (alpha, alpha, IC, OC) or (OCB, ICB, IC_BLOCK_SIZE,
        //! OC_BLOCK_SIZE)
        if (param.preprocessed_filter == nullptr &&
            param.filter_meta.format !=
                    param::ConvBias::Format::NCHW_WINOGRAD &&
            param.filter_meta.format !=
                    param::ConvBias::Format::NCHW88_WINOGRAD &&
            param.filter_meta.format !=
                    param::ConvBias::Format::NCHW44_WINOGRAD) {
            filter_transform_buf_size = Strategy::ALPHA * Strategy::ALPHA * OC *
                                        IC * sizeof(input_filter_compute_type);
        }
        size_t winograd_comput_size =
                get_wbundle_compute(param, matmul_algo).total_size_in_bytes() *
                nr_threads;
        if (param.filter_meta.format == param::ConvBias::Format::NCHW ||
            param.filter_meta.format == param::ConvBias::Format::NCHW88 ||
            param.filter_meta.format == param::ConvBias::Format::NCHW44) {
            return WorkspaceBundle(
                    nullptr,
                    {winograd_comput_size, filter_transform_buf_size * GROUP});
        } else {
            megdnn_assert(param.filter_meta.format ==
                                  param::ConvBias::Format::NCHW_WINOGRAD ||
                          param.filter_meta.format ==
                                  param::ConvBias::Format::NCHW88_WINOGRAD ||
                          param.filter_meta.format ==
                                  param::ConvBias::Format::NCHW44_WINOGRAD);
            return WorkspaceBundle(nullptr, {winograd_comput_size});
        }
    }

    WorkspaceBundle get_wbundle_compute(
            const NCBKernSizeParam& param,
            fallback::MatrixMulImpl::AlgoBase* matmul_algo) const {
        size_t OC = param.filter_meta.ocpg;
        size_t IC = param.filter_meta.icpg;
        size_t oc_size = std::min(OC, m_unit_oc_size);
        //! input : (alpha, alpha, unit_tile_size, IC) or (alpha, alpha,
        //! ICB, unit_tile_size, IC_BLOCK_SIZE)
        size_t input_transform_buf_size = Strategy::ALPHA * Strategy::ALPHA *
                                          IC * m_unit_tile_size *
                                          sizeof(input_filter_compute_type);
        //! output : (alpha, alpha, unit_tile_size, OC) or
        //! (alpha, alpha, OCB, unit_tile_size, OC_BLOCK_SIZE)
        size_t output_transform_buf_size = Strategy::ALPHA * Strategy::ALPHA *
                                           oc_size * m_unit_tile_size *
                                           sizeof(output_compute_type);

        //! use for inner temporary usage
        size_t transform_mid_buf_size =
                2 * Strategy::ALPHA * Strategy::ALPHA *
                sizeof(output_compute_type) *
                std::max(Strategy::IC_BLOCK_SIZE, Strategy::OC_BLOCK_SIZE);

        size_t matmul_workspace_size = matmul_algo->get_workspace(
                get_matmul_kern_param(param, m_unit_oc_size));

        //! compute workspace is independent and separated as far as possible
        //! in case of false cache line sharing
        return WorkspaceBundle(
                nullptr, {input_transform_buf_size, output_transform_buf_size,
                          transform_mid_buf_size, matmul_workspace_size});
    }

    WorkspaceBundle get_preprocess_wbundle(
            const NCBKernSizeParam& param) const {
        //! use for inner temporary usage
        size_t transform_mid_buf_size =
                2 * Strategy::ALPHA * Strategy::ALPHA *
                sizeof(output_compute_type) *
                std::max(Strategy::IC_BLOCK_SIZE, Strategy::OC_BLOCK_SIZE);
        size_t nr_threads = param.nr_threads;
        SmallVector<size_t> space_vec(nr_threads, transform_mid_buf_size);
        return WorkspaceBundle{nullptr, space_vec};
    }

public:
    //! Get the m_unit_oc_size, according to the nr_threads and
    //! output_featuremap_size. When single thread the m_unit_oc_size is set
    //! 2048 heuristicly, When multi-threads, the m_unit_oc_size is set
    //! according to nr_threads and out_featuremap_size
    ConvBias(const Strategy& strategy, size_t unit_tile_size,
             const NCBKernSizeParam& param)
            : m_strategy{strategy}, m_unit_tile_size{unit_tile_size} {
        size_t nr_threads = param.nr_threads;
        size_t OC = param.filter_meta.ocpg;
        size_t OH = param.osz[0];
        size_t OW = param.osz[1];
        if (nr_threads > 1) {
            size_t units_h = div_ceil<size_t>(OH, Strategy::OUTPUT_BLOCK_SIZE);
            size_t units_w = div_ceil<size_t>(OW, Strategy::OUTPUT_BLOCK_SIZE);
            size_t nr_units = units_h * units_w;
            size_t nr_parallism_unit =
                    div_ceil<size_t>(nr_units, unit_tile_size);
            if (nr_parallism_unit < nr_threads) {
                m_unit_oc_size = div_ceil<size_t>(OC, nr_threads);
                if (format == param::MatrixMul::Format::MK8) {
                    m_unit_oc_size = round_up<size_t>(m_unit_oc_size, 8);
                } else {
                    m_unit_oc_size = round_up<size_t>(m_unit_oc_size, 4);
                }
            } else {
                m_unit_oc_size = UNIT_OC_SIZE_DEFAULT;
            }
        } else {
            m_unit_oc_size = UNIT_OC_SIZE_DEFAULT;
        }
    }
    ConvBias(const Strategy& strategy, size_t unit_tile_size)
            : m_strategy{strategy}, m_unit_tile_size{unit_tile_size} {
        m_unit_oc_size = UNIT_OC_SIZE_DEFAULT;
    }

    size_t get_workspace_size(
            const NCBKernSizeParam& param,
            fallback::MatrixMulImpl::AlgoBase* matmul_algo) const {
        return get_wbundle(param, matmul_algo).total_size_in_bytes();
    }

    size_t get_preprocess_workspace_size(
            const NCBKernSizeParam& param,
            fallback::MatrixMulImpl::AlgoBase*) const {
        return get_preprocess_wbundle(param).total_size_in_bytes();
    }

    SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
            const NCBKernSizeParam& param, fallback::MatrixMulImpl::AlgoBase*) {
        size_t OC = param.filter_meta.ocpg;
        size_t IC = param.filter_meta.icpg;
        size_t GROUP = param.filter_meta.group;
        SmallVector<TensorLayout> preprocessed_layouts;
        DType dtype = m_strategy.filter_dtype;
        if (dtype.category() == DTypeCategory::QUANTIZED) {
            if (format == param::MatrixMul::Format::MK4) {
                dtype = dtype::Float32();
            } else if (format == param::MatrixMul::Format::MK8) {
                dtype = dtype::Int16();
            }
        }
        if (format == param::MatrixMul::Format::DEFAULT) {
            preprocessed_layouts.push_back(
                    {{GROUP, Strategy::ALPHA, Strategy::ALPHA, OC, IC}, dtype});
        } else if (format == param::MatrixMul::Format::MK4) {
            preprocessed_layouts.push_back(
                    {{GROUP, Strategy::ALPHA, Strategy::ALPHA, OC / 4, IC / 4,
                      4, 4},
                     dtype});
        } else {
            megdnn_assert(format == param::MatrixMul::Format::MK8);
            preprocessed_layouts.push_back(
                    {{GROUP, Strategy::ALPHA, Strategy::ALPHA, OC / 8, IC / 8,
                      8, 8},
                     dtype});
        }
        return preprocessed_layouts;
    }

    //! Used by winograd_filter_preprocess opr
    void filter_process(const stype* filter_ptr,
                        input_filter_compute_type* filter_transform_buf,
                        void* transform_mid_buf, size_t OC, size_t IC) {
        m_strategy.filter(
                filter_ptr, filter_transform_buf,
                static_cast<input_filter_compute_type*>(transform_mid_buf), OC,
                IC, 0, OC);
    }

    static void filter_process(Strategy strategy,
                               const WorkspaceBundle& bundle_top,
                               const WorkspaceBundle& bundle_compute,
                               const NCBKernParam& kern_param,
                               const NCBKernIndex& ncb_index) {
        size_t compute_workspace_size_per_thread =
                bundle_compute.total_size_in_bytes();
        size_t thread_id = ncb_index.thread_id;
        size_t oc_id = ncb_index.ndrange_id[2];
        size_t group_id = ncb_index.ndrange_id[0];
        size_t OC = kern_param.filter_meta.ocpg;
        size_t IC = kern_param.filter_meta.icpg;
        size_t filter_group_size = Strategy::ALPHA * Strategy::ALPHA * OC * IC *
                                   sizeof(input_filter_compute_type);
        //! Filter trans dst ptr
        input_filter_compute_type* filter_transform_buf =
                reinterpret_cast<input_filter_compute_type*>(
                        reinterpret_cast<uintptr_t>(bundle_top.get(1)) +
                        group_id * filter_group_size);
        //! Filter trans src ptr
        input_filter_compute_type* transform_mid_buf =
                reinterpret_cast<input_filter_compute_type*>(
                        reinterpret_cast<uintptr_t>(bundle_compute.get(2)) +
                        compute_workspace_size_per_thread * thread_id);

        const stype* filter_ptr = kern_param.filter<stype>(group_id);
        size_t oc_start = oc_id, oc_end = oc_id + 1;

        if (kern_param.filter_meta.format == param::ConvBias::Format::NCHW88) {
            oc_start = 8 * oc_id;
            oc_end = oc_start + 8;
        } else if (kern_param.filter_meta.format ==
                   param::ConvBias::Format::NCHW44) {
            oc_start = 4 * oc_id;
            oc_end = oc_start + 4;
        }
        strategy.filter(filter_ptr, filter_transform_buf, transform_mid_buf, OC,
                        IC, oc_start, oc_end);
    }

    static void filter_preprocess(Strategy strategy,
                                  const WorkspaceBundle& bundle,
                                  const TensorND& preprocessed_tensor,
                                  const NCBKernParam& kern_param,
                                  const NCBKernIndex& ncb_index) {
        size_t thread_id = ncb_index.thread_id;
        size_t oc_id = ncb_index.ndrange_id[1];
        size_t group_id = ncb_index.ndrange_id[0];
        size_t OC = kern_param.filter_meta.ocpg;
        size_t IC = kern_param.filter_meta.icpg;
        size_t filter_group_size = Strategy::ALPHA * Strategy::ALPHA * OC * IC *
                                   sizeof(input_filter_compute_type);
        //! Filter trans dst ptr
        input_filter_compute_type* filter_transform_buf =
                reinterpret_cast<input_filter_compute_type*>(
                        reinterpret_cast<uintptr_t>(
                                preprocessed_tensor.raw_ptr) +
                        group_id * filter_group_size);
        //! Filter trans src ptr
        input_filter_compute_type* transform_mid_buf =
                reinterpret_cast<input_filter_compute_type*>(
                        reinterpret_cast<uintptr_t>(bundle.get(thread_id)));

        const stype* filter_ptr = kern_param.filter<stype>(group_id);
        size_t oc_start, oc_end;

        if (kern_param.filter_meta.format == param::ConvBias::Format::NCHW88) {
            oc_start = 8 * oc_id;
            oc_end = oc_start + 8;
        } else if (kern_param.filter_meta.format ==
                   param::ConvBias::Format::NCHW44) {
            oc_start = 4 * oc_id;
            oc_end = oc_start + 4;
        } else {
            oc_start = oc_id;
            oc_end = oc_id + 1;
        }
        strategy.filter(filter_ptr, filter_transform_buf, transform_mid_buf, OC,
                        IC, oc_start, oc_end);
    }

    static void winograd_compute(
            Strategy strategy, const WorkspaceBundle& bundle_top,
            const WorkspaceBundle& bundle_compute,
            fallback::MatrixMulImpl::AlgoBase* matmul_algo,
            fallback::MatrixMulImpl::KernParam matmul_param,
            size_t unit_tile_size, size_t unit_oc_size,
            const NCBKernParam& ncb_param, const NCBKernIndex& ncb_index) {
        size_t OC = ncb_param.filter_meta.ocpg;
        size_t IC = ncb_param.filter_meta.icpg;
        size_t IH = ncb_param.isz[0];
        size_t IW = ncb_param.isz[1];
        size_t OH = ncb_param.osz[0];
        size_t OW = ncb_param.osz[1];
        size_t PH = ncb_param.filter_meta.padding[0];
        size_t PW = ncb_param.filter_meta.padding[1];
        size_t filter_group_size = Strategy::ALPHA * Strategy::ALPHA * OC * IC *
                                   sizeof(input_filter_compute_type);
        size_t compute_workspace_size_per_thread =
                bundle_compute.total_size_in_bytes();

        size_t units_h = div_ceil<size_t>(OH, Strategy::OUTPUT_BLOCK_SIZE);
        size_t units_w = div_ceil<size_t>(OW, Strategy::OUTPUT_BLOCK_SIZE);
        size_t nr_units = units_h * units_w;

        size_t oc_block_id = ncb_index.ndrange_id[3];
        size_t tile_id = ncb_index.ndrange_id[2];
        size_t batch_id = ncb_index.ndrange_id[1];
        size_t group_id = ncb_index.ndrange_id[0];
        size_t thread_id = ncb_index.thread_id;

        const stype* src_ptr = ncb_param.src<stype>(batch_id, group_id);
        dst_type* dst_ptr = ncb_param.dst<dst_type>(batch_id, group_id);
        const output_compute_type* bias_ptr =
                static_cast<const output_compute_type*>(
                        ncb_param.bias<output_compute_type>(batch_id,
                                                            group_id));

        input_filter_compute_type* input_transform_buf =
                reinterpret_cast<input_filter_compute_type*>(
                        reinterpret_cast<uintptr_t>(bundle_compute.get(0)) +
                        compute_workspace_size_per_thread * thread_id);

        output_compute_type* output_transform_buf =
                reinterpret_cast<output_compute_type*>(
                        reinterpret_cast<uintptr_t>(bundle_compute.get(1)) +
                        compute_workspace_size_per_thread * thread_id);
        input_filter_compute_type* transform_mid_buf =
                reinterpret_cast<input_filter_compute_type*>(
                        reinterpret_cast<uintptr_t>(bundle_compute.get(2)) +
                        compute_workspace_size_per_thread * thread_id);

        //! NCHW88_WINOGRAD and NCHW_WINOGRAD is the same offset
        const input_filter_compute_type* filter_transform_buf = nullptr;
        if (nullptr != ncb_param.preprocessed_filter) {
            auto preprocess_raw_ptr =
                    ncb_param.preprocessed_filter->tensors[0].raw_ptr;
            filter_transform_buf = reinterpret_cast<input_filter_compute_type*>(
                    reinterpret_cast<uintptr_t>(preprocess_raw_ptr) +
                    group_id * filter_group_size);
        } else {
            filter_transform_buf =
                    static_cast<const input_filter_compute_type*>(
                            ncb_param.filter<input_filter_compute_type>(
                                    group_id));
            if (ncb_param.filter_meta.format == param::ConvBias::Format::NCHW ||
                ncb_param.filter_meta.format ==
                        param::ConvBias::Format::NCHW88 ||
                ncb_param.filter_meta.format ==
                        param::ConvBias::Format::NCHW44) {
                filter_transform_buf =
                        reinterpret_cast<input_filter_compute_type*>(
                                reinterpret_cast<uintptr_t>(bundle_top.get(1)) +
                                group_id * filter_group_size);
            }
        }
        //! prepare matmul param
        matmul_param.workspace_ptr = reinterpret_cast<void*>(
                reinterpret_cast<uintptr_t>(bundle_compute.get(3)) +
                compute_workspace_size_per_thread * thread_id);
        matmul_param.workspace_size = bundle_compute.get_size(3);
        fallback::MatrixMulImpl::kern_t matmul_kern =
                matmul_algo->get_kern(matmul_param);

        size_t unit_start_idx = tile_id * unit_tile_size;
        size_t nr_tiles_in_unit =
                std::min(nr_units - unit_start_idx, unit_tile_size);
        size_t oc_start_idx = oc_block_id * unit_oc_size;
        size_t nr_oc_in_unit = std::min(OC - oc_start_idx, unit_oc_size);
        megdnn_assert(nr_oc_in_unit % Strategy::OC_BLOCK_SIZE == 0,
                      "The winograd remain oc is not times of OC_BLOCK_SIZE");
        if (format == param::MatrixMul::Format::MK4 ||
            format == param::MatrixMul::Format::MK8) {
#if !MEGDNN_X86
            nr_tiles_in_unit = round_up<size_t>(nr_tiles_in_unit, 4);
#endif
            megdnn_assert(nr_tiles_in_unit <= unit_tile_size,
                          "nr_tiles_in_unit: %zu TILE_SIZE:%zu",
                          nr_tiles_in_unit, unit_tile_size);
        }
        //! BTdB
        strategy.input(src_ptr, input_transform_buf, transform_mid_buf,
                       IH, IW, IC, PH, PW, unit_start_idx, nr_tiles_in_unit);

        rep(i, Strategy::ALPHA) rep(j, Strategy::ALPHA) {
            if (format == param::MatrixMul::Format::DEFAULT) {
                matmul_param.A_ptr =
                        input_transform_buf +
                        (i * Strategy::ALPHA + j) * nr_tiles_in_unit * IC;
                matmul_param.B_ptr = filter_transform_buf +
                                     (i * Strategy::ALPHA + j) * OC * IC +
                                     oc_start_idx;

                matmul_param.C_ptr = output_transform_buf +
                                     (i * Strategy::ALPHA + j) *
                                             nr_tiles_in_unit * nr_oc_in_unit;

                matmul_param.M = nr_tiles_in_unit;
                matmul_param.N = nr_oc_in_unit;
                matmul_param.LDB = OC;
                matmul_param.LDC = nr_oc_in_unit;
            } else {
                matmul_param.A_ptr = filter_transform_buf +
                                     (i * Strategy::ALPHA + j) * OC * IC +
                                     oc_start_idx * IC;

                matmul_param.B_ptr =
                        input_transform_buf +
                        (i * Strategy::ALPHA + j) * nr_tiles_in_unit * IC;

                matmul_param.C_ptr = output_transform_buf +
                                     (i * Strategy::ALPHA + j) *
                                             nr_tiles_in_unit * nr_oc_in_unit;
                matmul_param.N = nr_tiles_in_unit;
                matmul_param.M = nr_oc_in_unit;
                matmul_param.LDB = matmul_param.N * Strategy::IC_BLOCK_SIZE;
                matmul_param.LDC = matmul_param.N * Strategy::IC_BLOCK_SIZE;
            }
            matmul_kern(matmul_param);
        }

        //! Y = ATmA
        size_t oc_end_idx = oc_start_idx + nr_oc_in_unit;
        strategy.output(
                output_transform_buf, bias_ptr, dst_ptr,
                reinterpret_cast<output_compute_type*>(transform_mid_buf),
                ncb_param.bias_mode, ncb_param.nonlineMode, OH, OW,
                oc_start_idx, oc_end_idx, unit_start_idx, nr_tiles_in_unit);
    };

    SmallVector<NCBKern> get_preprocess_kerns(
            const NCBKernSizeParam& param, fallback::MatrixMulImpl::AlgoBase*) {
        megdnn_assert(
                param.filter_meta.format == param::ConvBias::Format::NCHW ||
                param.filter_meta.format == param::ConvBias::Format::NCHW88 ||
                param.filter_meta.format == param::ConvBias::Format::NCHW44);
        megdnn_assert(param.preprocessed_filter &&
                      param.preprocessed_filter->tensors.size() > 0);
        size_t OC = param.filter_meta.ocpg;
        size_t GROUP = param.filter_meta.group;
        const TensorND& preprocessed_dst =
                param.preprocessed_filter->tensors[0];
        WorkspaceBundle bundle = get_preprocess_wbundle(param);

        Strategy strategy = m_strategy;
        SmallVector<NCBKern> kerns;
        auto filter_process_kern =
                [strategy, bundle, &preprocessed_dst](
                        const NCBKernParam& ncb_param,
                        const NCBKernIndex& ncb_index) mutable {
                    MIDOUT_BEGIN(megdnn_fallback_conv_bias_winograd_common,
                                 midout_iv("filter_preprocess"_hash)) {
                        bundle.set(ncb_param.workspace_ptr);
                        filter_preprocess(strategy, bundle, preprocessed_dst,
                                          ncb_param, ncb_index);
                    }
                    MIDOUT_END();
                };
        size_t oc_parallelism = OC;
        if (param.filter_meta.format == param::ConvBias::Format::NCHW88) {
            megdnn_assert(OC % 8 == 0);
            oc_parallelism = OC / 8;
        } else if (param.filter_meta.format ==
                   param::ConvBias::Format::NCHW44) {
            megdnn_assert(OC % 4 == 0);
            oc_parallelism = OC / 4;
        }
        kerns.push_back({filter_process_kern, {GROUP, oc_parallelism}});
        return kerns;
    }

    SmallVector<NCBKern> get_kerns(
            const NCBKernSizeParam& param,
            fallback::MatrixMulImpl::AlgoBase* matmul_algo) {
        size_t N = param.n;
        size_t OC = param.filter_meta.ocpg;
        size_t OH = param.osz[0];
        size_t OW = param.osz[1];
        size_t GROUP = param.filter_meta.group;
        WorkspaceBundle bundle_top = get_wbundle(param, matmul_algo);
        WorkspaceBundle bundle_compute =
                get_wbundle_compute(param, matmul_algo);
        fallback::MatrixMulImpl::KernParam matmul_param;
        static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
                get_matmul_kern_param(param, m_unit_oc_size);

        size_t unit_tile_size = m_unit_tile_size;
        size_t unit_oc_size = m_unit_oc_size;
        size_t units_h = div_ceil<size_t>(OH, Strategy::OUTPUT_BLOCK_SIZE);
        size_t units_w = div_ceil<size_t>(OW, Strategy::OUTPUT_BLOCK_SIZE);

        size_t nr_units = units_h * units_w;
        size_t nr_hw_tiles = div_ceil<size_t>(nr_units, m_unit_tile_size);
        size_t nr_oc_tiles = div_ceil<size_t>(OC, m_unit_oc_size);

        //! The filter should process ahead
        megdnn_assert(
                param.filter_meta.stride[0] == 1 &&
                param.filter_meta.stride[1] == 1 &&
                (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                 param.filter_meta.format == param::ConvBias::Format::NCHW88 ||
                 param.filter_meta.format == param::ConvBias::Format::NCHW44 ||
                 param.filter_meta.format ==
                         param::ConvBias::Format::NCHW_WINOGRAD ||
                 param.filter_meta.format ==
                         param::ConvBias::Format::NCHW88_WINOGRAD ||
                 param.filter_meta.format ==
                         param::ConvBias::Format::NCHW44_WINOGRAD));

        SmallVector<NCBKern> kerns;
        if (param.preprocessed_filter == nullptr &&
            (param.filter_meta.format == param::ConvBias::Format::NCHW ||
             param.filter_meta.format == param::ConvBias::Format::NCHW88 ||
             param.filter_meta.format == param::ConvBias::Format::NCHW44)) {
            auto filter_process_kern =
                    [strategy = m_strategy, bundle_top, bundle_compute](
                            const NCBKernParam& ncb_param,
                            const NCBKernIndex& ncb_index) mutable {
                        MIDOUT_BEGIN(megdnn_fallback_conv_bias_winograd_common,
                                     midout_iv("filter_process"_hash)) {
                            bundle_top.set(ncb_param.workspace_ptr);
                            bundle_compute.set(bundle_top.get(0));
                            filter_process(strategy, bundle_top, bundle_compute,
                                           ncb_param, std::move(ncb_index));
                        }
                        MIDOUT_END();
                    };
            size_t oc_parallelism = OC;
            if (param.filter_meta.format == param::ConvBias::Format::NCHW88) {
                megdnn_assert(OC % 8 == 0);
                oc_parallelism = OC / 8;
            } else if (param.filter_meta.format ==
                       param::ConvBias::Format::NCHW44) {
                megdnn_assert(OC % 4 == 0);
                oc_parallelism = OC / 4;
            }
            kerns.push_back({filter_process_kern, {GROUP, 1, oc_parallelism}});
        }
        auto winograd_compute_kern =
                [strategy = m_strategy, bundle_top, bundle_compute, matmul_algo,
                 matmul_param, unit_tile_size,
                 unit_oc_size](const NCBKernParam& ncb_param,
                               const NCBKernIndex& ncb_index) mutable {
                    MIDOUT_BEGIN(megdnn_fallback_conv_bias_winograd_common,
                                 midout_iv("winograd_compute"_hash)) {
                        bundle_top.set(ncb_param.workspace_ptr);
                        bundle_compute.set(bundle_top.get(0));
                        winograd_compute(strategy, bundle_top, bundle_compute,
                                         matmul_algo, matmul_param,
                                         unit_tile_size, unit_oc_size,
                                         ncb_param, std::move(ncb_index));
                    }
                    MIDOUT_END();
                };
        kerns.push_back(
                {winograd_compute_kern, {GROUP, N, nr_hw_tiles, nr_oc_tiles}});
        return kerns;
    }

    fallback::MatrixMulImpl::KernSizeParam get_matmul_kern_param(
            const NCBKernSizeParam& param, size_t nr_oc_in_unit = 0) const {
        size_t M = 0;
        size_t N = 0;
        size_t K = 0;
        size_t LDA = 0, LDB = 0, LDC = 0;
        if (nr_oc_in_unit == 0) {
            nr_oc_in_unit = param.filter_meta.ocpg;
        }

        if (format == param::MatrixMul::Format::DEFAULT) {
            M = m_unit_tile_size;
            N = nr_oc_in_unit;
            K = param.filter_meta.icpg;
            LDA = K;
            LDB = N;
            LDC = N;
        } else {
            M = nr_oc_in_unit;
            N = m_unit_tile_size;
            K = param.filter_meta.icpg;
            megdnn_assert(K % Strategy::IC_BLOCK_SIZE == 0, "invalid K: %zu",
                          K);
            LDA = K / Strategy::IC_BLOCK_SIZE * Strategy::OC_BLOCK_SIZE *
                  Strategy::IC_BLOCK_SIZE;
            LDB = N * Strategy::IC_BLOCK_SIZE;
            LDC = N * Strategy::IC_BLOCK_SIZE;
        }

        return {DType::from_enum(DTypeTrait<input_filter_compute_type>::enumv),
                DType::from_enum(DTypeTrait<input_filter_compute_type>::enumv),
                DType::from_enum(DTypeTrait<output_compute_type>::enumv),
                M,
                N,
                K,
                LDA,
                LDB,
                LDC,
                false,
                false,
                param::MatrixMul::ComputeMode::DEFAULT,
                format};
    }
};

}  // namespace winograd
}  // namespace megdnn

#define MEGDNN_REG_WINOGRAD_STRATEGY(                                          \
        _stype, _dtype, _input_filter_ctype, _ctype, _output_block_size,       \
        _kernel_size, _ic_block_size, _oc_block_size, _strategy_cls_name)      \
    class _strategy_cls_name {                                                 \
    public:                                                                    \
        using stype = _stype;                                                  \
        using dst_type = _dtype;                                               \
        using output_compute_type = _ctype;                                    \
        using input_filter_compute_type = _input_filter_ctype;                 \
        /**                                                                    \
         * kernel size of convolution, same as \c r                            \
         * output block size, same as \c m                                     \
         */                                                                    \
        constexpr static size_t KERNEL_SIZE = _kernel_size;                    \
        constexpr static size_t OUTPUT_BLOCK_SIZE = _output_block_size;        \
        constexpr static size_t IC_BLOCK_SIZE = _ic_block_size;                \
        constexpr static size_t OC_BLOCK_SIZE = _oc_block_size;                \
        constexpr static size_t ALPHA = KERNEL_SIZE + OUTPUT_BLOCK_SIZE - 1;   \
        /**                                                                    \
         * process \c UNIT_TILE_SIZE small matrix mul once, total tiles is     \
         * N * DIV_UP(OH, OUTPUT_BLOCK_SIZE) * DIV_UP(OW, OUTPUT_BLOCK_SIZE)   \
         */                                                                    \
        const DType src_dtype;                                                 \
        const DType filter_dtype;                                              \
        const DType dst_dtype;                                                 \
        _strategy_cls_name(DType src_dtype, DType filter_dtype,                \
                           DType dst_dtype);                                   \
        void filter(const stype* filter,                                       \
                    input_filter_compute_type* filter_transform_buf,           \
                    input_filter_compute_type* transform_mid_buf, size_t OC,   \
                    size_t IC, size_t oc_start, size_t oc_end);                \
        void input(const stype* input,                                         \
                   input_filter_compute_type* input_transform_buf,             \
                   input_filter_compute_type* transform_mid_buf,               \
                   size_t IH, size_t IW, size_t IC, size_t PH, size_t PW,      \
                   size_t unit_start_idx, size_t nr_tiles_in_unit);            \
        void output(const output_compute_type* output_transform_buf,           \
                    const output_compute_type* bias, dst_type* output,         \
                    output_compute_type* transform_mid_buf, BiasMode bmode,    \
                    NonlineMode nonline_mode, size_t OH, size_t OW,            \
                    size_t oc_start, size_t oc_end, size_t unit_start_idx,     \
                    size_t nr_tiles_in_unit);                                  \
    };

#define MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(_strategy_cls_name)     \
    constexpr size_t _strategy_cls_name::KERNEL_SIZE;             \
    constexpr size_t _strategy_cls_name::OUTPUT_BLOCK_SIZE;       \
    constexpr size_t _strategy_cls_name::ALPHA;                   \
    constexpr size_t _strategy_cls_name::IC_BLOCK_SIZE;           \
    constexpr size_t _strategy_cls_name::OC_BLOCK_SIZE;           \
    _strategy_cls_name::_strategy_cls_name(                       \
            DType src_dtype, DType filter_dtype, DType dst_dtype) \
            : src_dtype(src_dtype),                               \
              filter_dtype(filter_dtype),                         \
              dst_dtype(dst_dtype) {}

#define MEGDNN_WINOGRADS_ALGO_FUN_DEFINE(_class, _fun, _strategy,              \
                                         _midout_flag, _matmul_format)         \
    MEGDNN_MARK_USED_VAR(param);                                               \
    MIDOUT_BEGIN(_midout_flag, midout_iv(#_class #_fun##_hash)) {              \
        _strategy strategy(param.src_type, param.filter_type, param.dst_type); \
        return megdnn::winograd::ConvBias<_strategy, _matmul_format>(          \
                       strategy, m_tile_size, param)                           \
                ._fun(param, m_matmul_algo);                                   \
    }                                                                          \
    MIDOUT_END();

#define MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(_class, _strategy, _midout_flag, \
                                            _matmul_format)                  \
    size_t ConvBiasImpl::_class::get_workspace(                              \
            fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {  \
        MEGDNN_WINOGRADS_ALGO_FUN_DEFINE(_class, get_workspace_size,         \
                                         _strategy, _midout_flag,            \
                                         _matmul_format);                    \
        return 0;                                                            \
    }                                                                        \
    size_t ConvBiasImpl::_class::get_preprocess_workspace(                   \
            fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {  \
        MEGDNN_WINOGRADS_ALGO_FUN_DEFINE(                                    \
                _class, get_preprocess_workspace_size, _strategy,            \
                _midout_flag, _matmul_format);                               \
        return 0;                                                            \
    }                                                                        \
    SmallVector<TensorLayout>                                                \
    ConvBiasImpl::_class::deduce_preprocessed_filter_layout(                 \
            fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {  \
        MEGDNN_WINOGRADS_ALGO_FUN_DEFINE(                                    \
                _class, deduce_preprocessed_filter_layout, _strategy,        \
                _midout_flag, _matmul_format);                               \
        return {};                                                           \
    }                                                                        \
    SmallVector<ConvBiasImpl::NCBKern>                                       \
    ConvBiasImpl::_class::dispatch_preprocess_kerns(                         \
            fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {  \
        MEGDNN_WINOGRADS_ALGO_FUN_DEFINE(_class, get_preprocess_kerns,       \
                                         _strategy, _midout_flag,            \
                                         _matmul_format);                    \
        return {};                                                           \
    }                                                                        \
    SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::_class::dispatch_kerns( \
            fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const {  \
        MEGDNN_WINOGRADS_ALGO_FUN_DEFINE(_class, get_kerns, _strategy,       \
                                         _midout_flag, _matmul_format);      \
        return {};                                                           \
    }

// vim: syntax=cpp.doxygen