algos.cpp 33.0 KB
Newer Older
1 2 3 4 5 6 7 8
/**
 * \file dnn/src/arm_common/conv_bias/fp32/algos.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
9 10
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 */

#include "src/arm_common/conv_bias/fp32/algos.h"
#include "src/arm_common/conv_bias/direct/multi_thread_common.h"
#include "src/arm_common/conv_bias/fp32/direct.h"
#include "src/arm_common/conv_bias/fp32/do_conv_stride1.h"
#include "src/arm_common/conv_bias/fp32/do_conv_stride2.h"
#include "src/arm_common/conv_bias/fp32/strategy.h"
#include "src/arm_common/conv_bias/img2col_helper.h"
#include "src/arm_common/conv_bias/postprocess_helper.h"
#include "src/common/opr_delegate.h"
#include "src/fallback/conv_bias/common.h"

#include "midout.h"

MIDOUT_DECL(megdnn_arm_common_winograd_fp32)

using namespace megdnn;
using namespace arm_common;

/* ======================= AlgoFP32WinogradF23_4x4 ======================== */

bool ConvBiasImpl::AlgoFP32WinogradF23_4x4::usable(
34
        const NCBKernSizeParam& param,
35 36 37 38 39 40
        AlgoSelectionStrategy /*algo_selection_strategy*/) const {
    MEGDNN_MARK_USED_VAR(param);
    MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 0, 0) {
        if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
            return false;
        using Strategy = winograd::winograd_2x3_4x4_f;
41
        using PackMode = fallback::MatrixMulImpl::AlgoBase::PackMode;
42 43 44 45
        Strategy strategy(param.src_type, param.filter_type, param.dst_type);
        auto&& matmul_param =
                megdnn::winograd::ConvBias<Strategy,
                                           param::MatrixMul::Format::MK4>(
46
                        strategy, m_tile_size, param)
47 48
                        .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
49
               m_matmul_algo->packmode() == PackMode::NO_PACK &&
50 51
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
52
                         param::ConvBias::Format::NCHW_WINOGRAD &&
53
                 param.output_block_size == 2 &&
54 55
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::MK4)) &&
56
               !param.filter_meta.should_flip &&
57 58 59 60 61 62 63 64 65 66 67 68 69 70
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
               (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
                param.filter_meta.stride[0] == 1) &&
               (param.filter_meta.dilation[0] ==
                        param.filter_meta.dilation[1] &&
                param.filter_meta.dilation[0] == 1) &&
               param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
               param.src_type.enumv() == DTypeEnum::Float32;
    }
    MIDOUT_END();
    return false;
}

71 72 73 74
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF23_4x4,
                                    winograd::winograd_2x3_4x4_f,
                                    megdnn_arm_common_winograd_fp32,
                                    param::MatrixMul::Format::MK4);
75 76 77 78

/* ======================= AlgoFP32WinogradF63 ======================== */

bool ConvBiasImpl::AlgoFP32WinogradF63::usable(
79
        const NCBKernSizeParam& param,
80 81 82 83 84
        AlgoSelectionStrategy /*algo_selection_strategy*/) const {
    MEGDNN_MARK_USED_VAR(param);
    MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 1, 0) {
        using Strategy = winograd::winograd_6x3_1x1_f;
        Strategy strategy(param.src_type, param.filter_type, param.dst_type);
85 86 87
        auto&& matmul_param = megdnn::winograd::ConvBias<Strategy>(
                                      strategy, m_tile_size, param)
                                      .get_matmul_kern_param(param);
88
        return m_matmul_algo->usable(matmul_param) &&
89 90
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
91
                         param::ConvBias::Format::NCHW_WINOGRAD &&
92
                 param.output_block_size == 6 &&
93 94
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::DEFAULT)) &&
95
               !param.filter_meta.should_flip &&
96 97 98 99 100 101 102 103 104 105 106 107 108 109
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
               (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
                param.filter_meta.stride[0] == 1) &&
               (param.filter_meta.dilation[0] ==
                        param.filter_meta.dilation[1] &&
                param.filter_meta.dilation[0] == 1) &&
               param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
               param.src_type.enumv() == DTypeEnum::Float32;
    }
    MIDOUT_END();
    return false;
}

110 111 112 113
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF63,
                                    winograd::winograd_6x3_1x1_f,
                                    megdnn_arm_common_winograd_fp32,
                                    param::MatrixMul::Format::DEFAULT);
114 115 116 117

/* ======================= AlgoFP32WinogradF54 ======================== */

bool ConvBiasImpl::AlgoFP32WinogradF54::usable(
118
        const NCBKernSizeParam& param,
119 120 121 122 123
        AlgoSelectionStrategy /*algo_selection_strategy*/) const {
    MEGDNN_MARK_USED_VAR(param);
    MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 2, 0) {
        using Strategy = winograd::winograd_5x4_1x1_f;
        Strategy strategy(param.src_type, param.filter_type, param.dst_type);
124 125 126
        auto&& matmul_param = megdnn::winograd::ConvBias<Strategy>(
                                      strategy, m_tile_size, param)
                                      .get_matmul_kern_param(param);
127
        return m_matmul_algo->usable(matmul_param) &&
128 129
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
130
                         param::ConvBias::Format::NCHW_WINOGRAD &&
131
                 param.output_block_size == 5 &&
132 133
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::DEFAULT)) &&
134
               !param.filter_meta.should_flip &&
135 136 137 138 139 140 141 142 143 144 145 146 147 148
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 4) &&
               (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
                param.filter_meta.stride[0] == 1) &&
               (param.filter_meta.dilation[0] ==
                        param.filter_meta.dilation[1] &&
                param.filter_meta.dilation[0] == 1) &&
               param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
               param.src_type.enumv() == DTypeEnum::Float32;
    }
    MIDOUT_END();
    return false;
}

149 150 151 152
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF54,
                                    winograd::winograd_5x4_1x1_f,
                                    megdnn_arm_common_winograd_fp32,
                                    param::MatrixMul::Format::DEFAULT);
153 154 155 156

/* ======================= AlgoFP32WinogradF45 ======================== */

bool ConvBiasImpl::AlgoFP32WinogradF45::usable(
157
        const NCBKernSizeParam& param,
158 159 160 161 162
        AlgoSelectionStrategy /*algo_selection_strategy*/) const {
    MEGDNN_MARK_USED_VAR(param);
    MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 3, 0) {
        using Strategy = winograd::winograd_4x5_1x1_f;
        Strategy strategy(param.src_type, param.filter_type, param.dst_type);
163 164 165
        auto&& matmul_param = megdnn::winograd::ConvBias<Strategy>(
                                      strategy, m_tile_size, param)
                                      .get_matmul_kern_param(param);
166
        return m_matmul_algo->usable(matmul_param) &&
167 168
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
169
                         param::ConvBias::Format::NCHW_WINOGRAD &&
170
                 param.output_block_size == 4 &&
171 172
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::DEFAULT)) &&
173
               !param.filter_meta.should_flip &&
174 175 176 177 178 179 180 181 182 183 184 185 186 187
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 5) &&
               (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
                param.filter_meta.stride[0] == 1) &&
               (param.filter_meta.dilation[0] ==
                        param.filter_meta.dilation[1] &&
                param.filter_meta.dilation[0] == 1) &&
               param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
               param.src_type.enumv() == DTypeEnum::Float32;
    }
    MIDOUT_END();
    return false;
}

188 189 190 191
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF45,
                                    winograd::winograd_4x5_1x1_f,
                                    megdnn_arm_common_winograd_fp32,
                                    param::MatrixMul::Format::DEFAULT);
192 193 194 195

/* ======================= AlgoFP32WinogradF63_4x4 ======================== */

bool ConvBiasImpl::AlgoFP32WinogradF63_4x4::usable(
196
        const NCBKernSizeParam& param,
197 198 199 200 201 202
        AlgoSelectionStrategy /*algo_selection_strategy*/) const {
    MEGDNN_MARK_USED_VAR(param);
    MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, 4, 0) {
        if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
            return false;
        using Strategy = winograd::winograd_6x3_4x4_f;
203
        using PackMode = fallback::MatrixMulImpl::AlgoBase::PackMode;
204 205 206 207
        Strategy strategy(param.src_type, param.filter_type, param.dst_type);
        auto&& matmul_param =
                megdnn::winograd::ConvBias<Strategy,
                                           param::MatrixMul::Format::MK4>(
208
                        strategy, m_tile_size, param)
209 210
                        .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
211
               m_matmul_algo->packmode() == PackMode::NO_PACK &&
212 213
               (param.filter_meta.format == param::ConvBias::Format::NCHW ||
                (param.filter_meta.format ==
214
                         param::ConvBias::Format::NCHW_WINOGRAD &&
215
                 param.output_block_size == 6 &&
216 217
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::MK4)) &&
218
               !param.filter_meta.should_flip &&
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
               (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
                param.filter_meta.stride[0] == 1) &&
               (param.filter_meta.dilation[0] ==
                        param.filter_meta.dilation[1] &&
                param.filter_meta.dilation[0] == 1) &&
               param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
               param.src_type.enumv() == DTypeEnum::Float32 &&
               param.filter_meta.icpg % 4 == 0 &&
               param.filter_meta.ocpg % 4 == 0;
    }
    MIDOUT_END();
    return false;
}

235 236 237 238
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF63_4x4,
                                    winograd::winograd_6x3_4x4_f,
                                    megdnn_arm_common_winograd_fp32,
                                    param::MatrixMul::Format::MK4);
239 240 241 242

/* =================== AlgoFP32WinogradF23_4x4_NCHW44 =================== */

bool ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::usable(
243
        const NCBKernSizeParam& param,
244 245 246 247 248 249 250 251 252 253 254
        AlgoSelectionStrategy /*algo_selection_strategy*/) const {
    MEGDNN_MARK_USED_VAR(param);
    MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32,
                 midout_iv("AlgoFP32WinogradF23_4x4_NCHW44"_hash)) {
        if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
            return false;
        using Strategy = winograd::winograd_F23_mk4_f_nchw44;
        Strategy strategy(param.src_type, param.filter_type, param.dst_type);
        auto&& matmul_param =
                megdnn::winograd::ConvBias<Strategy,
                                           param::MatrixMul::Format::MK4>(
255
                        strategy, m_tile_size, param)
256 257 258 259
                        .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
               m_matmul_algo->packmode() ==
                       fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK &&
260 261
               (param.filter_meta.format == param::ConvBias::Format::NCHW44 ||
                (param.filter_meta.format ==
262
                         param::ConvBias::Format::NCHW44_WINOGRAD &&
263
                 param.output_block_size == 2 &&
264 265
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::MK4)) &&
266
               !param.filter_meta.should_flip &&
267 268 269 270 271 272 273 274 275 276 277 278 279 280
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
               (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
                param.filter_meta.stride[0] == 1) &&
               (param.filter_meta.dilation[0] ==
                        param.filter_meta.dilation[1] &&
                param.filter_meta.dilation[0] == 1) &&
               param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
               param.src_type.enumv() == DTypeEnum::Float32;
    }
    MIDOUT_END();
    return false;
}

281 282 283 284
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF23_4x4_NCHW44,
                                    winograd::winograd_F23_mk4_f_nchw44,
                                    megdnn_arm_common_winograd_fp32,
                                    param::MatrixMul::Format::MK4);
285 286 287 288

/* =================== AlgoFP32WinogradF63_4x4_NCHW44 ===================== */

bool ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::usable(
289
        const NCBKernSizeParam& param,
290 291 292 293 294 295 296 297 298 299 300
        AlgoSelectionStrategy /*algo_selection_strategy*/) const {
    MEGDNN_MARK_USED_VAR(param);
    MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32,
                 midout_iv("AlgoFP32WinogradF63_4x4_NCHW44"_hash)) {
        if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0)
            return false;
        using Strategy = winograd::winograd_F63_mk4_f_nchw44;
        Strategy strategy(param.src_type, param.filter_type, param.dst_type);
        auto&& matmul_param =
                megdnn::winograd::ConvBias<Strategy,
                                           param::MatrixMul::Format::MK4>(
301
                        strategy, m_tile_size, param)
302 303 304 305
                        .get_matmul_kern_param(param);
        return m_matmul_algo->usable(matmul_param) &&
               m_matmul_algo->packmode() ==
                       fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK &&
306 307
               (param.filter_meta.format == param::ConvBias::Format::NCHW44 ||
                (param.filter_meta.format ==
308
                         param::ConvBias::Format::NCHW44_WINOGRAD &&
309
                 param.output_block_size == 6 &&
310 311
                 param.winograd_matmul_format ==
                         param::MatrixMul::Format::MK4)) &&
312
               !param.filter_meta.should_flip &&
313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
               (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
                param.filter_meta.spatial[0] == 3) &&
               (param.filter_meta.stride[0] == param.filter_meta.stride[1] &&
                param.filter_meta.stride[0] == 1) &&
               (param.filter_meta.dilation[0] ==
                        param.filter_meta.dilation[1] &&
                param.filter_meta.dilation[0] == 1) &&
               param.compute_mode == param::ConvBias::ComputeMode::DEFAULT &&
               param.src_type.enumv() == DTypeEnum::Float32 &&
               param.filter_meta.icpg % 4 == 0 &&
               param.filter_meta.ocpg % 4 == 0;
    }
    MIDOUT_END();
    return false;
}

329 330 331 332
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(AlgoFP32WinogradF63_4x4_NCHW44,
                                    winograd::winograd_F63_mk4_f_nchw44,
                                    megdnn_arm_common_winograd_fp32,
                                    param::MatrixMul::Format::MK4);
333 334 335 336 337

/* ===================== direct algo ===================== */
MIDOUT_DECL(megdnn_arm_common_conv_bias_f32_kimpl);

bool ConvBiasImpl::AlgoF32Direct::usable(
338
        const NCBKernSizeParam& param,
339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
        AlgoSelectionStrategy algo_selection_strategy) const {
    MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 0, 0) {
        auto&& fm = param.filter_meta;
        auto FH = fm.spatial[0];
        auto SH = fm.stride[0], SW = fm.stride[1];
        // the condition ``param.isz[0]*param.isz[1] >= 4'' and
        // ``param.osz[0]*param.osz[1] >= 4'' comes from the fact that the
        // kernel may have access to up to 4 floats after the end of the memory
        // chunk.
        bool aviliable = fm.format == param::ConvBias::Format::NCHW &&
                         param.src_type.enumv() == DTypeEnum::Float32 &&
                         param.filter_type.enumv() == DTypeEnum::Float32 &&
                         param.dst_type.enumv() == DTypeEnum::Float32 &&
                         fm.spatial_ndim == 2 && fm.dilation[0] == 1 &&
                         fm.dilation[1] == 1 &&
                         param.isz[0] * param.isz[1] >= 4 &&
                         param.osz[0] * param.osz[1] >= 4 && FH <= 7 &&
                         SH == 1 && SW == 1;
        if (algo_selection_strategy == AlgoSelectionStrategy::HEURISTIC) {
            bool large_group = param.filter_meta.group >= param.nr_threads;
            aviliable &= (large_group == m_large_group);
        }
        return aviliable;
    }
    MIDOUT_END();
    return false;
}
size_t ConvBiasImpl::AlgoF32Direct::get_workspace(
367
        const NCBKernSizeParam& param) const {
368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
    MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 0, 1) {
        auto wbundle = MultithreadDirectConvCommon<float, float>::get_bundle(
                param, m_large_group);
        return wbundle.total_size_in_bytes();
    }
    MIDOUT_END();
    return 0;
}
SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::get_kimpls(
        const NCBKernSizeParam& param) const {
    auto fm = param.filter_meta;
    size_t N = param.n;
    size_t IC = param.filter_meta.icpg;
    size_t OC = param.filter_meta.ocpg;
    size_t group = fm.group;
383
    WorkspaceBundle bundle =
384 385 386 387 388 389 390
            MultithreadDirectConvCommon<float, float>::get_bundle(
                    param, m_large_group);
    SmallVector<NCBKern> ret_kerns;
    //! When group >= nr_threads, treat it as large_group, each thread process
    //! one group for better performance
    if (m_large_group) {
        //! Channel wise conv and big groups
391 392
        auto exec_one_group = [bundle](const NCBKernParam& kern_param,
                                       const NCBKernIndex& ncb_index) mutable {
393 394 395
            auto fm = kern_param.filter_meta;
            size_t IC = fm.icpg;
            size_t OC = fm.ocpg;
396
            bundle.set(kern_param.workspace_ptr);
397 398 399 400 401 402 403 404 405
            if (fm.should_flip) {
                for (size_t oc = 0; oc < OC; oc++) {
                    MultithreadDirectConvCommon<float, float>::weight_flip_kern(
                            bundle, kern_param, ncb_index,
                            {ncb_index.thread_id, 0, oc});
                }
            }
            for (size_t ic = 0; ic < IC; ic++) {
                MultithreadDirectConvCommon<float, float>::copy_padding_kern(
406 407
                        bundle, kern_param, ncb_index,
                        {ncb_index.thread_id, 0, ic});
408 409 410 411 412 413 414 415 416 417 418 419
            }
            for (size_t oc = 0; oc < OC; oc++) {
                MultithreadDirectConvCommon<float, float>::do_conv_kern(
                        bundle, kern_param, ncb_index,
                        fp32::conv_bias::kern_direct,
                        {ncb_index.thread_id, 0, oc});
            }
        };
        ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
    } else {
        if (fm.should_flip) {
            auto weight_flip = [bundle](const NCBKernParam& kern_param,
420 421
                                        const NCBKernIndex& ncb_index) mutable {
                bundle.set(kern_param.workspace_ptr);
422 423 424 425 426 427
                MultithreadDirectConvCommon<float, float>::weight_flip_kern(
                        bundle, kern_param, ncb_index, ncb_index.ndrange_id);
            };
            ret_kerns.push_back({weight_flip, {group, 1_z, OC}});
        }
        auto copy_padding = [bundle](const NCBKernParam& kern_param,
428 429
                                     const NCBKernIndex& ncb_index) mutable {
            bundle.set(kern_param.workspace_ptr);
430 431 432 433 434
            MultithreadDirectConvCommon<float, float>::copy_padding_kern(
                    bundle, kern_param, ncb_index, ncb_index.ndrange_id);
        };
        ret_kerns.push_back({copy_padding, {group, N, IC}});
        auto do_conv = [bundle](const NCBKernParam& kern_param,
435 436
                                const NCBKernIndex& ncb_index) mutable {
            bundle.set(kern_param.workspace_ptr);
437 438 439 440 441 442 443 444 445 446
            MultithreadDirectConvCommon<float, float>::do_conv_kern(
                    bundle, kern_param, ncb_index, fp32::conv_bias::kern_direct,
                    ncb_index.ndrange_id);
        };
        ret_kerns.push_back({do_conv, {group, N, OC}});
    }
    return ret_kerns;
}

SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32Direct::dispatch_kerns(
447
        const NCBKernSizeParam& param) const {
448 449 450 451 452 453 454 455
    MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 0, 1) {
        return get_kimpls(param);
    }
    MIDOUT_END();
    return {};
}
/* ===================== stride-1 algo ===================== */
bool ConvBiasImpl::AlgoF32DirectStride1::usable(
456
        const NCBKernSizeParam& param,
457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481
        AlgoSelectionStrategy algo_selection_strategy) const {
    MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 1, 1) {
        auto&& fm = param.filter_meta;
        auto FH = fm.spatial[0];
        bool aviliable =
                param.filter_meta.format == param::ConvBias::Format::NCHW &&
                param.src_type.enumv() == DTypeEnum::Float32 &&
                param.filter_type.enumv() == DTypeEnum::Float32 &&
                param.dst_type.enumv() == DTypeEnum::Float32 &&
                !fm.should_flip && fm.spatial_ndim == 2 &&
                fm.dilation[0] == 1 && fm.dilation[1] == 1 &&
                fm.stride[0] == 1 && fm.stride[1] == 1 && FH == fm.spatial[1] &&
                (FH == 2 || FH == 3 || FH == 5 || FH == 7);
        if (algo_selection_strategy ==
            ConvBiasImpl::AlgoSelectionStrategy::HEURISTIC) {
            bool large_group = param.filter_meta.group >= param.nr_threads;
            aviliable &= (large_group == m_large_group);
        }
        return aviliable;
    }
    MIDOUT_END();
    return false;
}

size_t ConvBiasImpl::AlgoF32DirectStride1::get_workspace(
482
        const NCBKernSizeParam& param) const {
483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
    MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 1, 1) {
        auto bundle =
                MultithreadDirectConvCommon<float, float>::get_bundle_stride(
                        param, m_large_group);
        return bundle.total_size_in_bytes();
    }
    MIDOUT_END();
    return 0;
}

SmallVector<ConvBiasImpl::NCBKern>
ConvBiasImpl::AlgoF32DirectStride1::get_kimpls(
        const NCBKernSizeParam& param) const {
    auto fm = param.filter_meta;
    auto FH = fm.spatial[0];
    size_t N = param.n;
    size_t IC = param.filter_meta.icpg;
    size_t OC = param.filter_meta.ocpg;
    size_t group = fm.group;
    using Func = std::function<void(const float*, const float*, float*, size_t,
                                    size_t, size_t, size_t, size_t)>;
    Func conv_kern_function = nullptr;

#define SWITCH_KERN_STR1()                                                \
    switch (FH) {                                                         \
        case 2:                                                           \
            conv_kern_function = fp32::conv_stride1::do_conv_2x2_stride1; \
            break;                                                        \
        case 3:                                                           \
            conv_kern_function = fp32::conv_stride1::do_conv_3x3_stride1; \
            break;                                                        \
        case 5:                                                           \
            conv_kern_function = fp32::conv_stride1::do_conv_5x5_stride1; \
            break;                                                        \
        case 7:                                                           \
            conv_kern_function = fp32::conv_stride1::do_conv_7x7_stride1; \
            break;                                                        \
    }
    SWITCH_KERN_STR1();

523
    WorkspaceBundle bundle =
524 525 526 527 528 529 530
            MultithreadDirectConvCommon<float, float>::get_bundle_stride(
                    param, m_large_group);
    SmallVector<NCBKern> ret_kerns;
    //! When group >= nr_threads, treat it as large_group, each thread process
    //! one group for better performance
    if (m_large_group) {
        //! Channel wise conv and big groups
531
        auto exec_one_group = [bundle, conv_kern_function](
532
                                      const NCBKernParam& kern_param,
533
                                      const NCBKernIndex& ncb_index) mutable {
534 535 536
            auto fm = kern_param.filter_meta;
            size_t IC = fm.icpg;
            size_t OC = fm.ocpg;
537
            bundle.set(kern_param.workspace_ptr);
538 539 540 541 542 543 544 545 546 547 548 549 550 551
            for (size_t ic = 0; ic < IC; ic++) {
                MultithreadDirectConvCommon<float, float>::
                        copy_padding_kern_stride(bundle, kern_param, ncb_index,
                                                 {ncb_index.thread_id, 0, ic});
            }
            for (size_t oc = 0; oc < OC; oc++) {
                MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
                        bundle, kern_param, ncb_index, conv_kern_function,
                        {ncb_index.thread_id, 0, oc});
            }
        };
        ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
    } else {
        auto copy_padding = [bundle](const NCBKernParam& kern_param,
552 553
                                     const NCBKernIndex& ncb_index) mutable {
            bundle.set(kern_param.workspace_ptr);
554 555 556 557 558 559
            MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride(
                    bundle, kern_param, ncb_index, ncb_index.ndrange_id);
        };
        ret_kerns.push_back({copy_padding, {group, N, IC}});
        auto do_conv = [bundle, conv_kern_function](
                               const NCBKernParam& kern_param,
560 561
                               const NCBKernIndex& ncb_index) mutable {
            bundle.set(kern_param.workspace_ptr);
562 563 564 565 566 567 568 569 570 571 572
            MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
                    bundle, kern_param, ncb_index, conv_kern_function,
                    ncb_index.ndrange_id);
        };
        ret_kerns.push_back({do_conv, {group, N, OC}});
    }
    return ret_kerns;
}

SmallVector<ConvBiasImpl::NCBKern>
ConvBiasImpl::AlgoF32DirectStride1::dispatch_kerns(
573
        const NCBKernSizeParam& param) const {
574 575 576 577 578 579 580 581 582 583
    MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 1, 2) {
        return get_kimpls(param);
    }
    MIDOUT_END();
    return {};
}

/* ===================== stride-2 algo ===================== */

bool ConvBiasImpl::AlgoF32DirectStride2::usable(
584
        const NCBKernSizeParam& param,
585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608
        AlgoSelectionStrategy algo_selection_strategy) const {
    MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 2, 0) {
        auto&& fm = param.filter_meta;
        auto FH = fm.spatial[0];
        bool aviliable =
                param.filter_meta.format == param::ConvBias::Format::NCHW &&
                param.src_type.enumv() == DTypeEnum::Float32 &&
                param.filter_type.enumv() == DTypeEnum::Float32 &&
                param.dst_type.enumv() == DTypeEnum::Float32 &&
                !fm.should_flip && fm.spatial_ndim == 2 &&
                fm.dilation[0] == 1 && fm.dilation[1] == 1 &&
                fm.stride[0] == 2 && fm.stride[1] == 2 && FH == fm.spatial[1] &&
                (FH == 2 || FH == 3 || FH == 5 || FH == 7);
        if (algo_selection_strategy ==
            ConvBiasImpl::AlgoSelectionStrategy::HEURISTIC) {
            bool large_group = param.filter_meta.group >= param.nr_threads;
            aviliable &= (large_group == m_large_group);
        }
        return aviliable;
    }
    MIDOUT_END();
    return false;
}
size_t ConvBiasImpl::AlgoF32DirectStride2::get_workspace(
609
        const NCBKernSizeParam& param) const {
610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648
    MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 2, 1) {
        auto bundle =
                MultithreadDirectConvCommon<float, float>::get_bundle_stride(
                        param, m_large_group);
        return bundle.total_size_in_bytes();
    }
    MIDOUT_END();
    return 0;
}
SmallVector<ConvBiasImpl::NCBKern>
ConvBiasImpl::AlgoF32DirectStride2::get_kimpls(
        const NCBKernSizeParam& param) const {
    auto fm = param.filter_meta;
    auto FH = fm.spatial[0];
    size_t N = param.n;
    size_t IC = param.filter_meta.icpg;
    size_t OC = param.filter_meta.ocpg;
    size_t group = fm.group;
    using Func = std::function<void(const float*, const float*, float*, size_t,
                                    size_t, size_t, size_t, size_t)>;
    Func conv_kern_function = nullptr;

#define SWITCH_KERN_STR2()                                                \
    switch (FH) {                                                         \
        case 2:                                                           \
            conv_kern_function = fp32::conv_stride2::do_conv_2x2_stride2; \
            break;                                                        \
        case 3:                                                           \
            conv_kern_function = fp32::conv_stride2::do_conv_3x3_stride2; \
            break;                                                        \
        case 5:                                                           \
            conv_kern_function = fp32::conv_stride2::do_conv_5x5_stride2; \
            break;                                                        \
        case 7:                                                           \
            conv_kern_function = fp32::conv_stride2::do_conv_7x7_stride2; \
            break;                                                        \
    }
    SWITCH_KERN_STR2();

649
    WorkspaceBundle bundle =
650 651 652 653 654 655 656
            MultithreadDirectConvCommon<float, float>::get_bundle_stride(
                    param, m_large_group);
    SmallVector<NCBKern> ret_kerns;
    //! When group >= nr_threads, treat it as large_group, each thread process
    //! one group for better performance
    if (m_large_group) {
        //! Channel wise conv and big groups
657
        auto exec_one_group = [bundle, conv_kern_function](
658
                                      const NCBKernParam& kern_param,
659
                                      const NCBKernIndex& ncb_index) mutable {
660 661 662
            auto fm = kern_param.filter_meta;
            size_t IC = fm.icpg;
            size_t OC = fm.ocpg;
663
            bundle.set(kern_param.workspace_ptr);
664 665 666 667 668 669 670 671 672 673 674 675 676 677
            for (size_t ic = 0; ic < IC; ic++) {
                MultithreadDirectConvCommon<float, float>::
                        copy_padding_kern_stride(bundle, kern_param, ncb_index,
                                                 {ncb_index.thread_id, 0, ic});
            }
            for (size_t oc = 0; oc < OC; oc++) {
                MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
                        bundle, kern_param, ncb_index, conv_kern_function,
                        {ncb_index.thread_id, 0, oc});
            }
        };
        ret_kerns.push_back({exec_one_group, {group, N, 1_z}});
    } else {
        auto copy_padding = [bundle](const NCBKernParam& kern_param,
678 679
                                     const NCBKernIndex& ncb_index) mutable {
            bundle.set(kern_param.workspace_ptr);
680 681 682 683 684 685
            MultithreadDirectConvCommon<float, float>::copy_padding_kern_stride(
                    bundle, kern_param, ncb_index, ncb_index.ndrange_id);
        };
        ret_kerns.push_back({copy_padding, {group, N, IC}});
        auto do_conv = [bundle, conv_kern_function](
                               const NCBKernParam& kern_param,
686 687
                               const NCBKernIndex& ncb_index) mutable {
            bundle.set(kern_param.workspace_ptr);
688 689 690 691 692 693 694 695 696 697 698
            MultithreadDirectConvCommon<float, float>::do_conv_kern_stride(
                    bundle, kern_param, ncb_index, conv_kern_function,
                    ncb_index.ndrange_id);
        };
        ret_kerns.push_back({do_conv, {group, N, OC}});
    }
    return ret_kerns;
}

SmallVector<ConvBiasImpl::NCBKern>
ConvBiasImpl::AlgoF32DirectStride2::dispatch_kerns(
699
        const NCBKernSizeParam& param) const {
700 701 702 703 704 705 706
    MIDOUT_BEGIN(megdnn_arm_common_conv_bias_f32_kimpl, 2, 2) {
        return get_kimpls(param);
    }
    MIDOUT_END();
    return {};
}
// vim: syntax=cpp.doxygen