/*************************************************************************************************** * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this list of * conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, this list of * conditions and the following disclaimer in the documentation and/or other materials * provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used * to endorse or promote products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ /** * \file dnn/src/cuda/convolution_helper/conv_trait/iconv_trait.cuh * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ #pragma once #include "src/cuda/convolution_helper/block_tile_consumer/block_consumer.cuh" #include "src/cuda/convolution_helper/block_tile_iterator/block_tile_iterator.cuh" #include "src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor.cuh" #include "src/cuda/convolution_helper/global_memory_writer/global_memory_writer.cuh" #include "src/cuda/convolution_helper/layout.cuh" #include "src/cuda/convolution_helper/parameter.cuh" namespace megdnn { namespace cuda { namespace convolution { #define COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM( \ _src_dtype, _filter_dtype, _smem_storage_dtype, _input_layout, \ _kern_layout, _output_layout, _conv_param) \ using src_dtype = _src_dtype; \ using filter_dtype = _filter_dtype; \ using smem_storage_dtype = _smem_storage_dtype; \ using InputLayout = _input_layout; \ using KernLayout = _kern_layout; \ using OutputLayout = _output_layout; \ using Param = _conv_param; \ static constexpr bool check_bounds = check_bounds_; #define MEGDNN_COMMA , template struct IConvTrait { COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM(int8_t, int8_t, int32_t, Layout, Layout, Layout, ConvParam); using RegBlockConfig = RegBlockConfig_; using ThreadConfig = ThreadConfig_; struct DataTileCount { using RegBlockConfig = RegBlockConfig; using ThreadConfig = ThreadConfig; using copy_t = ldg_dtype; using smem_storage_dtype = smem_storage_dtype; static int constexpr load_width = sizeof(copy_t) / sizeof(smem_storage_dtype); static int constexpr ldg_load_width = sizeof(copy_t) / sizeof(src_dtype); static int constexpr skew = load_width; static int constexpr block_tile_batch = RegBlockConfig::reg_n * ThreadConfig::nr_thread_x; static int constexpr block_tile_in_channel = RegBlockConfig::reg_k; static int constexpr smem_load_x = block_tile_batch / load_width; static int constexpr load_x = smem_load_x > 32 ? 32 : smem_load_x; static int constexpr load_y = ThreadConfig::nr_threads / load_x; static int constexpr smem_h = RegBlockConfig::reg_k_packed; static int constexpr smem_w = block_tile_batch; static int constexpr smem_stride = smem_w % 2 == 0 ? smem_w + skew : smem_w; static int constexpr smem_tot = smem_h * smem_stride; static int constexpr reg_h = (smem_h + load_y - 1) / load_y; static int constexpr reg_w = (smem_load_x + load_x - 1) / load_x; static bool constexpr check_bounds_h = smem_h % load_y != 0; static bool constexpr check_bounds_w = smem_load_x % load_x != 0; }; struct FilterTileCount { using RegBlockConfig = RegBlockConfig; using ThreadConfig = ThreadConfig; using copy_t = ldg_dtype; using smem_storage_dtype = smem_storage_dtype; static int constexpr load_width = sizeof(copy_t) / sizeof(smem_storage_dtype); static int constexpr ldg_load_width = sizeof(copy_t) / sizeof(filter_dtype); static int constexpr skew = load_width; static int constexpr block_tile_out_channel = RegBlockConfig::reg_m * ThreadConfig::nr_thread_y; static int constexpr block_tile_in_channel = RegBlockConfig::reg_k; static int constexpr smem_load_x = block_tile_out_channel / load_width; static int constexpr load_x = smem_load_x > 32 ? 32 : smem_load_x; static int constexpr load_y = ThreadConfig::nr_threads / load_x; static int constexpr smem_h = RegBlockConfig::reg_k_packed; static int constexpr smem_w = block_tile_out_channel; static int constexpr smem_stride = smem_w % 2 == 0 ? smem_w + skew : smem_w; static int constexpr smem_tot = smem_h * smem_stride; static int constexpr reg_h = (smem_h + load_y - 1) / load_y; static int constexpr reg_w = (smem_load_x + load_x - 1) / load_x; static bool constexpr check_bounds_h = smem_h % load_y != 0; static bool constexpr check_bounds_w = smem_load_x % load_x != 0; }; using BlockTileIterator = BlockTileIteratorBasic; using DataGlobal2ShareMemVisitor = Global2ShareMemVisitor_CIxN; using FilterGlobal2ShareMemVisitor = Global2ShareMemVisitor_CIxN; static bool constexpr pipelined = RegBlockConfig::reg_k_packed > 1; using BlockConsumer = IConvBlockConsumer; using GlobalMemoryWriter = IConvGlobalMemoryWriter; }; template struct IConvTraitUnrollWidth { COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM(int8_t, int8_t, int32_t, Layout, Layout, Layout, ConvParam); using RegBlockConfig = RegBlockConfig_; using ThreadConfig = ThreadConfig_; struct DataTileCount { using RegBlockConfig = RegBlockConfig; using ThreadConfig = ThreadConfig; using copy_t = ldg_dtype; using smem_storage_dtype = smem_storage_dtype; static int constexpr load_width = sizeof(copy_t) / sizeof(smem_storage_dtype); static int constexpr ldg_load_width = sizeof(copy_t) / sizeof(src_dtype); static int constexpr skew = load_width; static int constexpr block_tile_batch = RegBlockConfig::reg_n * ThreadConfig::nr_thread_x; static int constexpr block_tile_out_width = RegBlockConfig::reg_width; static int constexpr block_tile_in_channel = RegBlockConfig::reg_k; static int constexpr smem_load_x = block_tile_batch / load_width; static int constexpr load_x = smem_load_x > 32 ? 32 : smem_load_x; static int constexpr load_y = ThreadConfig::nr_threads / load_x; static int constexpr smem_h = RegBlockConfig::reg_k_packed; static int constexpr smem_w = block_tile_batch; static int constexpr img_cache = RegBlockConfig::reg_width; static int constexpr smem_stride = smem_w % 2 == 0 ? smem_w + skew : smem_w; static int constexpr smem_tot = smem_h * img_cache * smem_stride; static int constexpr reg_h = (smem_h + load_y - 1) / load_y; static int constexpr reg_w = (smem_load_x + load_x - 1) / load_x; static bool constexpr check_bounds_h = smem_h % load_y != 0; static bool constexpr check_bounds_w = smem_load_x % load_x != 0; }; MEGDNN_STATIC_ASSERT( std::is_same::filter_dtype MEGDNN_COMMA filter_dtype>::value == true, "data type of filter tensor should be int8_t"); using FilterTileCount = typename IConvTrait::FilterTileCount; using BlockTileIterator = BlockTileIteratorUnrollWidth; using DataGlobal2ShareMemVisitor = Global2ShareMemVisitor_CIxWOxN; using FilterGlobal2ShareMemVisitor = Global2ShareMemVisitor_CIxN; static bool constexpr pipelined = RegBlockConfig::reg_k_packed > 1; using BlockConsumer = IConvBlockConsumerUnrollWidth; using GlobalMemoryWriter = IConvGlobalMemoryWriterUnrollWidth; }; #undef COMMON_DEFS_WITH_DATA_TYPE_LAYOUT_AND_PARAM #undef MEGDNN_COMMA } // namespace convolution } // namespace cuda } // namespace megdnn // vim: ft=cpp syntax=cuda.doxygen foldmethod=marker foldmarker=f{{{,f}}}