/** * \file dnn/src/fallback/convolution/opr_impl.h * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ #pragma once #include #include #include "megdnn/oprs/base.h" #include "src/common/utils.h" #include "src/common/algo_base.h" #include "src/fallback/handle.h" #include "src/naive/convolution/opr_impl.h" namespace megdnn { /** * \brief Convolutino algo category */ enum class AlgoCategory : int32_t { DIRECT = 0, IM2COL = 1, WINOGRAD = 2, NAIVE = 3, }; struct ConvAlgoTypePack { detail::AlgoDataType data_type : 32; AlgoCategory algo_category : 32; }; namespace fallback { /*! * \brief fallback convolution forward impl * * Note: this operator class serves for multiple purposes: * * 1. canonizing conv reprs into NCBKernParam and NCBKernSizeParam, and * subclasses should impl by overriding *_ncb methods * 2. providing a default impl for group conv by calling ncb_1g* methods * 3. providing a conv impl faster than naive under some cases * 4. providing a default impl for choosing heuristic algorithm, by using the * first algo that fits the workspace limit */ class ConvolutionImpl : public naive::ConvolutionForwardImpl { public: using naive::ConvolutionForwardImpl::ConvolutionForwardImpl; using AlgoSelectionStrategy = detail::AlgoSelectionStrategy; using AlgoDataType = detail::AlgoDataType; //! implemented by exec_with_ncb_kern() void exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, const PreprocessedFilter*, _megdnn_workspace workspace) override; void exec_preprocess(const TensorLayout& src_layout, _megdnn_tensor_in filter, const TensorLayout& dst_layout, PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) override; //! implemented by get_workspace_with_ncb() size_t get_workspace_in_bytes(const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst, const PreprocessedFilter*) override; SmallVector deduce_preprocessed_filter_layout( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst) override; size_t get_preprocess_workspace_in_bytes(const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst) override; //! implemented by get_all_algorithms_with_ncb() std::vector get_all_algorithms( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst) override; //! implemented by get_algorithm_heuristic_with_ncb() Algorithm* get_algorithm_heuristic( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst, size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr) override; //! size param for kernels with non-contiguous batch struct NCBKernSizeParam { uint32_t n; std::array isz, osz; //! filter info; group is guaranteed to be 1 CanonizedFilterMeta filter_meta; DType src_type, filter_type, dst_type; //! stride for batch of input, output ptrdiff_t inp_bs, out_bs; //! stride for each dim of input, output ptrdiff_t inp_s[4], out_s[4]; Param::ComputeMode compute_mode; size_t nr_threads; //! weight_preprocess info const PreprocessedFilter* preprocessed_filter; //! get the data type category of the param for select the algo AlgoDataType deduce_algo_data_type() const; }; //! memory param for kernels with non-contiguous batch struct NCBKernParam : public NCBKernSizeParam { const void* src_ptr; const void* filter_ptr; void* dst_ptr; void* workspace_ptr; size_t workspace_size; template const T* src() const { src_type.assert_is_compatible_ctype(); return static_cast(src_ptr); } template const T* filter() const { filter_type.assert_is_compatible_ctype(); return static_cast(filter_ptr); } template T* dst() const { dst_type.assert_is_compatible_ctype(); return static_cast(dst_ptr); } template T* workspace() const { return static_cast(workspace_ptr); } //! when format is nchwxx and channel wise, multi group will pack into //! one group_pack_id. group_pack_size is the number of packed group //! together, like weight shape is {g/8, 1, 1, Fh, Fw, 8} template T* dst(size_t batch_id, size_t group_pack_id, size_t group_pack_size = 1_z) const{ size_t batch_offset = batch_id * out_bs * dst_type.size(); size_t group_offset = group_pack_size * group_pack_id * filter_meta.ocpg * osz[0] * osz[1] * dst_type.size(); return reinterpret_cast(reinterpret_cast(dst_ptr) + batch_offset + group_offset); } template const T* src(size_t batch_id, size_t group_pack_id, size_t group_pack_size = 1_z) const { size_t batch_offset = batch_id * inp_bs * src_type.size(); size_t group_offset = group_pack_size * group_pack_id * filter_meta.icpg * isz[0] * isz[1] * src_type.size(); return reinterpret_cast(reinterpret_cast(src_ptr) + batch_offset + group_offset); } template const T* filter(size_t group_pack_id, size_t pack_group_size = 1_z) const { size_t group_offset = pack_group_size * group_pack_id * filter_meta.icpg * filter_meta.ocpg * filter_meta.spatial[0] * filter_meta.spatial[1] * filter_type.size(); return reinterpret_cast( reinterpret_cast(filter_ptr) + group_offset); } }; /** * \brief Kernel run time id, This information is used for getting the * work data */ struct NCBKernIndex { size_t thread_id = 0; //!< Thread id CpuNDRange ndrange_id; }; using ncb_kern_t = thin_function; struct NCBKern { ncb_kern_t kern; //!< conv kern parallel ptr CpuNDRange global_size; }; class AlgoBase : public Algorithm { public: AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::FALLBACK; } enum class AlgoType : uint32_t { //! fallback FB_ALGO = 1 << 0, FB_NAIVE, FB_DEFAULT, }; virtual ~AlgoBase() = default; virtual bool usable(const NCBKernSizeParam& param, AlgoSelectionStrategy) const = 0; virtual size_t get_workspace(const NCBKernSizeParam& param) const = 0; virtual SmallVector dispatch_kern( const NCBKernSizeParam& param) const = 0; virtual SmallVector dispatch_preprocess_kern( const NCBKernSizeParam&) const { return {}; }; //! get the layouts of weight_prerocess dst virtual SmallVector deduce_preprocessed_filter_layout( const NCBKernSizeParam&) const { return {}; }; //! get the workspace when weight_prerocess virtual size_t get_preprocess_workspace(const NCBKernSizeParam&) const { return 0_z; }; //! Temporarily used to identify whether the matmul algorithm is //! is_preferred. virtual bool is_preferred(const NCBKernSizeParam&) const { return false; } bool usable_attribute(const NCBKernSizeParam& param, AlgoSelectionStrategy algo_selection_strategy, const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE, const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) const { return contain_attribute_all(positive_attr) && !contain_attribute_any(negative_attr) && usable(param, algo_selection_strategy); } //! get the type of the algo virtual ConvAlgoTypePack get_algo_type() const = 0; using Mapper = std::unordered_map; }; /** * \brief get all the algorithm for the opr. */ virtual SmallVector get_all_packed_algo(); /** * \brief select algo according to input algo type */ SmallVector select_algo_type(ConvAlgoTypePack algo_type); protected: virtual void exec_with_ncb_kern(const NCBKernParam& param, Algorithm* algo); virtual void exec_preprocess_with_ncb_kern(const NCBKernParam& param, Algorithm* algo); virtual std::vector get_all_algorithms_with_ncb( const NCBKernSizeParam& param); virtual Algorithm* get_algorithm_heuristic_with_ncb( const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr); const char* get_algorithm_set_name() const override; class AlgoFallback; class AlgoNaive; class AlgoDefault; class AlgoPack; private: NCBKernSizeParam m_prev_selected_algo_sizep; Algorithm* m_prev_selected_algo = nullptr; Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override; bool is_naive_algo(ConvolutionImpl::Algorithm* algo); Algorithm* get_algorithm( const NCBKernSizeParam& param, size_t workspace_size = std::numeric_limits::max()); NCBKernSizeParam make_ncb_kern_size_param( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter); NCBKernParam make_ncb_kern_param( _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, const PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace); SmallVector suggest_algo_category_order( const NCBKernSizeParam& param) const; public: static const AlgoPack& algo_pack(); }; class ConvolutionBackwardDataImpl : public naive::ConvolutionBackwardDataImpl { public: using naive::ConvolutionBackwardDataImpl::ConvolutionBackwardDataImpl; void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, _megdnn_workspace workspace) override; size_t get_workspace_in_bytes(const TensorLayout& flter, const TensorLayout& diff, const TensorLayout& grad) override; std::vector get_all_algorithms( const TensorLayout& filter, const TensorLayout& diff, const TensorLayout& grad) override; Algorithm* get_algorithm_heuristic( const TensorLayout& filter, const TensorLayout& diff, const TensorLayout& grad, size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr) override; const char* get_algorithm_set_name() const override; //! size param for kernels with non-contiguous batch struct NCBKernSizeParam { uint32_t n; std::array isz, osz; //! filter info; group is guaranteed to be 1 CanonizedFilterMeta filter_meta; DType diff_type, filter_type, grad_type; TensorLayout diff_layout, filter_layout, grad_layout; //! stride for batch of input, output ptrdiff_t inp_bs, out_bs; //! extra_mem_size (in bytes) memory after the end of the logical //! memory block is accessible. //! //! this allows for eliminating unnecessary memory copies: e.g. //! if several bytes after the end of the tensor are //! accessible, some kernel implementations can utilize //! out-of-bound SIMD memory access, to avoid issuing //! memcpy instructions. //! //! Note that although extra_mem_size bytes are accessible by the //! kernel implementation, kernel implementation should not have any //! ``visible'' effect on any unintended memory location. //! This means reading and writing the same value to some memory //! location within extra_mem_size is allowed, but writing a //! different value is not allowed. size_t diff_extra_mem_size, filter_extra_mem_size, grad_extra_mem_size; Param::ComputeMode compute_mode; }; //! memory param for kernels with non-contiguous batch struct NCBKernParam : public NCBKernSizeParam { const void* filter_ptr; const void* diff_ptr; void* grad_ptr; void* workspace_ptr; size_t workspace_size; template const T* diff() const { diff_type.assert_is_compatible_ctype(); return static_cast(diff_ptr); } template const T* filter() const { filter_type.assert_is_compatible_ctype(); return static_cast(filter_ptr); } template T* grad() const { grad_type.assert_is_compatible_ctype(); return static_cast(grad_ptr); } template T* workspace() const { return static_cast(workspace_ptr); } }; protected: using ncb_kern_t = thin_function; class AlgoBase : public Algorithm { protected: ~AlgoBase() = default; public: AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::FALLBACK; } enum class AlgoType : uint32_t { //! fallback FB_NAIVE = 1 << 0, FB_DIRECT, FB_MATMUL, #if MEGDNN_AARCH64 || MEGDNN_ARMV7 ARM_COMMON_DIRECT_STRD1_DOT_INT8X8X32 = 1 << 8, ARM_COMMON_DIRECT_STRD2_DOT_INT8X8X32, ARM_COMMON_DIRECT_STRD1_DOT_QU8, ARM_COMMON_DIRECT_STRD2_DOT_QU8 #endif }; virtual bool usable(ConvolutionBackwardDataImpl* opr, const NCBKernSizeParam& param) const = 0; virtual size_t get_workspace(ConvolutionBackwardDataImpl* opr, const NCBKernSizeParam& param) const = 0; virtual ncb_kern_t dispatch_kern( ConvolutionBackwardDataImpl* opr, const NCBKernSizeParam& param) const = 0; bool usable_attribute(ConvolutionBackwardDataImpl* opr, const NCBKernSizeParam& param, const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE, const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) const { return contain_attribute_all(positive_attr) && !contain_attribute_any(negative_attr) && usable(opr, param); } virtual bool is_preferred(const NCBKernSizeParam&) const { return false; } //! if the algo is naive, it will not split by group virtual bool is_naive() const { return false; } using Mapper = std::unordered_map; }; protected: //! default impl calls ncb_1g_dispatch_kern() virtual void exec_with_ncb_kern(const NCBKernParam& param); //! default impl calls ncb_1g_get_workspace() virtual size_t get_workspace_with_ncb(const NCBKernSizeParam& param); //! default impl calls ncb_1g_get_all_algorithms() virtual std::vector get_all_algorithms_with_ncb( const NCBKernSizeParam& param); //! default impl calls ncb_1g_get_algorithm_heuristic() virtual Algorithm* get_algorithm_heuristic_with_ncb( const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr); //! get kernel pointer for float32 non-contiguous batch 1-group kernel virtual ncb_kern_t ncb_1g_dispatch_kern(Algorithm* algo, const NCBKernSizeParam& param); virtual size_t ncb_1g_get_workspace(Algorithm* algo, const NCBKernSizeParam& param); virtual std::vector ncb_1g_get_all_algorithms( const NCBKernSizeParam& param); /*! * the default impl iterates over all ncb_1g_get_all_algorithms() * and return the first one whose workspace does not exceed the limit. */ virtual Algorithm* ncb_1g_get_algorithm_heuristic( const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr); static bool is_matrix_mul_preferred(const NCBKernSizeParam& param); /** * \brief get all the algorithm for the opr. */ virtual SmallVector get_all_packed_algo(); private: NCBKernSizeParam m_prev_selected_algo_sizep; Algorithm* m_prev_selected_algo = nullptr; //! get algorithm set by user or by heuristic Algorithm* get_algorithm(const NCBKernSizeParam& param); NCBKernSizeParam make_ncb_kern_size_param(const TensorLayout& filter, const TensorLayout& diff, const TensorLayout& grad); NCBKernParam make_ncb_kern_param(_megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, _megdnn_workspace workspace); class AlgoNaive; class AlgoDirect; class AlgoMatrixMul; class AlgoPack; Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override; public: //! maintain all the algos of in the opr of fallback static const AlgoPack& algo_pack(); }; } // namespace fallback } // namespace megdnn //! unpack NCBKernSizeParam into local variables (N, IC, IH, IW, ...) #define UNPACK_CONV_F32_NCB_KERN_SIZES(_p) \ auto N = _p.n, IC = _p.filter_meta.icpg, IH = _p.isz[0], IW = _p.isz[1], \ OC = _p.filter_meta.ocpg, OH = _p.osz[0], OW = _p.osz[1], \ FH = _p.filter_meta.spatial[0], FW = _p.filter_meta.spatial[1], \ SH = _p.filter_meta.stride[0], SW = _p.filter_meta.stride[1], \ PH = _p.filter_meta.padding[0], PW = _p.filter_meta.padding[1] // vim: syntax=cpp.doxygen