diff --git a/dnn/src/x86/conv_bias/f32/algos.cpp b/dnn/src/x86/conv_bias/f32/algos.cpp index 04171df1487ea3bfa7a450f7a89f0e38144961fb..9140d4a6f5c1d36a944fa63b0842a656293fe362 100644 --- a/dnn/src/x86/conv_bias/f32/algos.cpp +++ b/dnn/src/x86/conv_bias/f32/algos.cpp @@ -20,7 +20,6 @@ #include "src/x86/conv_bias/postprocess_helper.h" #include "src/x86/convolution/convolution_direct_special_cases.h" #include "src/x86/handle.h" -#include "src/x86/profile.h" #include "midout.h" @@ -487,153 +486,6 @@ SmallVector ConvBiasImpl::AlgoDirectStride2::get_kimpls( const NCBKernSizeParam& param) const { GET_KERN; } -/* ===================== matmul algo ===================== */ -WorkspaceBundle ConvBiasImpl::AlgoMatrixMul::get_bundle( - const NCBKernSizeParam& param) { - UNPACK_CONV_F32_NCB_KERN_SIZES(param); - MEGDNN_MARK_USED_VAR(N); - MEGDNN_MARK_USED_VAR(OC); - auto IW2 = IH + 2 * PH; - auto IH2 = IW + 2 * PW; - bool can_matrix_mul_direct = - (FH == 1 && FW == 1 && SH == 1 && SW == 1 && PH == 0 && PW == 0); - // temp space to store padding-free src (with 4 extra floats) - // temp space to store unrolled matrix (with 4 extra floats) - // workspace for matrix mul opr - size_t part0, part1, part2; - if (can_matrix_mul_direct) { - part0 = part1 = 0; - } else { - part0 = (IC * IH2 * IW2 + 4) * sizeof(float); - part1 = (IC * FH * FW * OH * OW + 4) * sizeof(float); - } - { - TensorLayout A_, B_, C_; - A_ = TensorLayout({OC, IC * FH * FW}, dtype::Float32()); - B_ = TensorLayout({IC * FH * FW, OH * OW}, dtype::Float32()); - C_ = TensorLayout({OC, OH * OW}, dtype::Float32()); - part2 = get_matmul_opr()->get_workspace_in_bytes(A_, B_, C_); - } - return {nullptr, {part0, part1, part2}}; -} - -bool ConvBiasImpl::AlgoMatrixMul::is_preferred( - const NCBKernSizeParam& param) const { - auto&& fm = param.filter_meta; - if (fm.dilation[0] != 1 || fm.dilation[1] != 1) { - return false; - } - - // single channel conv should never use matrix mul - if (fm.ocpg == 1 || fm.icpg == 1) - return false; - // 1x1 conv should always use matrix mul - if (fm.spatial[0] == 1 && fm.spatial[1] == 1) - return true; - // if stride is not 1x1, always use matrix mul - if (fm.stride[0] != 1 || fm.stride[1] != 1) - return true; - int f = find_nearest_elem( - std::round(geometric_mean(fm.spatial[0], fm.spatial[1])), - {2, 3, 4, 5, 6, 7}); - int oc = find_nearest_elem(fm.ocpg, {4, 8, 16, 32, 64, 96, 128}); - int ic = find_nearest_elem(fm.icpg, {4, 8, 16, 32, 64, 96, 128}); - int on = std::round(geometric_mean(param.osz[0], param.osz[1])); - ProfileElement cur(f, oc, ic, on); - auto H = static_cast(inplace_cpu_handle().get()); - auto&& target = std::lower_bound(H->profile_cache().begin(), - H->profile_cache().end(), cur); - megdnn_assert_internal(target->f == cur.f); - megdnn_assert_internal(target->oc == cur.oc); - megdnn_assert_internal(target->ic == cur.ic); - return on < target->on_threshold; -} - -MatrixMul* ConvBiasImpl::AlgoMatrixMul::get_matmul_opr() { - static CpuOprDelegationStorage<> storage; - return storage.get(); -} - -void ConvBiasImpl::AlgoMatrixMul::kimpl(const NCBKernParam& param, - const NCBKernIndex& ncb_index) { - UNPACK_CONV_F32_NCB_KERN_SIZES(param); - auto IH2 = IH + 2 * PH; - auto IW2 = IW + 2 * PW; - size_t group_id = ncb_index.ndrange_id[0]; - bool is_xcorr = !param.filter_meta.should_flip; - auto bundle = get_bundle(param); - bundle.set(param.workspace_ptr); - // workspace = tmp..src2 - - for (size_t n = 0; n < N; ++n) { - float* src = const_cast(param.src(n, group_id)); - float* dst = param.dst(n, group_id); - float* bias_ptr = static_cast( - const_cast(param.bias(n, group_id))); - - float *B, *src2; - if (FH == 1 && FW == 1 && SH == 1 && SW == 1 && PH == 0 && PW == 0) { - // special case: 1x1 - B = src; - } else { - src2 = static_cast(bundle.get(0)); - // copy src to src2; - float* src2_ptr = src2; - const float* src_ptr = src; - rep(ic, IC) { - if (PH != 0) { - std::memset(src2_ptr, 0, sizeof(float) * PH * IW2); - src2_ptr += PH * IW2; - } - rep(ih, IH) { - if (PW != 0) - rep(pw, PW) * (src2_ptr++) = 0.0f; - std::memcpy(src2_ptr, src_ptr, sizeof(float) * IW); - src2_ptr += IW; - src_ptr += IW; - if (PW != 0) - rep(pw, PW) * (src2_ptr++) = 0.0f; - } - if (PH != 0) { - std::memset(src2_ptr, 0, sizeof(float) * PH * IW2); - src2_ptr += PH * IW2; - } - } - - B = static_cast(bundle.get(1)); - if (SH == 1 && SW == 1) { - if (is_xcorr) { - img2col(src2, B, OC, OH, OW, IC, IH2, IW2, FH, FW); - } else { - img2col(src2, B, OC, OH, OW, IC, IH2, IW2, FH, FW); - } - } else { - if (is_xcorr) { - img2col_stride(src2, B, OC, OH, OW, IC, IH2, IW2, FH, - FW, SH, SW); - } else { - img2col_stride(src2, B, OC, OH, OW, IC, IH2, IW2, FH, - FW, SH, SW); - } - } - } - { - TensorND A_, B_, C_; - A_.layout = TensorLayout({OC, IC * FH * FW}, dtype::Float32()); - A_.raw_ptr = const_cast(param.filter(group_id)); - B_.layout = TensorLayout({IC * FH * FW, OH * OW}, dtype::Float32()); - B_.raw_ptr = B; - C_.layout = TensorLayout({OC, OH * OW}, dtype::Float32()); - C_.raw_ptr = dst; - Workspace workspace(static_cast(bundle.get(2)), - bundle.get_size(2)); - get_matmul_opr()->exec(A_, B_, C_, workspace); - } - PostProcess::run(dst, bias_ptr, dst, param.bias_mode, - param.nonlineMode, param.bias_type, - param.dst_type, 1_z, OC, OH, OW); - } -} #if MEGDNN_X86_WITH_MKL_DNN static inline void mkldnn_fp32_conv_instance( diff --git a/dnn/src/x86/conv_bias/f32/algos.h b/dnn/src/x86/conv_bias/f32/algos.h index b3565cece19736a457f18ca84cd645786737c857..7a4ed9c26f57455244e29ca519f157a917e193ba 100644 --- a/dnn/src/x86/conv_bias/f32/algos.h +++ b/dnn/src/x86/conv_bias/f32/algos.h @@ -123,47 +123,6 @@ public: MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(AlgoDataType::FLOAT32); }; -/* ===================== matmul algo ===================== */ -class ConvBiasImpl::AlgoMatrixMul final : public AlgoBase { - static MatrixMul* get_matmul_opr(); - static WorkspaceBundle get_bundle(const NCBKernSizeParam& param); - static void kimpl(const NCBKernParam& param, const NCBKernIndex&); - -public: - bool is_reproducible() const override { return true; } - const char* name() const override { return "X86_CONV_BIAS_MATMUL"; } - - bool usable(const NCBKernSizeParam& param, - AlgoSelectionStrategy) const override { - auto&& fm = param.filter_meta; - return fm.format == Param::Format::NCHW && fm.spatial_ndim == 2 && - param.src_type.enumv() == DTypeEnum::Float32 && - param.filter_type.enumv() == DTypeEnum::Float32 && - param.dst_type.enumv() == DTypeEnum::Float32 && - fm.dilation[0] == 1 && fm.dilation[1] == 1 && - //! The matmul opr is only used in single thread - //! TODO:support the no pack matmul algo in fallback im2col + - //! matmul - param.nr_threads == 1_z; - } - - bool is_preferred(const NCBKernSizeParam&) const override; - - size_t get_workspace(const NCBKernSizeParam& param) const override { - return get_bundle(param).total_size_in_bytes(); - } - SmallVector dispatch_kerns( - const NCBKernSizeParam& param) const override { - size_t group = param.filter_meta.group; - return {{kimpl, {group, 1_z, 1_z}}}; - } - - void* type() const override; - ConvAlgoTypePack get_algo_type() const override { - return {AlgoDataType::FLOAT32, AlgoCategory::IM2COL}; - } -}; - #if MEGDNN_X86_WITH_MKL_DNN class ConvBiasImpl::AlgoMkldnnConv final : public AlgoBase { static void kern_mkldnn_fp32(const NCBKernParam& param, diff --git a/dnn/src/x86/conv_bias/opr_impl.cpp b/dnn/src/x86/conv_bias/opr_impl.cpp index b4759ea1e3988ee95feb6c61ff9992953b419308..515666052831e2d8b83fcb50a8b6a4adcb0754e1 100644 --- a/dnn/src/x86/conv_bias/opr_impl.cpp +++ b/dnn/src/x86/conv_bias/opr_impl.cpp @@ -47,10 +47,6 @@ void* ConvBiasImpl::AlgoDirectStride2::type() const { return x86_algo_type; } -void* ConvBiasImpl::AlgoMatrixMul::type() const { - return x86_algo_type; -} - void* ConvBiasImpl::AlgoDirectAvx2Stride1Int8::type() const { return x86_algo_type; } @@ -82,7 +78,6 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj { AlgoAVX2DirectConvStride2 avx2_stride2_direct; AlgoChanWiseAvx2Stride1Qint8 avx2_stride1_chanwsie_qint8; AlgoChanWiseAvx2Stride2Qint8 avx2_stride2_chanwsie_qint8; - AlgoMatrixMul matmul; #if MEGDNN_X86_WITH_MKL_DNN AlgoMkldnnMatmulQint8 mkldnn_matmul_qint8; //! Because the mkldnnconv need handle @@ -107,7 +102,6 @@ public: all_algos.emplace_back(&avx2_stride2_chanwsie_qint8); all_algos.emplace_back(&avx2_stride1_direct_int8); all_algos.emplace_back(&avx2_stride2_direct); - all_algos.emplace_back(&matmul); static CpuOprDelegationStorage<> storage; auto matmul_opr = storage.get(); diff --git a/dnn/src/x86/conv_bias/opr_impl.h b/dnn/src/x86/conv_bias/opr_impl.h index 38dca3d3023865a0158eef7d0e8fd34109d5cc33..e98aae5d99b5b5e9d5eca0257edad870f94709f9 100644 --- a/dnn/src/x86/conv_bias/opr_impl.h +++ b/dnn/src/x86/conv_bias/opr_impl.h @@ -31,7 +31,6 @@ public: class AlgoDirectStride2; class AlgoFP32WinogradF63_8x8; class AlgoFP32WinogradF23_8x8; - class AlgoMatrixMul; class AlgoDirectAvx2Stride1Int8; class AlgoAVX2DirectConvStride2; class AlgoChanWiseAvx2Stride1Qint8; diff --git a/dnn/src/x86/handle.h b/dnn/src/x86/handle.h index 0443a6dcfad31f58618d788575cdc4af99edb14e..488ee4d5d59cb141a160effcbcf9bd078d75aa24 100644 --- a/dnn/src/x86/handle.h +++ b/dnn/src/x86/handle.h @@ -11,8 +11,6 @@ #pragma once #include "src/fallback/handle.h" -#include "src/x86/profile.h" - #if MEGDNN_X86_WITH_MKL_DNN #include #endif @@ -22,8 +20,6 @@ namespace x86 { class HandleImpl : public fallback::HandleImpl { public: - const ProfileCache& profile_cache() { return m_profile_cache; } - HandleImpl(megcoreComputingHandle_t computing_handle, HandleType type = HandleType::X86); @@ -37,7 +33,6 @@ public: #endif private: - ProfileCache m_profile_cache = get_profile_cache(); #if MEGDNN_X86_WITH_MKL_DNN dnnl::engine m_mkldnn_engine; dnnl::stream m_mkldnn_stream; diff --git a/dnn/src/x86/profile.cpp b/dnn/src/x86/profile.cpp deleted file mode 100644 index b5eb532c115f39bddd9196c2b9304724173e2574..0000000000000000000000000000000000000000 --- a/dnn/src/x86/profile.cpp +++ /dev/null @@ -1,324 +0,0 @@ -/** - * \file dnn/src/x86/profile.cpp - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#include "src/x86/profile.h" - -namespace megdnn { -namespace x86 { - -ProfileCache get_profile_cache() -{ - ProfileCache vec; - vec.clear(); - vec.reserve(294); - vec.push_back(ProfileElement(2, 4, 4, 49)); - vec.push_back(ProfileElement(2, 4, 8, 25)); - vec.push_back(ProfileElement(2, 4, 16, 19)); - vec.push_back(ProfileElement(2, 4, 32, 14)); - vec.push_back(ProfileElement(2, 4, 64, 13)); - vec.push_back(ProfileElement(2, 4, 96, 15)); - vec.push_back(ProfileElement(2, 4, 128, 15)); - vec.push_back(ProfileElement(2, 8, 4, 241)); - vec.push_back(ProfileElement(2, 8, 8, 121)); - vec.push_back(ProfileElement(2, 8, 16, 57)); - vec.push_back(ProfileElement(2, 8, 32, 29)); - vec.push_back(ProfileElement(2, 8, 64, 17)); - vec.push_back(ProfileElement(2, 8, 96, 39)); - vec.push_back(ProfileElement(2, 8, 128, 29)); - vec.push_back(ProfileElement(2, 16, 4, 1000000000)); - vec.push_back(ProfileElement(2, 16, 8, 273)); - vec.push_back(ProfileElement(2, 16, 16, 177)); - vec.push_back(ProfileElement(2, 16, 32, 137)); - vec.push_back(ProfileElement(2, 16, 64, 1000000000)); - vec.push_back(ProfileElement(2, 16, 96, 1000000000)); - vec.push_back(ProfileElement(2, 16, 128, 1000000000)); - vec.push_back(ProfileElement(2, 32, 4, 1000000000)); - vec.push_back(ProfileElement(2, 32, 8, 1000000000)); - vec.push_back(ProfileElement(2, 32, 16, 1000000000)); - vec.push_back(ProfileElement(2, 32, 32, 1000000000)); - vec.push_back(ProfileElement(2, 32, 64, 1000000000)); - vec.push_back(ProfileElement(2, 32, 96, 1000000000)); - vec.push_back(ProfileElement(2, 32, 128, 1000000000)); - vec.push_back(ProfileElement(2, 64, 4, 1000000000)); - vec.push_back(ProfileElement(2, 64, 8, 1000000000)); - vec.push_back(ProfileElement(2, 64, 16, 1000000000)); - vec.push_back(ProfileElement(2, 64, 32, 1000000000)); - vec.push_back(ProfileElement(2, 64, 64, 1000000000)); - vec.push_back(ProfileElement(2, 64, 96, 1000000000)); - vec.push_back(ProfileElement(2, 64, 128, 1000000000)); - vec.push_back(ProfileElement(2, 96, 4, 1000000000)); - vec.push_back(ProfileElement(2, 96, 8, 1000000000)); - vec.push_back(ProfileElement(2, 96, 16, 1000000000)); - vec.push_back(ProfileElement(2, 96, 32, 1000000000)); - vec.push_back(ProfileElement(2, 96, 64, 1000000000)); - vec.push_back(ProfileElement(2, 96, 96, 1000000000)); - vec.push_back(ProfileElement(2, 96, 128, 1000000000)); - vec.push_back(ProfileElement(2, 128, 4, 1000000000)); - vec.push_back(ProfileElement(2, 128, 8, 1000000000)); - vec.push_back(ProfileElement(2, 128, 16, 1000000000)); - vec.push_back(ProfileElement(2, 128, 32, 1000000000)); - vec.push_back(ProfileElement(2, 128, 64, 1000000000)); - vec.push_back(ProfileElement(2, 128, 96, 1000000000)); - vec.push_back(ProfileElement(2, 128, 128, 1000000000)); - vec.push_back(ProfileElement(3, 4, 4, 10)); - vec.push_back(ProfileElement(3, 4, 8, 5)); - vec.push_back(ProfileElement(3, 4, 16, 7)); - vec.push_back(ProfileElement(3, 4, 32, 7)); - vec.push_back(ProfileElement(3, 4, 64, 6)); - vec.push_back(ProfileElement(3, 4, 96, 5)); - vec.push_back(ProfileElement(3, 4, 128, 5)); - vec.push_back(ProfileElement(3, 8, 4, 14)); - vec.push_back(ProfileElement(3, 8, 8, 13)); - vec.push_back(ProfileElement(3, 8, 16, 13)); - vec.push_back(ProfileElement(3, 8, 32, 13)); - vec.push_back(ProfileElement(3, 8, 64, 11)); - vec.push_back(ProfileElement(3, 8, 96, 11)); - vec.push_back(ProfileElement(3, 8, 128, 12)); - vec.push_back(ProfileElement(3, 16, 4, 37)); - vec.push_back(ProfileElement(3, 16, 8, 29)); - vec.push_back(ProfileElement(3, 16, 16, 21)); - vec.push_back(ProfileElement(3, 16, 32, 19)); - vec.push_back(ProfileElement(3, 16, 64, 14)); - vec.push_back(ProfileElement(3, 16, 96, 13)); - vec.push_back(ProfileElement(3, 16, 128, 13)); - vec.push_back(ProfileElement(3, 32, 4, 69)); - vec.push_back(ProfileElement(3, 32, 8, 105)); - vec.push_back(ProfileElement(3, 32, 16, 105)); - vec.push_back(ProfileElement(3, 32, 32, 49)); - vec.push_back(ProfileElement(3, 32, 64, 29)); - vec.push_back(ProfileElement(3, 32, 96, 27)); - vec.push_back(ProfileElement(3, 32, 128, 39)); - vec.push_back(ProfileElement(3, 64, 4, 193)); - vec.push_back(ProfileElement(3, 64, 8, 161)); - vec.push_back(ProfileElement(3, 64, 16, 137)); - vec.push_back(ProfileElement(3, 64, 32, 113)); - vec.push_back(ProfileElement(3, 64, 64, 1000000000)); - vec.push_back(ProfileElement(3, 64, 96, 1000000000)); - vec.push_back(ProfileElement(3, 64, 128, 1000000000)); - vec.push_back(ProfileElement(3, 96, 4, 1000000000)); - vec.push_back(ProfileElement(3, 96, 8, 305)); - vec.push_back(ProfileElement(3, 96, 16, 1000000000)); - vec.push_back(ProfileElement(3, 96, 32, 1000000000)); - vec.push_back(ProfileElement(3, 96, 64, 1000000000)); - vec.push_back(ProfileElement(3, 96, 96, 1000000000)); - vec.push_back(ProfileElement(3, 96, 128, 1000000000)); - vec.push_back(ProfileElement(3, 128, 4, 1000000000)); - vec.push_back(ProfileElement(3, 128, 8, 1000000000)); - vec.push_back(ProfileElement(3, 128, 16, 1000000000)); - vec.push_back(ProfileElement(3, 128, 32, 1000000000)); - vec.push_back(ProfileElement(3, 128, 64, 1000000000)); - vec.push_back(ProfileElement(3, 128, 96, 1000000000)); - vec.push_back(ProfileElement(3, 128, 128, 1000000000)); - vec.push_back(ProfileElement(4, 4, 4, 7)); - vec.push_back(ProfileElement(4, 4, 8, 7)); - vec.push_back(ProfileElement(4, 4, 16, 5)); - vec.push_back(ProfileElement(4, 4, 32, 6)); - vec.push_back(ProfileElement(4, 4, 64, 5)); - vec.push_back(ProfileElement(4, 4, 96, 5)); - vec.push_back(ProfileElement(4, 4, 128, 5)); - vec.push_back(ProfileElement(4, 8, 4, 14)); - vec.push_back(ProfileElement(4, 8, 8, 12)); - vec.push_back(ProfileElement(4, 8, 16, 5)); - vec.push_back(ProfileElement(4, 8, 32, 6)); - vec.push_back(ProfileElement(4, 8, 64, 6)); - vec.push_back(ProfileElement(4, 8, 96, 6)); - vec.push_back(ProfileElement(4, 8, 128, 5)); - vec.push_back(ProfileElement(4, 16, 4, 14)); - vec.push_back(ProfileElement(4, 16, 8, 14)); - vec.push_back(ProfileElement(4, 16, 16, 13)); - vec.push_back(ProfileElement(4, 16, 32, 13)); - vec.push_back(ProfileElement(4, 16, 64, 13)); - vec.push_back(ProfileElement(4, 16, 96, 13)); - vec.push_back(ProfileElement(4, 16, 128, 13)); - vec.push_back(ProfileElement(4, 32, 4, 37)); - vec.push_back(ProfileElement(4, 32, 8, 31)); - vec.push_back(ProfileElement(4, 32, 16, 29)); - vec.push_back(ProfileElement(4, 32, 32, 21)); - vec.push_back(ProfileElement(4, 32, 64, 21)); - vec.push_back(ProfileElement(4, 32, 96, 29)); - vec.push_back(ProfileElement(4, 32, 128, 21)); - vec.push_back(ProfileElement(4, 64, 4, 137)); - vec.push_back(ProfileElement(4, 64, 8, 113)); - vec.push_back(ProfileElement(4, 64, 16, 89)); - vec.push_back(ProfileElement(4, 64, 32, 69)); - vec.push_back(ProfileElement(4, 64, 64, 45)); - vec.push_back(ProfileElement(4, 64, 96, 37)); - vec.push_back(ProfileElement(4, 64, 128, 35)); - vec.push_back(ProfileElement(4, 96, 4, 137)); - vec.push_back(ProfileElement(4, 96, 8, 113)); - vec.push_back(ProfileElement(4, 96, 16, 105)); - vec.push_back(ProfileElement(4, 96, 32, 77)); - vec.push_back(ProfileElement(4, 96, 64, 53)); - vec.push_back(ProfileElement(4, 96, 96, 45)); - vec.push_back(ProfileElement(4, 96, 128, 39)); - vec.push_back(ProfileElement(4, 128, 4, 137)); - vec.push_back(ProfileElement(4, 128, 8, 121)); - vec.push_back(ProfileElement(4, 128, 16, 153)); - vec.push_back(ProfileElement(4, 128, 32, 97)); - vec.push_back(ProfileElement(4, 128, 64, 1000000000)); - vec.push_back(ProfileElement(4, 128, 96, 1000000000)); - vec.push_back(ProfileElement(4, 128, 128, 1000000000)); - vec.push_back(ProfileElement(5, 4, 4, 8)); - vec.push_back(ProfileElement(5, 4, 8, 9)); - vec.push_back(ProfileElement(5, 4, 16, 5)); - vec.push_back(ProfileElement(5, 4, 32, 5)); - vec.push_back(ProfileElement(5, 4, 64, 5)); - vec.push_back(ProfileElement(5, 4, 96, 5)); - vec.push_back(ProfileElement(5, 4, 128, 5)); - vec.push_back(ProfileElement(5, 8, 4, 7)); - vec.push_back(ProfileElement(5, 8, 8, 6)); - vec.push_back(ProfileElement(5, 8, 16, 5)); - vec.push_back(ProfileElement(5, 8, 32, 5)); - vec.push_back(ProfileElement(5, 8, 64, 5)); - vec.push_back(ProfileElement(5, 8, 96, 5)); - vec.push_back(ProfileElement(5, 8, 128, 5)); - vec.push_back(ProfileElement(5, 16, 4, 21)); - vec.push_back(ProfileElement(5, 16, 8, 12)); - vec.push_back(ProfileElement(5, 16, 16, 12)); - vec.push_back(ProfileElement(5, 16, 32, 11)); - vec.push_back(ProfileElement(5, 16, 64, 11)); - vec.push_back(ProfileElement(5, 16, 96, 11)); - vec.push_back(ProfileElement(5, 16, 128, 11)); - vec.push_back(ProfileElement(5, 32, 4, 23)); - vec.push_back(ProfileElement(5, 32, 8, 14)); - vec.push_back(ProfileElement(5, 32, 16, 14)); - vec.push_back(ProfileElement(5, 32, 32, 13)); - vec.push_back(ProfileElement(5, 32, 64, 13)); - vec.push_back(ProfileElement(5, 32, 96, 13)); - vec.push_back(ProfileElement(5, 32, 128, 13)); - vec.push_back(ProfileElement(5, 64, 4, 77)); - vec.push_back(ProfileElement(5, 64, 8, 39)); - vec.push_back(ProfileElement(5, 64, 16, 37)); - vec.push_back(ProfileElement(5, 64, 32, 29)); - vec.push_back(ProfileElement(5, 64, 64, 29)); - vec.push_back(ProfileElement(5, 64, 96, 21)); - vec.push_back(ProfileElement(5, 64, 128, 21)); - vec.push_back(ProfileElement(5, 96, 4, 113)); - vec.push_back(ProfileElement(5, 96, 8, 77)); - vec.push_back(ProfileElement(5, 96, 16, 61)); - vec.push_back(ProfileElement(5, 96, 32, 39)); - vec.push_back(ProfileElement(5, 96, 64, 37)); - vec.push_back(ProfileElement(5, 96, 96, 31)); - vec.push_back(ProfileElement(5, 96, 128, 29)); - vec.push_back(ProfileElement(5, 128, 4, 113)); - vec.push_back(ProfileElement(5, 128, 8, 97)); - vec.push_back(ProfileElement(5, 128, 16, 69)); - vec.push_back(ProfileElement(5, 128, 32, 53)); - vec.push_back(ProfileElement(5, 128, 64, 39)); - vec.push_back(ProfileElement(5, 128, 96, 31)); - vec.push_back(ProfileElement(5, 128, 128, 31)); - vec.push_back(ProfileElement(6, 4, 4, 7)); - vec.push_back(ProfileElement(6, 4, 8, 3)); - vec.push_back(ProfileElement(6, 4, 16, 5)); - vec.push_back(ProfileElement(6, 4, 32, 4)); - vec.push_back(ProfileElement(6, 4, 64, 5)); - vec.push_back(ProfileElement(6, 4, 96, 4)); - vec.push_back(ProfileElement(6, 4, 128, 4)); - vec.push_back(ProfileElement(6, 8, 4, 11)); - vec.push_back(ProfileElement(6, 8, 8, 5)); - vec.push_back(ProfileElement(6, 8, 16, 5)); - vec.push_back(ProfileElement(6, 8, 32, 5)); - vec.push_back(ProfileElement(6, 8, 64, 5)); - vec.push_back(ProfileElement(6, 8, 96, 5)); - vec.push_back(ProfileElement(6, 8, 128, 5)); - vec.push_back(ProfileElement(6, 16, 4, 13)); - vec.push_back(ProfileElement(6, 16, 8, 11)); - vec.push_back(ProfileElement(6, 16, 16, 11)); - vec.push_back(ProfileElement(6, 16, 32, 5)); - vec.push_back(ProfileElement(6, 16, 64, 5)); - vec.push_back(ProfileElement(6, 16, 96, 5)); - vec.push_back(ProfileElement(6, 16, 128, 11)); - vec.push_back(ProfileElement(6, 32, 4, 21)); - vec.push_back(ProfileElement(6, 32, 8, 14)); - vec.push_back(ProfileElement(6, 32, 16, 13)); - vec.push_back(ProfileElement(6, 32, 32, 13)); - vec.push_back(ProfileElement(6, 32, 64, 13)); - vec.push_back(ProfileElement(6, 32, 96, 13)); - vec.push_back(ProfileElement(6, 32, 128, 13)); - vec.push_back(ProfileElement(6, 64, 4, 39)); - vec.push_back(ProfileElement(6, 64, 8, 29)); - vec.push_back(ProfileElement(6, 64, 16, 29)); - vec.push_back(ProfileElement(6, 64, 32, 21)); - vec.push_back(ProfileElement(6, 64, 64, 21)); - vec.push_back(ProfileElement(6, 64, 96, 21)); - vec.push_back(ProfileElement(6, 64, 128, 21)); - vec.push_back(ProfileElement(6, 96, 4, 97)); - vec.push_back(ProfileElement(6, 96, 8, 61)); - vec.push_back(ProfileElement(6, 96, 16, 39)); - vec.push_back(ProfileElement(6, 96, 32, 37)); - vec.push_back(ProfileElement(6, 96, 64, 29)); - vec.push_back(ProfileElement(6, 96, 96, 29)); - vec.push_back(ProfileElement(6, 96, 128, 21)); - vec.push_back(ProfileElement(6, 128, 4, 77)); - vec.push_back(ProfileElement(6, 128, 8, 61)); - vec.push_back(ProfileElement(6, 128, 16, 39)); - vec.push_back(ProfileElement(6, 128, 32, 37)); - vec.push_back(ProfileElement(6, 128, 64, 29)); - vec.push_back(ProfileElement(6, 128, 96, 29)); - vec.push_back(ProfileElement(6, 128, 128, 23)); - vec.push_back(ProfileElement(7, 4, 4, 5)); - vec.push_back(ProfileElement(7, 4, 8, 4)); - vec.push_back(ProfileElement(7, 4, 16, 4)); - vec.push_back(ProfileElement(7, 4, 32, 4)); - vec.push_back(ProfileElement(7, 4, 64, 4)); - vec.push_back(ProfileElement(7, 4, 96, 4)); - vec.push_back(ProfileElement(7, 4, 128, 3)); - vec.push_back(ProfileElement(7, 8, 4, 5)); - vec.push_back(ProfileElement(7, 8, 8, 5)); - vec.push_back(ProfileElement(7, 8, 16, 5)); - vec.push_back(ProfileElement(7, 8, 32, 5)); - vec.push_back(ProfileElement(7, 8, 64, 5)); - vec.push_back(ProfileElement(7, 8, 96, 5)); - vec.push_back(ProfileElement(7, 8, 128, 5)); - vec.push_back(ProfileElement(7, 16, 4, 13)); - vec.push_back(ProfileElement(7, 16, 8, 11)); - vec.push_back(ProfileElement(7, 16, 16, 5)); - vec.push_back(ProfileElement(7, 16, 32, 5)); - vec.push_back(ProfileElement(7, 16, 64, 5)); - vec.push_back(ProfileElement(7, 16, 96, 5)); - vec.push_back(ProfileElement(7, 16, 128, 5)); - vec.push_back(ProfileElement(7, 32, 4, 21)); - vec.push_back(ProfileElement(7, 32, 8, 13)); - vec.push_back(ProfileElement(7, 32, 16, 13)); - vec.push_back(ProfileElement(7, 32, 32, 13)); - vec.push_back(ProfileElement(7, 32, 64, 13)); - vec.push_back(ProfileElement(7, 32, 96, 13)); - vec.push_back(ProfileElement(7, 32, 128, 12)); - vec.push_back(ProfileElement(7, 64, 4, 37)); - vec.push_back(ProfileElement(7, 64, 8, 21)); - vec.push_back(ProfileElement(7, 64, 16, 14)); - vec.push_back(ProfileElement(7, 64, 32, 14)); - vec.push_back(ProfileElement(7, 64, 64, 14)); - vec.push_back(ProfileElement(7, 64, 96, 13)); - vec.push_back(ProfileElement(7, 64, 128, 14)); - vec.push_back(ProfileElement(7, 96, 4, 61)); - vec.push_back(ProfileElement(7, 96, 8, 39)); - vec.push_back(ProfileElement(7, 96, 16, 37)); - vec.push_back(ProfileElement(7, 96, 32, 31)); - vec.push_back(ProfileElement(7, 96, 64, 21)); - vec.push_back(ProfileElement(7, 96, 96, 21)); - vec.push_back(ProfileElement(7, 96, 128, 21)); - vec.push_back(ProfileElement(7, 128, 4, 61)); - vec.push_back(ProfileElement(7, 128, 8, 31)); - vec.push_back(ProfileElement(7, 128, 16, 37)); - vec.push_back(ProfileElement(7, 128, 32, 11)); - vec.push_back(ProfileElement(7, 128, 64, 13)); - vec.push_back(ProfileElement(7, 128, 96, 23)); - vec.push_back(ProfileElement(7, 128, 128, 21)); - return vec; -} - -} // namespace fallback -} // namespace megdnn - -// vim: syntax=cpp.doxygen - - - diff --git a/dnn/src/x86/profile.h b/dnn/src/x86/profile.h deleted file mode 100644 index c01aa2d5a0da7f0da38bc5687f202b3e7cc31d02..0000000000000000000000000000000000000000 --- a/dnn/src/x86/profile.h +++ /dev/null @@ -1,45 +0,0 @@ -/** - * \file dnn/src/x86/profile.h - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#pragma once -#include - -namespace megdnn { -namespace x86 { - -struct ProfileElement { - // when output_size > on_threshold, DIRECT is faster, - // otherwise MATRIX_MUL is faster - int f, ic, oc, on_threshold; - ProfileElement(int f, int ic, int oc, int on_threshold): - f(f), ic(ic), oc(oc), on_threshold(on_threshold) - { - } - bool operator<(const ProfileElement &rhs) const - { - if (this->f < rhs.f) return true; - if (this->f > rhs.f) return false; - if (this->ic < rhs.ic) return true; - if (this->ic > rhs.ic) return false; - if (this->oc < rhs.oc) return true; - if (this->oc > rhs.oc) return false; - return false; - } -}; -using ProfileCache = std::vector; - -ProfileCache get_profile_cache(); - -} // namespace fallback -} // namespace megdnn - -// vim: syntax=cpp.doxygen - - diff --git a/dnn/src/x86/separable_conv/opr_impl.cpp b/dnn/src/x86/separable_conv/opr_impl.cpp index 8d43664cfea1a5a17417038314ca6df4f305e2ba..2926af58608e65e293ef5b3f10244c3e4aa96bcf 100644 --- a/dnn/src/x86/separable_conv/opr_impl.cpp +++ b/dnn/src/x86/separable_conv/opr_impl.cpp @@ -63,7 +63,6 @@ #include "./sep_conv_filter.h" #include "src/common/utils.h" #include "src/x86/utils.h" -#include "src/x86/profile.h" #include "src/x86/handle.h" #include diff --git a/dnn/src/x86/separable_filter/opr_impl.cpp b/dnn/src/x86/separable_filter/opr_impl.cpp index 81dcb39dda1d9d17fd0c066858406f02cbb3b835..eba9c356f685e9d52fa87cf272a30c54bb971014 100644 --- a/dnn/src/x86/separable_filter/opr_impl.cpp +++ b/dnn/src/x86/separable_filter/opr_impl.cpp @@ -14,7 +14,6 @@ #include "src/common/cv/helper.h" #include "src/common/utils.h" #include "src/x86/utils.h" -#include "src/x86/profile.h" #include "src/x86/handle.h" #include diff --git a/dnn/test/x86/conv_bias.cpp b/dnn/test/x86/conv_bias.cpp index df8c540425e82317ac18a1b7d4099147e0c8b722..e17582b591bc8a94562626e82e0b48a901f76ce0 100644 --- a/dnn/test/x86/conv_bias.cpp +++ b/dnn/test/x86/conv_bias.cpp @@ -1599,73 +1599,6 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8_FILTER_PREPROCESS) { #undef cb } - -TEST_F(X86, CONV_BIAS_MATMUL) { - using namespace conv_bias; - std::vector args; - - auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, - size_t p, NonlineMode nonline_mode) { - if (w + 2 * p < kernel || h + 2 * p < kernel) - return; - param::ConvBias param; - param.stride_h = 1; - param.stride_w = 1; - param.pad_h = p; - param.pad_w = p; - param.nonlineMode = nonline_mode; - - //! no bias - param.sparse = param::ConvBias::Sparse::DENSE; - args.emplace_back(param, TensorShape{1, ic, h, w}, - TensorShape{oc, ic, kernel, kernel}, TensorShape{}); - //! bias channel - args.emplace_back(param, TensorShape{2, ic, h, w}, - TensorShape{oc, ic, kernel, kernel}, - TensorShape{1, oc, 1, 1}); - //! bias - args.emplace_back(param, TensorShape{2, ic, h, w}, - TensorShape{oc, ic, kernel, kernel}, - TensorShape{2, oc, (h + param.pad_h * 2 - kernel) + 1, - (w + param.pad_w * 2 - kernel) + 1}); - //! gruop - param.sparse = param::ConvBias::Sparse::GROUP; - args.emplace_back( - param, TensorShape{2, 2 * ic, h, w}, - TensorShape{2, oc, ic, kernel, kernel}, - TensorShape{2, 2 * oc, (h + param.pad_h * 2 - kernel) + 1, - (w + param.pad_w * 2 - kernel) + 1}); - }; - - for (size_t kernel : {2, 3, 5, 7}) - for (size_t ic : {1, 2, 3, 4}) - for (size_t oc : {1, 2, 3, 4}) - for (size_t p : {0, 2}) - for (size_t size : {20, 21, 22, 23, 24}) - for (NonlineMode nonline_mode : - {NonlineMode::RELU, NonlineMode::SIGMOID, - NonlineMode::H_SWISH, NonlineMode::IDENTITY}) { - run(oc, ic, size, size, kernel, p, nonline_mode); - } - - Checker checker(handle()); - checker.set_before_exec_callback( - conv_bias::ConvBiasAlgoChecker( - "X86_CONV_BIAS_MATMUL")); - checker.set_epsilon(1); - UniformIntRNG rng{-50, 50}; - checker.set_dtype(0, dtype::Float32()) - .set_dtype(1, dtype::Float32()) - .set_dtype(2, dtype::Float32()) - .set_rng(0, &rng) - .set_rng(1, &rng) - .set_rng(2, &rng); - - for (auto&& arg : args) { - checker.set_param(arg.param).exec( - {arg.src, arg.filter, arg.bias, {}, {}}); - } -} #if MEGDNN_WITH_BENCHMARK #if MEGDNN_X86_WITH_MKL_DNN static void x86_benchmark_fp32_mkldnn(Handle* handle) { diff --git a/dnn/test/x86/convolution.cpp b/dnn/test/x86/convolution.cpp index 97a22b096ed1bd6be6206d756d38470fc3d89740..9d42b5001fc8d4b6f360da7d8564a7bf609a7514 100644 --- a/dnn/test/x86/convolution.cpp +++ b/dnn/test/x86/convolution.cpp @@ -182,49 +182,6 @@ TEST_F(X86, DEFAULT_CONV_DIRECT_STRIDE2) { } } -TEST_F(X86, DEFAULT_CONV_MATMUL) { - using namespace convolution; - std::vector args; - - auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, - size_t p) { - if (w + 2 * p < kernel || h + 2 * p < kernel) - return; - param::Convolution param; - param.stride_h = 1; - param.stride_w = 1; - param.pad_h = p; - param.pad_w = p; - - //! no bias - args.emplace_back(param, TensorShape{1, ic, h, w}, - TensorShape{oc, ic, kernel, kernel}); - }; - - for (size_t kernel : {2, 3, 5, 7}) - for (size_t ic : {1, 2, 3, 4}) - for (size_t oc : {1, 2, 3, 4}) - for (size_t p : {0, 2}) - for (size_t size : {20, 21, 22, 23, 24}) { - run(oc, ic, size, size, kernel, p); - } - - Checker checker(handle()); - checker.set_before_exec_callback(AlgoChecker( - "CONVOLUTION_DEFAULT_X86_CONV_BIAS_MATMUL")); - UniformIntRNG rng{-50, 50}; - checker.set_dtype(0, dtype::Float32()) - .set_dtype(1, dtype::Float32()) - .set_dtype(2, dtype::Float32()) - .set_rng(0, &rng) - .set_rng(1, &rng) - .set_rng(2, &rng); - - for (auto&& arg : args) { - checker.set_param(arg.param).exec({arg.src, arg.filter, {}}); - } -} - #if MEGDNN_X86_WITH_MKL_DNN TEST_F(X86, CONVOLUTION_FORWARD_INT8) { Checker checker(handle());