提交 e39f9386 编写于 作者: M Megvii Engine Team

refactor(dnn): remove ProfileCache and matmul algo in x86

GitOrigin-RevId: 55a700d7476a8cff404ef990eb39be393eaee5b5
上级 28dbadf7
......@@ -20,7 +20,6 @@
#include "src/x86/conv_bias/postprocess_helper.h"
#include "src/x86/convolution/convolution_direct_special_cases.h"
#include "src/x86/handle.h"
#include "src/x86/profile.h"
#include "midout.h"
......@@ -487,153 +486,6 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoDirectStride2::get_kimpls(
const NCBKernSizeParam& param) const {
GET_KERN;
}
/* ===================== matmul algo ===================== */
WorkspaceBundle ConvBiasImpl::AlgoMatrixMul::get_bundle(
const NCBKernSizeParam& param) {
UNPACK_CONV_F32_NCB_KERN_SIZES(param);
MEGDNN_MARK_USED_VAR(N);
MEGDNN_MARK_USED_VAR(OC);
auto IW2 = IH + 2 * PH;
auto IH2 = IW + 2 * PW;
bool can_matrix_mul_direct =
(FH == 1 && FW == 1 && SH == 1 && SW == 1 && PH == 0 && PW == 0);
// temp space to store padding-free src (with 4 extra floats)
// temp space to store unrolled matrix (with 4 extra floats)
// workspace for matrix mul opr
size_t part0, part1, part2;
if (can_matrix_mul_direct) {
part0 = part1 = 0;
} else {
part0 = (IC * IH2 * IW2 + 4) * sizeof(float);
part1 = (IC * FH * FW * OH * OW + 4) * sizeof(float);
}
{
TensorLayout A_, B_, C_;
A_ = TensorLayout({OC, IC * FH * FW}, dtype::Float32());
B_ = TensorLayout({IC * FH * FW, OH * OW}, dtype::Float32());
C_ = TensorLayout({OC, OH * OW}, dtype::Float32());
part2 = get_matmul_opr()->get_workspace_in_bytes(A_, B_, C_);
}
return {nullptr, {part0, part1, part2}};
}
bool ConvBiasImpl::AlgoMatrixMul::is_preferred(
const NCBKernSizeParam& param) const {
auto&& fm = param.filter_meta;
if (fm.dilation[0] != 1 || fm.dilation[1] != 1) {
return false;
}
// single channel conv should never use matrix mul
if (fm.ocpg == 1 || fm.icpg == 1)
return false;
// 1x1 conv should always use matrix mul
if (fm.spatial[0] == 1 && fm.spatial[1] == 1)
return true;
// if stride is not 1x1, always use matrix mul
if (fm.stride[0] != 1 || fm.stride[1] != 1)
return true;
int f = find_nearest_elem<int>(
std::round(geometric_mean(fm.spatial[0], fm.spatial[1])),
{2, 3, 4, 5, 6, 7});
int oc = find_nearest_elem<int>(fm.ocpg, {4, 8, 16, 32, 64, 96, 128});
int ic = find_nearest_elem<int>(fm.icpg, {4, 8, 16, 32, 64, 96, 128});
int on = std::round(geometric_mean(param.osz[0], param.osz[1]));
ProfileElement cur(f, oc, ic, on);
auto H = static_cast<HandleImpl*>(inplace_cpu_handle().get());
auto&& target = std::lower_bound(H->profile_cache().begin(),
H->profile_cache().end(), cur);
megdnn_assert_internal(target->f == cur.f);
megdnn_assert_internal(target->oc == cur.oc);
megdnn_assert_internal(target->ic == cur.ic);
return on < target->on_threshold;
}
MatrixMul* ConvBiasImpl::AlgoMatrixMul::get_matmul_opr() {
static CpuOprDelegationStorage<> storage;
return storage.get<MatrixMul>();
}
void ConvBiasImpl::AlgoMatrixMul::kimpl(const NCBKernParam& param,
const NCBKernIndex& ncb_index) {
UNPACK_CONV_F32_NCB_KERN_SIZES(param);
auto IH2 = IH + 2 * PH;
auto IW2 = IW + 2 * PW;
size_t group_id = ncb_index.ndrange_id[0];
bool is_xcorr = !param.filter_meta.should_flip;
auto bundle = get_bundle(param);
bundle.set(param.workspace_ptr);
// workspace = tmp..src2
for (size_t n = 0; n < N; ++n) {
float* src = const_cast<float*>(param.src<float>(n, group_id));
float* dst = param.dst<float>(n, group_id);
float* bias_ptr = static_cast<float*>(
const_cast<void*>(param.bias<void>(n, group_id)));
float *B, *src2;
if (FH == 1 && FW == 1 && SH == 1 && SW == 1 && PH == 0 && PW == 0) {
// special case: 1x1
B = src;
} else {
src2 = static_cast<float*>(bundle.get(0));
// copy src to src2;
float* src2_ptr = src2;
const float* src_ptr = src;
rep(ic, IC) {
if (PH != 0) {
std::memset(src2_ptr, 0, sizeof(float) * PH * IW2);
src2_ptr += PH * IW2;
}
rep(ih, IH) {
if (PW != 0)
rep(pw, PW) * (src2_ptr++) = 0.0f;
std::memcpy(src2_ptr, src_ptr, sizeof(float) * IW);
src2_ptr += IW;
src_ptr += IW;
if (PW != 0)
rep(pw, PW) * (src2_ptr++) = 0.0f;
}
if (PH != 0) {
std::memset(src2_ptr, 0, sizeof(float) * PH * IW2);
src2_ptr += PH * IW2;
}
}
B = static_cast<float*>(bundle.get(1));
if (SH == 1 && SW == 1) {
if (is_xcorr) {
img2col<true>(src2, B, OC, OH, OW, IC, IH2, IW2, FH, FW);
} else {
img2col<false>(src2, B, OC, OH, OW, IC, IH2, IW2, FH, FW);
}
} else {
if (is_xcorr) {
img2col_stride<true>(src2, B, OC, OH, OW, IC, IH2, IW2, FH,
FW, SH, SW);
} else {
img2col_stride<false>(src2, B, OC, OH, OW, IC, IH2, IW2, FH,
FW, SH, SW);
}
}
}
{
TensorND A_, B_, C_;
A_.layout = TensorLayout({OC, IC * FH * FW}, dtype::Float32());
A_.raw_ptr = const_cast<float*>(param.filter<float>(group_id));
B_.layout = TensorLayout({IC * FH * FW, OH * OW}, dtype::Float32());
B_.raw_ptr = B;
C_.layout = TensorLayout({OC, OH * OW}, dtype::Float32());
C_.raw_ptr = dst;
Workspace workspace(static_cast<dt_byte*>(bundle.get(2)),
bundle.get_size(2));
get_matmul_opr()->exec(A_, B_, C_, workspace);
}
PostProcess<float>::run(dst, bias_ptr, dst, param.bias_mode,
param.nonlineMode, param.bias_type,
param.dst_type, 1_z, OC, OH, OW);
}
}
#if MEGDNN_X86_WITH_MKL_DNN
static inline void mkldnn_fp32_conv_instance(
......
......@@ -123,47 +123,6 @@ public:
MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(AlgoDataType::FLOAT32);
};
/* ===================== matmul algo ===================== */
class ConvBiasImpl::AlgoMatrixMul final : public AlgoBase {
static MatrixMul* get_matmul_opr();
static WorkspaceBundle get_bundle(const NCBKernSizeParam& param);
static void kimpl(const NCBKernParam& param, const NCBKernIndex&);
public:
bool is_reproducible() const override { return true; }
const char* name() const override { return "X86_CONV_BIAS_MATMUL"; }
bool usable(const NCBKernSizeParam& param,
AlgoSelectionStrategy) const override {
auto&& fm = param.filter_meta;
return fm.format == Param::Format::NCHW && fm.spatial_ndim == 2 &&
param.src_type.enumv() == DTypeEnum::Float32 &&
param.filter_type.enumv() == DTypeEnum::Float32 &&
param.dst_type.enumv() == DTypeEnum::Float32 &&
fm.dilation[0] == 1 && fm.dilation[1] == 1 &&
//! The matmul opr is only used in single thread
//! TODO:support the no pack matmul algo in fallback im2col +
//! matmul
param.nr_threads == 1_z;
}
bool is_preferred(const NCBKernSizeParam&) const override;
size_t get_workspace(const NCBKernSizeParam& param) const override {
return get_bundle(param).total_size_in_bytes();
}
SmallVector<NCBKern> dispatch_kerns(
const NCBKernSizeParam& param) const override {
size_t group = param.filter_meta.group;
return {{kimpl, {group, 1_z, 1_z}}};
}
void* type() const override;
ConvAlgoTypePack get_algo_type() const override {
return {AlgoDataType::FLOAT32, AlgoCategory::IM2COL};
}
};
#if MEGDNN_X86_WITH_MKL_DNN
class ConvBiasImpl::AlgoMkldnnConv final : public AlgoBase {
static void kern_mkldnn_fp32(const NCBKernParam& param,
......
......@@ -47,10 +47,6 @@ void* ConvBiasImpl::AlgoDirectStride2::type() const {
return x86_algo_type;
}
void* ConvBiasImpl::AlgoMatrixMul::type() const {
return x86_algo_type;
}
void* ConvBiasImpl::AlgoDirectAvx2Stride1Int8::type() const {
return x86_algo_type;
}
......@@ -82,7 +78,6 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {
AlgoAVX2DirectConvStride2 avx2_stride2_direct;
AlgoChanWiseAvx2Stride1Qint8 avx2_stride1_chanwsie_qint8;
AlgoChanWiseAvx2Stride2Qint8 avx2_stride2_chanwsie_qint8;
AlgoMatrixMul matmul;
#if MEGDNN_X86_WITH_MKL_DNN
AlgoMkldnnMatmulQint8 mkldnn_matmul_qint8;
//! Because the mkldnnconv need handle
......@@ -107,7 +102,6 @@ public:
all_algos.emplace_back(&avx2_stride2_chanwsie_qint8);
all_algos.emplace_back(&avx2_stride1_direct_int8);
all_algos.emplace_back(&avx2_stride2_direct);
all_algos.emplace_back(&matmul);
static CpuOprDelegationStorage<> storage;
auto matmul_opr = storage.get<MatrixMul>();
......
......@@ -31,7 +31,6 @@ public:
class AlgoDirectStride2;
class AlgoFP32WinogradF63_8x8;
class AlgoFP32WinogradF23_8x8;
class AlgoMatrixMul;
class AlgoDirectAvx2Stride1Int8;
class AlgoAVX2DirectConvStride2;
class AlgoChanWiseAvx2Stride1Qint8;
......
......@@ -11,8 +11,6 @@
#pragma once
#include "src/fallback/handle.h"
#include "src/x86/profile.h"
#if MEGDNN_X86_WITH_MKL_DNN
#include <mkldnn.hpp>
#endif
......@@ -22,8 +20,6 @@ namespace x86 {
class HandleImpl : public fallback::HandleImpl {
public:
const ProfileCache& profile_cache() { return m_profile_cache; }
HandleImpl(megcoreComputingHandle_t computing_handle,
HandleType type = HandleType::X86);
......@@ -37,7 +33,6 @@ public:
#endif
private:
ProfileCache m_profile_cache = get_profile_cache();
#if MEGDNN_X86_WITH_MKL_DNN
dnnl::engine m_mkldnn_engine;
dnnl::stream m_mkldnn_stream;
......
/**
* \file dnn/src/x86/profile.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "src/x86/profile.h"
namespace megdnn {
namespace x86 {
ProfileCache get_profile_cache()
{
ProfileCache vec;
vec.clear();
vec.reserve(294);
vec.push_back(ProfileElement(2, 4, 4, 49));
vec.push_back(ProfileElement(2, 4, 8, 25));
vec.push_back(ProfileElement(2, 4, 16, 19));
vec.push_back(ProfileElement(2, 4, 32, 14));
vec.push_back(ProfileElement(2, 4, 64, 13));
vec.push_back(ProfileElement(2, 4, 96, 15));
vec.push_back(ProfileElement(2, 4, 128, 15));
vec.push_back(ProfileElement(2, 8, 4, 241));
vec.push_back(ProfileElement(2, 8, 8, 121));
vec.push_back(ProfileElement(2, 8, 16, 57));
vec.push_back(ProfileElement(2, 8, 32, 29));
vec.push_back(ProfileElement(2, 8, 64, 17));
vec.push_back(ProfileElement(2, 8, 96, 39));
vec.push_back(ProfileElement(2, 8, 128, 29));
vec.push_back(ProfileElement(2, 16, 4, 1000000000));
vec.push_back(ProfileElement(2, 16, 8, 273));
vec.push_back(ProfileElement(2, 16, 16, 177));
vec.push_back(ProfileElement(2, 16, 32, 137));
vec.push_back(ProfileElement(2, 16, 64, 1000000000));
vec.push_back(ProfileElement(2, 16, 96, 1000000000));
vec.push_back(ProfileElement(2, 16, 128, 1000000000));
vec.push_back(ProfileElement(2, 32, 4, 1000000000));
vec.push_back(ProfileElement(2, 32, 8, 1000000000));
vec.push_back(ProfileElement(2, 32, 16, 1000000000));
vec.push_back(ProfileElement(2, 32, 32, 1000000000));
vec.push_back(ProfileElement(2, 32, 64, 1000000000));
vec.push_back(ProfileElement(2, 32, 96, 1000000000));
vec.push_back(ProfileElement(2, 32, 128, 1000000000));
vec.push_back(ProfileElement(2, 64, 4, 1000000000));
vec.push_back(ProfileElement(2, 64, 8, 1000000000));
vec.push_back(ProfileElement(2, 64, 16, 1000000000));
vec.push_back(ProfileElement(2, 64, 32, 1000000000));
vec.push_back(ProfileElement(2, 64, 64, 1000000000));
vec.push_back(ProfileElement(2, 64, 96, 1000000000));
vec.push_back(ProfileElement(2, 64, 128, 1000000000));
vec.push_back(ProfileElement(2, 96, 4, 1000000000));
vec.push_back(ProfileElement(2, 96, 8, 1000000000));
vec.push_back(ProfileElement(2, 96, 16, 1000000000));
vec.push_back(ProfileElement(2, 96, 32, 1000000000));
vec.push_back(ProfileElement(2, 96, 64, 1000000000));
vec.push_back(ProfileElement(2, 96, 96, 1000000000));
vec.push_back(ProfileElement(2, 96, 128, 1000000000));
vec.push_back(ProfileElement(2, 128, 4, 1000000000));
vec.push_back(ProfileElement(2, 128, 8, 1000000000));
vec.push_back(ProfileElement(2, 128, 16, 1000000000));
vec.push_back(ProfileElement(2, 128, 32, 1000000000));
vec.push_back(ProfileElement(2, 128, 64, 1000000000));
vec.push_back(ProfileElement(2, 128, 96, 1000000000));
vec.push_back(ProfileElement(2, 128, 128, 1000000000));
vec.push_back(ProfileElement(3, 4, 4, 10));
vec.push_back(ProfileElement(3, 4, 8, 5));
vec.push_back(ProfileElement(3, 4, 16, 7));
vec.push_back(ProfileElement(3, 4, 32, 7));
vec.push_back(ProfileElement(3, 4, 64, 6));
vec.push_back(ProfileElement(3, 4, 96, 5));
vec.push_back(ProfileElement(3, 4, 128, 5));
vec.push_back(ProfileElement(3, 8, 4, 14));
vec.push_back(ProfileElement(3, 8, 8, 13));
vec.push_back(ProfileElement(3, 8, 16, 13));
vec.push_back(ProfileElement(3, 8, 32, 13));
vec.push_back(ProfileElement(3, 8, 64, 11));
vec.push_back(ProfileElement(3, 8, 96, 11));
vec.push_back(ProfileElement(3, 8, 128, 12));
vec.push_back(ProfileElement(3, 16, 4, 37));
vec.push_back(ProfileElement(3, 16, 8, 29));
vec.push_back(ProfileElement(3, 16, 16, 21));
vec.push_back(ProfileElement(3, 16, 32, 19));
vec.push_back(ProfileElement(3, 16, 64, 14));
vec.push_back(ProfileElement(3, 16, 96, 13));
vec.push_back(ProfileElement(3, 16, 128, 13));
vec.push_back(ProfileElement(3, 32, 4, 69));
vec.push_back(ProfileElement(3, 32, 8, 105));
vec.push_back(ProfileElement(3, 32, 16, 105));
vec.push_back(ProfileElement(3, 32, 32, 49));
vec.push_back(ProfileElement(3, 32, 64, 29));
vec.push_back(ProfileElement(3, 32, 96, 27));
vec.push_back(ProfileElement(3, 32, 128, 39));
vec.push_back(ProfileElement(3, 64, 4, 193));
vec.push_back(ProfileElement(3, 64, 8, 161));
vec.push_back(ProfileElement(3, 64, 16, 137));
vec.push_back(ProfileElement(3, 64, 32, 113));
vec.push_back(ProfileElement(3, 64, 64, 1000000000));
vec.push_back(ProfileElement(3, 64, 96, 1000000000));
vec.push_back(ProfileElement(3, 64, 128, 1000000000));
vec.push_back(ProfileElement(3, 96, 4, 1000000000));
vec.push_back(ProfileElement(3, 96, 8, 305));
vec.push_back(ProfileElement(3, 96, 16, 1000000000));
vec.push_back(ProfileElement(3, 96, 32, 1000000000));
vec.push_back(ProfileElement(3, 96, 64, 1000000000));
vec.push_back(ProfileElement(3, 96, 96, 1000000000));
vec.push_back(ProfileElement(3, 96, 128, 1000000000));
vec.push_back(ProfileElement(3, 128, 4, 1000000000));
vec.push_back(ProfileElement(3, 128, 8, 1000000000));
vec.push_back(ProfileElement(3, 128, 16, 1000000000));
vec.push_back(ProfileElement(3, 128, 32, 1000000000));
vec.push_back(ProfileElement(3, 128, 64, 1000000000));
vec.push_back(ProfileElement(3, 128, 96, 1000000000));
vec.push_back(ProfileElement(3, 128, 128, 1000000000));
vec.push_back(ProfileElement(4, 4, 4, 7));
vec.push_back(ProfileElement(4, 4, 8, 7));
vec.push_back(ProfileElement(4, 4, 16, 5));
vec.push_back(ProfileElement(4, 4, 32, 6));
vec.push_back(ProfileElement(4, 4, 64, 5));
vec.push_back(ProfileElement(4, 4, 96, 5));
vec.push_back(ProfileElement(4, 4, 128, 5));
vec.push_back(ProfileElement(4, 8, 4, 14));
vec.push_back(ProfileElement(4, 8, 8, 12));
vec.push_back(ProfileElement(4, 8, 16, 5));
vec.push_back(ProfileElement(4, 8, 32, 6));
vec.push_back(ProfileElement(4, 8, 64, 6));
vec.push_back(ProfileElement(4, 8, 96, 6));
vec.push_back(ProfileElement(4, 8, 128, 5));
vec.push_back(ProfileElement(4, 16, 4, 14));
vec.push_back(ProfileElement(4, 16, 8, 14));
vec.push_back(ProfileElement(4, 16, 16, 13));
vec.push_back(ProfileElement(4, 16, 32, 13));
vec.push_back(ProfileElement(4, 16, 64, 13));
vec.push_back(ProfileElement(4, 16, 96, 13));
vec.push_back(ProfileElement(4, 16, 128, 13));
vec.push_back(ProfileElement(4, 32, 4, 37));
vec.push_back(ProfileElement(4, 32, 8, 31));
vec.push_back(ProfileElement(4, 32, 16, 29));
vec.push_back(ProfileElement(4, 32, 32, 21));
vec.push_back(ProfileElement(4, 32, 64, 21));
vec.push_back(ProfileElement(4, 32, 96, 29));
vec.push_back(ProfileElement(4, 32, 128, 21));
vec.push_back(ProfileElement(4, 64, 4, 137));
vec.push_back(ProfileElement(4, 64, 8, 113));
vec.push_back(ProfileElement(4, 64, 16, 89));
vec.push_back(ProfileElement(4, 64, 32, 69));
vec.push_back(ProfileElement(4, 64, 64, 45));
vec.push_back(ProfileElement(4, 64, 96, 37));
vec.push_back(ProfileElement(4, 64, 128, 35));
vec.push_back(ProfileElement(4, 96, 4, 137));
vec.push_back(ProfileElement(4, 96, 8, 113));
vec.push_back(ProfileElement(4, 96, 16, 105));
vec.push_back(ProfileElement(4, 96, 32, 77));
vec.push_back(ProfileElement(4, 96, 64, 53));
vec.push_back(ProfileElement(4, 96, 96, 45));
vec.push_back(ProfileElement(4, 96, 128, 39));
vec.push_back(ProfileElement(4, 128, 4, 137));
vec.push_back(ProfileElement(4, 128, 8, 121));
vec.push_back(ProfileElement(4, 128, 16, 153));
vec.push_back(ProfileElement(4, 128, 32, 97));
vec.push_back(ProfileElement(4, 128, 64, 1000000000));
vec.push_back(ProfileElement(4, 128, 96, 1000000000));
vec.push_back(ProfileElement(4, 128, 128, 1000000000));
vec.push_back(ProfileElement(5, 4, 4, 8));
vec.push_back(ProfileElement(5, 4, 8, 9));
vec.push_back(ProfileElement(5, 4, 16, 5));
vec.push_back(ProfileElement(5, 4, 32, 5));
vec.push_back(ProfileElement(5, 4, 64, 5));
vec.push_back(ProfileElement(5, 4, 96, 5));
vec.push_back(ProfileElement(5, 4, 128, 5));
vec.push_back(ProfileElement(5, 8, 4, 7));
vec.push_back(ProfileElement(5, 8, 8, 6));
vec.push_back(ProfileElement(5, 8, 16, 5));
vec.push_back(ProfileElement(5, 8, 32, 5));
vec.push_back(ProfileElement(5, 8, 64, 5));
vec.push_back(ProfileElement(5, 8, 96, 5));
vec.push_back(ProfileElement(5, 8, 128, 5));
vec.push_back(ProfileElement(5, 16, 4, 21));
vec.push_back(ProfileElement(5, 16, 8, 12));
vec.push_back(ProfileElement(5, 16, 16, 12));
vec.push_back(ProfileElement(5, 16, 32, 11));
vec.push_back(ProfileElement(5, 16, 64, 11));
vec.push_back(ProfileElement(5, 16, 96, 11));
vec.push_back(ProfileElement(5, 16, 128, 11));
vec.push_back(ProfileElement(5, 32, 4, 23));
vec.push_back(ProfileElement(5, 32, 8, 14));
vec.push_back(ProfileElement(5, 32, 16, 14));
vec.push_back(ProfileElement(5, 32, 32, 13));
vec.push_back(ProfileElement(5, 32, 64, 13));
vec.push_back(ProfileElement(5, 32, 96, 13));
vec.push_back(ProfileElement(5, 32, 128, 13));
vec.push_back(ProfileElement(5, 64, 4, 77));
vec.push_back(ProfileElement(5, 64, 8, 39));
vec.push_back(ProfileElement(5, 64, 16, 37));
vec.push_back(ProfileElement(5, 64, 32, 29));
vec.push_back(ProfileElement(5, 64, 64, 29));
vec.push_back(ProfileElement(5, 64, 96, 21));
vec.push_back(ProfileElement(5, 64, 128, 21));
vec.push_back(ProfileElement(5, 96, 4, 113));
vec.push_back(ProfileElement(5, 96, 8, 77));
vec.push_back(ProfileElement(5, 96, 16, 61));
vec.push_back(ProfileElement(5, 96, 32, 39));
vec.push_back(ProfileElement(5, 96, 64, 37));
vec.push_back(ProfileElement(5, 96, 96, 31));
vec.push_back(ProfileElement(5, 96, 128, 29));
vec.push_back(ProfileElement(5, 128, 4, 113));
vec.push_back(ProfileElement(5, 128, 8, 97));
vec.push_back(ProfileElement(5, 128, 16, 69));
vec.push_back(ProfileElement(5, 128, 32, 53));
vec.push_back(ProfileElement(5, 128, 64, 39));
vec.push_back(ProfileElement(5, 128, 96, 31));
vec.push_back(ProfileElement(5, 128, 128, 31));
vec.push_back(ProfileElement(6, 4, 4, 7));
vec.push_back(ProfileElement(6, 4, 8, 3));
vec.push_back(ProfileElement(6, 4, 16, 5));
vec.push_back(ProfileElement(6, 4, 32, 4));
vec.push_back(ProfileElement(6, 4, 64, 5));
vec.push_back(ProfileElement(6, 4, 96, 4));
vec.push_back(ProfileElement(6, 4, 128, 4));
vec.push_back(ProfileElement(6, 8, 4, 11));
vec.push_back(ProfileElement(6, 8, 8, 5));
vec.push_back(ProfileElement(6, 8, 16, 5));
vec.push_back(ProfileElement(6, 8, 32, 5));
vec.push_back(ProfileElement(6, 8, 64, 5));
vec.push_back(ProfileElement(6, 8, 96, 5));
vec.push_back(ProfileElement(6, 8, 128, 5));
vec.push_back(ProfileElement(6, 16, 4, 13));
vec.push_back(ProfileElement(6, 16, 8, 11));
vec.push_back(ProfileElement(6, 16, 16, 11));
vec.push_back(ProfileElement(6, 16, 32, 5));
vec.push_back(ProfileElement(6, 16, 64, 5));
vec.push_back(ProfileElement(6, 16, 96, 5));
vec.push_back(ProfileElement(6, 16, 128, 11));
vec.push_back(ProfileElement(6, 32, 4, 21));
vec.push_back(ProfileElement(6, 32, 8, 14));
vec.push_back(ProfileElement(6, 32, 16, 13));
vec.push_back(ProfileElement(6, 32, 32, 13));
vec.push_back(ProfileElement(6, 32, 64, 13));
vec.push_back(ProfileElement(6, 32, 96, 13));
vec.push_back(ProfileElement(6, 32, 128, 13));
vec.push_back(ProfileElement(6, 64, 4, 39));
vec.push_back(ProfileElement(6, 64, 8, 29));
vec.push_back(ProfileElement(6, 64, 16, 29));
vec.push_back(ProfileElement(6, 64, 32, 21));
vec.push_back(ProfileElement(6, 64, 64, 21));
vec.push_back(ProfileElement(6, 64, 96, 21));
vec.push_back(ProfileElement(6, 64, 128, 21));
vec.push_back(ProfileElement(6, 96, 4, 97));
vec.push_back(ProfileElement(6, 96, 8, 61));
vec.push_back(ProfileElement(6, 96, 16, 39));
vec.push_back(ProfileElement(6, 96, 32, 37));
vec.push_back(ProfileElement(6, 96, 64, 29));
vec.push_back(ProfileElement(6, 96, 96, 29));
vec.push_back(ProfileElement(6, 96, 128, 21));
vec.push_back(ProfileElement(6, 128, 4, 77));
vec.push_back(ProfileElement(6, 128, 8, 61));
vec.push_back(ProfileElement(6, 128, 16, 39));
vec.push_back(ProfileElement(6, 128, 32, 37));
vec.push_back(ProfileElement(6, 128, 64, 29));
vec.push_back(ProfileElement(6, 128, 96, 29));
vec.push_back(ProfileElement(6, 128, 128, 23));
vec.push_back(ProfileElement(7, 4, 4, 5));
vec.push_back(ProfileElement(7, 4, 8, 4));
vec.push_back(ProfileElement(7, 4, 16, 4));
vec.push_back(ProfileElement(7, 4, 32, 4));
vec.push_back(ProfileElement(7, 4, 64, 4));
vec.push_back(ProfileElement(7, 4, 96, 4));
vec.push_back(ProfileElement(7, 4, 128, 3));
vec.push_back(ProfileElement(7, 8, 4, 5));
vec.push_back(ProfileElement(7, 8, 8, 5));
vec.push_back(ProfileElement(7, 8, 16, 5));
vec.push_back(ProfileElement(7, 8, 32, 5));
vec.push_back(ProfileElement(7, 8, 64, 5));
vec.push_back(ProfileElement(7, 8, 96, 5));
vec.push_back(ProfileElement(7, 8, 128, 5));
vec.push_back(ProfileElement(7, 16, 4, 13));
vec.push_back(ProfileElement(7, 16, 8, 11));
vec.push_back(ProfileElement(7, 16, 16, 5));
vec.push_back(ProfileElement(7, 16, 32, 5));
vec.push_back(ProfileElement(7, 16, 64, 5));
vec.push_back(ProfileElement(7, 16, 96, 5));
vec.push_back(ProfileElement(7, 16, 128, 5));
vec.push_back(ProfileElement(7, 32, 4, 21));
vec.push_back(ProfileElement(7, 32, 8, 13));
vec.push_back(ProfileElement(7, 32, 16, 13));
vec.push_back(ProfileElement(7, 32, 32, 13));
vec.push_back(ProfileElement(7, 32, 64, 13));
vec.push_back(ProfileElement(7, 32, 96, 13));
vec.push_back(ProfileElement(7, 32, 128, 12));
vec.push_back(ProfileElement(7, 64, 4, 37));
vec.push_back(ProfileElement(7, 64, 8, 21));
vec.push_back(ProfileElement(7, 64, 16, 14));
vec.push_back(ProfileElement(7, 64, 32, 14));
vec.push_back(ProfileElement(7, 64, 64, 14));
vec.push_back(ProfileElement(7, 64, 96, 13));
vec.push_back(ProfileElement(7, 64, 128, 14));
vec.push_back(ProfileElement(7, 96, 4, 61));
vec.push_back(ProfileElement(7, 96, 8, 39));
vec.push_back(ProfileElement(7, 96, 16, 37));
vec.push_back(ProfileElement(7, 96, 32, 31));
vec.push_back(ProfileElement(7, 96, 64, 21));
vec.push_back(ProfileElement(7, 96, 96, 21));
vec.push_back(ProfileElement(7, 96, 128, 21));
vec.push_back(ProfileElement(7, 128, 4, 61));
vec.push_back(ProfileElement(7, 128, 8, 31));
vec.push_back(ProfileElement(7, 128, 16, 37));
vec.push_back(ProfileElement(7, 128, 32, 11));
vec.push_back(ProfileElement(7, 128, 64, 13));
vec.push_back(ProfileElement(7, 128, 96, 23));
vec.push_back(ProfileElement(7, 128, 128, 21));
return vec;
}
} // namespace fallback
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/x86/profile.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include <vector>
namespace megdnn {
namespace x86 {
struct ProfileElement {
// when output_size > on_threshold, DIRECT is faster,
// otherwise MATRIX_MUL is faster
int f, ic, oc, on_threshold;
ProfileElement(int f, int ic, int oc, int on_threshold):
f(f), ic(ic), oc(oc), on_threshold(on_threshold)
{
}
bool operator<(const ProfileElement &rhs) const
{
if (this->f < rhs.f) return true;
if (this->f > rhs.f) return false;
if (this->ic < rhs.ic) return true;
if (this->ic > rhs.ic) return false;
if (this->oc < rhs.oc) return true;
if (this->oc > rhs.oc) return false;
return false;
}
};
using ProfileCache = std::vector<ProfileElement>;
ProfileCache get_profile_cache();
} // namespace fallback
} // namespace megdnn
// vim: syntax=cpp.doxygen
......@@ -63,7 +63,6 @@
#include "./sep_conv_filter.h"
#include "src/common/utils.h"
#include "src/x86/utils.h"
#include "src/x86/profile.h"
#include "src/x86/handle.h"
#include <cstring>
......
......@@ -14,7 +14,6 @@
#include "src/common/cv/helper.h"
#include "src/common/utils.h"
#include "src/x86/utils.h"
#include "src/x86/profile.h"
#include "src/x86/handle.h"
#include <cstring>
......
......@@ -1599,73 +1599,6 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8_FILTER_PREPROCESS) {
#undef cb
}
TEST_F(X86, CONV_BIAS_MATMUL) {
using namespace conv_bias;
std::vector<TestArg> args;
auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
size_t p, NonlineMode nonline_mode) {
if (w + 2 * p < kernel || h + 2 * p < kernel)
return;
param::ConvBias param;
param.stride_h = 1;
param.stride_w = 1;
param.pad_h = p;
param.pad_w = p;
param.nonlineMode = nonline_mode;
//! no bias
param.sparse = param::ConvBias::Sparse::DENSE;
args.emplace_back(param, TensorShape{1, ic, h, w},
TensorShape{oc, ic, kernel, kernel}, TensorShape{});
//! bias channel
args.emplace_back(param, TensorShape{2, ic, h, w},
TensorShape{oc, ic, kernel, kernel},
TensorShape{1, oc, 1, 1});
//! bias
args.emplace_back(param, TensorShape{2, ic, h, w},
TensorShape{oc, ic, kernel, kernel},
TensorShape{2, oc, (h + param.pad_h * 2 - kernel) + 1,
(w + param.pad_w * 2 - kernel) + 1});
//! gruop
param.sparse = param::ConvBias::Sparse::GROUP;
args.emplace_back(
param, TensorShape{2, 2 * ic, h, w},
TensorShape{2, oc, ic, kernel, kernel},
TensorShape{2, 2 * oc, (h + param.pad_h * 2 - kernel) + 1,
(w + param.pad_w * 2 - kernel) + 1});
};
for (size_t kernel : {2, 3, 5, 7})
for (size_t ic : {1, 2, 3, 4})
for (size_t oc : {1, 2, 3, 4})
for (size_t p : {0, 2})
for (size_t size : {20, 21, 22, 23, 24})
for (NonlineMode nonline_mode :
{NonlineMode::RELU, NonlineMode::SIGMOID,
NonlineMode::H_SWISH, NonlineMode::IDENTITY}) {
run(oc, ic, size, size, kernel, p, nonline_mode);
}
Checker<ConvBias> checker(handle());
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
"X86_CONV_BIAS_MATMUL"));
checker.set_epsilon(1);
UniformIntRNG rng{-50, 50};
checker.set_dtype(0, dtype::Float32())
.set_dtype(1, dtype::Float32())
.set_dtype(2, dtype::Float32())
.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &rng);
for (auto&& arg : args) {
checker.set_param(arg.param).exec(
{arg.src, arg.filter, arg.bias, {}, {}});
}
}
#if MEGDNN_WITH_BENCHMARK
#if MEGDNN_X86_WITH_MKL_DNN
static void x86_benchmark_fp32_mkldnn(Handle* handle) {
......
......@@ -182,49 +182,6 @@ TEST_F(X86, DEFAULT_CONV_DIRECT_STRIDE2) {
}
}
TEST_F(X86, DEFAULT_CONV_MATMUL) {
using namespace convolution;
std::vector<TestArg> args;
auto run = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
size_t p) {
if (w + 2 * p < kernel || h + 2 * p < kernel)
return;
param::Convolution param;
param.stride_h = 1;
param.stride_w = 1;
param.pad_h = p;
param.pad_w = p;
//! no bias
args.emplace_back(param, TensorShape{1, ic, h, w},
TensorShape{oc, ic, kernel, kernel});
};
for (size_t kernel : {2, 3, 5, 7})
for (size_t ic : {1, 2, 3, 4})
for (size_t oc : {1, 2, 3, 4})
for (size_t p : {0, 2})
for (size_t size : {20, 21, 22, 23, 24}) {
run(oc, ic, size, size, kernel, p);
}
Checker<ConvolutionForward> checker(handle());
checker.set_before_exec_callback(AlgoChecker<ConvolutionForward>(
"CONVOLUTION_DEFAULT_X86_CONV_BIAS_MATMUL"));
UniformIntRNG rng{-50, 50};
checker.set_dtype(0, dtype::Float32())
.set_dtype(1, dtype::Float32())
.set_dtype(2, dtype::Float32())
.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &rng);
for (auto&& arg : args) {
checker.set_param(arg.param).exec({arg.src, arg.filter, {}});
}
}
#if MEGDNN_X86_WITH_MKL_DNN
TEST_F(X86, CONVOLUTION_FORWARD_INT8) {
Checker<ConvolutionForward> checker(handle());
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册