提交 54b5db17 编写于 作者: M Megvii Engine Team

feat(x86/rvv): add AGENT_NCHW_NCHW44 algo

GitOrigin-RevId: 8cf6c3fac004c533c616a0266ea21376edb60f4d
上级 eaa18018
#pragma once
#include "src/common/opr_delegate.h"
#include "src/fallback/conv_bias/opr_impl.h"
#include "src/fallback/matrix_mul/opr_impl.h"
......@@ -249,6 +250,26 @@ public:
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_DIRECT_NCHW_NCHW44_FP32)
};
class ConvBiasImpl::AlgoF32DirectNCHWNCHW44AGENT final : public AlgoBase {
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;
public:
AlgoF32DirectNCHWNCHW44AGENT(){};
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
const char* name() const override { return "F32_CONV_AGENT_NCHW_NCHW44"; }
bool usable(
const NCBKernSizeParam& param,
AlgoSelectionStrategy algo_selection_strategy) const override;
size_t get_workspace(const NCBKernSizeParam& param) const override;
virtual SmallVector<NCBKern> dispatch_kerns(
const NCBKernSizeParam& param) const override;
ConvAlgoTypePack get_algo_type() const override {
return {AlgoDataType::FLOAT32, AlgoCategory::DIRECT};
}
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_DIRECT_NCHW_NCHW44_AGENT_FP32)
};
class ConvBiasImpl::AlgoF32ChannelWiseNCHW44 final : public AlgoBase {
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;
......
#include "megdnn/opr_param_defs.h"
#include "megdnn/oprs.h"
#include "src/common/nchw_nchwxx_valid.h"
#include "src/common/opr_delegate.h"
#include "src/fallback/conv_bias/gi/fp32/algos.h"
#include "src/fallback/elemwise_helper/elemwise_op.h"
#include "midout.h"
using namespace megdnn;
using namespace fallback;
MIDOUT_DECL(megdnn_fallback_conv_bias_fp32_nchw_nchw44_agent)
namespace {
param::ConvBias get_param_convbias(const ConvBiasImpl::NCBKernSizeParam& p) {
param::ConvBias::Mode mode;
if (p.filter_meta.should_flip) {
mode = param::ConvBias::Mode::CONVOLUTION;
} else {
mode = param::ConvBias::Mode::CROSS_CORRELATION;
}
return param::ConvBias{
p.nonlineMode,
mode,
param::ConvBias::Sparse::DENSE,
ConvBias::Param::Format::NCHW,
p.filter_meta.padding[0],
p.filter_meta.padding[1],
p.filter_meta.stride[0],
p.filter_meta.stride[1],
p.filter_meta.dilation[0],
p.filter_meta.dilation[1],
megdnn::param::ConvBias::ComputeMode::DEFAULT};
}
TensorLayoutArray get_layouts(const ConvBiasImpl::NCBKernSizeParam& p) {
UNPACK_CONV_NCB_KERN_SIZES(p);
MEGDNN_MARK_USED_VAR(SH);
MEGDNN_MARK_USED_VAR(SW);
MEGDNN_MARK_USED_VAR(PH);
MEGDNN_MARK_USED_VAR(PW);
MEGDNN_MARK_USED_VAR(OW);
MEGDNN_MARK_USED_VAR(OH);
TensorLayout src_layout({N, IC, IH, IW}, p.src_type);
//! 44 filter to chw
TensorLayout filter_layout44({OC / 4, FH, FW, IC, 4}, p.filter_type);
TensorLayout filter_layout_reshape({OC / 4, 4, IC, FH, FW}, p.filter_type);
TensorLayout filter_layout({OC, IC, FH, FW}, p.filter_type);
TensorLayout bias_layout44{{}, p.bias_type};
TensorLayout bias_layout{{}, p.bias_type};
TensorLayout bias_layout_reshape{{}, p.bias_type};
if (p.bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) {
bias_layout44 = TensorLayout({1, OC / 4, 1, 1, 4}, p.bias_type);
bias_layout_reshape = TensorLayout({1, OC / 4, 4, 1, 1}, p.bias_type);
bias_layout = TensorLayout({1, OC, 1, 1}, p.bias_type);
}
//! chw dst to 44
TensorLayout dst_layout = TensorLayout({N, OC, OH, OW}, p.dst_type);
TensorLayout dst_layout_reshape = TensorLayout({N, OC / 4, 4, OH, OW}, p.dst_type);
TensorLayout dst_layout44 = TensorLayout({N, OC / 4, OH, OW, 4}, p.dst_type);
return {src_layout, filter_layout, filter_layout44,
bias_layout, bias_layout44, dst_layout,
dst_layout44, filter_layout_reshape, bias_layout_reshape,
dst_layout_reshape};
}
static WorkspaceBundle get_bundle(
const ConvBiasImpl::NCBKernSizeParam& param,
const std::unique_ptr<ConvBias>& conv_bias_op) {
auto layouts = get_layouts(param);
auto src_layout = layouts[0];
auto filter_layout = layouts[1];
auto bias_layout = layouts[3];
auto dst_layout = layouts[5];
size_t weight_relayout_workspace = filter_layout.span().dist_byte();
size_t bias_relayout_workspace = bias_layout.span().dist_byte();
conv_bias_op->param() = get_param_convbias(param);
auto dummy = TensorLayout();
auto conv_workspace = conv_bias_op->get_workspace_in_bytes(
src_layout, filter_layout, bias_layout, dummy, dst_layout, nullptr);
auto conv_dst_workspace = dst_layout.span().dist_byte();
return {nullptr,
{weight_relayout_workspace, bias_relayout_workspace, conv_workspace,
conv_dst_workspace}};
};
}; // namespace
namespace {
inline bool is_usable(
const DTypeEnum src_dtype, const DTypeEnum filter_dtype,
const DTypeEnum dst_dtype,
const ConvolutionBase<param::Convolution>::CanonizedFilterMeta& fm,
const BiasMode bias_mode, const param::ConvBias::NonlineMode nonline_mode) {
bool ok_type =
((src_dtype == DTypeEnum::Float32 && filter_dtype == DTypeEnum::Float32 &&
(dst_dtype == DTypeEnum::Float32))) &&
(fm.format == param::Convolution::Format::NCHW44);
bool ok_nonline = nonline_mode == param::ConvBias::NonlineMode::IDENTITY ||
nonline_mode == param::ConvBias::NonlineMode::RELU ||
nonline_mode == param::ConvBias::NonlineMode::SIGMOID ||
nonline_mode == param::ConvBias::NonlineMode::H_SWISH;
bool ok_src_dst =
fm.icpg < 4 && (fm.ocpg % 4 == 0 && fm.ocpg >= 4) && fm.group == 1;
bool ok_filter = fm.spatial_ndim == 2 && fm.spatial[0] == fm.spatial[1] &&
(fm.spatial[0] == 2 || fm.spatial[0] == 3 || fm.spatial[0] == 5 ||
fm.spatial[0] == 7);
bool ok_slide = fm.dilation[0] == 1 && fm.dilation[1] == 1 &&
fm.stride[0] == fm.stride[1] &&
(fm.stride[0] == 1 || fm.stride[1] == 2);
bool ok_conv = !fm.should_flip && bias_mode != BiasMode::BIAS;
bool avaible =
ok_type && ok_nonline && ok_src_dst && ok_filter && ok_slide && ok_conv;
return avaible;
}
}; // namespace
bool ConvBiasImpl::AlgoF32DirectNCHWNCHW44AGENT::usable(
const NCBKernSizeParam& param, AlgoSelectionStrategy) const {
return is_usable(
param.src_type.enumv(), param.filter_type.enumv(), param.dst_type.enumv(),
param.filter_meta, param.bias_mode, param.nonlineMode);
}
size_t ConvBiasImpl::AlgoF32DirectNCHWNCHW44AGENT::get_workspace(
const NCBKernSizeParam& param) const {
MIDOUT_BEGIN(
megdnn_fallback_conv_bias_fp32_nchw_nchw44_agent,
midout_iv("AlgoF32DirectNCHWNCHW44AGENT::get_workspace"_hash)) {
auto conv_bias_op = param.handle->create_operator<ConvBias>();
return get_bundle(param, conv_bias_op).total_size_in_bytes();
}
MIDOUT_END();
return 0;
}
SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectNCHWNCHW44AGENT::
dispatch_kerns(const NCBKernSizeParam& k_param) const {
SmallVector<ConvBiasImpl::NCBKern> ret_kerns;
MIDOUT_BEGIN(
megdnn_fallback_conv_bias_fp32_nchw_nchw44_agent,
midout_iv("AlgoF32DirectNCHWNCHW44AGENT::dispatch_kerns"_hash)) {
auto filter_and_bias_dimshuffle = [](const NCBKernParam& kern_param,
const NCBKernIndex&) {
auto layouts = get_layouts(kern_param);
auto filter_layout_44 = layouts[2];
auto bias_layout44 = layouts[4];
auto filter_layout_reshape = layouts[7];
auto bias_layout_reshape = layouts[8];
auto conv_bias_op = kern_param.handle->create_operator<ConvBias>();
auto bundle = get_bundle(kern_param, conv_bias_op);
bundle.set(kern_param.workspace_ptr);
auto weight_ws = bundle.get(0);
auto bias_ws = bundle.get(1);
//! relayout bias and weight
TensorND chw_weight_t = TensorND(weight_ws, filter_layout_reshape);
TensorND weight44_t = TensorND(
kern_param.filter_ptr.get_ptr(),
filter_layout_44.dimshuffle({0, 4, 3, 1, 2}));
auto relayout_op = inplace_cpu_handle()->create_operator<Relayout>();
relayout_op->exec(weight44_t, chw_weight_t);
TensorND chw_bias_t = TensorND(bias_ws, bias_layout_reshape);
if (bias_layout44.ndim != 0) {
TensorND bias44_t = TensorND(
kern_param.bias_ptr.get_ptr(),
bias_layout44.dimshuffle({0, 1, 4, 2, 3}));
relayout_op->exec(bias44_t, chw_bias_t);
}
};
ret_kerns.push_back({filter_and_bias_dimshuffle, {1}});
auto do_agent_conv = [&ret_kerns, &k_param]() {
auto layouts = get_layouts(k_param);
auto src_layout = layouts[0];
auto filter_layout = layouts[1];
auto bias_layout = layouts[3];
auto dst_layout = layouts[5];
//! do chw conv
auto conv_bias_op = k_param.handle->create_operator<ConvBias>();
conv_bias_op->param() = get_param_convbias(k_param);
auto dummy_z = TensorND();
auto&& conv_bias_algo =
static_cast<ConvBiasImpl*>(conv_bias_op.get())
->get_algorithm_heuristic(
src_layout, filter_layout, bias_layout,
dummy_z.layout, dst_layout,
std::numeric_limits<size_t>::max(),
AlgoAttribute::DEFAULT, AlgoAttribute::DEFAULT);
auto new_param = k_param;
new_param.filter_meta.format = ConvBias::Param::Format::NCHW;
auto&& conv_bias_kerns =
static_cast<AlgoBase*>(conv_bias_algo)->dispatch_kerns(new_param);
for (size_t i = 0; i < conv_bias_kerns.size(); i++) {
auto&& kernel = conv_bias_kerns[i];
auto run = [kernel](
const NCBKernParam& p,
const NCBKernIndex& ncb_index) {
auto conv_bias_op = p.handle->create_operator<ConvBias>();
auto bundle = get_bundle(p, conv_bias_op);
bundle.set(p.workspace_ptr);
auto weight_ws = bundle.get(0);
auto bias_ws = bundle.get(1);
auto chw_conv_ws = bundle.get(2);
auto chw_conv_ws_size = bundle.get_size(2);
auto chw_conv_dst_ws = bundle.get(3);
auto param = p;
param.filter_ptr = weight_ws;
param.bias_ptr = bias_ws;
param.dst_ptr = chw_conv_dst_ws;
param.workspace_ptr = chw_conv_ws;
param.workspace_size = chw_conv_ws_size;
kernel.kern(param, {ncb_index.thread_id, ncb_index.ndrange_id});
};
ret_kerns.push_back({run, kernel.global_size});
}
};
do_agent_conv();
auto dest_dimshuffle = [](const NCBKernParam& kern_param, const NCBKernIndex&) {
auto param = kern_param;
auto layouts = get_layouts(param);
auto dst_layout44 = layouts[6];
auto dst_layout_reshape = layouts[9];
auto conv_bias_op = kern_param.handle->create_operator<ConvBias>();
auto bundle = get_bundle(kern_param, conv_bias_op);
bundle.set(kern_param.workspace_ptr);
auto chw_conv_dst_ws = bundle.get(3);
//! relayout dst to dst44 tensor
TensorND chw44_dst_t = TensorND(kern_param.dst_ptr.get_ptr(), dst_layout44);
TensorND chw_dst_t = TensorND(chw_conv_dst_ws, dst_layout_reshape);
auto relayout_op = inplace_cpu_handle()->create_operator<Relayout>();
relayout_op->exec(
{chw_conv_dst_ws, dst_layout_reshape.dimshuffle({0, 1, 3, 4, 2})},
chw44_dst_t);
};
ret_kerns.push_back({dest_dimshuffle, {1}});
return ret_kerns;
}
MIDOUT_END();
}
// vim: syntax=cpp.doxygen
/**
* \file
dnn/src/fallback/conv_bias/gi/fp32/f32_direct_nchw_nchw44_algo.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied.
*/
#include "megdnn/oprs.h"
#include "src/common/nchw_nchwxx_valid.h"
#include "src/common/opr_delegate.h"
......
......@@ -84,7 +84,8 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {
AlgoBase::Mapper m_all_algos_map;
SmallVector<fallback::ConvBiasImpl::AlgoBase*> m_gi_winograd_algos;
AlgoF32DirectNCHWNCHW44 f32_direct_stride2_nchw_nchw44;
AlgoF32DirectNCHWNCHW44 f32_nchw_nchw44;
AlgoF32DirectNCHWNCHW44AGENT f32_nchw_nchw44_agent;
AlgoF32ChannelWiseNCHW44 f32_chanel_wise_nchw44;
AlgoF32DirectNCHW44 f32_direct_nchw44;
......@@ -94,8 +95,17 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {
public:
AlgoPack() {
// fallback gi fp32 algo
m_all_algos.emplace_back(&f32_direct_stride2_nchw_nchw44);
//! fallback gi fp32 algo
//! now f32_nchw_nchw44_agent is fast than f32_nchw_nchw44
//! on x86 and rvv platform, so we adjust heuristic order.
#if MEGDNN_AARCH64 || MEGDNN_ARMV7
m_all_algos.emplace_back(&f32_nchw_nchw44);
m_all_algos.emplace_back(&f32_nchw_nchw44_agent);
#else
m_all_algos.emplace_back(&f32_nchw_nchw44_agent);
m_all_algos.emplace_back(&f32_nchw_nchw44);
#endif
m_all_algos.emplace_back(&f32_chanel_wise_nchw44);
m_all_algos.emplace_back(&f32_direct_nchw44);
m_all_algos.emplace_back(&f32_direct_stride1);
......@@ -471,7 +481,8 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param(
param().compute_mode,
nr_threads,
reinterpret_cast<const ConvolutionForward::PreprocessedFilter*>(
preprocessed_filter)},
preprocessed_filter),
handle()},
bias.dtype,
bias.stride[0],
bias_mode,
......@@ -491,6 +502,7 @@ ConvBiasImpl::NCBKernParam ConvBiasImpl::make_ncb_kern_param(
ret.dst_ptr = dst.get_ref_ptr();
ret.workspace_ptr = workspace.raw_ptr;
ret.workspace_size = workspace.size;
ret.handle = handle();
return ret;
}
......
......@@ -228,6 +228,7 @@ public:
GI_COMMON_DIRECT_STRD2_FP32,
GI_COMMON_DIRECT_NCHW44_FP32,
GI_COMMON_DIRECT_NCHW_NCHW44_FP32,
GI_COMMON_DIRECT_NCHW_NCHW44_AGENT_FP32,
GI_COMMON_CHWNWISE_NCHW44_F32,
#if MEGDNN_X86
......@@ -389,6 +390,7 @@ private:
class AlgoF32DirectStride1;
class AlgoF32DirectStride2;
class AlgoF32DirectNCHWNCHW44;
class AlgoF32DirectNCHWNCHW44AGENT;
class AlgoF32ChannelWiseNCHW44;
class AlgoF32DirectNCHW44;
......
......@@ -242,7 +242,8 @@ ConvolutionImpl::NCBKernSizeParam ConvolutionImpl::make_ncb_kern_size_param(
{dst.stride[0], dst.stride[1], dst.stride[2], dst.stride[3]},
param().compute_mode,
nr_threads,
preprocessed_filter};
preprocessed_filter,
handle()};
}
ConvolutionImpl::NCBKernParam ConvolutionImpl::make_ncb_kern_param(
......
......@@ -101,6 +101,7 @@ public:
const PreprocessedFilter* preprocessed_filter;
//! get the data type category of the param for select the algo
AlgoDataType deduce_algo_data_type() const;
Handle* handle;
};
//! memory param for kernels with non-contiguous batch
......
......@@ -354,6 +354,31 @@ TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S1) {
handle(), "F32_CONV_NCHW_NCHW44");
}
#define CB(_MODE, _SUFFIX) \
TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S2_AGENT_##_SUFFIX) { \
check_conv_bias( \
conv_bias::get_nchw44_conv_bias_args( \
{2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, {_MODE}, 2, false, true), \
handle(), "F32_CONV_AGENT_NCHW_NCHW44"); \
}
CB(megdnn::BiasMode::NO_BIAS, NO_BIAS);
CB(megdnn::BiasMode::BROADCAST_CHANNEL_BIAS, BROADCAST_CHANNEL_BIAS);
#undef CB
#define CB(_MODE, _SUFFIX) \
TEST_F(FALLBACK_MULTI_THREADS, \
CONVBIAS_GI_NCHW_NCHW44_F32_S1_AGENT_IDENTITY_##_SUFFIX) { \
check_conv_bias( \
conv_bias::get_nchw44_conv_bias_args( \
{2, 3, 5, 7}, {_MODE}, ONLY_BR_BIASMODE, 1, false, true), \
handle(), "F32_CONV_AGENT_NCHW_NCHW44"); \
}
CB(param::ConvBias::NonlineMode::IDENTITY, IDENTITY);
CB(param::ConvBias::NonlineMode::RELU, RELU);
CB(param::ConvBias::NonlineMode::H_SWISH, H_SWISH);
CB(param::ConvBias::NonlineMode::SIGMOID, SIGMOID);
#undef CB
std::vector<conv_bias::TestArg> get_nchw44_channel_wise_args(
std::vector<size_t> kernel, size_t stride, bool no_bias, bool no_nonlinemode,
bool no_full_bias) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册