提交 2aba0378 编写于 作者: M Megvii Engine Team

refactor(mgb/dnn): fix group conv is_available

GitOrigin-RevId: b2799091689336cfc626315885f3d296fd13e70f
上级 4a92346b
......@@ -74,6 +74,21 @@ std::vector<typename Opr::Algorithm*> get_all_algorithms(
return ret;
}
/*!
* \brief whether there is an algorithm from algo_pack() that is available for
* current size
*/
template <class Opr>
bool has_available_algo(
const typename Opr::AlgoBase::SizeArgs& args) {
for (auto i : Opr::algo_pack().all_algos) {
if (i->is_available(args)) {
return true;
}
}
return false;
}
/*!
* \brief a helper function to get an algorithm match attribute. If require a
* algorithm with specified attribute, and the given algorithm match that
......
......@@ -454,8 +454,6 @@ public:
return AlgoAttribute::REPRODUCIBLE;
}
static void modify_size_args(SizeArgs& args, TensorLayout& src_pg,
TensorLayout& dst_pg, TensorLayout& bias_pg);
MEGDNN_DECL_ALGO_TYPE(CUDA_GROUP_CONV_GENERAL)
private:
......@@ -578,11 +576,6 @@ public:
const OperatorBase* opr) const override;
private:
void make_inner_layout(const SizeArgs& args, TensorLayout& inner_src_layout,
TensorLayout& inner_weight_layout,
TensorLayout& inner_dst_layout,
TensorLayout& inner_bias_layout,
TensorLayout& inner_z_layout) const;
WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const;
};
......
......@@ -14,6 +14,7 @@
#include "src/cuda/conv_bias/algo.h"
#include "src/cuda/cudnn_wrapper.h"
#include "src/cuda/relayout_format/opr_impl.h"
#include "src/cuda/relayout_format/relayout_format.h"
#include "src/cuda/utils.h"
using namespace megdnn;
......@@ -37,18 +38,21 @@ inline void deduce_reformat_layout(std::unique_ptr<RelayoutFormat>& relayout,
dst_layout = src_layout;
}
}
} // namespace
void ConvBiasForwardImpl::AlgoFallbackNCHWQS8::make_inner_layout(
const SizeArgs& args, TensorLayout& inner_src_layout,
TensorLayout& inner_weight_layout, TensorLayout& inner_dst_layout,
TensorLayout& inner_bias_layout, TensorLayout& inner_z_layout) const {
std::pair<TensorLayoutArray, ConvBiasForwardImpl::Param> sub_opr_config(
const ConvBiasForwardImpl::AlgoBase::SizeArgs& args) {
TensorLayout inner_src_layout;
TensorLayout inner_filter_layout;
TensorLayout inner_bias_layout;
TensorLayout inner_z_layout;
TensorLayout inner_dst_layout;
auto relayout_src = args.handle->create_operator<RelayoutFormat>();
deduce_reformat_layout(relayout_src, *args.src_layout, inner_src_layout,
RelayoutFormat::Param::Mode::NCHW_NCHW4, 0,
args.filter_meta.group);
deduce_reformat_layout(relayout_src, *args.filter_layout,
inner_weight_layout,
inner_filter_layout,
RelayoutFormat::Param::Mode::NCHW_NCHW4_WEIGHT);
bool dst_float = args.dst_layout->dtype.enumv() == DTypeEnum::Float32;
if (dst_float) {
......@@ -67,7 +71,32 @@ void ConvBiasForwardImpl::AlgoFallbackNCHWQS8::make_inner_layout(
RelayoutFormat::Param::Mode::NCHW_NCHW4, 0,
args.filter_meta.group);
}
};
megdnn::param::ConvBias inner_conv_param = args.opr->param();
if (args.dst_layout->dtype.enumv() == DTypeEnum::Float32) {
inner_conv_param.format = megdnn::param::ConvBias::Format::NCHW4_NCHW;
} else {
inner_conv_param.format = megdnn::param::ConvBias::Format::NCHW4;
}
std::pair<TensorLayoutArray, ConvBiasForwardImpl::Param> ret;
ret.first = {inner_src_layout, inner_filter_layout, inner_bias_layout,
inner_z_layout, inner_dst_layout};
ret.second = inner_conv_param;
return ret;
}
std::pair<TensorLayoutArray, std::unique_ptr<ConvBiasForward>> prepare_sub_opr(
const ConvBiasForwardImpl::AlgoBase::SizeArgs& args) {
auto convbias_opr = args.handle->create_operator<ConvBias>();
set_execution_policy<ConvBiasForward, ConvBiasForward*>(args.opr,
convbias_opr.get());
auto&& config = sub_opr_config(args);
convbias_opr->param() = config.second;
return {config.first, std::move(convbias_opr)};
}
} // namespace
std::vector<Algorithm::SearchItem>
ConvBiasForwardImpl::AlgoFallbackNCHWQS8::get_subopr_list(
......@@ -75,28 +104,12 @@ ConvBiasForwardImpl::AlgoFallbackNCHWQS8::get_subopr_list(
const ConvBiasForwardImpl* o = static_cast<const ConvBiasForwardImpl*>(opr);
SizeArgs args(const_cast<ConvBiasForwardImpl*>(o), layouts[0], layouts[1],
layouts[2], layouts[3], layouts[4], nullptr);
TensorLayout inner_src_layout;
TensorLayout inner_weight_layout;
TensorLayout inner_dst_layout;
TensorLayout inner_bias_layout;
TensorLayout inner_z_layout;
make_inner_layout(args, inner_src_layout, inner_weight_layout,
inner_dst_layout, inner_bias_layout, inner_z_layout);
Param inner_conv_param = o->param();
if (layouts[4].dtype.enumv() == DTypeEnum::Float32) {
inner_conv_param.format = Param::Format::NCHW4_NCHW;
} else {
inner_conv_param.format = Param::Format::NCHW4;
}
auto&& config = sub_opr_config(args);
std::string param_str;
Algorithm::serialize_write_pod(inner_conv_param, param_str);
return {{Algorithm::OprType::CONVBIAS_FORWARD,
param_str,
{inner_src_layout, inner_weight_layout, inner_bias_layout,
inner_z_layout, inner_dst_layout}}};
Algorithm::serialize_write_pod(config.second, param_str);
return {{Algorithm::OprType::CONVBIAS_FORWARD, param_str, config.first}};
}
bool ConvBiasForwardImpl::AlgoFallbackNCHWQS8::is_available(
......@@ -115,39 +128,46 @@ bool ConvBiasForwardImpl::AlgoFallbackNCHWQS8::is_available(
args.bias_layout->shape[2] == 1 &&
args.bias_layout->shape[3] == 1);
bool is_ok = is_format_ok && is_version_ok && is_dtype_ok && is_bias_ok;
return is_ok;
if (!is_ok) {
return false;
}
auto config = prepare_sub_opr(args);
AlgoBase::SizeArgs sub_args{
static_cast<ConvBiasForwardImpl*>(config.second.get()),
config.first[0],
config.first[1],
config.first[2],
config.first[3],
config.first[4]};
bool is_relayout_ok = true;
if (args.dst_layout->dtype.enumv() != DTypeEnum::Float32) {
is_relayout_ok = relayout_format::RelayoutFormatFast::usable(
config.first[4], *args.dst_layout,
RelayoutFormat::Param::Mode::NCHW4_NCHW);
}
return is_relayout_ok && has_available_algo<ConvBiasForwardImpl>(sub_args);
}
WorkspaceBundle ConvBiasForwardImpl::AlgoFallbackNCHWQS8::get_workspace_bundle(
void* ptr, const SizeArgs& args) const {
TensorLayout inner_src_layout;
TensorLayout inner_weight_layout;
TensorLayout inner_dst_layout;
TensorLayout inner_bias_layout;
TensorLayout inner_z_layout;
make_inner_layout(args, inner_src_layout, inner_weight_layout,
inner_dst_layout, inner_bias_layout, inner_z_layout);
Param inner_conv_param = args.opr->param();
auto config = prepare_sub_opr(args);
size_t ws_dst = 0, ws_bias = 0, ws_z = 0;
if (args.dst_layout->dtype.enumv() == DTypeEnum::Float32) {
inner_conv_param.format = Param::Format::NCHW4_NCHW;
} else {
inner_conv_param.format = Param::Format::NCHW4;
ws_dst = inner_dst_layout.span().dist_byte();
ws_bias = inner_bias_layout.span().dist_byte();
ws_z = inner_z_layout.span().dist_byte();
if (args.dst_layout->dtype.enumv() != DTypeEnum::Float32) {
ws_bias = config.first[2].span().dist_byte();
ws_z = config.first[3].span().dist_byte();
ws_dst = config.first[4].span().dist_byte();
}
auto opr = args.handle->create_operator<ConvBiasForward>();
opr->param() = inner_conv_param;
set_execution_policy<ConvBiasForward, ConvBiasForward*>(args.opr,
opr.get());
return WorkspaceBundle(
ptr,
{inner_src_layout.span().dist_byte(),
inner_weight_layout.span().dist_byte(), ws_dst, ws_bias, ws_z,
opr->get_workspace_in_bytes(inner_src_layout, inner_weight_layout,
inner_bias_layout, inner_z_layout,
inner_dst_layout, nullptr)});
size_t inner_ws = config.second->get_workspace_in_bytes(
config.first[0], config.first[1], config.first[2], config.first[3],
config.first[4], nullptr);
return WorkspaceBundle(ptr, {config.first[0].span().dist_byte(),
config.first[1].span().dist_byte(), ws_bias,
ws_z, ws_dst, inner_ws});
}
size_t ConvBiasForwardImpl::AlgoFallbackNCHWQS8::get_workspace_in_bytes(
......@@ -177,44 +197,33 @@ void ConvBiasForwardImpl::AlgoFallbackNCHWQS8::exec(
relayout_nchw4_nchw->param() = nchw4_nchw_trans;
auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args);
TensorLayout inner_src_layout;
TensorLayout inner_weight_layout;
TensorLayout inner_dst_layout;
TensorLayout inner_bias_layout;
TensorLayout inner_z_layout;
make_inner_layout(args, inner_src_layout, inner_weight_layout,
inner_dst_layout, inner_bias_layout, inner_z_layout);
TensorND inner_src(bundle.get(0), inner_src_layout);
TensorND inner_weight(bundle.get(1), inner_weight_layout);
TensorND inner_dst(bundle.get(2), inner_dst_layout);
TensorND inner_bias(bundle.get(3), inner_bias_layout);
TensorND inner_z(bundle.get(4), inner_z_layout);
bool dst_float = args.dst_layout->dtype.enumv() == DTypeEnum::Float32;
auto config = prepare_sub_opr(args);
TensorND inner_src(bundle.get(0), config.first[0]);
TensorND inner_weight(bundle.get(1), config.first[1]);
TensorND inner_bias(bundle.get(2), config.first[2]);
TensorND inner_z(bundle.get(3), config.first[3]);
TensorND inner_dst(bundle.get(4), config.first[4]);
Param inner_conv_param = args.opr->param();
inner_conv_param.format =
dst_float ? Param::Format::NCHW4_NCHW : Param::Format::NCHW4;
auto inner_opr = args.handle->create_operator<ConvBiasForward>();
inner_opr->param() = inner_conv_param;
set_execution_policy<ConvBiasForward, ConvBiasForward*>(args.opr,
inner_opr.get());
bool dst_float = args.dst_layout->dtype.enumv() == DTypeEnum::Float32;
relayout_nchw_nchw4->exec(*args.src_tensor, inner_src, {});
relayout_weight->exec(*args.filter_tensor, inner_weight, {});
if (dst_float) {
inner_opr->exec(inner_src, inner_weight, *args.bias_tensor,
*args.z_tensor, *args.dst_tensor, nullptr,
config.second->exec(
inner_src, inner_weight, *args.bias_tensor, *args.z_tensor,
*args.dst_tensor, nullptr,
Workspace((dt_byte*)bundle.get(5), bundle.get_size(5)));
} else {
if (inner_bias_layout.ndim > 0) {
if (inner_bias.layout.ndim > 0) {
relayout_nchw_nchw4->exec(*args.bias_tensor, inner_bias, {});
}
if (inner_z_layout.ndim > 0) {
if (inner_z.layout.ndim > 0) {
relayout_nchw_nchw4->exec(*args.z_tensor, inner_z, {});
}
inner_opr->exec(inner_src, inner_weight, inner_bias, inner_z, inner_dst,
config.second->exec(
inner_src, inner_weight, inner_bias, inner_z, inner_dst,
nullptr,
Workspace((dt_byte*)bundle.get(5), bundle.get_size(5)));
relayout_nchw4_nchw->exec(inner_dst, *args.dst_tensor, {});
......
......@@ -21,20 +21,7 @@ namespace {
std::pair<TensorLayoutArray, ConvBiasForwardImpl::Param> sub_opr_config(
const ConvBiasForwardImpl::AlgoBase::SizeArgs& args) {
TensorLayout src_pg = *args.src_layout;
SmallVector<size_t> flt_shape(0);
std::vector<ptrdiff_t> flt_stride(0);
size_t idx = 0;
// check if the first dim is group
if (args.filter_layout->ndim > args.src_layout->ndim)
++idx;
for (; idx < args.filter_layout->ndim; ++idx) {
flt_shape.push_back(args.filter_layout->shape[idx]);
flt_stride.push_back(args.filter_layout->stride[idx]);
}
TensorLayout filter_pg(flt_shape, flt_stride,
args.filter_layout->dtype,
args.filter_layout->format);
TensorLayout filter_pg = *args.filter_layout;
TensorLayout bias_pg = *args.bias_layout;
TensorLayout z_pg = *args.z_layout;
TensorLayout dst_pg = *args.dst_layout;
......@@ -50,6 +37,8 @@ std::pair<TensorLayoutArray, ConvBiasForwardImpl::Param> sub_opr_config(
"invalid conv format");
c_pos = 3;
}
filter_pg.remove_axis_inplace(0);
src_pg.shape[c_pos] /= nr_grp;
bias_pg.ndim = 0;
dst_pg.shape[c_pos] /= nr_grp;
......@@ -107,10 +96,27 @@ bool ConvBiasForwardImpl::AlgoGroupConvGeneral::is_available(
param.format == param::ConvBias::Format::NCHW32)
return false;
auto config = prepare_sub_opr(args);
return get_algorithm(static_cast<ConvBiasForwardImpl*>(config.second.get()),
config.first[0], config.first[1], config.first[2],
config.first[3], config.first[4]);
auto dst_layout = *args.dst_layout;
if (dst_layout.dtype.enumv() != args.bias_layout->dtype.enumv()) {
dst_layout.dtype = DType();
args.opr->check_or_deduce_dtype_fwd(args.src_layout->dtype,
args.filter_layout->dtype,
dst_layout.dtype);
}
auto conv_args = args;
conv_args.dst_layout = &dst_layout;
auto config = prepare_sub_opr(conv_args);
AlgoBase::SizeArgs sub_args{
static_cast<ConvBiasForwardImpl*>(config.second.get()),
config.first[0],
config.first[1],
config.first[2],
config.first[3],
config.first[4]};
bool ret = has_available_algo<ConvBiasForwardImpl>(sub_args);
return ret;
}
WorkspaceBundle ConvBiasForwardImpl::AlgoGroupConvGeneral::get_workspace_bundle(
......@@ -125,7 +131,9 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoGroupConvGeneral::get_workspace_bundle(
sizes.push_back(dst_layout.span().dist_byte());
}
auto config = prepare_sub_opr(args);
auto conv_args = args;
conv_args.dst_layout = &dst_layout;
auto config = prepare_sub_opr(conv_args);
size_t mm_ws = config.second->get_workspace_in_bytes(
config.first[0], config.first[1], config.first[2],
config.first[3], config.first[4], nullptr);
......
......@@ -197,11 +197,10 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic(
return algo;
}
if (args.filter_meta.group > 1) {
if (auto algo = megdnn::get_algo_match_attribute<ConvBiasForwardImpl>(
&sm_algo_pack.group, positive_attr, negative_attr)){
return algo;
}
if (args.filter_meta.group > 1 &&
sm_algo_pack.group.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
return &sm_algo_pack.group;
}
if (sm_algo_pack.fallback_nchw_qs8.is_available_attribute(
......
......@@ -19,21 +19,11 @@ using namespace convolution;
namespace {
std::pair<TensorLayoutArray, Convolution::Param> sub_opr_config(
const ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args) {
SmallVector<size_t> flt_shape(0);
std::vector<ptrdiff_t> flt_stride(0);
size_t idx = 0;
// check if the first dim is group
if (args.filter_layout->ndim > args.diff_layout->ndim)
++idx;
for (; idx < args.filter_layout->ndim; ++idx) {
flt_shape.push_back(args.filter_layout->shape[idx]);
flt_stride.push_back(args.filter_layout->stride[idx]);
}
TensorLayout filter_pg(flt_shape, flt_stride, args.filter_layout->dtype,
args.filter_layout->format);
TensorLayout filter_pg = *args.filter_layout;
TensorLayout diff_pg = *args.diff_layout;
TensorLayout grad_pg = *args.grad_layout;
filter_pg.remove_axis_inplace(0);
auto nr_grp = args.filter_meta.group;
size_t c_pos = 1;
diff_pg.shape[c_pos] /= nr_grp;
......@@ -92,9 +82,11 @@ bool ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::is_available(
}
auto config = prepare_sub_opr(args);
return get_algorithm(
AlgoBase::SizeArgs sub_args{
static_cast<ConvolutionBackwardDataImpl*>(config.second.get()),
config.first[0], config.first[1], config.first[2]);
config.first[0], config.first[1], config.first[2]};
return has_available_algo<ConvolutionBackwardDataImpl>(sub_args);
}
WorkspaceBundle
......
......@@ -18,21 +18,11 @@ using namespace convolution;
namespace {
std::pair<TensorLayoutArray, Convolution::Param> sub_opr_config(
const ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs& args) {
SmallVector<size_t> flt_shape(0);
std::vector<ptrdiff_t> flt_stride(0);
size_t idx = 0;
// check if the first dim is group
if (args.grad_layout->ndim > args.diff_layout->ndim)
++idx;
for (; idx < args.grad_layout->ndim; ++idx) {
flt_shape.push_back(args.grad_layout->shape[idx]);
flt_stride.push_back(args.grad_layout->stride[idx]);
}
TensorLayout filter_pg(flt_shape, flt_stride, args.grad_layout->dtype,
args.grad_layout->format);
TensorLayout filter_pg = *args.grad_layout;
TensorLayout src_pg = *args.src_layout;
TensorLayout diff_pg = *args.diff_layout;
filter_pg.remove_axis_inplace(0);
auto nr_grp = args.grad_filter_meta.group;
size_t c_pos = 1;
src_pg.shape[c_pos] /= nr_grp;
......@@ -88,9 +78,11 @@ bool ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::is_available(
}
auto config = prepare_sub_opr(args);
return get_algorithm(
AlgoBase::SizeArgs sub_args{
static_cast<ConvolutionBackwardFilterImpl*>(config.second.get()),
config.first[0], config.first[1], config.first[2]);
config.first[0], config.first[1], config.first[2]};
return has_available_algo<ConvolutionBackwardFilterImpl>(sub_args);
}
WorkspaceBundle
......
......@@ -173,12 +173,10 @@ ConvolutionBackwardDataImpl::get_algorithm_heuristic(
return algo;
}
if (args.filter_meta.group > 1) {
if (auto algo = megdnn::get_algo_match_attribute<
ConvolutionBackwardDataImpl>(
&sm_algo_pack.group, positive_attr, negative_attr)) {
return algo;
}
if (args.filter_meta.group > 1 &&
sm_algo_pack.group.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
return &sm_algo_pack.group;
}
if (args.filter_layout->dtype.enumv() !=
......@@ -302,12 +300,10 @@ ConvolutionBackwardFilterImpl::get_algorithm_heuristic(
return algo;
}
if (args.grad_filter_meta.group > 1) {
if (auto algo = megdnn::get_algo_match_attribute<
ConvolutionBackwardFilterImpl>(
&sm_algo_pack.group, positive_attr, negative_attr)) {
return algo;
}
if (args.grad_filter_meta.group > 1 &&
sm_algo_pack.group.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
return &sm_algo_pack.group;
}
if (args.src_layout->dtype.enumv() != DTypeTrait<dtype::BFloat16>::enumv) {
......
......@@ -18,22 +18,11 @@ using namespace convolution3d;
namespace {
std::pair<TensorLayoutArray, Convolution3DBackwardDataImpl::Param>
sub_opr_config(const Convolution3DBackwardDataImpl::AlgoBase::SizeArgs& args) {
SmallVector<size_t> flt_shape(0);
std::vector<ptrdiff_t> flt_stride(0);
size_t idx = 0;
// check if the first dim is group
if (args.filter_layout->ndim > args.grad_layout->ndim)
++idx;
for (; idx < args.filter_layout->ndim; ++idx) {
flt_shape.push_back(args.filter_layout->shape[idx]);
flt_stride.push_back(args.filter_layout->stride[idx]);
}
TensorLayout filter_pg(flt_shape, flt_stride,
args.filter_layout->dtype,
args.filter_layout->format);
TensorLayout filter_pg = *args.filter_layout;
TensorLayout diff_pg = *args.diff_layout;
TensorLayout grad_pg = *args.grad_layout;
filter_pg.remove_axis_inplace(0);
auto nr_grp = args.filter_meta.group;
size_t c_pos = 1;
diff_pg.shape[c_pos] /= nr_grp;
......@@ -84,9 +73,11 @@ bool Convolution3DBackwardDataImpl::AlgoGroupConvGeneral::is_available(
}
auto config = prepare_sub_opr(args);
return get_algorithm(
AlgoBase::SizeArgs sub_args{
static_cast<Convolution3DBackwardDataImpl*>(config.second.get()),
config.first[0], config.first[1], config.first[2]);
config.first[0], config.first[1], config.first[2]};
return has_available_algo<Convolution3DBackwardDataImpl>(sub_args);
}
WorkspaceBundle
......
......@@ -19,21 +19,12 @@ namespace {
std::pair<TensorLayoutArray, Convolution3DBackwardFilterImpl::Param>
sub_opr_config(
const Convolution3DBackwardFilterImpl::AlgoBase::SizeArgs& args) {
SmallVector<size_t> flt_shape(0);
std::vector<ptrdiff_t> flt_stride(0);
size_t idx = 0;
// check if the first dim is group
if (args.grad_layout->ndim > args.src_layout->ndim)
++idx;
for (; idx < args.grad_layout->ndim; ++idx) {
flt_shape.push_back(args.grad_layout->shape[idx]);
flt_stride.push_back(args.grad_layout->stride[idx]);
}
TensorLayout grad_pg(flt_shape, flt_stride, args.grad_layout->dtype,
args.grad_layout->format);
TensorLayout grad_pg = *args.grad_layout;
TensorLayout src_pg = *args.src_layout;
TensorLayout diff_pg = *args.diff_layout;
grad_pg.remove_axis_inplace(0);
auto nr_grp = args.grad_filter_meta.group;
size_t c_pos = 1;
src_pg.shape[c_pos] /= nr_grp;
......@@ -86,9 +77,11 @@ bool Convolution3DBackwardFilterImpl::AlgoGroupConvGeneral::is_available(
}
auto config = prepare_sub_opr(args);
return get_algorithm(
AlgoBase::SizeArgs sub_args{
static_cast<Convolution3DBackwardFilterImpl*>(config.second.get()),
config.first[0], config.first[1], config.first[2]);
config.first[0], config.first[1], config.first[2]};
return has_available_algo<Convolution3DBackwardFilterImpl>(sub_args);
}
WorkspaceBundle
......
......@@ -19,20 +19,7 @@ namespace {
std::pair<TensorLayoutArray, Convolution3DForwardImpl::Param> sub_opr_config(
const Convolution3DForwardImpl::AlgoBase::SizeArgs& args) {
TensorLayout src_pg = *args.src_layout;
SmallVector<size_t> flt_shape(0);
std::vector<ptrdiff_t> flt_stride(0);
size_t idx = 0;
// check if the first dim is group
if (args.filter_layout->ndim > args.src_layout->ndim)
++idx;
for (; idx < args.filter_layout->ndim; ++idx) {
flt_shape.push_back(args.filter_layout->shape[idx]);
flt_stride.push_back(args.filter_layout->stride[idx]);
}
TensorLayout filter_pg(flt_shape, flt_stride,
args.filter_layout->dtype,
args.filter_layout->format);
TensorLayout filter_pg = *args.filter_layout;
TensorLayout dst_pg = *args.dst_layout;
auto nr_grp = args.filter_meta.group;
......@@ -45,6 +32,7 @@ std::pair<TensorLayoutArray, Convolution3DForwardImpl::Param> sub_opr_config(
"invalid conv format");
c_pos = 4;
}
filter_pg.remove_axis_inplace(0);
src_pg.shape[c_pos] /= nr_grp;
dst_pg.shape[c_pos] /= nr_grp;
......@@ -92,9 +80,11 @@ bool Convolution3DForwardImpl::AlgoGroupConvGeneral::is_available(
}
auto config = prepare_sub_opr(args);
return get_algorithm(
AlgoBase::SizeArgs sub_args{
static_cast<Convolution3DForwardImpl*>(config.second.get()),
config.first[0], config.first[1], config.first[2]);
config.first[0], config.first[1], config.first[2]};
return has_available_algo<Convolution3DForwardImpl>(sub_args);
}
WorkspaceBundle
......
......@@ -89,13 +89,10 @@ Convolution3DForwardImpl::get_algorithm_heuristic(
return algo;
}
if (args.filter_meta.group > 1) {
if (auto algo =
megdnn::get_algo_match_attribute<Convolution3DForwardImpl>(
&sm_algo_pack.group, positive_attr,
negative_attr)) {
return algo;
}
if (args.filter_meta.group > 1 &&
sm_algo_pack.group.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
return &sm_algo_pack.group;
}
return megdnn::get_algo_match_attribute<Convolution3DForwardImpl>(
......@@ -189,12 +186,10 @@ Convolution3DBackwardDataImpl::get_algorithm_heuristic(
return algo;
}
if (args.filter_meta.group > 1) {
if (auto algo = megdnn::get_algo_match_attribute<
Convolution3DBackwardDataImpl>(
&sm_algo_pack.group, positive_attr, negative_attr)) {
return algo;
}
if (args.filter_meta.group > 1 &&
sm_algo_pack.group.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
return &sm_algo_pack.group;
}
return megdnn::get_algo_match_attribute<Convolution3DBackwardDataImpl>(
......@@ -272,12 +267,10 @@ Convolution3DBackwardFilterImpl::get_algorithm_heuristic(
return algo;
}
if (args.grad_filter_meta.group > 1) {
if (auto algo = megdnn::get_algo_match_attribute<
Convolution3DBackwardFilterImpl>(
&sm_algo_pack.group, positive_attr, negative_attr)) {
return algo;
}
if (args.grad_filter_meta.group > 1 &&
sm_algo_pack.group.is_available_attribute(
args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
return &sm_algo_pack.group;
}
return megdnn::get_algo_match_attribute<Convolution3DBackwardFilterImpl>(
......
......@@ -467,7 +467,7 @@ CudnnAlgoPack::conv_bwd_data_algos() {
DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT, true, true),
DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING, true, true),
#if CUDNN_MAJOR >= 5
DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD, true, false),
DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD, true, true),
#if CUDNN_MAJOR >= 6 || CUDNN_MINOR >= 1
DEF_ALGO(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED, true, false),
#endif
......
......@@ -94,7 +94,7 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
param().mode == Param::Mode::NCHW_NCHW4_WEIGHT;
if (is_trans_4bits || is_nchw_nchw4) {
bool is_usable = relayout_format::RelayoutFormatFast::usable(
src.layout, dst.layout);
src.layout, dst.layout, param().mode);
megdnn_assert(is_usable,
"RelayoutFormatFast kernel is not usable for "
"transforming %s(%s) to %s(%s).",
......
......@@ -12,6 +12,9 @@
#include "src/cuda/relayout_format/relayout_format.cuh"
#include "src/cuda/relayout_format/relayout_format.h"
#include "src/common/utils.h"
#include "megdnn/dtype.h"
using namespace megdnn;
using namespace cuda;
......@@ -35,8 +38,38 @@ inline void get_scale_zeropoint(const DType& tensor_dtype, float& scale,
} // namespace
bool relayout_format::RelayoutFormatFast::usable(
const TensorLayout& src_layout, const TensorLayout& dst_layout) {
return relayout_format_cuda_usable(src_layout, dst_layout);
const TensorLayout& src_layout, const TensorLayout& dst_layout,
const RelayoutFormat::Param::Mode& mode) {
bool is_all_continue =
src_layout.is_contiguous() && dst_layout.is_contiguous();
bool is_all_int32 =
(src_layout.dtype.enumv() == DTypeEnum::QuantizedS32 &&
dst_layout.dtype.enumv() == DTypeEnum::QuantizedS32);
bool is_all_int8 =
(src_layout.dtype.enumv() == DTypeEnum::Uint8 &&
dst_layout.dtype.enumv() == DTypeEnum::QuantizedS8) ||
(src_layout.dtype.enumv() == DTypeEnum::Quantized8Asymm &&
dst_layout.dtype.enumv() == DTypeEnum::QuantizedS8) ||
(src_layout.dtype.enumv() == DTypeEnum::Quantized8Asymm &&
dst_layout.dtype.enumv() == DTypeEnum::Quantized8Asymm) ||
(src_layout.dtype.enumv() == DTypeEnum::QuantizedS8 &&
dst_layout.dtype.enumv() == DTypeEnum::QuantizedS8);
bool is_all_int4 =
(src_layout.dtype.enumv() == DTypeEnum::QuantizedS4 &&
dst_layout.dtype.enumv() == DTypeEnum::QuantizedS4) ||
(src_layout.dtype.enumv() == DTypeEnum::Quantized4Asymm &&
dst_layout.dtype.enumv() == DTypeEnum::Quantized4Asymm);
bool is_nchw4_nchw_ok = true;
if (mode == RelayoutFormat::Param::Mode::NCHW4_NCHW) {
is_nchw4_nchw_ok =
(src_layout.dtype.enumv() ==
DTypeEnum::Quantized8Asymm ||
src_layout.dtype.enumv() == DTypeEnum::QuantizedS8) &&
src_layout.dtype == dst_layout.dtype;
}
return is_all_continue && (is_all_int32 || is_all_int8 || is_all_int4) &&
is_nchw4_nchw_ok;
}
void relayout_format::RelayoutFormatFast::exec(const TensorND& src,
......
......@@ -461,28 +461,6 @@ void relayout_format::relayout_format_cuda_nchw_nchwx(
}
}
bool relayout_format::relayout_format_cuda_usable(
const TensorLayout& src_layout, const TensorLayout& dst_layout) {
bool is_all_continue =
src_layout.is_contiguous() && dst_layout.is_contiguous();
bool is_all_int32 =
(src_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS32 &&
dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS32);
bool is_all_int8 =
(src_layout.dtype.enumv().ev == DTypeEnum::Ev::Uint8 &&
dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8) ||
(src_layout.dtype.enumv().ev == DTypeEnum::Ev::Quantized8Asymm &&
dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8) ||
(src_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8 &&
dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8);
bool is_all_int4 =
(src_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS4 &&
dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS4) ||
(src_layout.dtype.enumv().ev == DTypeEnum::Ev::Quantized4Asymm &&
dst_layout.dtype.enumv().ev == DTypeEnum::Ev::Quantized4Asymm);
return is_all_continue && (is_all_int32 || is_all_int8 || is_all_int4);
}
void relayout_format::relayout_format_cuda_nchwx_nchw(
const TensorND& src, const TensorND& dst, const cudaStream_t& stream,
const float src_scale, const float dst_scale,
......
......@@ -25,9 +25,6 @@ void relayout_format_cuda_nchw_nchwx(
const uint8_t src_zero_point = 0, const uint8_t dst_zero_point = 0,
const int group = 1);
bool relayout_format_cuda_usable(const TensorLayout& src_layout,
const TensorLayout& dst_layout);
void relayout_format_cuda_nchw4_nchw(const TensorND& src, const TensorND& dst,
const cudaStream_t& stream,
const int group);
......
......@@ -22,7 +22,9 @@ namespace relayout_format {
struct RelayoutFormatFast {
static bool usable(const TensorLayout& src_layout,
const TensorLayout& dst_layout);
const TensorLayout& dst_layout,
const RelayoutFormat::Param::Mode& mode =
RelayoutFormat::Param::Mode::NCHW_NCHW4);
static void exec(const TensorND& src, const TensorND& dst,
cudaStream_t stream, RelayoutFormat::Param::Mode mode,
int group);
......
......@@ -164,9 +164,9 @@ public:
}
std::vector<Algorithm::Info::Desc> ret;
megdnn_assert(layouts.size() == OprTrait<Opr>::arity);
for (auto algo_info :
AlgoProxy<Opr, OprTrait<Opr>::arity>::get_all_algorithms_info(
opr, layouts)) {
auto vec = AlgoProxy<Opr, OprTrait<Opr>::arity>::get_all_algorithms_info(
opr, layouts);
for (auto algo_info : vec) {
if (!(algo_info.attribute &
AlgoAttribute::ACCURACY_DEPEND_ON_BATCH) &&
(algo_info.attribute & AlgoAttribute::REPRODUCIBLE) &&
......
......@@ -40,16 +40,8 @@ TEST_F(CUDA, SHAKE_CONV_BIAS_FORWARD) {
{64, 64, 30, 30},
{}});
ConvBias::Param param;
// group
param.sparse = ConvBias::Param::Sparse::GROUP;
checker.set_param(param);
checker.exec({{64, 16, 32, 32}, {2, 32, 8, 3, 3}, {}, {}, {}});
checker.exec({{64, 16, 32, 32}, {2, 32, 8, 3, 3}, {1, 64, 1, 1}, {}, {}});
checker.exec({{64, 16, 32, 32},
{2, 32, 8, 3, 3},
{1, 64, 1, 1},
{64, 64, 30, 30},
{}});
// FIXME currently group conv cannot get the attribute of it's subopr, so we
// just ignore group conv here.
}
TEST_F(CUDA, SHAKE_CONV_BIAS_FORWARD_QS8_NCHW) {
......@@ -248,15 +240,10 @@ TEST_F(CUDA, SHAKE_CONVOLUTION_BACKWARD_DATA) {
.set_dtype(1, dtype::Float32())
.set_rng(0, &default_rng)
.set_rng(1, &default_rng);
// ConvolutionBackwardData
checker.exec({{8, 16, 3, 3}, {64, 8, 5, 5}, {64, 16, 7, 7}});
// group
ConvolutionBackwardData::Param param;
param.sparse = Convolution::Param::Sparse::GROUP;
checker.set_param(param);
checker.exec({{2, 16, 32, 3, 3}, {2, 32, 5, 5}, {2, 64, 7, 7}});
checker.exec({{2, 8, 32, 3, 3}, {64, 16, 19, 19}, {64, 64, 21, 21}});
// FIXME currently group conv cannot get the attribute of it's subopr, so we
// just ignore group conv here.
}
TEST_F(CUDA, SHAKE_CONVOLUTION_BACKWARD_FILTER) {
......@@ -266,14 +253,10 @@ TEST_F(CUDA, SHAKE_CONVOLUTION_BACKWARD_FILTER) {
.set_dtype(1, dtype::Float32())
.set_rng(0, &default_rng)
.set_rng(1, &default_rng);
// ConvolutionBackwardFilter
checker.exec({{2, 64, 7, 7}, {2, 32, 5, 5}, {32, 64, 3, 3}});
// group
ConvolutionBackwardFilter::Param param;
param.sparse = Convolution::Param::Sparse::GROUP;
checker.set_param(param);
checker.exec({{2, 64, 7, 7}, {2, 32, 5, 5}, {2, 16, 32, 3, 3}});
// FIXME currently group conv cannot get the attribute of it's subopr, so we
// just ignore group conv here.
}
} // namespace test
......
......@@ -226,11 +226,11 @@ TEST_F(CUDA, CONV_BIAS_NCHW_QS8) {
ConvBias::Param param;
param.format = ConvBias::Param::Format::NCHW;
checker.set_dtype(0, dtype::QuantizedS8(2.5f))
.set_dtype(1, dtype::QuantizedS8(2.5f))
.set_dtype(2, dtype::QuantizedS32(6.25f))
.set_dtype(3, dtype::QuantizedS8(0.25f))
.set_dtype(4, dtype::QuantizedS8(0.25f))
checker.set_dtype(0, dtype::QuantizedS8(1.f))
.set_dtype(1, dtype::QuantizedS8(1.f))
.set_dtype(2, dtype::QuantizedS32(1.f))
.set_dtype(3, dtype::QuantizedS8(1.f))
.set_dtype(4, dtype::QuantizedS8(1.f))
.set_rng(0, &int_rng)
.set_rng(1, &int_rng)
.set_rng(2, &int_rng)
......
......@@ -37,6 +37,7 @@ TEST_F(CUDA, RELAYOUT_FORMAT) {
TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4_NCHW) {
Checker<RelayoutFormat> checker(handle_cuda());
UniformIntRNG rng{-50, 50};
UniformIntRNG u8_rng{0, 255};
param::RelayoutFormat param;
param.mode = param::RelayoutFormat::Mode::NCHW4_NCHW;
......@@ -46,6 +47,12 @@ TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4_NCHW) {
.set_param(param)
.execs({{1, 1, 2, 2, 4}, {}});
checker.set_dtype(0, dtype::Quantized8Asymm{1.f, 128})
.set_dtype(1, dtype::Quantized8Asymm{1.f, 128})
.set_rng(0, &u8_rng)
.set_param(param)
.execs({{1, 1, 2, 2, 4}, {}});
checker.set_dtype(0, dtype::QuantizedS8{0.1f})
.set_dtype(1, dtype::QuantizedS8{0.1f})
.set_rng(0, &rng)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册