提交 5d350fc8 编写于 作者: M Megvii Engine Team

feat(dnn/cuda): add deconv int8 and fix cutlass conv wrapper base on modify cutlass 2.4

GitOrigin-RevId: 49e0565e8a882a455387c95a80a8037c29166a76
上级 4917534b
......@@ -5,6 +5,7 @@ dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary
dnn/src/cuda/batch_conv_bias/int8/kimpl/* binary
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary
dnn/src/cuda/sass/prebuilt/map_defs.cpp binary
dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary
tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text
*.caffemodel filter=lfs diff=lfs merge=lfs -text
imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text
......
......@@ -46,7 +46,7 @@ void make_canonized_filter_meta_nchw_nhwc(
size_t src_ndim, const TensorLayout& filter, const Param& param,
typename ConvolutionBase<Parameter>::CanonizedFilterMeta& ret) {
megdnn_assert(param.format == Param::Format::NCHW ||
param.format == Param::Format::NHWC );
param.format == Param::Format::NHWC);
auto img_ndim = src_ndim - 2;
size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos;
if (param.sparse == Param::Sparse::DENSE) {
......@@ -320,8 +320,8 @@ void make_canonized_filter_meta_nchwxx(
img_ndim, filter.ndim);
megdnn_assert((filter[filter.ndim - 1] == pack_size &&
filter[filter.ndim - 2] == pack_size) ||
(filter[filter.ndim - 1] == 2 * pack_size &&
filter[filter.ndim - 2] == 2 * pack_size),
(filter[filter.ndim - 1] == 2 * pack_size &&
filter[filter.ndim - 2] == 2 * pack_size),
"last 2 dim of filter must be %zu, but got %zu, %zu",
pack_size, filter[filter.ndim - 2],
filter[filter.ndim - 1]);
......@@ -684,7 +684,8 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
}
if (param().format == Param::Format::NCHW44 ||
param().format == Param::Format::NCHW44_DOT) {
//!support nchw44 filter change to 88 for int8 winogradf23_88 using MK8 mamtul
//! support nchw44 filter change to 88 for int8 winogradf23_88 using
//! MK8 mamtul
megdnn_assert((src.ndim == 4 && filter.ndim == 5 &&
filter[filter.ndim - 1] == 4) ||
(src.ndim == 5 &&
......@@ -716,7 +717,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
"currently only convolution on 2D image is supported");
auto cflt = make_canonized_filter_meta(src.ndim, filter);
if (param().format == Param::Format::NCHW ||
param().format == Param::Format::NHWC ) {
param().format == Param::Format::NHWC) {
size_t src_or_dst_c_pos = 0;
size_t src_or_dst_spatial_start = 0;
if (param().format == Param::Format::NCHW) {
......@@ -790,7 +791,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1],
cflt.stride[1], cflt.padding[1]);
dst[4] = 32;
} else if (param().format == Param::Format::NCHW88 ) {
} else if (param().format == Param::Format::NCHW88) {
megdnn_assert(src.ndim == 5 || (src.ndim == 4 && src[1] <= 8),
"invalid src ndim for NCHW88, expected=5 or 4, got=%zu",
src.ndim);
......@@ -1042,10 +1043,10 @@ void ConvolutionBackwardData::deduce_dtype(DType filter, DType diff,
}
megdnn_assert(param().compute_mode != Param::ComputeMode::FLOAT32
#if !MEGDNN_DISABLE_FLOAT16
|| filter.enumv() == DTypeEnum::Float16
|| filter.enumv() == DTypeEnum::BFloat16
|| filter.enumv() == DTypeEnum::Float16 ||
filter.enumv() == DTypeEnum::BFloat16
#endif
,
,
"ComputeMode::FLOAT32 is only available for Float16/BFloat16 "
"input / output.");
}
......@@ -1096,6 +1097,24 @@ void ConvolutionBackwardData::deduce_layout(const TensorLayout& filter,
diff[i + src_or_dst_spatial_start], cflt.dilated_spatial[i],
cflt.stride[i], cflt.padding[i]);
}
} else if (param().format == Param::Format::NCHW4) {
megdnn_assert(diff.ndim == 5,
"valid diff ndim for NCHW4, expected=5, got=%zu",
diff.ndim);
megdnn_assert(cflt.group == 1, "%s", errmsg().c_str());
megdnn_assert(cflt.ocpg * cflt.group == diff[1] * 4, "%s",
errmsg().c_str());
grad.ndim = diff.ndim;
grad[0] = diff[0];
auto ic = cflt.icpg * cflt.group;
megdnn_assert(ic % 4 == 0);
grad[1] = ic / 4;
grad[2] = deduce(diff[2], cflt.dilated_spatial[0], cflt.stride[0],
cflt.padding[0]);
grad[3] = deduce(diff[3], cflt.dilated_spatial[1], cflt.stride[1],
cflt.padding[1]);
megdnn_assert(diff[4] == 4);
grad[4] = 4;
} else {
megdnn_assert(param().format == Param::Format::NHWCD4);
megdnn_assert(diff.ndim == 5,
......
......@@ -62,22 +62,21 @@ void megdnn::cuda::cutlass_wrapper::
threadblock_k_>; \
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \
using Convolution = cutlass::convolution::device::Convolution< \
using Convolution = cutlass::conv::device::Convolution< \
int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \
cutlass::layout::TensorCxRSKx<32>, ElementOutput, \
cutlass::layout::TensorNCxHWx<32>, int32_t, \
cutlass::layout::TensorNCxHWx<32>, int32_t, \
cutlass::convolution::ConvType::kConvolution, \
cutlass::conv::ConvType::kConvolution, \
cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \
cutlass::convolution::threadblock:: \
ConvolutionNCxHWxThreadblockSwizzle< \
cutlass::convolution::ConvType::kConvolution>, \
cutlass::conv::threadblock:: \
ConvolutionFpropNCxHWxThreadblockSwizzle, \
2, 16, 16, NeedLoadFromConstMem>; \
typename Convolution::ConvolutionParameter conv_param{ \
param.n, param.ci, param.co, param.hi, param.wi, \
param.fh, param.fw, param.ho, param.wo, param.sh, \
param.sw, param.ph, param.pw, 1, 1}; \
typename Convolution::ConvolutionParameter conv_param( \
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \
return cutlass_convolution_wrapper<Convolution>( \
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \
epilogue, stream); \
......@@ -186,22 +185,21 @@ void megdnn::cuda::cutlass_wrapper::
threadblock_k_>; \
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \
using Convolution = cutlass::convolution::device::Convolution< \
using Convolution = cutlass::conv::device::Convolution< \
int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \
cutlass::layout::TensorCxRSKx<32>, ElementOutput, \
cutlass::layout::TensorNCxHWx<4>, int32_t, \
cutlass::layout::TensorNCxHWx<4>, int32_t, \
cutlass::convolution::ConvType::kConvolution, \
cutlass::conv::ConvType::kConvolution, \
cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \
cutlass::convolution::threadblock:: \
ConvolutionNCxHWxThreadblockSwizzle< \
cutlass::convolution::ConvType::kConvolution>, \
cutlass::conv::threadblock:: \
ConvolutionFpropNCxHWxThreadblockSwizzle, \
2, 16, 16, NeedLoadFromConstMem>; \
typename Convolution::ConvolutionParameter conv_param{ \
param.n, param.ci, param.co, param.hi, param.wi, \
param.fh, param.fw, param.ho, param.wo, param.sh, \
param.sw, param.ph, param.pw, 1, 1}; \
typename Convolution::ConvolutionParameter conv_param( \
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \
return cutlass_convolution_wrapper<Convolution>( \
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \
epilogue, stream); \
......@@ -311,22 +309,21 @@ void megdnn::cuda::cutlass_wrapper::
threadblock_k_>; \
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \
using Convolution = cutlass::convolution::device::Convolution< \
using Convolution = cutlass::conv::device::Convolution< \
int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \
cutlass::layout::TensorCxRSKx<4>, ElementOutput, \
cutlass::layout::TensorNCxHWx<4>, int32_t, \
cutlass::layout::TensorNCxHWx<4>, int32_t, \
cutlass::convolution::ConvType::kConvolution, \
cutlass::conv::ConvType::kConvolution, \
cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \
cutlass::convolution::threadblock:: \
ConvolutionNCxHWxThreadblockSwizzle< \
cutlass::convolution::ConvType::kConvolution>, \
cutlass::conv::threadblock:: \
ConvolutionFpropNCxHWxThreadblockSwizzle, \
stage_, 4, aligned_, NeedLoadFromConstMem>; \
typename Convolution::ConvolutionParameter conv_param{ \
param.n, param.ci, param.co, param.hi, param.wi, \
param.fh, param.fw, param.ho, param.wo, param.sh, \
param.sw, param.ph, param.pw, 1, 1}; \
typename Convolution::ConvolutionParameter conv_param( \
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \
return cutlass_convolution_wrapper<Convolution>( \
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \
epilogue, stream); \
......@@ -441,23 +438,22 @@ void megdnn::cuda::cutlass_wrapper::
threadblock_k_>; \
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \
using Convolution = cutlass::convolution::device::Convolution< \
using Convolution = cutlass::conv::device::Convolution< \
int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \
cutlass::layout::TensorCxRSKx<4>, ElementOutput, \
cutlass::layout::TensorNCHW, float, \
cutlass::layout::TensorNCHW, int32_t, \
cutlass::convolution::ConvType::kConvolution, \
cutlass::conv::ConvType::kConvolution, \
cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \
cutlass::convolution::threadblock:: \
ConvolutionNCxHWxThreadblockSwizzle< \
cutlass::convolution::ConvType::kConvolution>, \
cutlass::conv::threadblock:: \
ConvolutionFpropNCxHWxThreadblockSwizzle, \
stages_, 4, aligned_, NeedLoadFromConstMem, \
cutlass::arch::OpMultiplyAdd>; \
typename Convolution::ConvolutionParameter conv_param{ \
param.n, param.ci, param.co, param.hi, param.wi, \
param.fh, param.fw, param.ho, param.wo, param.sh, \
param.sw, param.ph, param.pw, 1, 1}; \
typename Convolution::ConvolutionParameter conv_param( \
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \
return cutlass_convolution_wrapper<Convolution>( \
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \
epilogue, stream); \
......@@ -572,36 +568,35 @@ void megdnn::cuda::cutlass_wrapper::
threadblock_k_>; \
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \
using Convolution = cutlass::convolution::device::Convolution< \
using Convolution = cutlass::conv::device::Convolution< \
int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \
cutlass::layout::TensorCxRSKx<4>, ElementOutput, \
cutlass::layout::TensorNCxHWx<32>, int32_t, \
cutlass::layout::TensorNCxHWx<32>, int32_t, \
cutlass::convolution::ConvType::kConvolution, \
cutlass::conv::ConvType::kConvolution, \
cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \
cutlass::convolution::threadblock:: \
ConvolutionNCxHWxThreadblockSwizzle< \
cutlass::convolution::ConvType::kConvolution>, \
cutlass::conv::threadblock:: \
ConvolutionFpropNCxHWxThreadblockSwizzle, \
stages_, 4, aligned_, NeedLoadFromConstMem>; \
typename Convolution::ConvolutionParameter conv_param{ \
param.n, param.ci, param.co, param.hi, param.wi, \
param.fh, param.fw, param.ho, param.wo, param.sh, \
param.sw, param.ph, param.pw, 1, 1}; \
typename Convolution::ConvolutionParameter conv_param( \
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \
return cutlass_convolution_wrapper<Convolution>( \
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \
epilogue, stream); \
}
#define DISPATCH_KERNEL \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \
megdnn_assert(false, \
"unsupported threadblock shape (%dx%dx%d) and warp shape " \
"(%dx%dx%d)", \
......
......@@ -29,28 +29,30 @@ void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper(
cudaStream_t stream) {
typename Convolution::TensorRefSrc tensor_src{
const_cast<typename Convolution::ElementSrc*>(d_src),
Convolution::LayoutSrc::packed({conv_param.n(), conv_param.hi(),
conv_param.wi(), conv_param.ci()})};
Convolution::LayoutSrc::packed(
{conv_param.N, conv_param.H, conv_param.W, conv_param.C})};
typename Convolution::TensorRefFilter tensor_filter{
const_cast<typename Convolution::ElementFilter*>(d_filter),
Convolution::LayoutFilter::packed({conv_param.co(), conv_param.fh(),
conv_param.fw(),
conv_param.ci()})};
Convolution::LayoutFilter::packed(
{conv_param.K, conv_param.R, conv_param.S, conv_param.C})};
typename Convolution::TensorRefBias tensor_bias{
const_cast<typename Convolution::ElementBias*>(d_bias),
Convolution::LayoutBias::packed({1, 1, 1, conv_param.co()})};
Convolution::LayoutBias::packed({1, 1, 1, conv_param.K})};
typename Convolution::TensorRefDst tensor_z{
const_cast<typename Convolution::ElementDst*>(d_z),
Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(),
conv_param.wo(), conv_param.co()})};
Convolution::LayoutDst::packed(
{conv_param.N, conv_param.P, conv_param.Q, conv_param.K})};
typename Convolution::TensorRefDst tensor_dst{
d_dst,
Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(),
conv_param.wo(), conv_param.co()})};
typename Convolution::Arguments arguments{
conv_param, tensor_src, tensor_filter,
tensor_bias, tensor_z, tensor_dst.non_const_ref(),
epilogue};
Convolution::LayoutDst::packed(
{conv_param.N, conv_param.P, conv_param.Q, conv_param.K})};
typename Convolution::Arguments arguments{conv_param,
tensor_src.non_const_ref(),
tensor_filter.non_const_ref(),
tensor_bias.non_const_ref(),
tensor_z.non_const_ref(),
tensor_dst.non_const_ref(),
epilogue};
Convolution conv_op;
cutlass_check(conv_op.initialize(arguments, workspace));
cutlass_check(conv_op(stream));
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册