提交 b4687ce8 编写于 作者: M Megvii Engine Team 提交者: huangxinda

feat(dnn/cuda): add convolution with i8 input and u4 output

GitOrigin-RevId: 8be439abf1f448a6be33ea0e57c48e674c5c94c4
上级 00083d13
......@@ -960,7 +960,7 @@ void megdnn::cuda::cutlass_wrapper::
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \
cutlass::conv::threadblock:: \
ConvolutionFpropNCxHWxThreadblockSwizzle, \
stages_, 4, aligned_, NeedLoadFromConstMem, \
stages_, 4, aligned_, true, \
cutlass::arch::OpMultiplyAddSaturate>; \
typename Convolution::ConvolutionParameter conv_param( \
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \
......@@ -1020,7 +1020,7 @@ void megdnn::cuda::cutlass_wrapper::
ElementOutput, 8, ElementAccumulator, ElementBias,
ElementCompute>;
typename EpilogueOp::Params epilogue{alpha, beta, gamma,
scale, detla, theta};
scale, delta, theta};
DISPATCH_KERNEL;
}
default:
......
/**
* \file
* dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl
* dnn/src/cuda/conv_bias/int8/implicit_gemm_conv_bias_cutlass_wrapper.cuinl
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
......
......@@ -181,6 +181,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
float alpha = src_scale * filter_scale;
float beta = 1.f;
float dst_scale = 1.f;
float gamma = 0.f;
float theta = 0.f;
if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
theta = args.dst_layout->dtype.param<dtype::Quantized4Asymm>()
.zero_point;
}
if (args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32) {
megdnn_assert(args.dst_layout->dtype.category() ==
DTypeCategory::QUANTIZED);
......@@ -189,7 +195,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
dst_scale = get_scale(args.dst_layout->dtype);
alpha /= dst_scale, beta = bias_scale / dst_scale;
}
float gamma = 0.f;
float delta = 0.f;
if (args.z_layout->ndim > 0) {
gamma = 1.f;
if (args.z_layout->dtype.category() == DTypeCategory::QUANTIZED) {
......@@ -198,6 +204,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
float z_scale = get_scale(args.z_layout->dtype);
gamma = z_scale / dst_scale;
}
if (args.z_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
uint8_t z_zero =
args.z_layout->dtype.param<dtype::Quantized4Asymm>()
.zero_point;
delta = -z_zero * gamma;
}
}
uint32_t nonlinear_mode = static_cast<uint32_t>(param.nonlineMode);
bool nonunity_kernel = !(fh == 1 && fw == 1);
......@@ -244,14 +256,15 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
DISPATCH(false);
#undef cb
} else if (param.format == Format::NCHW4_NHWC) {
#define cb(_nonunity_kernel) \
#define cb(_signedness) \
cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nhwc< \
_nonunity_kernel>( \
_signedness>( \
args.src_tensor->compatible_ptr<int8_t>(), filter_ptr, \
args.bias_tensor->compatible_ptr<int32_t>(), \
reinterpret_cast<int8_t*>(args.z_tensor->raw_ptr), \
reinterpret_cast<int8_t*>(args.dst_tensor->raw_ptr), nullptr, \
kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, \
kern_param, nonlinear_mode, alpha, beta, gamma, delta, theta, \
dst_scale, \
cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, \
m_algo_param.threadblock_n, \
m_algo_param.threadblock_k}, \
......@@ -259,7 +272,13 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
m_algo_param.warp_n, \
m_algo_param.warp_k}, \
m_algo_param.stage, stream);
cb(true);
if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) {
cb(true);
} else {
megdnn_assert(args.dst_layout->dtype.enumv() ==
DTypeEnum::Quantized4Asymm);
cb(false);
}
#undef cb
} else {
megdnn_assert(param.format == Format::NCHW4_NCHW32);
......
......@@ -3801,7 +3801,6 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const {
return false;
auto in_dtype = typecvt->input(0)->dtype(),
out_dtype = typecvt->output(0)->dtype();
printf("%s, %s\n", in_dtype.name(), out_dtype.name());
bool is_s82s4 = in_dtype.enumv() == DTypeEnum::QuantizedS8 &&
(out_dtype.enumv() == DTypeEnum::QuantizedS4 ||
out_dtype.enumv() == DTypeEnum::Quantized4Asymm);
......
......@@ -4159,14 +4159,7 @@ TEST(TestGoptInference, FoldingConvDimshuffle) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 61) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 61);
return;
}
REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
......@@ -4240,14 +4233,7 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 61) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 61);
return;
}
REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
......@@ -4326,14 +4312,7 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 75) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 75);
return;
}
REQUIRE_CUDA_COMPUTE_CAPABILITY(7, 5);
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
......@@ -4405,14 +4384,7 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NHWC) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 75) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 75);
return;
}
REQUIRE_CUDA_COMPUTE_CAPABILITY(7, 5);
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
......@@ -4466,7 +4438,6 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NHWC) {
->writeto_fpath(output_file(
"TestGoptInference.FoldingConvDimshuffleNCHW4NHWC.json"));
size_t nr_dimshuffle = find_opr_num<opr::TypeCvt>(y_fuse);
printf("%zu \n", nr_dimshuffle);
ASSERT_EQ(3u, find_opr_num<opr::Dimshuffle>(y_fuse));
bool found = false;
cg::DepOprIter{[&found](cg::OperatorNodeBase* opr) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册