提交 89ad33ae 编写于 作者: M Megvii Engine Team

feat(dnn/cuda): support weight preprocessing for cutlass algorithms

GitOrigin-RevId: 7b77579acd8b61864dee06b3bba00d5c1b88cc8e
上级 33e8879a
......@@ -428,6 +428,11 @@ public:
void exec(const ExecArgs& args) const override;
const char* name() const override { return m_name.c_str(); }
bool is_reproducible() const override { return true; }
size_t get_preprocess_workspace_in_bytes(
const SizeArgs& args) const override;
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const SizeArgs& args) const override;
void exec_preprocess(const ExecArgs& args) const override;
private:
WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
......@@ -560,6 +565,11 @@ public:
const char* name() const override { return m_name.c_str(); }
bool is_reproducible() const override { return true; }
static std::string to_string(AlgoParam algo_param);
size_t get_preprocess_workspace_in_bytes(
const SizeArgs& args) const override;
SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
const SizeArgs& args) const override;
void exec_preprocess(const ExecArgs& args) const override;
private:
WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
......
......@@ -65,8 +65,12 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available(
WorkspaceBundle
ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::get_workspace_bundle(
dt_byte* raw_ptr, const SizeArgs& args) const {
size_t ws_filter = args.filter_layout->span().dist_byte();
return WorkspaceBundle{raw_ptr, {ws_filter}};
if (args.preprocessed_filter) {
return WorkspaceBundle{raw_ptr, {}};
} else {
size_t ws_filter = args.filter_layout->span().dist_byte();
return WorkspaceBundle{raw_ptr, {ws_filter}};
}
}
size_t
......@@ -82,12 +86,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec(
auto&& fm = args.filter_meta;
UNPACK_CONV_BIAS_NCHW32_PARAM(*(args.src_layout), fm, *(args.dst_layout),
param);
auto ws = get_workspace_bundle(args.workspace.raw_ptr, args);
auto ws_filter = ws.get(0);
auto&& stream = cuda_stream(args.opr->handle());
// reformat filter from nchw32 to chwn32
{
int8_t* filter_ptr = nullptr;
if (args.preprocessed_filter == nullptr) {
filter_ptr = reinterpret_cast<int8_t*>(args.workspace.raw_ptr);
// reformat filter from nchw32 to chwn32
TensorLayout src{{co, ci / 32, fh, fw, 32}, dtype::Int8()};
src.init_contiguous_stride();
TensorLayout dst = src;
......@@ -99,11 +103,14 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec(
TensorND ts_src, ts_dst;
ts_src.raw_ptr = args.filter_tensor->raw_ptr;
ts_src.layout = src;
ts_dst.raw_ptr = ws_filter;
ts_dst.raw_ptr = args.workspace.raw_ptr;
ts_dst.layout = dst;
auto&& transpose =
args.opr->handle()->create_operator<RelayoutForward>();
transpose->exec(ts_src, ts_dst);
} else {
filter_ptr = reinterpret_cast<int8_t*>(
args.preprocessed_filter->tensors[0].raw_ptr);
}
ConvParam kern_param;
......@@ -131,8 +138,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec(
uint32_t nonlinear_mode = static_cast<uint32_t>(param.nonlineMode);
if (fh == 1 && fw == 1) {
cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32<
false>(args.src_tensor->compatible_ptr<int8_t>(),
reinterpret_cast<int8_t*>(ws_filter),
false>(args.src_tensor->compatible_ptr<int8_t>(), filter_ptr,
args.bias_tensor->compatible_ptr<int32_t>(), z_dev_ptr,
args.dst_tensor->compatible_ptr<int8_t>(), nullptr,
kern_param, nonlinear_mode, alpha, beta, gamma,
......@@ -146,8 +152,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec(
stream);
} else {
cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32<true>(
args.src_tensor->compatible_ptr<int8_t>(),
reinterpret_cast<int8_t*>(ws_filter),
args.src_tensor->compatible_ptr<int8_t>(), filter_ptr,
args.bias_tensor->compatible_ptr<int32_t>(), z_dev_ptr,
args.dst_tensor->compatible_ptr<int8_t>(), nullptr, kern_param,
nonlinear_mode, alpha, beta, gamma, dst_scale,
......@@ -167,6 +172,41 @@ std::string ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::to_string(
algo_param.threadblock_n, algo_param.threadblock_k,
algo_param.warp_m, algo_param.warp_n, algo_param.warp_k);
}
size_t ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::
get_preprocess_workspace_in_bytes(const SizeArgs& args) const {
return 0_z;
}
SmallVector<TensorLayout> ConvBiasForwardImpl::
AlgoInt8NCHW32IMMAImplicitGemm::deduce_preprocessed_filter_layout(
const SizeArgs& args) const {
return {args.filter_layout->collapse_contiguous()};
}
void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec_preprocess(
const ExecArgs& args) const {
using Format = Param::Format;
auto&& param = args.opr->param();
auto&& fm = args.filter_meta;
UNPACK_CONV_BIAS_NCHW32_PARAM(*(args.src_layout), fm, *(args.dst_layout),
param);
TensorLayout src{{co, ci / 32, fh, fw, 32}, dtype::Int8()};
src.init_contiguous_stride();
TensorLayout dst = src;
dst.stride[0] = 32;
dst.stride[1] = co * fh * fw * 32;
dst.stride[2] = co * fw * 32;
dst.stride[3] = co * 32;
dst.stride[4] = 1;
TensorND ts_src, ts_dst;
ts_src.raw_ptr = args.filter_tensor->raw_ptr;
ts_src.layout = src;
ts_dst.raw_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
ts_dst.layout = dst;
auto&& transpose = args.opr->handle()->create_operator<RelayoutForward>();
transpose->exec(ts_src, ts_dst);
}
#endif
// vim: syntax=cpp.doxygen
......@@ -62,8 +62,12 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available(
WorkspaceBundle
ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::get_workspace_bundle(
dt_byte* raw_ptr, const SizeArgs& args) const {
size_t ws_filter = args.filter_layout->span().dist_byte();
return WorkspaceBundle{raw_ptr, {ws_filter}};
if (args.preprocessed_filter) {
return WorkspaceBundle{raw_ptr, {}};
} else {
size_t ws_filter = args.filter_layout->span().dist_byte();
return WorkspaceBundle{raw_ptr, {ws_filter}};
}
}
size_t
......@@ -79,12 +83,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
auto&& fm = args.filter_meta;
UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout),
param);
auto ws = get_workspace_bundle(args.workspace.raw_ptr, args);
auto ws_filter = ws.get(0);
auto&& stream = cuda_stream(args.opr->handle());
// reformat filter from nchw4 to chwn4
{
int8_t* filter_ptr = nullptr;
if (args.preprocessed_filter == nullptr) {
filter_ptr = reinterpret_cast<int8_t*>(args.workspace.raw_ptr);
// reformat filter from nchw4 to chwn4
TensorLayout src{{co, ci / 4 * fh * fw}, dtype::Int32()};
src.init_contiguous_stride();
TensorLayout dst = src;
......@@ -92,11 +96,14 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
TensorND ts_src, ts_dst;
ts_src.raw_ptr = args.filter_tensor->raw_ptr;
ts_src.layout = src;
ts_dst.raw_ptr = ws_filter;
ts_dst.raw_ptr = args.workspace.raw_ptr;
ts_dst.layout = dst;
auto&& transpose =
args.opr->handle()->create_operator<RelayoutForward>();
transpose->exec(ts_src, ts_dst);
} else {
filter_ptr = reinterpret_cast<int8_t*>(
args.preprocessed_filter->tensors[0].raw_ptr);
}
convolution::ConvParam kern_param;
......@@ -124,8 +131,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
uint32_t nonlinear_mode = static_cast<uint32_t>(param.nonlineMode);
if (fh == 1 && fw == 1) {
cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4<false>(
args.src_tensor->compatible_ptr<int8_t>(),
reinterpret_cast<int8_t*>(ws_filter),
args.src_tensor->compatible_ptr<int8_t>(), filter_ptr,
args.bias_tensor->compatible_ptr<int32_t>(), z_dev_ptr,
args.dst_tensor->compatible_ptr<int8_t>(), nullptr, kern_param,
nonlinear_mode, alpha, beta, gamma, dst_scale,
......@@ -138,8 +144,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
stream);
} else {
cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4<true>(
args.src_tensor->compatible_ptr<int8_t>(),
reinterpret_cast<int8_t*>(ws_filter),
args.src_tensor->compatible_ptr<int8_t>(), filter_ptr,
args.bias_tensor->compatible_ptr<int32_t>(), z_dev_ptr,
args.dst_tensor->compatible_ptr<int8_t>(), nullptr, kern_param,
nonlinear_mode, alpha, beta, gamma, dst_scale,
......@@ -153,4 +158,35 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
}
}
size_t ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::
get_preprocess_workspace_in_bytes(const SizeArgs& args) const {
return 0_z;
}
SmallVector<TensorLayout> ConvBiasForwardImpl::
AlgoInt8NCHW4DotProdImplicitGemm::deduce_preprocessed_filter_layout(
const SizeArgs& args) const {
return {args.filter_layout->collapse_contiguous()};
}
void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec_preprocess(
const ExecArgs& args) const {
using Format = Param::Format;
auto&& param = args.opr->param();
auto&& fm = args.filter_meta;
UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout),
param);
TensorLayout src{{co, ci / 4 * fh * fw}, dtype::Int32()};
src.init_contiguous_stride();
TensorLayout dst = src;
dst.stride[0] = 1, dst.stride[1] = dst[0];
TensorND ts_src, ts_dst;
ts_src.raw_ptr = args.filter_tensor->raw_ptr;
ts_src.layout = src;
ts_dst.raw_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
ts_dst.layout = dst;
auto&& transpose = args.opr->handle()->create_operator<RelayoutForward>();
transpose->exec(ts_src, ts_dst);
}
// vim: syntax=cpp.doxygen
......@@ -1084,6 +1084,42 @@ TEST_F(CUDA, CONV_BIAS_INT8_CHWN4_UNROLL_WIDTH_TENSORCORE_1x1_ALGO_2) {
}
TEST_F(CUDA, CUTLASS_WEIGHT_PREPROCESS) {
require_compute_capability(6, 1);
Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
handle_cuda());
auto check = [&checker](const std::string& algo) {
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
UniformIntRNG rng{-16, 16};
UniformIntRNG bias_rng{-50, 50};
UniformIntRNG const_rng{1, 1};
checker.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &bias_rng)
.set_rng(3, &rng)
.set_dtype(0, dtype::QuantizedS8{1.2f})
.set_dtype(1, dtype::QuantizedS8{1.3f})
.set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
.set_dtype(3, dtype::QuantizedS8{1.3f})
.set_dtype(4, dtype::QuantizedS8{1.0f})
.set_epsilon(1 + 1e-3)
.set_max_avg_error(1e-1)
.set_max_avg_biased_error(1e-3);
param::ConvBias param;
param.pad_h = param.pad_w = 1;
param.stride_h = param.stride_w = 2;
param.format = param::ConvBias::Format::NCHW4;
checker.set_param(param).execs({{16, 4, 14, 14, 4},
{16, 4, 3, 3, 4},
{1, 4, 1, 1, 4},
{},
{}});
};
check("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_128X32X32_64X32X32");
check("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM_16X64X8_16X64X8");
}
#if CUDA_VERSION >= 10020
/// \note: we only check several cases and block sizes in megdnn_test, the
/// full testcases are written in cutlass repository
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册