From b4687ce8da766540455d0923788ad3e9fbc8f8e4 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 24 May 2021 19:03:53 +0800 Subject: [PATCH] feat(dnn/cuda): add convolution with i8 input and u4 output GitOrigin-RevId: 8be439abf1f448a6be33ea0e57c48e674c5c94c4 --- .../conv_bias/cutlass_convolution_wrapper.cu | 4 +- ...licit_gemm_conv_bias_cutlass_wrapper.cuinl | 2 +- .../implicit_gemm_int8_nchw4_dp4a.cpp | 29 +++++++++++--- ...sh_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1804 -> 1859 bytes ...ish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1802 -> 1857 bytes ...ish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1802 -> 1857 bytes ...sh_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | Bin 1803 -> 1858 bytes ...hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | Bin 1795 -> 1850 bytes ...ish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1802 -> 1857 bytes ...wish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | Bin 1800 -> 1855 bytes ...wish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1800 -> 1855 bytes ...ish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1802 -> 1857 bytes ...wish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1800 -> 1855 bytes ...wish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1800 -> 1855 bytes ...ty_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1800 -> 1855 bytes ...ity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1798 -> 1853 bytes ...ity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1798 -> 1853 bytes ...ty_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | Bin 1799 -> 1854 bytes ...entity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | Bin 1791 -> 1846 bytes ...ity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1798 -> 1853 bytes ...tity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | Bin 1796 -> 1851 bytes ...tity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1796 -> 1851 bytes ...ity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1798 -> 1853 bytes ...tity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1796 -> 1851 bytes ...tity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1796 -> 1851 bytes ...lu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1800 -> 1855 bytes ...elu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1798 -> 1853 bytes ...elu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1798 -> 1853 bytes ...lu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | Bin 1799 -> 1854 bytes ...p_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | Bin 1791 -> 1846 bytes ...elu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1798 -> 1853 bytes ...relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | Bin 1796 -> 1851 bytes ...relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 1796 -> 1851 bytes ...elu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1798 -> 1853 bytes ...relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1796 -> 1851 bytes ...relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 1796 -> 1851 bytes ...sh_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1861 bytes ...ish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1859 bytes ...ish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1859 bytes ...sh_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | Bin 0 -> 1860 bytes ...hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | Bin 0 -> 1852 bytes ...ish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1859 bytes ...wish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1857 bytes ...wish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1857 bytes ...ish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1859 bytes ...wish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1857 bytes ...wish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1857 bytes ...ty_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1857 bytes ...ity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1855 bytes ...ity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1855 bytes ...ty_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | Bin 0 -> 1856 bytes ...entity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | Bin 0 -> 1848 bytes ...ity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1855 bytes ...tity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1853 bytes ...tity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1853 bytes ...ity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1855 bytes ...tity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1853 bytes ...tity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1853 bytes ...lu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1857 bytes ...elu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1855 bytes ...elu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1855 bytes ...lu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | Bin 0 -> 1856 bytes ...p_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | Bin 0 -> 1848 bytes ...elu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1855 bytes ...relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1853 bytes ...relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1853 bytes ...elu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1855 bytes ...relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1853 bytes ...relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | Bin 0 -> 1853 bytes src/gopt/impl/tensor_reformat.cpp | 1 - src/gopt/test/inference.cpp | 37 ++---------------- 71 files changed, 31 insertions(+), 42 deletions(-) create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu index 4e3cba3e1..7ad77f3ef 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu @@ -960,7 +960,7 @@ void megdnn::cuda::cutlass_wrapper:: ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ cutlass::conv::threadblock:: \ ConvolutionFpropNCxHWxThreadblockSwizzle, \ - stages_, 4, aligned_, NeedLoadFromConstMem, \ + stages_, 4, aligned_, true, \ cutlass::arch::OpMultiplyAddSaturate>; \ typename Convolution::ConvolutionParameter conv_param( \ param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ @@ -1020,7 +1020,7 @@ void megdnn::cuda::cutlass_wrapper:: ElementOutput, 8, ElementAccumulator, ElementBias, ElementCompute>; typename EpilogueOp::Params epilogue{alpha, beta, gamma, - scale, detla, theta}; + scale, delta, theta}; DISPATCH_KERNEL; } default: diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl b/dnn/src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl index 9f09ce41d..6d1582bd8 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl +++ b/dnn/src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl @@ -1,6 +1,6 @@ /** * \file - * dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl + * dnn/src/cuda/conv_bias/int8/implicit_gemm_conv_bias_cutlass_wrapper.cuinl * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp index 87672047a..3287b80aa 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp @@ -181,6 +181,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( float alpha = src_scale * filter_scale; float beta = 1.f; float dst_scale = 1.f; + float gamma = 0.f; + float theta = 0.f; + if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { + theta = args.dst_layout->dtype.param() + .zero_point; + } if (args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32) { megdnn_assert(args.dst_layout->dtype.category() == DTypeCategory::QUANTIZED); @@ -189,7 +195,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( dst_scale = get_scale(args.dst_layout->dtype); alpha /= dst_scale, beta = bias_scale / dst_scale; } - float gamma = 0.f; + float delta = 0.f; if (args.z_layout->ndim > 0) { gamma = 1.f; if (args.z_layout->dtype.category() == DTypeCategory::QUANTIZED) { @@ -198,6 +204,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( float z_scale = get_scale(args.z_layout->dtype); gamma = z_scale / dst_scale; } + if (args.z_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { + uint8_t z_zero = + args.z_layout->dtype.param() + .zero_point; + delta = -z_zero * gamma; + } } uint32_t nonlinear_mode = static_cast(param.nonlineMode); bool nonunity_kernel = !(fh == 1 && fw == 1); @@ -244,14 +256,15 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( DISPATCH(false); #undef cb } else if (param.format == Format::NCHW4_NHWC) { -#define cb(_nonunity_kernel) \ +#define cb(_signedness) \ cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nhwc< \ - _nonunity_kernel>( \ + _signedness>( \ args.src_tensor->compatible_ptr(), filter_ptr, \ args.bias_tensor->compatible_ptr(), \ reinterpret_cast(args.z_tensor->raw_ptr), \ reinterpret_cast(args.dst_tensor->raw_ptr), nullptr, \ - kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, \ + kern_param, nonlinear_mode, alpha, beta, gamma, delta, theta, \ + dst_scale, \ cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, \ m_algo_param.threadblock_n, \ m_algo_param.threadblock_k}, \ @@ -259,7 +272,13 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( m_algo_param.warp_n, \ m_algo_param.warp_k}, \ m_algo_param.stage, stream); - cb(true); + if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) { + cb(true); + } else { + megdnn_assert(args.dst_layout->dtype.enumv() == + DTypeEnum::Quantized4Asymm); + cb(false); + } #undef cb } else { megdnn_assert(param.format == Format::NCHW4_NCHW32); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index 42e715a0b9d61076178dfb86fc5cf4f51dba2169..4af7d5a52b4f5d61b2517ebd01d773c8d2fe53f0 100644 GIT binary patch delta 50 zcmeC-JIuG?Iosq0HYM?jlA^?b#G=Gpg;WqTz5vA5w4UfE!CjDFz@?m;my(&r1pwTN B5wQRO delta 11 ScmX@i*Tc8rIoo78c2xiyg#+dQ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu index 53e3dafd4c775db8b6cb8e54e0f80d6c844a29ac..f60407caa4fa15602e99ac0ae40b56c6fc7a45a0 100644 GIT binary patch delta 42 tcmeC;JIJ@;Dcj^aHYM?jlA^?b#G=Gpg;WqTz5vA5w4UfEG5J3mHvm}V548XQ delta 11 ScmX@e*TuKtDcfWjc2xiyO9S2j diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index 6f32c76ed2ad9ba018db915181f175cb44073c85..71218fb46b6f23a505ac31bb6f0a268a7efbfb06 100644 GIT binary patch delta 42 tcmeC;JIJ@;Dcj^aHYM?jlA^?b#G=Gpg;WqTz5vA5w4UfEG5J3mHvm}V548XQ delta 11 ScmX@e*TuKtDcfWjc2xiyO9S2j diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index 673a85ed8946ef8011e153b8529e35ee4e166e9b..b69d716104bc8bb4b1a15f657271a2d41f1440ef 100644 GIT binary patch delta 41 scmeC?JH)r)8QbJ~HYM?jlA^?b#G=Gpg;WqTz5vA5w4UfEF^QcU07upix&QzG delta 11 ScmX@a*Uh)#8QWx8c2xiyXanK^ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index 759f06620bc1346d1ba696c626313c5cc6d26b2f..db3398a5478bb5b280422a13ab98a264ec0bb2e4 100644 GIT binary patch delta 36 pcmZqX+r_ux9@}ICHboKFijtzlfW)H2T!mB+Grl0PX!0F4bpY~94PyWR delta 17 YcmdnR*UY!!9vhRU^+Z33$@=Wv05#|Z9{>OV diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu index 7f956905bd0d23049ecaf75b6dc9adaa4486e06c..30a823bda1af114e178865b8eb35efc2a1d410d3 100644 GIT binary patch delta 42 tcmeC;JIJ@;Dcj^aHYM?jlA^?b#G=Gpg;WqTz5vA5w4UfEG5J3mHvm}V548XQ delta 11 ScmX@e*TuKtDcfWjc2xiyO9S2j diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index e4c880fb6cee3aa1adde4caa0e3e44b0f0bc2404..c186f0eac8b4348a15ca772b00a70ba9404c455a 100644 GIT binary patch delta 35 ocmeC++t0V*G27%CHYMSTlA^?b#G=Gpg;WqTz96w^@i_@% diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index 15fc0d1bf9b2a33f1b0da0bb790c6d11f0733890..b6c4af9a6ac318fafd6c584070d28b8b768a5aae 100644 GIT binary patch delta 35 ocmZqU+sn7%A=~6CHYMSTlA^?b#G=Gpg;WqTz96w^@_jaS0Py7vbN~PV delta 25 gcmdnX*T%QuAsdsX^+Z1j?t=URF6Gp`l*}|P0Bs%z>i_@% diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index 743c95abaa70aa5c6070f6631f32d2461b1c4146..adb9b8566beb07ae89a936d32f56101214578bec 100644 GIT binary patch delta 35 ocmZqY+sC)z5!>WyHYMSTlA^?b#G=Gpg;WqTz96w^@&h(?0P$rFc>n+a delta 25 gcmdnT*Uq=$5gU`H^+Z1j?t=URF6Gp`l*}|P0BvFi?f?J) diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index 78624fb9a267297e67b0231d6a33b729567404a1..d4b3ed4e154a27b95c262e9e1d6aa89a45315754 100644 GIT binary patch delta 50 zcmey*yNz$dEw;(UY)ax4B}It=iA9OI3aKDwd;y59X+6i_@% diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index a50f3970bba155b65ca6558565a2969e1091fe4c..4df8e99bb09c4c5f4891b85bba33722ce7caece1 100644 GIT binary patch delta 36 pcmZqS+s(J(KHFqNHboKFijtzlfW)H2T!mB+Grl0PX!2b)bpZ0u4QK!W delta 17 YcmdnZ*TT2qJ{yy!^+Z33$p-A)05%r|BLDyZ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index 5005718657f201ae62d6adf67e17ac5a987f9398..14c996d0795b8ade42f35223a17284e3bec34845 100644 GIT binary patch delta 36 pcmZqS+s(J(KHFqNHboKFijtzlfW)H2T!mB+Grl0PX!2b)bpZ0u4QK!W delta 17 YcmdnZ*TT2qJ{yy!^+Z33$p-A)05%r|BLDyZ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index 3d4f9e641c4600adf4a3989780e898f0d67a31cd..32b5cc5732e3645750cd4c81d01ec0db8848dc13 100644 GIT binary patch delta 35 ocmZqU+sn7%A=~6CHYMSTlA^?b#G=Gpg;WqTz96w^@_jaS0Py7vbN~PV delta 25 gcmdnX*T%QuAsdsX^+Z1j?t=URF6Gp`l*}|P0Bs%z>i_@% diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu index 395346c3af177cf7c48c1684180cadfe0dce0cec..3f96498034644023a52410eacbb45fd7fb014b0d 100644 GIT binary patch delta 36 pcmZqS+s(J(KHFqNHboKFijtzlfW)H2T!mB+Grl0PX!2b)bpZ0u4QK!W delta 17 YcmdnZ*TT2qJ{yy!^+Z33$p-A)05%r|BLDyZ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index ca3cc8195ba3814106d80a410809f741287b2a28..e29625afa4a4dd393275f81fd685706e086f4ff7 100644 GIT binary patch delta 36 pcmZqS+s(J(KHFqNHboKFijtzlfW)H2T!mB+Grl0PX!2b)bpZ0u4QK!W delta 17 YcmdnZ*TT2qJ{yy!^+Z33$p-A)05%r|BLDyZ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index 1a0bdfc4118cb5ca093161ddeca3c7574600d851..89a9158fc8afa9de2abc879bf83dc3c51deaec7b 100644 GIT binary patch delta 35 ocmeC++t0V*G27%CHYMSTlA^?b#G=Gpg;WqTz96w^@i_@% diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu index c802ed9243360f1c6c4b35446f1cbe3431b0a7a0..e16227c5ae2fd40a1f3508b9b16ff82360dcf112 100644 GIT binary patch delta 35 ocmZqU+sn7%A=~6CHYMSTlA^?b#G=Gpg;WqTz96w^@_jaS0Py7vbN~PV delta 25 gcmdnX*T%QuAsdsX^+Z1j?t=URF6Gp`l*}|P0Bs%z>i_@% diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu index 97335b6a99bf5beea5017d022391b02b17fba2bf..346ca7f69dee13a1c8c21a7ba607518d33db329c 100644 GIT binary patch delta 35 ocmZqY+sC)z5!>WyHYMSTlA^?b#G=Gpg;WqTz96w^@&h(?0P$rFc>n+a delta 25 gcmdnT*Uq=$5gU`H^+Z1j?t=URF6Gp`l*}|P0BvFi?f?J) diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu index ce38777d2ad0f17178a6be3802f1b59c5b88deb4..810561c6d9718f10cdd7ab43548895041409cd4a 100644 GIT binary patch delta 50 zcmey*yNz$dEw;(UY)ax4B}It=iA9OI3aKDwd;y59X+6i_@% diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu index a291162cd18836f5554dc105eb745b4390f29d14..d7f9d9c31f12188e67d7948dbf799fe5294fbf7d 100644 GIT binary patch delta 36 pcmZqS+s(J(KHFqNHboKFijtzlfW)H2T!mB+Grl0PX!2b)bpZ0u4QK!W delta 17 YcmdnZ*TT2qJ{yy!^+Z33$p-A)05%r|BLDyZ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu index f99ad2d47d03fd93e757739428516db137eda6ad..68113c775ba0448d9463f240aa08974a989372a9 100644 GIT binary patch delta 36 pcmZqS+s(J(KHFqNHboKFijtzlfW)H2T!mB+Grl0PX!2b)bpZ0u4QK!W delta 17 YcmdnZ*TT2qJ{yy!^+Z33$p-A)05%r|BLDyZ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu index a1cbf38b007ff5de0af8a35030999dcda8225c7a..902fdab85dc153494a5929202345918bf433f807 100644 GIT binary patch delta 35 ocmZqU+sn7%A=~6CHYMSTlA^?b#G=Gpg;WqTz96w^@_jaS0Py7vbN~PV delta 25 gcmdnX*T%QuAsdsX^+Z1j?t=URF6Gp`l*}|P0Bs%z>i_@% diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu index 198baf758c8ecb97c947bca5d6c393f0315a9aa7..65ec55762adef1e444f9ecbf7ecf203c7275a557 100644 GIT binary patch delta 36 pcmZqS+s(J(KHFqNHboKFijtzlfW)H2T!mB+Grl0PX!2b)bpZ0u4QK!W delta 17 YcmdnZ*TT2qJ{yy!^+Z33$p-A)05%r|BLDyZ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu index 00a266e755c907b6193a6f7c77de833edee1c271..b51e432df0f8f8a237d4d4e8e0f6793f6eee5abc 100644 GIT binary patch delta 36 pcmZqS+s(J(KHFqNHboKFijtzlfW)H2T!mB+Grl0PX!2b)bpZ0u4QK!W delta 17 YcmdnZ*TT2qJ{yy!^+Z33$p-A)05%r|BLDyZ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..053849749c35c41b1a826998a10307ef58d17e1e GIT binary patch literal 1861 zcmb7F+in^$5PjdTaEVm45h;Su6gf0fN(hnKBvlqEeY1?$fW@wD`NAb%-|=2xONmGx zFkXAk?3pu{jZVZAeYhWvZXO=UcsP2xCV$#dtA)f&D#Ou=8Y$!qsVQRKrPSK!L>oHG zC>jk0C=nFwtP>1rZ|ASw70wdWID7WeTb`V0YG}qCH;p%7tTTeSGnxu&feb)|WT{U$ zYFNWs%qO(Plw6Vt7_>yDQ^ABIGoEE+drnIJWM!z<+!@Vl zw=%XObHsOvm};XmncJ1Hb7Fg>-RZ5v@4Q1Uy0C^#he+0)uewCeSFC|%TyjGlPtasj zsaG8;6ZsGe8USyVdMA`b-%*5++h{I9n+kz(2m#}SF9qWm*8qJmM?l*3h-(Hc+JREK zQQl!Z=F%$jFj(I`ulwDr=9?jd^;7(F2Pp-?7*uoJ$T8_y z&P{;f%S+0ep(cyGb_yb(QvQ|lIsq>+kt}dZt9fux= z2eecoK^dPVQbM1=90+SZrm8mI9fUB-POC2awIaIuR4N8yDwqSiMywfOJWP4UrHc(a zMG1lNQ*sY+z5#bDQW$yxI?Z?QLx+M0*f8hbhrF>a0AFgT?VvdnUumD%s&PYn(3+=c zrOd)=%BrM=i^HyJet%cvzR>7n_*;_vBXFEi*%y!AY4092#<@3JxvKi6<|tG`?4XG< l;t+g2U{kw2%yzhThUT6T58Hy>=gZf=Pg?2dgv&%sqraPwVYvVR literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..9b45e053c6f6bfa119fa316879e63c726c75c479 GIT binary patch literal 1859 zcmb7FYj4^x6#brG;ZbSYDmnyZ9dcQvj#4UZZKnpQ`(_!(fJI_ke(>n8-*q04F{)B2 z1joMT=ALsOiB7~6eYhWvZXO=UcsP2xCV$#dtA)f&D#Ou=8Y$!qsVQRKrPSK!L>oHG zC>jk0C=nFwtP>1rZ|AR_70wdWID7WeTb`V0YG}qCH;p4O))~Rv8BGPX5DY+sWT{U$ zYFNWs%qO(Plw6Vt7_>yDQ^ABIGoEE+yHCpa$;wczx%tApkZB`|0^0>QlBY;W>!@Vl zw=%XObHsOvm};XmncJ1Hb7Fg>-RZ5*J8+SUZn@|X$-47Zm&o~wHPDPpZm8o4nrtfZ zs!e4gAE=-KgsoEVgp%kxiV$)e%_T@vAutXMFi!YVFphBz&$~T5zkAg@8ZuZv#XonDQV)zlG}oQH`NF&B!S-FC(OdBa^MV1mKV)Bs z5Xg@~AdD9^9rn3X!{!)2>Om;i7!p`SvZYX*$Jw{9dk3LlFdo4@o>R^HkmDvg?}T5e z+}6jw=|%Tg%XSmNRi$F5EG*#5)pnax=@jLnlhT6`VL;Onn#8!P9_(5l$qLnfWt0}9J z77h-(s`>q04f{f)kKu1g@=w6q5tV)M=$-cNQDdBYvz4o=Z)%Q0CBzPzC?gKRw*wZn f)5FmY*Ur$~GvZ-cMECjfwXc&_Iy&Jp5!2`|`Ili$ literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..da56c4af7c70240b82b744f4626339a007de3898 GIT binary patch literal 1859 zcmb7FYj4^x6#brG;ZbSYDmnyZ9dcQvj#4UZZKnpQ`(_!(fJI_ke(>n8-*q04F{)At z1;@VU=ALsOj!wiBeYhWvZXO=UcsP2xCV$#dtA)f&D#Ou=8Y$!qsVQRKrPSK!L>oHG zC>jk0C=nFwtP>1rZ|AR_70wdWID7WeTb`V0YG}qCH;p4O))~Rv8BGPX5DY+sWT{U$ zYFNWs%qO(Plw6Vt7_>yDQ^ABIGoEE+yHCpa$;wczx%tApkZB`|0^0>QlBY;W>!@Vl zw=%XObHsOvm};XmncJ1Hb7Fg>-RZ3_y6f`}$)91-A(D0Ht1glA6>Fdwm)uat6ExXW z;#HfwfpDc{F6Oeu{tYAf+A{gJ`ZhdGmvJ&x7r^K$^GW3+4p_aDT|Y z5FwBsgFqNBYC7z5r-scje$<0dt}!IAh-6ElIFGY$U-u3|!C*XsXFR8x_aVnkbkS|1 z^G>PU*2lhSVRejUyNTderDE%lYmus%2WoC^I88AA0mXcsB)^2@)C^QM5t4>Fxr;*$ z0=Y~V6;Mn2HamHq5$Y8T^D3#y`v3bhRn+C6eWr!-MNpGevBf_%cpKWHD6erIcWwd^ zzPzNo8A`H9Yo`bVw9G%!z9<*YcreTNwo+92yZ5ORI^BQ>F?Bxl($yvOtK-n&@Pd{~ zBq-yvL`rBAm;&LM52>ojcN>95*=W^azg9z6pGw6*Oa)V5*M~JDjE5=DxOA~$rzjyX ze9G7Z&m(ZRB88C`pwoQyK6EIEfDKdbeaIW@0`R4V+Rm9n{7U=8R(%`zL1&(#l`;#f zDXWqe4i3Aj`TboD`$D6S;crRuE8y*j%D#B?PJ8#LG0wf&%2m}jHAkTmVh2r>5r^RJ ifJN=}aJ0jxgV>PiYUwYx!?>Lt>V?`A0 zC2{QY9iQ(zmqbTmhTh#?jV|x*$mDADa6x`|qfQ5jxm1Ru6*W@GIZ`vkyvwMy(UCTE zo>Mf6W0VLAcGd}ow72O?XQi_QwN4(r^p+r61=$eian+3u4nezG!DYi>R#~R zDiis@1;r4z%DfXwqHicd$Za&2piPayI8eYi;Y-0d#tpy_%n=0b2E?@k7VSV8-6-!c zo^WZExr^5~kLzLYynQevUO$X~>>!mO7=voAH+}WOo5y&2*XPrN%u5E~exH3QLLff@ zfiPY)ba>948kS;wuj5d!W5{3`$(BNSKF+^>d3F#g2ICPt$0^l(*zLCwWEmd)O1Z6% z|1kxBZB*xC>yc}bsksMkE;XDc7=MRyzDSavLUL+il~0AFp-^tdp$dUruFDc=+o8>xgV>PiYUwYx!?>K4Nj1^J# z5-0Zgj=%GrOQIt&MelB}MwfSYWPCMxxFEl~QKy5%Oe(|Ck{T)G45=w%-sRNV=tvtn zD<~SpF-ipmJL?2P+S}}1sHJ6}Ejlek2$2jE+!8pbZz!1z45OxFN+5wAppq#FicNmYk zw94GYtDDEwuy@`*7!t1@k{>(BMhuKWG1r^Cdg0AuyuIu5=|SdI1mJ$3eI-H&ehdO( zJa6dmJa%eWhVi|QL#-x|y^6^;gz7vgzJ7Uj5UK^^5j>NOYCi1t+o<1bqrpbEt%v_N zY@==))p^+Z<67iu<{>VZ8BSA-ze5>cr0Gu~IW@5=CPLCsBsWQ@K8UQ)Rr(t_-=?Na zGD3qYys|_Zn*RGe%@uVe=y{-(>=mg=uGsvq0=y0lQPtMC6a5w?jQvIX$h2 zfRg1U_3DfVt8#0rM4h`^pF5%RHE57f=R+TzpFyKK4y_FzXrV-k3O-AvgcgDJ56^Ns z)!n_@2vKaBtUeqz>gW8!MljG)LHqA|uwjJpRn7}8U1HcVN(oGz4fhb{0k~_C!>kL? zY5Ddxbf|}b4dd-?$V=-2@Tq~?&X|4iZR}H9&y)}!bmcKxDl@m5vN~zuV85$|-(U5V zuQYlemIWzi|Lcr4ZE^3N_U>L|Tza#$tGaC(jzT3Q4tl5{4#7_YUK*$S*eBy0h$4w0<4-1LZCZdd~?xa5X99;4Z= z6tCP=Ch~y_8ba7A@lGg-zM}{sx6@pLG$jJ#zyRZzuLa{6*8l@BM-a2?6W0t_v%B5B2ez?7T+75b`&7&d1?c?O<9#ZImF^J}Rvp0Wu`!uZn1=73~Uog)Zfcr!C zxd?&$6a>O}Rny@mcWT%Y;|Dzq z5k5aByctR|PirR+1Qg1C(mpR1&Ui4(_9`jL{N4G)37zafgb8&%^wQ-8^sD30;qZah zO2jDTi&#o%6PNV4=?5CI#e-20H%)&<~m4OPvVL;MQ+*p_`8_(5l$qKz^u zt0^m!77h-(s`>q04*OiAkKwf@*%k11L`7dbc&EL4&=_alZ0V}(o0_9g2@?lRloE&F j?tn$@^l-GJtur+9jCfe)(S5#n?RL^iM<-myVjlek`G{dn literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..5a3898bd2b3b9958475973c6e79156745da26256 GIT binary patch literal 1857 zcmbVNYfsxS6#brG;T37xP+3LkHe@;^mX#uHz(gvsZ0uL zY3juH+}v~S!_k45p?BA#@sFDuG8v8UF36wbsMkYcE|uYEMU514j?@e>?{aEwbf68L z7Zi=-7^Q-OoppjC?QQnlUga!Bouhj%z2)hVriK>WanpGP#yTUIJEFOu7J>nYkSzBp zM;&WekNK4Ln378}1%sX_bS{{1WX_9%Y~PbQezG!DYi>RC>-$vB4*ksO)|R@HY4_wob=b_wOjOwWP@xqATnFA4w`ex4Rt(4(@mpY^HQ0} z2PPhBO+}6eaTOM~2 zTummn4!IV&ntNd8GQ(+#@pmZYi!}WuB&Q};#Z*Wd%H%2u6$so4T@^qp>)Yg%X-4R@ zB3KqlOVxj$XSt%T1U+S1310;@%@tcbD#6Rp6jgOi%D77ti16Vd=iN||RaW~|AcHzp z$6xA&Gak$ymu{23Yo9xz^9_iQQ0GG{ou5IsIu88}4```GiV8kYrGzel84&K}h-zAV z_ae}!`>fdb|)3PJ4H&F)qE?##PfdEk~ge5(iCG5QpI8fJJL}f3%~uGqm)Kcvx1^ SeLQ>aY|=_c2VACN7X1Y#yI|J< literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..0d0c034a6996fc7a5179d0f77860a64ae2b98dfb GIT binary patch literal 1857 zcmbVNYfsxS6#brG;T37xP+3LWF=RR<)|Db{z(kPPH_LV0#%f|)e)Pew-*sNiSP?L( zN*(*2n|scEI64q>^zM2x{eE*p?kCf`3-bFo>h+LVNM$%$QzM03AT>wKyNp^J9cV)r zIYra)7$t&&oppjC?QQzpS?Mf6oudaYz2(V~riSL+anpGP#yTUIJEEDO7J>nYkSy~F zM;&WekNJf5n35|p1B0H(btaf_WWn>CH2b8ApR5hlnww9|3z>DID6m~}BYB2|w2n#! zersbZa!34-h`BaOlhm$-O^H1vgZ}1ZxEb_!i$0NTn68IJrfb$g3of~#jwfigt+lH+ zm5F>{g2oWF%DfXwqOT}I$Za*3piGUxI1s=%;VZ#7#x1}I%n^j_PKoOVEZTuGx>ep` ze9xs-=4QOPdf1GH=iOIB#+$qN#|~1-fiY<2hO-wxc=a%Dz6Bb+6kjke8G!qJ_N54c z{5=SS@v^1EQ|{ET6ysYx4%He%{>n%i3gvm6fBF36Ak+)SQ+UQH)qFJQchSkPiw6Bl zxW>i*TOM~2{OU|J4!IVYT6kdQQp0J2@i!>tizN9eB&TMq@|ln{l*v^bDiFBkx-5WJ z)|=!MX-4R@Bv=$lOVxkh=b56e06k?|3SR~_%M@GwRf3nHDaz`)FXAptAi~GTjCVst zmRaqWfq<69M~+Xbg)<(^ioHgPI(^qZb3$iZ5Fw_{hgLd2gKl*k`Ws%*N{IyJe33{A zT>>*8Jc|)kxA<-&(5U*XJ{+}5==?*a7>KE02JG6fWrXo0<2jcuHtY~31V&F4d*JyB z+@;81;05TsSiKD$3L;>`jC&jMyLAEh*g`d9W*@)GKCyM%27b_(hiI+L(rU`;q=ke1 zu3CP7)uX=D=zaKGk>Ub)IijjBZoSjq-D->rZ?<+-_f5-DsD#)-6XnDqxH(|aI^7@b cWaA7iJR=^KWpp3Up1Yg0($N8ziI_)!0E;kS>i_@% literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..29a4bc98f1dceac62de0ff233041d413750dc110 GIT binary patch literal 1859 zcmb7F*-jfV6n&qsaD!B}C^X7a8krP{h7hC{s7NIGW;q@wu`**@Ub67@y`C*o8bo-R z@!0p=x#!%)(TSL&4|k*S_5D4Wj>eBy%B5B2ez?7T+75b`&7&d1?c?O<9#ZImF^J}Rvp0Wu`!uZn1=73~Uog)Zfcr!C zxd?&$6a>O}Rny@mcWT%Y;|Dzq z5k5aByctR|PirR+1O&}~(mpR1&Ui4(_9`jL{N4G)37zafgb8&%^wQ-8^sD30;qZah zO2jDTi&#o%6PNV4=?5CI#e-20H%)&<~m4OPvVL;MQ+*p_`8_(5l$qKz^u zt0^m!77h-(s`>q04*OiAkKwf@*%k11L`7dbc&EL4&=_alZ0V}(o0_9g2@?lRloE&F j?tn$@^l-GJtur+9jCfe)(S5#n?RL^iM<-myVjlekET&;s literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..358017ff7a0cff8bebef91d24623d5fbb0c2fe82 GIT binary patch literal 1857 zcmbVNYfsxS6#brG;T37xP+3Lk7&09a>q?O}U?P>+H_LV0#%f|)e)Pew-*sNiSP?KO zO&$B5n|scEBsvgt^zLRn`F?v#rsK)|CHZ|E^?FDwq%s_>sgXi1keVarT}G{q4z!_* zoTABSgc3o)&N{)6_BMU)taO&3&e4OH-ty!~Q$us^xaqtCW1SJq9nnls3&8+HNS678 zqmDJK$9zJ2Ovx3Qfk98?IulGdvfz16ntf8mPu7NN&CMs~h0Hop6xc4gkvu~}T1O=V zzqPRyxg&l+#9SMtNov=^ro^6+)4}GX57%zdCz1`)^?=B9%{pkoB{$UZ1kJX!cJ-z* zkq=DJ2%=V*cS1?@6-5ZSt>zMxsSy|l0vIQJB^bxJ1sH-kf{@)AaovDLJ5WZq$~%mw zTv}yrN1N-1&2VtheKlmXxsQMBAe9^#gJy0pd+~+W52NN?pwUb51@n>txZh`AiV(<8 zK_HBmEghb6r-r2%-|11P))?|vM$%9y&*S{d=O+iDUND}(Jx;0S!_z?*o%F-660ULa z|CYyH1XrDj#v#`tQwtBwTxvK?F#ZOme3>Lah2+$XR6Y}uhBCR1Lj?l2T$csV%6gNW zBFzY$l?00-X{q|}^E^}36`-d~OX16)W|?Bkze?~bG(}lm(<1J|1R{KV%y>6cWSP}| z83^cEyyW<_S~%mutk`R$sMB}jGbePm1rcKEd}yVMbLdvbp}*k)t&~Vm&KHT4&?PVf z!o3(#b&Kyd0*$KA>ce5Hgf2c*ih-C4X27luTSgd7z+H(9 z23~;9i`CoEp&$Y_%(%B9zgri8k1bR)X7=%`>=RqJZQuuud5G4^EUl)jPFgtF@2chZ zS3T-Wjoyd96)8RdFGp1M#hrKByE~0>;my{r>b_|?3Y8E$Xri1r1YZtVv`+U&JKi`$ Y3(ts$Wf|Sa^XI-zTIuM3%S6nhKZ|x?>i_@% literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..f913d6a0d4eb1279a7eb8bf3c6997b0657cb3408 GIT binary patch literal 1857 zcmb7FYj4^x6#brG;ZbSYDmqkPUFEV$T}r96wVeu5_suenAr^^k`N5;Ve%EJ`bbWtMCd1L=75Q@-bvj7Qr7|3?sF6a>k(wdqT}G{qj&w~)XH9DY56|tiOh<*A0taH!QkDvVOYi6Pd193(dLYhB}_0>894M z-c%;?feDHsYL$5>ltkZAgpk{4Evb#P zwl4Nr_PU2yp0*KObtbkBxfYq4dtl~L!)b!?4=ClUB>5#IrzTeUR7e`i;Wuqcv-s{cRFGDTehddajDz6@%bDYp2h1aCr9l+`sU;x0@e!sq9V zw?jpiS?!jAfR@EePS2`^Gak%}y{!~=`tE$@gw8e~!k9WATIuovy47*$Z+JjUB@&eL zc_Jlr3Cw_SFGf_|;=7GNqw2HzaL_2B%TJYJAf|#DuxrDH5yr!e=Ulq6VaF&TFnX%k z1J46+HzI?97of9Z^*(ech=2_>jLn(f!dClef%o>#MW&a_(5YHqm?oXt0}9K z77q5iYWV$KkNQ%hkKu1giVwis5mkNh;GOpFL1SEav$d9*3ZI#ecbo(&2ia+NqMltkZAfRNp4ErmJ`niLYl3)zF+5YtC3-6x8`duK+vG}}s!2sMJ zvM)sN5y#><8d``oEv3ydFi=qomY92Sw(6pH&O{q}Y5!q*JOV_2gF)qDsEZlioT z@0HTkPWEjt`fYUH^S@)3UGUjPRW53WT#H1_9dNS?!)c814=CvCIQ}IhrzTYCR7mOz z1$d>pNPZ*Zbz<@qBQz+;q+oqOvA=#JNed z7B4ReZ~JO2GTJKw0e$k9bT7({H4e=3vs#2Ib@wi@LML0GA5rUk8(m#Ow^|PU4G(Cg zM2u2CkEMhTff?Y}d_+|(zB}+V${wpOhmG>N`cx_gVk(#cyB2I1VLVEB3jH4$c8X#G zqo<7B^LzvDRwOX+Jam@t-iP+3;IV$jy$^YlSr2?^pz0BGh+kUGt^Jdt;Y99>=H&4;e9i&tQW01}Er*FRS?m4XA1sc5-pEoZUfcr!C zg$SPf1O&o(+0bF1J2h;9@uLoX!A6k6B9fXyaUP}LzV03PlEHWk_h>;iA3~1X=)C8D zrE;~8ebbBnv6kI7f~!hJ?T~AcsJR1bc40V;G5!JNd>zNXgyhtODxC^ReVqWUR2R8# zB)rZ`o?(Or1-HCD8iM}+JWCX{IcT3_p?ML`G*N8%PxamUmM98ql1H4IL~HT#lJK@K z#v-A;A`s9ge@XYETv+44EZ?g|s4{o&5-W7F1^N-S&Uew(CG@K0(B1HWR!YPu~o<`YX)#0#FJy)Mf#Xw926JXbY4I_+42~VN>Bg0NnOknVov3s6J z;BG|%vlR+T_t`cX3lIS}M5VBj%C5TfYF!mfUj`>P3j&TDp1akyY+X1m{k3~CB zLbu9Uj3-=X$~=UdyXVcYf7L!35^kQNpF2pY2*x0r?N8sl@a{RR?*eJwiqD%D48Z*% z`$7ayegXnvylm*O&z%~!!1z&zzF;FrVG&79p*WAyZ(sKge92%uhG(>(nhznzZFJFZ zqw`*=T0XozYaE#6d$kBv=I&i$g-*6WKcd$8F1oseUbP&$8(z>#i5R7P z9!m)g0u#VL^8r;g_-@0~C|j&L95$-w>Qkv0h^b%#>^iVvgz+fhDRh5i*eQw$44yJ} z&+`b}tw>^}{GcaK(OQ{hrYWnE z<_`|LYWV$K4f;Z(kN$5(^6~$6L}f=jI;)+1)EMX9vdUG}F%3t)5+V!jlM;*Iy8(;F f>EUQc8*6Cp8F9ZXqWgUL+LuWy9h`6(i&^j&U%Ow` literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..d25af0697f71594cf3d19f314e8acb5790ab170a GIT binary patch literal 1856 zcmbtVTTk0C6n@XI@QO5TsH`gMy2^A&tU!@AU?P>+o8>xgYc;VgUwYx!?>M(+tcaq$ zI8J=NxslSslezG=HYi>R<_6K+fFfOPwt?&)CD?KgKsvfg6dBXY53Ei~tn8|rw1rrS!t z>Z3A|4{Xo~B3G$*LP_)uMF_dA<`UGY5Eus%7$j@6;vYLmDGA1)o9j(qz3}F7RNwXa^dR$s0l431Ux*OM zPe34ymkk}BbEk$aFuvELP_Z%Ou!y9lP@Kov*DucwLd{@2hBaPL&4=B78$p&~>6gmY zKK{oP{IyY)i`paCB2{w_+}y%&nqd4L3i={RehSH{8L4b4Bn^cEyfR%Rzmf4eF?os+ z8WiO60%<7v@B1uO)a9V(Bn#1taHgqZ%fHI+Iy6O5S(7~C+$3I$r>B&+Lp2r|?G}Ob z$~1wT`DM;{u*$c!236*6ed>fxx4=H8&WA2KKZ9m<9NHUR&`OB}Wqh7U2@L`xAguX> zsycl4A+RW0tU4Svs^|PeDHv#}UUE9?bXH^ntMh( VY>VhVp1t&I(n?21Tqa@`{Q>lgUI`i}ROEe#^{ zWiR%e**RzCc%vgRL+@^{#+P?@WO6lrxFEl~QKy5%TxrYEnp!2)9O)Ti-btEe(UGxq zo>DX(jZh*e#AQw}WPG-GX|G6@pw`KwS3cv(iJ_LJ+;Q7F0FyZ@m^-0T&+Ac9>QxGKGMSNR1RsI5OvHO6qk|)=$=!8pG`;=7o~2C<<(s+$t`SP?@8O zLEb7j6R9J8v%HXz;prr+D_?x#et-eTP&YO!W5H0O$2>Ue^t+e*7? zQQOD|CTIjzYw4ZP3VlNnLT+og0%a-$#(@CF3111uF>U~c5RRZ^Hz2MZuxJO8bgR9? zc*0et?cHc|^SBxI&f5n=Mw^HD#|~1;fiY<2dec`wc=I@_zxDa_pzwkLq~8}`h!Dt6 zKp>2l4IQ3yri61cQ0i7^ z`2WB*>b6nUhdLiOLh88(UT$GIO)&lrgM5)BKZWAdj&wQ|iiRNpPN^vx-{^MTl)SwN z4GLoUa5U8X_kAWcbvfuc#X|6+oT=1o`Pb-O2Qw5?HOVW^L*lh~dXl^yCb8(|X;BDh zn17_YJL@4TU#c~zx_0ZO6Gm=8j1mGTPFZ`Pc>wNONI2&Lbe2E8 z4IL&SV8hY&HsodI0`RGUs!x}F{L1(wt4@=^5BzwH*4i#JLs`|du(3Z?Bk!;3gfBFD zA8sp>pZl*@R2t&mJLBEG!8p&BRjDe&G!lhLh#j~mB@V$410EW?`>Va$I7{=)h=<3b Tx{qfs{gR9}(GgdPm_>g8Thm^O literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..c52e7751f34cf2ca663e3599fdaeb1e2de9a0447 GIT binary patch literal 1855 zcmb7FTTk0C6n@XI@QO5TsI2N`tTG)E3#CXKFp)~^&2k;LwVK$LWB0%XoaIsi7&i+_a9sWY!2~&uAiO2EhPCNS3&m zqgG~Ehq;(`n38KU1A~r8bt0IsWWm#v)cd52pKJ`(nwu}o37NElz_VR(BYA>^%q*1* z{8q-9NG)*#BIepCO+?J46?R9mA?f$F-QIB9@A=jFz%P14vcYmQAac23Ewtd08)|us zX1hwhYEzlOc`_)3z*XX`P!fGd0YY}Cxdd@41je2N#xY+D#xZUHMqrL0YC9yh?XhSN zO6X2Gi}94pOqu&|d;7E<4KCY9L&EK2^m7j>6~P!}vxC{27v4UF^<5y%Tk(1Gf&sWc zWM7Ej$xlHbj8_dEUUH{~Eir!3p)c47QdmS%Qz*`(^xM}L2fk!5p1?C&Qq4#GUK>G@ z+o<0wm8*RmGCdz0W7!3tZB(VAcF46z)WQKZyEL4}82^BBzKY{tLUL+CmCl5uzD|Hv zs*Bt=5?<#e&oDy6f?Hl64MG3^o+pah9Q2Z7p?ML`EKzLrPxamSmM98qnn#?QL>uw^ zoba|U#v-A;A`s9fzodI!F0652mhaUfRGGVTi4{880sV+t=ey|g0(#YQ=x+ExYb9co z@TuMkp36_AVj!l139#?Lh7rc&gs0H`kzuDOCNOx) z*gel9a5o}>ap$4)eDyxGF9nbF6YhP;>&$xKa|2aRm_z(Z`#7sQ*7Jj&JVhI2R+*-( zN}4}7?5g4ScQxn>jXwI>JT2ZTnnjt+!EO<$sOAd^s zvz(&QV1P0~!OnTXkO^-7+*#=?L#?Cxpn~Jsk)f96+;iJ{1;%+Rm_MRYPzS*PL?|Xh z#!<^z)?p!|9j4WiOu(Qcaw7#3p3HcjlkGmK;wLLhjp6na3qr|Ol0>!(ZWWhEDCenS z;I}q*BKIWph?p9y4G|evUib}hr{tu&F0S38OBCzPS3RQUE7n3YuDGS1XK1pivsZ6w zn}o;(4IpkUgBMz%Zzw^?ZwyzEn;L;}q=0e8mx6JO8-PBTBM91`65ozkv;#@H(ZORp z=E`Y%Ggx2Uulv38_NyU-^VB|&*GczP<|PAgzt6r*LL@(i z1Yx{rq}7Pv_L|{zYn>Pl=KawDXk1z_gqmeUO5@6gN_S@ugPPVGSF6QO8q6JV8_vh)NJvn`7Wa%%Dx&G5f>)uN<2JB-j2Ok*0Wm%(yNMe z{H0oW8^G*w)7FK%4$=!FH$Xq7K130npMg?62W`UxT56G@oX;|)Kp&w0crV6NE$`h% zq){oXKI}J|=lnyJF-WEc{qF?WFv55!c@El7EjvURfw5D?9(le3cPS(cy9k{YtGA(J zD@1IZZ*N2Xa6SSb8mR4n*~hQ4&s;rFB0rGiAzEp>aE7wFXz^gbtA^j-^_VYn^gjMA zNiqCij;K<^ZSW@eTZ3`o&DE}I#WWnnPDnk-Cnp}kR|6J})BVv7*WS{?GvaYsM)&dT Px$lzECOP0L6Vv1`TWMca literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..5e3ec9315a264a28743a802cb2e58df38465eff2 GIT binary patch literal 1853 zcmbVNYfsxS6#brG;T37xP+3LWF=RR;GKb9-enjygvVUin<`BSS6CgcG*&3QX>-WbTNjlIBnhK%`=+ z=K^(d%X-Xn+GAR+$P@y4GBc@U(vi8yGSWOJRsCdbsWHNSVqU7W6GegTQdlKYBvtOH zVvx5E&SmC^9}+n;RvRL@P)@on$xq3kzd0Fh2L0WlPZS$2)eUamjeOvOVyIiE-bt;{R}>-Swniw>rbb{KC}7ORN-}|Q3owFk1Vy`3;<^EgcA%7Q zwRafb3zcho6K}2_HlyKr_tlVibC>+sK`KEo2G!hf`r-?(9^&R*pwUb9h47LAq~8}` ziV(=(gFqNBTRJ@DPAywtd~4z`uL*RptfZk(?kCxo&rdeO#9%yuYqFq5j0XKKIvIA+ zpkE2sr1<~H<1T_<-H9flFf!G156s-c3d%A52E%;8`A?|?wXx2oQqeFbz$!Ck>s$S9 z+EO$aq0^FDF&-^F|Gl53nz{n?lw&D*S-*6^Hvs1yS+HTZvLz*Z2(<5Xne{lv0E#0eZbRqcW2D{z-G zg~KjDXT|Dm=r9Ta8_u`4A;0G?03TbZ=78D9uZrh+ZP>sMEP065+AecLS>3d-u|HKS z@2~opFEx4}{#K+o{9mr9a>T87#=BdCagi;rQ`L@XB?^;}IIvGf9D=U~ELyw!s~vBg ZrA1~W!m_OH%XoaIsi7&i+_a9sWY!2~&uAiO2EhPCNS3&m zqgG~Ehq;(`n38KU1A~r8bt0IsWWm#v)cd52pKJ`(nwu}o37NElz_VR(BYA>^%q*1* z{8q-9NG)*#BIepCO+?J46?R9mAvqsxyS?GI-}9?}&o6pJvcYmQAac23Ewtd08)|us zX1hwhYEzlOc`_)3z*XX`P!fGd0YY}Cxdd@41je2N#xY+D#xZUHMqrL0YC9yh?XhSN zO6X2Gi}94pOqu&|d;7E<4KCY9L&EK2^m7j>6~P!}vxC{27v4UF^<5y%Tk(1Gf&sWc zWM7Ej$xlHbj8_dEUUH{~Eir!3p)c47QdmS%Qz*`(^xM}L2fk!5p1?C&Qq4!_gEoRB zw^6@WDp&hBWZLf?W7!3tZB(VAcF46z)WQKZyEL4}82^BBzKY{tLUL+CmCl5uzD|Hv zs*Bt=5?<#e&oDy6f?Hl64MG3^o+pah9Q2Z7p?ML`EKzLrPxamSmM98qnn#?QL>uw^ zoba|U#v-A;A`p-=zodI!F0652mhaUfRGGVTi4{880sV+t=ey|g0(#YQ=x+ExYb9co z@TuMkp36_AVj!l139#?Lh7rc&gs0H`kzuDOCNOx) z*gel9a5o}>ap$4)eDyxGF9nbF6YhP;>&$xKa|2aRm_z(Z`#7sQ*7Jj&JVhI2R+*-( zN}4}7?5g4ScQxn>jXwI+H_LV0)@ou~e)Pew-*sNiSP?L( zO`Z6jn|sc^$Bhoe486MS+L?Ul(=)H!)sT66P>c_Fh-6a}_RZY0l;kk(Pj zAa5OPMec|n5HZt6X(CcCop4)X&&cUubJB;;ZqX-_4HoMGk&89!pgEV^P{&g=-PY>W z50#00;DTbPTV>t}CDB(DA>_82OVFl9U>qo5obr`m9OD*X2;m5dc4x$O0~YN-8Qm)H zFrIK}mAQ>K*AJWF;G+9#NW8gEe(WHXAQ*#cZZLiEh1U;p^DfZnrTRj6$pF&ti!Vh8 z5y#>;WWkg8w~Shn*J1$QxmIvDkKeK0<3afw!YQxrY%K- z5jrcW730y;^WXbfrl>1GPdS#7m*q?|#g=~!-&HU~IaiaS;vytji^s=|cf%}}{p^>8 zfRe>aj!&zFGakZQz^k8;Qw$f zj;Gq*y9a?r<*<5l*czUT50zpdrUL)(4A=_7c$D!Ryq_3$h*AQ_PE~v0`3l^X$l$OG z&{?s18#;_az=refZOHG|1>j=~)f_PU_*L<#tqmLafh7;oTA8KQl+{fO8~anW^8Tui z`BJ0z;b%pP!~f-qDo5OTr@gz=7#G=WovL36z@oLgzuM8p8CqmU WJS@xVKAu1KzoeCp4!BIkEcye#tY2vW literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..6bd542819c27f4f056a9ede1a2d655ce9afe09bc GIT binary patch literal 1853 zcmb7FYj4^x6#brG;ZbSYDmqkPUFEV$9i>#-+D;8p_suen0gJ@8{NT}Fzw5lnI;v6$ zf)n3!bI-Z=nCM8%(1*Lx`1<~yOh)6!EArU5BpOJz7(QzM0(BQ-aoCWRCbg5i@O+CL-a|3AZKooSgMH@ac8KqSpG=#cU>YY#$eMb>OZmYQjZ7KxDfda+}UkS!BZU6=lj-Y6FPFy=+(GHZ- zt?~}z371xx`{CyHX*1|wwhx93H;?hp9i$WlV^GcYr*FRS_GwtZ3#56gz7Sq8fb{#~ z3lReO2?&JovZ2FE?$oda#t(WJ<~4>67M0W#iv2kI_VvX^m>7)5aE%vK^TAoajl#Fr zErqL7?2GJm4~aZ&BluLEs1tH6QZ@I$%qaHOVW^L*lh~eolEi z%wo~cZczxRn7`!otXw$bAuNAZi%>Q0&Zkc3bPM!j>U=QKZVG?2o_Q{At@UH=j#_s-V cM;m8oo*D75EUNo_@!J2ARysQ3G7+=rFY+p1a{vGU literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..b417533b06071833b8dae039f976a1e4e6a62960 GIT binary patch literal 1857 zcmb7F+in^$5PjdTaEVm45h;Su6gf0fN(hnKBvn94-z?)bVBxhbugxW2-|^nqQX-NE zjMttsJLk-0qZ2VhAMS^vn}-K78IGQ=$)9%AY9TRK+H$m}RtYsndWM*HlBQ{NVl15} z6paQ06blM=sS^wtpDtdz%bmrjarW$$PkDT1sHF*a+&12TNu3qUolz-h3c&zGC?WM;r0vjLdixH1-46W6_-e;)KSI2 zZ)u#0#1Y>mVrHy1#B%9L+9T~wZxep!9dglyHEcRWvF>8sC2Fx|4K(M9Tk3d>rrT1x z@=)8z2PS9$QETa)&p1=;6sLBelc`WN6v`9S zAaF}eUI4YMSINoJj8HEpm=#G))&HMoQd5_K_M^{*&qJC@&6fX^;B9D%yt*b?)S1b6 zEnZ$EZ-$D@v)ai60gbYkv@eQ{vmVT{vr33EefM5EVdNGh7*pp%D_vbew>l2}4G(Cg zMT`psiaskZ`?V6f`cxp z(Nn}8c)kI5DXz{0UD$L7&rzw5ln7*(kh zf@9xvbMCp1L?>c~KHLvSHxCbFG8{c!lRxdK)k0#fwB=|`trBXE^b9fYBu&%k#8^5{ zC>jk0C>9j#QYRQPK3%+amOG14ZoGi zw=_;g;)w4OF*8;hV!8Ar?U8n;w>j^?MJ~Fl-aEu||r zwT*lT1Py?+mfi`i(03FerYI2`UgmOH7{s zTGFfBWLZY2m&cdYNKMrLpJ!52mx1=<%yrKxO{Hebe=6`cv_xK9lZnH2ml~>?GKbb%7 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..a0ea06e11a4ec43da9e2c9e7099cec93b035087b GIT binary patch literal 1855 zcmb7F+in^$5PjdTaEVm45h;Su6gf0fN(hnKBvn94-z?)bVBxhbugxW2-|^nqQlbzE zEna)hc+Q#2Mkiv1KHLvSHxCbFG8{c!lRxdK)k0#fwB=|`trBXE^b9fYBu&%k#8^5{ zC>jk0C>9j#QYRQPK3%+amOG14WM;r0vjLdixH1-46W6_-e;)KSI2 zZ)u#0#1Y>mVrHy1#B%9L+9T~wZ*$SzoOej}42up?th-ouiCU~#1I@YOmO37z>9&-v z+|)Mmfe0Ev&{}#Yv_juegpk`Bu0WU)fpLf*bVDMZecl%G5!Iid>zNXgyPf=bTSo+hB|qI3IuA2 z$@5=JdX<|j%Lw&yep!vwME(DLCN*^#XrE`UdmhqMYPS5R0&hc09d(!MAb&U!G*_9`LD?A?3mgppg2U`(A4opf~xz3Mo0H@u*g7BNcr zJXQ*t1SUXuW&^5h@ZCnBQM6fk*soR4)u%!+5L3eh*mYse2;-sT30H1x*(r(%44xwP z!1D;)t&lM80(6$G-iHq55U^pwy$^Ylx&VBsp{gl!h+knJr)Ad$e$bbvXszusHI$V} z3kQc?)%^Z02Ys&5$MClz+4z4uqM|Dvy))iD8jLeln{sDy8(;Z f>EURH8)s?e8S${pqx*dM+Ly^_6P<7si&^v+`e9$Y literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..bfaab36c3bc1234cf7475b3eb84ccc021d9a1d33 GIT binary patch literal 1856 zcmbtVTTk0C6n@XI@QO5TsH`gMy2^A&tU!@AU?P>+o8>xgYt^wW$L@t+zvJARu_B80 z;yCg7j?edDX(jZh*e*kw*IWPG-G>8x;;pw`KwS3cv(iJ_LJ+;Q7F0FyZ@m^-0T&+Ac9>QxG6jQVjGwG6HHOup)yAm z1HYAVCQ?UykBFJE+7Qd7C)t2>` zgsV*3yV2(6aWm|lw-1JlHV^TS9i)^4W6;d?rmtRj^Ej&S`h0qjdBFhO@3SvN2;?Uq z5XQ@f4$rw$%N7{ln^CCN81h#{Qd20-riFy)=MXh+=2ozbv|^``582;v52DCP4+DQFTH0b$K2 zRMp|T4}nG5X4T=aQ92ZI_v$tV&uq*zc<0_g6LP z3yt1~$BN|h|MiHMMuWc zc}mfEG(w4>V3#?;kn!2#rL)3Wf?6k!UipkCCx%*@a>s4!08HkrVD5xUK{E&jAVM+e z6OLM$WgX@d+F@F)$P^4ZA~jMl;mDk)DXI5K89!NDY7DoZm={X6q9|m$V-Ftqx!DTrw5rA8G!qJ_5}$c`3Vq& z@v_0g^W3Rr3ykm0D0FHJwJRd2Qz*{k^y`;r2ccIm9>X(UP{W7aejD|BZ8Ru(t3CX` zVHihX3M|Y?>dN~=&VT|b#4-`#nY4I?a+-yMNf-B zfXDojYIW9wRlZg8P*v{MODBxn0s}F1KB(yY3`FWU$QnM-N{a-ge4Z!;0s-lVXFi!K zW$!jZ7Nx|h!(pR)&Oeld0hb!2zf)kt2;(csQ?6WW*)d88Oq??IkmdonYawCI1?Vh) zdmB1*L%@d7_BP~Y<^u4kfvP9WzWhr2B&#M$$PYAmjMmyNGecQbv~aNBRm1PEYQh&B zy${QZE} ziDRGd_&eXZBsvi@^xNC&tow zO3`>YM2VnampQ?Z@!8_Fv%*<|8fQ;l`HUxLhFY3($8F;XOy;a$?u<%7Gav&Hp_udu zM~%#~7V`;hF|Af)3I;8a8Y!4?WX{u+?Dk2?pR6r4hTAX93nd#-6rx>ntGGl$WsWKa zeklI=5i?`8A(l%|vOei{HtkM-)9r-Sc`qzFM6upt-6LwTW(_pwid*VM-f79Yq$boDg?%XKgJ1P3C1z50R~`>fU)Zn*9=&+2T8is z-eEl9D%19UxVe4W40@N%qanl1WBhXuDb>IjBy+v#n-|_b4R?2eG;i?>=0ybH{*Zkk zLI{2W0%5$Y>F_dkYS{wg2Qv)C8bkUDCc6-d^EmzX^~FIb7mUa7j2G1KLATRHkmM%n zc1q=TJ`S0l_l~h_gU=?aQn7Q$jgWfoAvCwJoF*9mfKt9nl3zk`YKJKF0lHBuAx|L>X9)a9U;aTdB4l%`U%%rZICM9Bpp_O0 zO8God3Yr8aKzQZ@s%r54MhK&9v+8hAtDwtIrD7nah6%9m!kQ7rBgs>)Tx{7XN(c;| zl6#2r2;7a3Fzy0$mapE24&@NAVZyx+d7Zfce6FE(Q|3^7rG1iBT^r(qzC1;1ZI_v$ ztV&uqIP9wC_jfhu3ynU8*NWuh|7}KPS3G!Uyn8Sh=iai)Rn;{$N1+m82Q8Emhv2^f hi`wa7wxf-+H1~{nSQhL)U%d8bGTKBZTqR-_{RPLTU$FoH literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..9ccac5c9e44b1d9a8e193979c2dfd9470c937045 GIT binary patch literal 1853 zcmbVNYfsxS6#brG;T37xP+3LkHe@;^)|Db{z(gvsZ0uL zY3kVb+}v~SBgui7qIcKB(T|%OG9HfZF36wbq|-rSrnKc~MXeHQhV&G%;3ah~IWU&a za*9TS0m=jgJLd&MCb;=?XQi_YwT|wC3XW$-hFY3)&u!}!80W2E{)kFJ9Rvdqp_mLA zM=fVrhlPxGm{v=tpR6o3hTBgp2qjxd64@@eRa_#WoTrL` z-`d!T+>_8FVrr~5#Bv#kJ0&OGb#d($U7}cTzUmP*U$GXNam6k5JVTRBtz5mSZ4x36 zG=P}33|?r3zM%vmzcE~aE;Rz<7(T`sUkb)CZUFjVjv!!vN_;zF(GDc(MhB1am@B94 z&0u|XzwYVB|&*GczP<|PAgzt6rDA(9`1Ko~C? zIy~i0Et_L}YX-4aQ^;N!$(BNSp5|XaKRJlig7FCM>6{wgKk2qn`KyH6y7+&~<2Hh; z&cxOsH$v)J0A_w}In6Ns4uyP?Wxs^t)DCn$5sJn#xr6!xX1OWT-^lqkHARvUIxXoH zCDPFJ-{+~+)EA(qJWJWjkS0>I#iIhej15uN*0_keFiBV9;X(3ttj98^-7=6~m8jz{ z)xz5VW{(TEPTqBpUKqIn2~z4qY^3uu=v2?4x8VUTwa8G;XPHvaBhdfxUW})@y>}at zM%88YVZTv8=N~G?Kuit&ziYyV5ynHwbFO@9*&)gZjGZd>$nzDrOCe#{Md-9xy$v19 zA!6fvdmHkH^AY&aKy8Q2K7N&b=IWsm`9WJAqLsD_XDF+a77zBjYWV$KkNHxg_wjE@ zisAoqL{(SZ25*ADH5eD(TYJ zzUL~}_9ot3J#0q9^X{u5@#Zf1v4d1(U<{18;q=7|uO8y&F5u{;_=0(v0l431Uy=}# zzXyUaUbc96nme^@f$^=0L$4-Ky)u#}h4MVfzI=Xi5V{592|SYpHGDMachSkPiw6CQ zxW>i*TOM~2{OU?H4!IFh&pl-37M9Z#<8RQ&7is!aC{Asxv#C%tw8D|xjL>PBUeO{gPXB$MNlje=dYWfRdl}MHYPS5V0WX6f%HFy!qApC5wRn7#yc_zl ztZBat1eh!?IX8#7 zp#ShJ##1ft-9|{GQdxaCYBkXLhl(*EQ-l6@B5WC9JeEA;$|aT^qLjecsbUX#z5;hC zBn-O%ofWIMp+h?aY?yCvLw?U)06w-*&5+rbUuB=>wP-_rAj?Cv)^?d2%IczpgZ-{r zet*?tzU1hA_*;=;_`e)crHWhcjCZ#NwauhltaUh|LI0Rn}ShP;}M?2m) ZOAF74hh-Vv$Ft|YOGcaMfU8u@qCZ>DUrzu4 literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..6cf4781cf5ceecbd071edd1e43f8caf822958c91 GIT binary patch literal 1855 zcmb7FTTk0C6n@XI@QO5TsI2N`tTG)EE2T&qFp)~^&2k;Lwd&ZGWB0E} ziDRGd_&eXZBsvi@^xNC&tow zO3`>YM2VnampQ?Z@!8_Fv%*<|8fQ;l`HUxLhFY3($8F;XOy;a$?u<%7Gav&Hp_udu zM~%#~7V`;hF|Af)3I;8a8Y!4?WX{u+?Dk2?pR6r4hTAX93nd#-6rx>ntGGl$WsWKa zeklI=5i?`8A(l%|vOYQQZQ7mwrrQatZYL}{M6upt-6LwTW(_pwid*VM-f79Yq$boDg?%XKgJ1P3C1z50R~`>fU)Zn*9=&+2T8is z-eEl9D%19UxVe4W40@N%qanl1WBhXuDb>IjBy+v#n-|_b4R?2eG;i?>=0ybH{*Zkk zLI{2W0%5$Y>F_dkYS{wg2Qv)C8bkUDCc6-d^EmzX^~FIb7mUa7j2G1K!FjKVAjwVC z?Uc&xd>k_Ec8;-ZgU=?aQn7Q$jgWfoAvCwJoF*9mfKt9nl3zk`YKJKF0lHBuAx|L>X9)a9U;aTdB4l%`U%%rZICM9Bpp_O0 zO8God3Yr8aKzQZ@s%r54MhK&9v+8hAtDwtIrD7nah6%9m!kQ7rBgs>)Tx{7XN(c;| zl6#2r2;7a3Fzy0$mapE24&@NAVZyx+d7Zfce6FE(Q|3^7rG1iBT^r(qzC1;1ZI_v$ ztV&uqIP9wC_jfhu3ynU8*NWuh|7}KPS3G!Uyn8Sh=iai)Rn;{$N1+m82Q8Emhv2^f hi`wa7wxf-+H1~{nSQhL)U%d8bGTKBZTqR-_{RR4>U%LPR literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..affc60c8489ec0485d93bc4d4ff710f62e2c6b93 GIT binary patch literal 1853 zcmbVNZBN@U5dNND;T37xP+3Lk7&09a%Sw?pU?P>+H_LV0)~aJ$j@=i2{myx7#)^PR zY3kVbT<)H`=Mo)=8G3g!8h^jNC6m$k{*wGYj(R;L=1NGY=19*F^G?z{j}DBb z^NgZ#9HUfFu*;ob$oPEm+*#=?MV+GuuYAtaBSS6CxZ}3-3QX>-VD5-YL30QOAVM+e zQ;s^hWj*Fo+GAR+$P^5EA~RAj;mDk48EN)O6+c;9Y7DoZm={WRq9|m$aS zkq>#G7-H7aJE0Z&iXw#E)^G*9)Ci12_!y^rB^bxJ1sH-kf`Hu_aovDLJCLMX?H$Gw zu5xW}7N`F{&!ncV06opKq`eGjDm7dF)qqz)5M^&oil_^dWGx;aCGUoQENj{? z0|7RRmmHr~3uir;6?+X4b@6Vzbi&9jP>@jPgOD!HL8*>|w&4M-v`A6L=c!Vl5zv3Q z7vrgx_iiJkQK_sx9JU(h;zPw4kf}lcI}x^wFdj*sape-r4pB;A>{PLbJYRvk5)y`8 zfX<55+t8sM0yfOIw;{jhE&v}}sAkCQ%dfIe^IEhaKak}iT5G$^4P|xF!ohx5Ex*6& zF<)}@KK!jnG5lYSs8Yq9cgDLrgK^<4uU*xuX*mj=kT{S~MjV2#1}s{q`=cFgoTY_l W#KW?T?&JA$-zB3>bih?AX3-y96<<#P literal 0 HcmV?d00001 diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 0000000000000000000000000000000000000000..7cb844de92653384af46b7df4171e727d48affb0 GIT binary patch literal 1853 zcmb7FZBN@U5dNND;T37xP+3LWF=RR+H_LV0)~aJ$j@=i2{myx7#)>FX znmYD9m%HcgxkN`|hCbYl#@F}vWHK5*UXeeiQKy5%TxrYEnp!2)9O)Ti-btEe(UGxq zo>DX(4pAZ~*kw*IWPG-G?W}N?pw`KgS3cv(iJ_LJ+;Q7F0FyZ@m^-0T&+Ac9>QxG6jQup)yAm z1HYAVCQ?UypNN^U+7Qd7C)qhU>u=!db;F|94T~;OtiM?IiCU~#3(dLWmO7rG>9%67 z+SE4kArCZ!n6>myXobF`2qCvMTmdf?0^<-q#tB~u#xZUH24IdLV0TVjJ7CccBlzVP;GSicLXc`LqPUSt67_t_UD zgybhc5XQ>}4=;14mMt)TFvHNRF;uUJq)wqYkJE2oUmS#P!FUY!ctH&xob}r%{CeGz zxZ1@&%U<^o%hNW3tExopkQ*WO+(Tw=VL44O{sE1Al_bA};?xdxIu(kBHhF~pgUnJ> zl)q8)x-@x_5jro@%Uh(u>Hp6&sj16BFY_#DFG8A1&6fW(;7t%j(OZ)|>f9t=i|1#_ z+o2zens$pofXVzNr)TBDSr2CUUQI+*ygM(QFmekN#MJp9q{|CXs^g$-ct9&H5|r|J zq7-NZ^dIi|c&g;R+X!itDyt3$jRv~>R5Au+YS8~qgbgE%N0O&px!AH}ln@v@W$Yo( z18_G&!mta_S-yH7Iux0SstUcw#&>=RuwHA z?041h`@0(R1xFvl--_hJ|LurMRXliSyn8Sh=iai)Ri&DSqtFSl0|}+XA^2**qH(%E d+R?^YntMh(EQ{zqU%d8RGTKB(TqR-_{RNJ@Us(VE literal 0 HcmV?d00001 diff --git a/src/gopt/impl/tensor_reformat.cpp b/src/gopt/impl/tensor_reformat.cpp index 8789444fc..6cce73595 100644 --- a/src/gopt/impl/tensor_reformat.cpp +++ b/src/gopt/impl/tensor_reformat.cpp @@ -3801,7 +3801,6 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { return false; auto in_dtype = typecvt->input(0)->dtype(), out_dtype = typecvt->output(0)->dtype(); - printf("%s, %s\n", in_dtype.name(), out_dtype.name()); bool is_s82s4 = in_dtype.enumv() == DTypeEnum::QuantizedS8 && (out_dtype.enumv() == DTypeEnum::QuantizedS4 || out_dtype.enumv() == DTypeEnum::Quantized4Asymm); diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp index b8598a45d..633ee3472 100644 --- a/src/gopt/test/inference.cpp +++ b/src/gopt/test/inference.cpp @@ -4159,14 +4159,7 @@ TEST(TestGoptInference, FoldingConvDimshuffle) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); cn.activate(); - auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; - auto sm_ver = prop.major * 10 + prop.minor; - if (sm_ver < 61) { - printf("This testcast ignored due to insufficient cuda cap(got: %d, " - "expected: %d)\n", - sm_ver, 61); - return; - } + REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1); HostTensorGenerator gen; auto graph = ComputingGraph::make(); @@ -4240,14 +4233,7 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); cn.activate(); - auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; - auto sm_ver = prop.major * 10 + prop.minor; - if (sm_ver < 61) { - printf("This testcast ignored due to insufficient cuda cap(got: %d, " - "expected: %d)\n", - sm_ver, 61); - return; - } + REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1); HostTensorGenerator gen; auto graph = ComputingGraph::make(); @@ -4326,14 +4312,7 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); cn.activate(); - auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; - auto sm_ver = prop.major * 10 + prop.minor; - if (sm_ver < 75) { - printf("This testcast ignored due to insufficient cuda cap(got: %d, " - "expected: %d)\n", - sm_ver, 75); - return; - } + REQUIRE_CUDA_COMPUTE_CAPABILITY(7, 5); HostTensorGenerator gen; auto graph = ComputingGraph::make(); @@ -4405,14 +4384,7 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NHWC) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); cn.activate(); - auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; - auto sm_ver = prop.major * 10 + prop.minor; - if (sm_ver < 75) { - printf("This testcast ignored due to insufficient cuda cap(got: %d, " - "expected: %d)\n", - sm_ver, 75); - return; - } + REQUIRE_CUDA_COMPUTE_CAPABILITY(7, 5); HostTensorGenerator gen; auto graph = ComputingGraph::make(); @@ -4466,7 +4438,6 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NHWC) { ->writeto_fpath(output_file( "TestGoptInference.FoldingConvDimshuffleNCHW4NHWC.json")); size_t nr_dimshuffle = find_opr_num(y_fuse); - printf("%zu \n", nr_dimshuffle); ASSERT_EQ(3u, find_opr_num(y_fuse)); bool found = false; cg::DepOprIter{[&found](cg::OperatorNodeBase* opr) { -- GitLab