diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu
index 46a65af564233e246b1f551ed03c2a118ef0a034..667b5771dd0c23fd75300241466733792dc9f1c9 100644
--- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu
+++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu
@@ -286,7 +286,8 @@ void megdnn::cuda::cutlass_wrapper::
                 uint32_t /* nonlinear_mode */, float /* alpha */,
                 float /* beta */, float /* gamma */, float /* scale */,
                 const GemmCoord& /* threadblock_shape */,
-                const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {}
+                const GemmCoord& /* warp_shape */, int /* stages */,
+                cudaStream_t /* stream */) {}
 #else
 template <bool NeedLoadFromConstMem>
 void megdnn::cuda::cutlass_wrapper::
@@ -296,15 +297,15 @@ void megdnn::cuda::cutlass_wrapper::
                 int* workspace, const convolution::ConvParam& param,
                 uint32_t nonlinear_mode, float alpha, float beta, float gamma,
                 float scale, const GemmCoord& threadblock_shape,
-                const GemmCoord& warp_shape, cudaStream_t stream) {
+                const GemmCoord& warp_shape, int stages, cudaStream_t stream) {
 #define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_,        \
                                         threadblock_k_, warp_m_, warp_n_,      \
-                                        warp_k_, stage_, aligned_)                     \
+                                        warp_k_, stage_, aligned_)             \
     if (threadblock_shape.m() == threadblock_m_ &&                             \
         threadblock_shape.n() == threadblock_n_ &&                             \
         threadblock_shape.k() == threadblock_k_ &&                             \
         warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ &&              \
-        warp_shape.k() == warp_k_) {                                           \
+        warp_shape.k() == warp_k_ && stages == stage_) {                       \
         using ThreadBlockShape =                                               \
                 cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_,       \
                                          threadblock_k_>;                      \
@@ -397,7 +398,8 @@ void megdnn::cuda::cutlass_wrapper::
                     uint32_t nonlinear_mode, float alpha, float beta,        \
                     float gamma, float scale,                                \
                     const GemmCoord& threadblock_shape,                      \
-                    const GemmCoord& warp_shape, cudaStream_t stream);
+                    const GemmCoord& warp_shape, int stages,                 \
+                    cudaStream_t stream);
 INST(true);
 INST(false);
 #undef INST
@@ -414,7 +416,8 @@ void megdnn::cuda::cutlass_wrapper::
                 uint32_t /* nonlinear_mode */, float /* alpha */,
                 float /* beta */, float /* gamma */, float /* scale */,
                 const GemmCoord& /* threadblock_shape */,
-                const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {}
+                const GemmCoord& /* warp_shape */, int /* stages */,
+                cudaStream_t /* stream */) {}
 #else
 template <bool NeedLoadFromConstMem>
 void megdnn::cuda::cutlass_wrapper::
@@ -424,15 +427,15 @@ void megdnn::cuda::cutlass_wrapper::
                 int* workspace, const convolution::ConvParam& param,
                 uint32_t nonlinear_mode, float alpha, float beta, float gamma,
                 float scale, const GemmCoord& threadblock_shape,
-                const GemmCoord& warp_shape, cudaStream_t stream) {
+                const GemmCoord& warp_shape, int stages, cudaStream_t stream) {
 #define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_,        \
                                         threadblock_k_, warp_m_, warp_n_,      \
-                                        warp_k_, aligned_)                     \
+                                        warp_k_, stages_, aligned_)            \
     if (threadblock_shape.m() == threadblock_m_ &&                             \
         threadblock_shape.n() == threadblock_n_ &&                             \
         threadblock_shape.k() == threadblock_k_ &&                             \
         warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ &&              \
-        warp_shape.k() == warp_k_) {                                           \
+        warp_shape.k() == warp_k_ && stages == stages_) {                      \
         using ThreadBlockShape =                                               \
                 cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_,       \
                                          threadblock_k_>;                      \
@@ -449,7 +452,7 @@ void megdnn::cuda::cutlass_wrapper::
                 cutlass::convolution::threadblock::                            \
                         ConvolutionNCxHWxThreadblockSwizzle<                   \
                                 cutlass::convolution::ConvType::kConvolution>, \
-                2, 4, aligned_, NeedLoadFromConstMem,                          \
+                stages_, 4, aligned_, NeedLoadFromConstMem,                    \
                 cutlass::arch::OpMultiplyAdd>;                                 \
         typename Convolution::ConvolutionParameter conv_param{                 \
                 param.n,  param.ci, param.co, param.hi, param.wi,              \
@@ -460,16 +463,17 @@ void megdnn::cuda::cutlass_wrapper::
                 epilogue, stream);                                             \
     }
 #define DISPATCH_KERNEL                                                      \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 16);           \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 16);            \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 16);            \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 16);            \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 16);            \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 16);             \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 16);             \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 16);             \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 16);             \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 64, 8, 16, 64, 8, 4);                \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16);        \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16);         \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16);         \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16);         \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16);         \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16);          \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16);          \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16);          \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16);          \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 128, 16, 16, 128, 16, 1, 8);         \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 64, 8, 16, 64, 8, 2, 4);             \
     megdnn_assert(false,                                                     \
                   "unsupported threadblock shape (%dx%dx%d) and warp shape " \
                   "(%dx%dx%d)",                                              \
@@ -525,7 +529,8 @@ void megdnn::cuda::cutlass_wrapper::
                     uint32_t nonlinear_mode, float alpha, float beta,    \
                     float gamma, float scale,                            \
                     const GemmCoord& threadblock_shape,                  \
-                    const GemmCoord& warp_shape, cudaStream_t stream);
+                    const GemmCoord& warp_shape, int stages,             \
+                    cudaStream_t stream);
 INST(true);
 INST(false);
 #undef INST
@@ -542,7 +547,8 @@ void megdnn::cuda::cutlass_wrapper::
                 uint32_t /* nonlinear_mode */, float /* alpha */,
                 float /* beta */, float /* gamma */, float /* scale */,
                 const GemmCoord& /* threadblock_shape */,
-                const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {}
+                const GemmCoord& /* warp_shape */, int /* stages */,
+                cudaStream_t /* stream */) {}
 #else
 template <bool NeedLoadFromConstMem>
 void megdnn::cuda::cutlass_wrapper::
@@ -552,15 +558,15 @@ void megdnn::cuda::cutlass_wrapper::
                 int* workspace, const convolution::ConvParam& param,
                 uint32_t nonlinear_mode, float alpha, float beta, float gamma,
                 float scale, const GemmCoord& threadblock_shape,
-                const GemmCoord& warp_shape, cudaStream_t stream) {
+                const GemmCoord& warp_shape, int stages, cudaStream_t stream) {
 #define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_,        \
                                         threadblock_k_, warp_m_, warp_n_,      \
-                                        warp_k_, aligned_)                     \
+                                        warp_k_, stages_, aligned_)            \
     if (threadblock_shape.m() == threadblock_m_ &&                             \
         threadblock_shape.n() == threadblock_n_ &&                             \
         threadblock_shape.k() == threadblock_k_ &&                             \
         warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ &&              \
-        warp_shape.k() == warp_k_) {                                           \
+        warp_shape.k() == warp_k_ && stages == stages_) {                      \
         using ThreadBlockShape =                                               \
                 cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_,       \
                                          threadblock_k_>;                      \
@@ -577,7 +583,7 @@ void megdnn::cuda::cutlass_wrapper::
                 cutlass::convolution::threadblock::                            \
                         ConvolutionNCxHWxThreadblockSwizzle<                   \
                                 cutlass::convolution::ConvType::kConvolution>, \
-                2, 4, aligned_, NeedLoadFromConstMem>;                         \
+                stages_, 4, aligned_, NeedLoadFromConstMem>;                   \
         typename Convolution::ConvolutionParameter conv_param{                 \
                 param.n,  param.ci, param.co, param.hi, param.wi,              \
                 param.fh, param.fw, param.ho, param.wo, param.sh,              \
@@ -587,15 +593,15 @@ void megdnn::cuda::cutlass_wrapper::
                 epilogue, stream);                                             \
     }
 #define DISPATCH_KERNEL                                                      \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 16);           \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 16);            \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 16);            \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 16);            \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 16);            \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 16);             \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 16);             \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 16);             \
-    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 16);             \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16);           \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16);            \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16);            \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16);            \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16);            \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16);             \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16);             \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16);             \
+    DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16);             \
     megdnn_assert(false,                                                     \
                   "unsupported threadblock shape (%dx%dx%d) and warp shape " \
                   "(%dx%dx%d)",                                              \
@@ -651,7 +657,8 @@ void megdnn::cuda::cutlass_wrapper::
                     uint32_t nonlinear_mode, float alpha, float beta,        \
                     float gamma, float scale,                                \
                     const GemmCoord& threadblock_shape,                      \
-                    const GemmCoord& warp_shape, cudaStream_t stream);
+                    const GemmCoord& warp_shape, int stages,                 \
+                    cudaStream_t stream);
 INST(true);
 INST(false);
 #undef INST
diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh
index 71c158568f745c7e2f91aad717e5e2dbd0b361e9..0a9511d268de0ba3807c79a8156e735b61abbb25 100644
--- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh
+++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh
@@ -56,7 +56,7 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4(
         const convolution::ConvParam& param, uint32_t nonlinear_mode,
         float alpha, float beta, float gamma, float scale,
         const GemmCoord& threadblock_shape, const GemmCoord& warp_shape,
-        cudaStream_t stream);
+        int stages, cudaStream_t stream);
 
 template <bool NeedLoadFromConstMem>
 void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw(
@@ -65,7 +65,7 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw(
         const convolution::ConvParam& param, uint32_t nonlinear_mode,
         float alpha, float beta, float gamma, float scale,
         const GemmCoord& threadblock_shape, const GemmCoord& warp_shape,
-        cudaStream_t stream);
+        int stages, cudaStream_t stream);
 
 template <bool NeedLoadFromConstMem>
 void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32(
@@ -74,7 +74,7 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32(
         const convolution::ConvParam& param, uint32_t nonlinear_mode,
         float alpha, float beta, float gamma, float scale,
         const GemmCoord& threadblock_shape, const GemmCoord& warp_shape,
-        cudaStream_t stream);
+        int stages, cudaStream_t stream);
 
 }  // namespace cutlass_wrapper
 }  // namespace cuda
diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
index 76cec7d2040e80c5dae5361931c979f549b8ff16..ace4620a314ab68b4ee75e9cd0e6fdf26c75ac75 100644
--- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
@@ -32,8 +32,11 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available(
     if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout),
                                                 param.format))
         return false;
-    if (param.format != Format::NCHW4 && param.format != Format::NCHW4_NCHW &&
-        param.format != Format::NCHW4_NCHW32)
+    if (param.format == Format::NCHW4_NCHW32) {
+        if (m_algo_param.threadblock_m % 32 != 0)
+            return false;
+    } else if (param.format != Format::NCHW4_NCHW &&
+               param.format != Format::NCHW4)
         return false;
     size_t n = args.src_layout->operator[](0),
            ci = args.src_layout->operator[](1) * 4,
@@ -187,7 +190,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
                     cutlass_wrapper::GemmCoord{m_algo_param.warp_m,
                                                m_algo_param.warp_n,
                                                m_algo_param.warp_k},
-                    stream);
+                    m_algo_param.stage, stream);
         } else if (param.format == Format::NCHW4_NCHW) {
             cutlass_wrapper::
                     do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw<false>(
@@ -205,7 +208,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
                             cutlass_wrapper::GemmCoord{m_algo_param.warp_m,
                                                        m_algo_param.warp_n,
                                                        m_algo_param.warp_k},
-                            stream);
+                            m_algo_param.stage, stream);
         } else {
             megdnn_assert(param.format == Format::NCHW4_NCHW32);
             cutlass_wrapper::
@@ -225,7 +228,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
                             cutlass_wrapper::GemmCoord{m_algo_param.warp_m,
                                                        m_algo_param.warp_n,
                                                        m_algo_param.warp_k},
-                            stream);
+                            m_algo_param.stage, stream);
         }
     } else {
         if (param.format == Format::NCHW4) {
@@ -242,7 +245,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
                     cutlass_wrapper::GemmCoord{m_algo_param.warp_m,
                                                m_algo_param.warp_n,
                                                m_algo_param.warp_k},
-                    stream);
+                    m_algo_param.stage, stream);
         } else if (param.format == Format::NCHW4_NCHW) {
             cutlass_wrapper::
                     do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw<true>(
@@ -260,7 +263,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
                             cutlass_wrapper::GemmCoord{m_algo_param.warp_m,
                                                        m_algo_param.warp_n,
                                                        m_algo_param.warp_k},
-                            stream);
+                            m_algo_param.stage, stream);
 
         } else {
             megdnn_assert(param.format == Format::NCHW4_NCHW32);
@@ -281,7 +284,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
                             cutlass_wrapper::GemmCoord{m_algo_param.warp_m,
                                                        m_algo_param.warp_n,
                                                        m_algo_param.warp_k},
-                            stream);
+                            m_algo_param.stage, stream);
         }
     }
     after_kernel_launch();
diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu
index ab01f989dadf003d7073fb228910bdc6050460b8..0c779fe15d9856d56b82073d75e6c284fdf267af 100644
Binary files a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu differ
diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu
index 9f901437080cba0a608bbc613716d45356c1c870..85fdecd79fda6911989011249fdb9b5c2f050410 100644
Binary files a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu differ
diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu
index 5dfd371c8bdcd07754f633083d69edd5c36850ef..c088e6f6ec533ac73e814d8f65b420a89b129b67 100644
Binary files a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu differ
diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu
index 1c7115e9d9b4e077f0b265463b4c45710751ec6f..14226ecd3b901e9aac04d5444f25e7ac6114efdb 100644
Binary files a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu differ
diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu
index 374f51e9660a374dd54912933f94785123c5e458..449dee42ef976533a07d19f702196fbe9a19ec54 100644
Binary files a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu differ
diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu
index 76ecaad0dd28cb71a79c83b7470737e24741710e..0ed7466912ebf3b72a88f4555ecc03afeb3652b1 100644
Binary files a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu differ
diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_hswish.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8865080e356c1386e72b302a6160bf6c88d97ebe
Binary files /dev/null and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_hswish.cu differ
diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6a93ab6171cf7b4163c863df6d4bc59a588bcab7
Binary files /dev/null and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu differ
diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_relu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6e01aaaf1e058b2ca59ba90945c422d4a2a9c9cc
Binary files /dev/null and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_relu.cu differ
diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ddea6211e503043d806664aafd7d4bec23bfe154
Binary files /dev/null and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu differ
diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_id.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0aafd7fe2ea4d678f887ead8980b8490b7d4ec97
Binary files /dev/null and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_id.cu differ
diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_relu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0809ddee14cae1c5c3514fbb1dbf6ae4872bc26d
Binary files /dev/null and b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_relu.cu differ
diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp
index f6e0f5409cdc274568fd801fa88613941e60476d..c6e040b27c21771fa3dac2a6f2bd6261fece5c79 100644
--- a/src/gopt/test/inference.cpp
+++ b/src/gopt/test/inference.cpp
@@ -3895,6 +3895,9 @@ TEST(TestGoptInference, FoldingConvDimshuffle) {
                           .apply({{y}})
                           .endpoint_vars(),
                   y_fuse);
+    gopt::modify_opr_algo_strategy_inplace(
+            {y_fuse},
+            opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::PROFILE);
     graph->compile({{y_fuse, {}}})
             ->to_json()
             ->writeto_fpath(output_file(
@@ -3976,6 +3979,9 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) {
                           .apply({{y}})
                           .endpoint_vars(),
                   y_fuse);
+    gopt::modify_opr_algo_strategy_inplace(
+            {y_fuse},
+            opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::PROFILE);
     graph->compile({{y_fuse, {}}})
             ->to_json()
             ->writeto_fpath(output_file(