feat(dnn/cuda): add nhwc int8 imma conv and conv fuse typecvt

GitOrigin-RevId: 229e1eb4be0fe29932117f226523b446c9ca665e

feat(dnn/cuda): add nhwc int8 imma conv and conv fuse typecvt
GitOrigin-RevId: 229e1eb4be0fe29932117f226523b446c9ca665e
11f022ff · Megvii Engine Team · 03e80759 · 11f022ff · 11f022ff · 11f022ff
33 changed file
--- a/dnn/scripts/cutlass_generator/conv2d_operation.py
+++ b/dnn/scripts/cutlass_generator/conv2d_operation.py
@@ -19,8 +19,8 @@ class Conv2dOperation:
  #
  def __init__(self, conv_kind, conv_type, arch, tile_description, src, flt, bias, dst, element_epilogue, \
    epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity4, \
-    need_load_from_const = True, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, \
-    required_cuda_ver_major = 9, required_cuda_ver_minor = 2):
+    special_optimization = SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode = ImplicitGemmMode.GemmNT, \
+    without_shared_load = False, required_cuda_ver_major = 9, required_cuda_ver_minor = 2):

    self.operation_kind = OperationKind.Conv2d
    self.conv_kind = conv_kind
@@ -34,7 +34,7 @@ class Conv2dOperation:
    self.element_epilogue = element_epilogue
    self.epilogue_functor = epilogue_functor
    self.swizzling_functor = swizzling_functor
-    self.need_load_from_const = need_load_from_const  
+    self.special_optimization = special_optimization  
    self.implicit_gemm_mode = implicit_gemm_mode
    self.without_shared_load = without_shared_load
    self.required_cuda_ver_major = required_cuda_ver_major
@@ -60,16 +60,18 @@ class Conv2dOperation:
    else:
      inst_shape = ''

-    unity_kernel = ''
-    if not self.need_load_from_const:
-      unity_kernel = '_1x1'
+    special_opt = ''
+    if self.special_optimization == SpecialOptimizeDesc.ConvFilterUnity:
+      special_opt = '_1x1'
+    elif self.special_optimization == SpecialOptimizeDesc.DeconvDoubleUpsampling:
+       special_opt = '_s2'

    reorder_k = ''
    if self.without_shared_load:
      reorder_k = '_roc'

    return "%s%s%s%s%s%s_%s" % (ShortDataTypeNames[self.accumulator_type()], \
-      inst_shape, intermediate_type, ConvKindNames[self.conv_kind], unity_kernel, \
+      inst_shape, intermediate_type, ConvKindNames[self.conv_kind], special_opt, \
      reorder_k, ShortEpilogueNames[self.epilogue_functor])

  #
@@ -183,7 +185,7 @@ using Convolution =
    ${stages},
    ${alignment_src}, 
    ${alignment_filter}, 
-    ${nonuninity_kernel}, 
+    ${special_optimization}, 
    ${math_operator},
    ${implicit_gemm_mode}, 
    ${without_shared_load}>;
@@ -226,7 +228,7 @@ using Convolution =
      'stages': str(operation.tile_description.stages),
      'alignment_src': str(operation.src.alignment), 
      'alignment_filter': str(operation.flt.alignment), 
-      'nonuninity_kernel': str(operation.need_load_from_const).lower(),  
+      'special_optimization': SpecialOptimizeDescTag[operation.special_optimization],  
      'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation],
      'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode],
      'without_shared_load': str(operation.without_shared_load).lower()
@@ -266,7 +268,7 @@ using Deconvolution =
    ${stages},
    ${alignment_src}, 
    ${alignment_filter}, 
-    ${nonuninity_kernel}, 
+    ${special_optimization}, 
    ${math_operator},
    ${implicit_gemm_mode}>;
 """
@@ -308,7 +310,7 @@ using Deconvolution =
      'stages': str(operation.tile_description.stages),
      'alignment_src': str(operation.src.alignment), 
      'alignment_filter': str(operation.flt.alignment), 
-      'nonuninity_kernel': str(operation.need_load_from_const).lower(),
+      'special_optimization': SpecialOptimizeDescTag[operation.special_optimization],
      'math_operator': MathOperationTag[operation.tile_description.math_instruction.math_operation],
      'implicit_gemm_mode': ImplicitGemmModeTag[operation.implicit_gemm_mode]
    }
@@ -323,9 +325,9 @@ using Deconvolution =
 ###################################################################################################

 #
-def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 128, \
-  skip_unity_kernel = False, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, required_cuda_ver_major = 9, \
-  required_cuda_ver_minor = 2):
+def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_layout, dst_type, min_cc, src_align = 32, flt_align = 32, dst_align = 32, \
+  use_special_optimization = SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode = ImplicitGemmMode.GemmNT, without_shared_load = False, \
+  required_cuda_ver_major = 9, required_cuda_ver_minor = 2):
  operations = []

  element_epilogue = DataType.f32 
@@ -412,10 +414,10 @@ def GenerateConv2d(conv_kind, tile_descriptions, src_layout, flt_layout, dst_lay
      bias = TensorDescription(bias_type, dst_layout, max(1, int(32 / DataTypeSize[bias_type])))
      dst = TensorDescription(dst_type, dst_layout, int(dst_align / DataTypeSize[dst_type])) 

-      new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, True, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor)
+      new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, SpecialOptimizeDesc.NoneSpecialOpt, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor)
      operations.append(new_operation)
-      if not skip_unity_kernel:
-        new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, False, implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor)
+      if use_special_optimization != SpecialOptimizeDesc.NoneSpecialOpt:
+        new_operation = Conv2dOperation(conv_kind, ConvType.Convolution, min_cc, tile, src, flt, bias, dst, element_epilogue, epilogue, swizzling_functor, use_special_optimization , implicit_gemm_mode, without_shared_load, required_cuda_ver_major, required_cuda_ver_minor)
        operations.append(new_operation)
  return operations


--- a/dnn/scripts/cutlass_generator/generator.py
+++ b/dnn/scripts/cutlass_generator/generator.py
@@ -168,10 +168,10 @@ def GenerateConv2d_Simt(args):
      for dst_type, dst_layout in zip(dst_types, dst_layouts):
        if dst_type == DataType.s4 or dst_type == DataType.u4:
          min_cc = 75
-          skip_unity_kernel = True
+          use_special_optimization = SpecialOptimizeDesc.NoneSpecialOpt
        else:
          min_cc = 61
-          skip_unity_kernel = False
+          use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity
        tile_descriptions = [
          TileDescription([128, 128, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
          TileDescription([128,  64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
@@ -183,9 +183,15 @@ def GenerateConv2d_Simt(args):
          TileDescription([ 16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc), 
          TileDescription([ 16,  64,  8], 2, [1, 1, 1], math_inst, min_cc, max_cc), 
        ]
-        operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], 
+        for tile in tile_descriptions:
+          if dst_layout == LayoutType.TensorNC32HW32 and tile.threadblock_shape[0] > 32:
+            continue
+          if (dst_layout == LayoutType.TensorNCHW or dst_layout == LayoutType.TensorNHWC) \
+              and tile.threadblock_shape[0] > 16:
+            continue
+          operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], 
                                      dst_layout, dst_type, min_cc, 32, 32, 32, 
-                                     skip_unity_kernel) 
+                                      use_special_optimization) 
  return operations


@@ -214,6 +220,8 @@ def GenerateConv2d_TensorOp_8816(args):
      DataType.s8, 
  ]

+  use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity
+
  min_cc = 75
  max_cc = 1024

@@ -232,28 +240,69 @@ def GenerateConv2d_TensorOp_8816(args):
            TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
            TileDescription([128,  64, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc),
            TileDescription([128,  32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc),
-            TileDescription([ 64, 128, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([ 32, 128, 32], 1, [1, 2, 1], math_inst, min_cc, max_cc),
          ] 
          operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], 
-                                     dst_layout, dst_type, min_cc, 128, 128, 64,  
-                                     False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) 
+                                     dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization, 
+                                     ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
        else:
          assert dst_layout == LayoutType.TensorNC4HW4
          tile_descriptions = [
-            TileDescription([128, 256, 64], 2, [2, 4, 1], math_inst, min_cc, max_cc),
-            TileDescription([256, 128, 64], 2, [4, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([128, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([128,  64, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
            TileDescription([ 64, 128, 64], 2, [2, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([128,  64, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([128,  32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc),
-            TileDescription([ 64, 128, 32], 1, [2, 2, 1], math_inst, min_cc, max_cc),
            TileDescription([ 32, 128, 32], 1, [1, 2, 1], math_inst, min_cc, max_cc),
          ]
          operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], 
-                                     dst_layout, dst_type, min_cc, 128, 128, 64,  
-                                     False, ImplicitGemmMode.GemmNT, False, cuda_major, cuda_minor) 
+                                     dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization, 
+                                     ImplicitGemmMode.GemmNT, False, cuda_major, cuda_minor) 
+
+  layouts_nhwc = [
+    (LayoutType.TensorNHWC, LayoutType.TensorNC4HW4, 32), 
+    (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 64), 
+    (LayoutType.TensorNHWC, LayoutType.TensorNC16HW16, 128), 
+  ]
+
+  dst_layouts_nhwc = [
+      LayoutType.TensorNHWC, 
+  ]
+
+  for math_inst in math_instructions:
+    for layout in layouts_nhwc:
+      for dst_layout in dst_layouts_nhwc:
+          dst_type = math_inst.element_b
+          tile_descriptions = [
+              TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc),
+              TileDescription([64, 16, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc),
+          ]
+          for tile in tile_descriptions:
+            dst_align = 32 if tile.threadblock_shape[1] == 16 else 64
+            operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, 
+                                      dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, 
+                                      ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor)                      
+            if tile.threadblock_shape[1] == 16 or tile.threadblock_shape[1] == 32:
+              operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, 
+                                      dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, 
+                                      ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
+
+  out_dtypes = [DataType.s4, DataType.u4, DataType.f32]
+
+  #INT8x8x4 and INT8x8x32
+  for math_inst in math_instructions:
+    for layout in layouts_nhwc:
+      for dst_layout in dst_layouts_nhwc:
+        for out_dtype in out_dtypes:
+          tile_descriptions = [
+              TileDescription([128, 32, 32], 1, [2, 1, 1], math_inst, min_cc, max_cc),
+              TileDescription([64, 16, 32], 2, [1, 1, 1], math_inst, min_cc, max_cc),
+          ]
+          for tile in tile_descriptions:
+            dst_align = 4 * DataTypeSize[out_dtype] if tile.threadblock_shape[1] == 16 or out_dtype == DataType.f32 \
+              else 8 * DataTypeSize[out_dtype]
+            operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, 
+                                      out_dtype, min_cc, layout[2], layout[2], dst_align, use_special_optimization, 
+                                      ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor)
+            if tile.threadblock_shape[1] == 16 or (tile.threadblock_shape[1] == 32 and out_dtype != DataType.f32):
+              operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, 
+                                        out_dtype, min_cc, layout[2], layout[2], dst_align, use_special_optimization, 
+                                        ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor) 
        
  return operations

@@ -281,6 +330,8 @@ def GenerateConv2d_TensorOp_8832(args):
      LayoutType.TensorNC64HW64, 
  ]

+  use_special_optimization = SpecialOptimizeDesc.ConvFilterUnity
+
  min_cc = 75
  max_cc = 1024

@@ -298,8 +349,8 @@ def GenerateConv2d_TensorOp_8832(args):
          TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc),
        ] 
        operations += GenerateConv2d(ConvKind.Fprop, tile_descriptions, layout[0], layout[1], 
-                                     dst_layout, dst_type, min_cc, 128, 128, 64,  
-                                     False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
+                                     dst_layout, dst_type, min_cc, 128, 128, 64, use_special_optimization, 
+                                     ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)

  layouts_nhwc = [
    (LayoutType.TensorNHWC, LayoutType.TensorNC8HW8, 32), 
@@ -316,18 +367,39 @@ def GenerateConv2d_TensorOp_8832(args):
      for dst_layout in dst_layouts_nhwc:
          dst_type = math_inst.element_b
          tile_descriptions = [
+              TileDescription([128, 16, 64], 2, [1, 1, 1], math_inst, min_cc, max_cc),
              TileDescription([128, 32, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc),
              TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc),
          ]
          for tile in tile_descriptions:
-            operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], 
-                                      dst_layout, dst_type, min_cc, layout[2], layout[2], 32, 
-                                      False, ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor)
+            dst_align = 16 if tile.threadblock_shape[1] == 16 else 32
+            operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, 
+                                      dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, 
+                                      ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor)
            if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64:
              dst_align = 32 if tile.threadblock_shape[1] == 32 else 64
-              operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], 
-                                      dst_layout, dst_type, min_cc, layout[2], layout[2], dst_align, 
-                                      False, ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
+              operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, 
+                                      dst_type, min_cc, layout[2], layout[2], dst_align, use_special_optimization, 
+                                      ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)
+  # INT4x4x8
+  for math_inst in math_instructions:
+    for layout in layouts_nhwc:
+      for dst_layout in dst_layouts_nhwc:
+          tile_descriptions = [
+              TileDescription([128, 16, 64], 2, [1, 1, 1], math_inst, min_cc, max_cc),
+              TileDescription([128, 32, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc),
+              TileDescription([128, 64, 64], 1, [2, 1, 1], math_inst, min_cc, max_cc),
+          ]
+          for tile in tile_descriptions:
+            dst_align = 32 if tile.threadblock_shape[1] == 16 else 64
+            operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, 
+                                      DataType.s8, min_cc, layout[2], layout[2], dst_align, use_special_optimization, 
+                                      ImplicitGemmMode.GemmTN, False, cuda_major, cuda_minor)
+            if tile.threadblock_shape[1] == 32 or tile.threadblock_shape[1] == 64:
+              dst_align = 64 if tile.threadblock_shape[1] == 32 else 128
+              operations += GenerateConv2d(ConvKind.Fprop, [tile], layout[0], layout[1], dst_layout, 
+                                      DataType.s8, min_cc, layout[2], layout[2], dst_align, use_special_optimization, 
+                                      ImplicitGemmMode.GemmTN, True, cuda_major, cuda_minor)

  return operations

@@ -354,6 +426,8 @@ def GenerateDeconv_Simt(args):
      DataType.s8, 
  ]

+  use_special_optimization = SpecialOptimizeDesc.DeconvDoubleUpsampling
+
  min_cc = 61
  max_cc = 1024

@@ -361,7 +435,6 @@ def GenerateDeconv_Simt(args):
    for layout in layouts:
      for dst_type, dst_layout in zip(dst_types, dst_layouts):
        tile_descriptions = [
-          TileDescription([64, 128, 32], 2, [1, 4, 1], math_inst, min_cc, max_cc),
          TileDescription([32, 128, 32], 2, [1, 2, 1], math_inst, min_cc, max_cc),
          TileDescription([16, 128, 16], 2, [1, 2, 1], math_inst, min_cc, max_cc),
          TileDescription([16, 128, 16], 1, [1, 1, 1], math_inst, min_cc, max_cc),
@@ -369,7 +442,7 @@ def GenerateDeconv_Simt(args):
        ] 
        operations += GenerateConv2d(ConvKind.Dgrad, tile_descriptions, layout[0], layout[1], 
                                     dst_layout, dst_type, min_cc, 32, 32, 32, 
-                                     True) 
+                                     use_special_optimization) 
  return operations

 ################################################################################

--- a/dnn/scripts/cutlass_generator/library.py
+++ b/dnn/scripts/cutlass_generator/library.py
@@ -562,6 +562,24 @@ StrideSupportNames = {
  StrideSupport.Unity: 'unity_stride',
 }

+class SpecialOptimizeDesc(enum.Enum):
+  NoneSpecialOpt = enum_auto()
+  ConvFilterUnity = enum_auto()
+  DeconvDoubleUpsampling = enum_auto()
+
+SpecialOptimizeDescNames = {
+  SpecialOptimizeDesc.NoneSpecialOpt: 'none',
+  SpecialOptimizeDesc.ConvFilterUnity: 'conv_filter_unity',
+  SpecialOptimizeDesc.DeconvDoubleUpsampling: 'deconv_double_upsampling',
+}
+
+SpecialOptimizeDescTag = {
+  SpecialOptimizeDesc.NoneSpecialOpt: 'cutlass::conv::SpecialOptimizeDesc::NONE',
+  SpecialOptimizeDesc.ConvFilterUnity: 'cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY',
+  SpecialOptimizeDesc.DeconvDoubleUpsampling: 'cutlass::conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING',
+}
+
+
 class ImplicitGemmMode(enum.Enum):
  GemmNT = enum_auto()
  GemmTN = enum_auto()

--- a/dnn/scripts/cutlass_generator/list.bzl
+++ b/dnn/scripts/cutlass_generator/list.bzl
--- a/dnn/src/common/convolution.cpp
+++ b/dnn/src/common/convolution.cpp
@@ -553,7 +553,10 @@ void ConvolutionBase<Parameter>::check_or_deduce_dtype_fwd(DType src,
                dst.valid() && (dst.enumv() == src.enumv() ||
                                ((dst.enumv() == DTypeEnum::QuantizedS4 ||
                                  dst.enumv() == DTypeEnum::Quantized4Asymm) &&
-                                 src.enumv() == DTypeEnum::QuantizedS8));
+                                 src.enumv() == DTypeEnum::QuantizedS8) ||
+                                ((src.enumv() == DTypeEnum::QuantizedS4 ||
+                                  src.enumv() == DTypeEnum::Quantized4Asymm) &&
+                                 dst.enumv() == DTypeEnum::QuantizedS8));
        if (cond_dst) {
            supported_dst_dtype.push_back(dst);
        }

--- a/dnn/src/cuda/conv_bias/algo.cpp
+++ b/dnn/src/cuda/conv_bias/algo.cpp
@@ -71,6 +71,9 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() {
    for (auto&& algo : int8_nchw32_imma) {
        all_algos.push_back(&algo);
    }
+    for (auto&& algo : int8_nhwc_imma) {
+        all_algos.push_back(&algo);
+    }
    for (auto&& algo : int4_int4_nchw64_imma) {
        all_algos.push_back(&algo);
    }
@@ -236,7 +239,21 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
        int8_nchw32_imma.emplace_back(
                AlgoParam{32, 128, 32, 32, 64, 32, 8, 8, 16, 1});
    }
-
+    {
+        using AlgoParam = AlgoInt8NHWCIMMAImplicitGemm::AlgoParam;
+        int8_nhwc_imma.emplace_back(
+                AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 16});
+        int8_nhwc_imma.emplace_back(
+                AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 8});
+        int8_nhwc_imma.emplace_back(
+                AlgoParam{64, 16, 32, 64, 16, 32, 8, 8, 16, 2, 4});
+        int8_nhwc_imma.emplace_back(
+                AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 16});
+        int8_nhwc_imma.emplace_back(
+                AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 8});
+        int8_nhwc_imma.emplace_back(
+                AlgoParam{128, 32, 32, 64, 32, 32, 8, 8, 16, 1, 4});
+    }
    {
        using AlgoParam = AlgoInt4Int4NCHW64IMMAImplicitGemm::AlgoParam;
        int4_int4_nchw64_imma.emplace_back(
@@ -261,6 +278,12 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
    }
    {
        using AlgoParam = AlgoInt4Int4NHWCIMMAImplicitGemm::AlgoParam;
+        int4_int4_nhwc_imma.emplace_back(
+                AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 32});
+        int4_int4_nhwc_imma.emplace_back(
+                AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 16});
+        int4_int4_nhwc_imma.emplace_back(
+                AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 8});
        int4_int4_nhwc_imma.emplace_back(
                AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 32});
        int4_int4_nhwc_imma.emplace_back(
@@ -276,6 +299,12 @@ void ConvBiasForwardImpl::AlgoPack::fill_imma_algos() {
    }
    {
        using AlgoParam = AlgoUInt4Int4NHWCIMMAImplicitGemm::AlgoParam;
+        uint4_int4_nhwc_imma.emplace_back(
+                AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 32});
+        uint4_int4_nhwc_imma.emplace_back(
+                AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 16});
+        uint4_int4_nhwc_imma.emplace_back(
+                AlgoParam{128, 16, 64, 128, 16, 64, 8, 8, 32, 2, 8});
        uint4_int4_nhwc_imma.emplace_back(
                AlgoParam{128, 32, 64, 64, 32, 64, 8, 8, 32, 1, 32});
        uint4_int4_nhwc_imma.emplace_back(

--- a/dnn/src/cuda/conv_bias/algo.h
+++ b/dnn/src/cuda/conv_bias/algo.h
@@ -72,6 +72,7 @@ public:
        CUDA_IMPLICIT_GEMM_REORDER_FILTER_CHWN4_IMMA_INT8,
        CUDA_IMPLICIT_GEMM_UNROLL_WIDTH_CHWN4_IMMA_INT8,
        CUDA_IMPLICIT_GEMM_IMMA_NCHW32_INT8,
+        CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT8,
        CUDA_IMPLICIT_GEMM_IMMA_NCHW64_INT4_INT4,
        CUDA_IMPLICIT_GEMM_IMMA_NCHW64_UINT4_INT4,
        CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT4_INT4,
@@ -524,6 +525,7 @@ public:
 * +
 * +--- AlgoInt8NCHW4DotProdImplicitGemm
 * +--- AlgoInt8NCHW32IMMAImplicitGemm
+ * +--- AlgoInt8NHWCIMMAImplicitGemm
 * +
 * +--- AlgoInt4NCHW64IMMAImplicitGemmBase
 * +----+--- AlgoInt4Int4NCHW64IMMAImplicitGemm
@@ -582,7 +584,7 @@ public:
    // operation (cutlass kernel) from the global OperationTable
    const cutlass::library::Operation* get_cutlass_conv_op(
            const SizeArgs& args, ConvOperator conv_op, ConvType conv_type,
-            bool load_from_const, bool without_shared_load) const;
+            bool use_conv_filter_unity_opt, bool without_shared_load) const;

    // execute the cutlass kernel found by get_cutlass_conv_op. we give
    // subclasses full freedom to decide where and how these arguments are
@@ -829,6 +831,47 @@ private:
    std::string m_name;
 };

+class ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm final
+        : public AlgoCutlassConvolutionBase {
+public:
+    AlgoInt8NHWCIMMAImplicitGemm(AlgoParam algo_param)
+            : AlgoCutlassConvolutionBase(algo_param) {
+        m_name = ConvBias::algo_name<ConvBias::DirectParam>(
+                ssprintf("INT8_NHWC_IMMA_IMPLICIT_GEMM_%s",
+                         to_string(m_algo_param).c_str()),
+                ConvBias::DirectParam{});
+    }
+    bool is_available(const SizeArgs& args) const override;
+    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
+    void exec(const ExecArgs& args) const override;
+    const char* name() const override { return m_name.c_str(); }
+    AlgoAttribute attribute() const override {
+        return AlgoAttribute::REPRODUCIBLE;
+    }
+    static std::string to_string(AlgoParam algo_param);
+    size_t get_preprocess_workspace_in_bytes(
+            const SizeArgs& args) const override;
+    SmallVector<TensorLayout> deduce_preprocessed_filter_layout(
+            const SizeArgs& args) const override;
+    void exec_preprocess(const ExecArgs& args) const override;
+    MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_IMMA_NHWC_INT8)
+
+    std::string param() const override {
+        std::string ret;
+        serialize_write_pod(m_algo_param, ret);
+        return ret;
+    }
+
+private:
+    std::tuple<float, float, float, float, float> get_constants(
+            const ExecArgs& args) const;
+
+    void reorder_filter(const ExecArgs& args, int interleaved,
+                        void* reordered_filter) const;
+
+    std::string m_name;
+};
+
 class ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase
        : public AlgoCutlassConvolutionBase {
 public:
@@ -1087,6 +1130,7 @@ public:
 #endif
 #if CUDA_VERSION >= 10020
    std::vector<AlgoInt8NCHW32IMMAImplicitGemm> int8_nchw32_imma;
+    std::vector<AlgoInt8NHWCIMMAImplicitGemm> int8_nhwc_imma;
    std::vector<AlgoInt4Int4NCHW64IMMAImplicitGemm> int4_int4_nchw64_imma;
    std::vector<AlgoUInt4Int4NCHW64IMMAImplicitGemm> uint4_int4_nchw64_imma;
    std::vector<AlgoInt4Int4NHWCIMMAImplicitGemm> int4_int4_nhwc_imma;

--- a/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp
+++ b/dnn/src/cuda/conv_bias/cutlass_convolution_base.cpp
@@ -140,6 +140,11 @@ LayoutPack get_layout_pack(const param::ConvBias::Format format,
                    LayoutTypeID::kTensorNC64HW64};
        case Format::NHWC:
            switch (access_type) {
+                case 4:
+                    return {LayoutTypeID::kTensorNHWC,
+                            LayoutTypeID::kTensorNC4HW4,
+                            LayoutTypeID::kTensorNHWC,
+                            LayoutTypeID::kTensorNHWC};
                case 8:
                    return {LayoutTypeID::kTensorNHWC,
                            LayoutTypeID::kTensorNC8HW8,
@@ -192,12 +197,18 @@ EpilogueType get_epilogue_type(const param::ConvBias::NonlineMode mode,
 const Operation*
 ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op(
        const SizeArgs& args, ConvOperator conv_op, ConvType conv_type,
-        bool load_from_const, bool without_shared_load) const {
-    using Format = param::ConvBias::Format;
+        bool use_conv_filter_unity_opt, bool without_shared_load) const {
    auto&& param = args.opr->param();
    auto layouts = get_layout_pack(param.format, m_algo_param.access_size);
-    auto epilogue_type = get_epilogue_type(param.nonlineMode,
-                                           param.format != Format::NCHW4_NCHW);
+    auto epilogue_type = get_epilogue_type(
+            param.nonlineMode,
+            args.dst_layout->dtype.enumv() != DTypeEnum::Float32);
+
+    cutlass::conv::SpecialOptimizeDesc special_optimization =
+            (use_conv_filter_unity_opt)
+                    ? cutlass::conv::SpecialOptimizeDesc::CONV_FILTER_UNITY
+                    : cutlass::conv::SpecialOptimizeDesc::NONE;
+
    ConvolutionKey key{convert_conv_op(conv_op),
                       convert_dtype(args.src_layout->dtype.enumv()),
                       layouts.src,
@@ -219,7 +230,7 @@ ConvBiasForwardImpl::AlgoCutlassConvolutionBase::get_cutlass_conv_op(
                       m_algo_param.instruction_k,
                       epilogue_type,
                       m_algo_param.stage,
-                       load_from_const,
+                       special_optimization,
                       without_shared_load};

    return Singleton::get().operation_table.find_op(key);

--- a/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cu
+++ b/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cu
@@ -144,28 +144,48 @@ void megdnn::cuda::cutlass_wrapper::reorder_ncxhwx_imma_filter(
                                                   IC, FH, FW, trans_oc);
    after_kernel_launch();
 }
-
-template <uint32_t size_bits, uint32_t alignbits>
+template <uint32_t size_bits>
 void megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter(
        int8_t* dst_filter, const int8_t* src_filter, uint32_t OC, uint32_t IC,
-        uint32_t FH, uint32_t FW, bool trans_oc, uint32_t oc_interleaved,
-        cudaStream_t stream) {
-    static constexpr uint32_t elements_per_access = alignbits / size_bits;
-    uint32_t nr_threads =
-            query_blocksize_for_kernel(reinterpret_cast<const void*>(
-                    reorder_nhwc_imma_filter_kernel<size_bits, alignbits, 32>));
+        uint32_t FH, uint32_t FW, bool trans_oc, uint32_t alignbits,
+        uint32_t interleaved, cudaStream_t stream) {
+    const uint32_t elements_per_access = alignbits / size_bits;
+
+    void (*kern)(int8_t* __restrict__, const int8_t* __restrict__, uint32_t,
+                 uint32_t, uint32_t, uint32_t, bool);
+    kern = nullptr;
+
+    auto get_kern = [&kern](const uint32_t alignbits,
+                            const uint32_t interleaved) {
+#define DISPATCH_KERNEL(alignbits_, interleaved_)                     \
+    if (alignbits == alignbits_ && interleaved == interleaved_) {     \
+        kern = reorder_nhwc_imma_filter_kernel<size_bits, alignbits_, \
+                                               interleaved_>;         \
+        return;                                                       \
+    }
+        DISPATCH_KERNEL(128, 16);
+        DISPATCH_KERNEL(64, 16);
+        DISPATCH_KERNEL(32, 16);
+        DISPATCH_KERNEL(128, 32);
+        DISPATCH_KERNEL(64, 32);
+        DISPATCH_KERNEL(32, 32);
+        DISPATCH_KERNEL(128, 64);
+        DISPATCH_KERNEL(64, 64);
+        DISPATCH_KERNEL(32, 64);
+
+#undef DISPATCH_KERNEL
+    };
+
+    get_kern(alignbits, interleaved);
+
+    uint32_t nr_threads = query_blocksize_for_kernel(kern);
    uint32_t vthreads = DIVUP(OC * IC * FH * FW, elements_per_access);
    nr_threads = std::min(nr_threads, vthreads);
    uint32_t nr_blocks = DIVUP(vthreads, nr_threads);
-    if (oc_interleaved == 32) {
-        reorder_nhwc_imma_filter_kernel<size_bits, alignbits, 32>
-                <<<nr_blocks, nr_threads, 0, stream>>>(
-                        dst_filter, src_filter, OC, IC, FH, FW, trans_oc);
-    } else {
-        reorder_nhwc_imma_filter_kernel<size_bits, alignbits, 64>
-                <<<nr_blocks, nr_threads, 0, stream>>>(
-                        dst_filter, src_filter, OC, IC, FH, FW, trans_oc);
-    }
+
+    kern<<<nr_blocks, nr_threads, 0, stream>>>(dst_filter, src_filter, OC, IC,
+                                               FH, FW, trans_oc);
+
    after_kernel_launch();
 }

@@ -180,15 +200,14 @@ INST(8, 32)
 INST(4, 64)
 #undef INST

-#define INST(_size_bits, _alignbits)                                       \
-    template void megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter< \
-            _size_bits, _alignbits>(                                       \
+#define INST(_size_bits)                                                 \
+    template void                                                        \
+    megdnn::cuda::cutlass_wrapper::reorder_nhwc_imma_filter<_size_bits>( \
            int8_t * dst_filter, const int8_t* src_filter, uint32_t OC,  \
            uint32_t IC, uint32_t FH, uint32_t FW, bool trans_oc,        \
-            uint32_t oc_interleaved, cudaStream_t stream);
-INST(4, 32)
-INST(4, 64)
-INST(4, 128)
+            uint32_t alignbits, uint32_t interleaved, cudaStream_t stream);
+INST(4)
+INST(8)
 #undef INST

 // vim: syntax=cuda.doxygen
--- a/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cuh
+++ b/dnn/src/cuda/conv_bias/cutlass_reorder_filter.cuh
@@ -23,11 +23,11 @@ void reorder_ncxhwx_imma_filter(int8_t* dst_filter, const int8_t* src_filter,
                                uint32_t FW, bool trans_oc,
                                cudaStream_t stream);

-template <uint32_t size_bits, uint32_t alignbits>
+template <uint32_t size_bits>
 void reorder_nhwc_imma_filter(int8_t* dst_filter, const int8_t* src_filter,
                              uint32_t OC, uint32_t IC, uint32_t FH,
-                              uint32_t FW, bool trans_oc,
-                              uint32_t oc_interleaved, cudaStream_t stream);
+                              uint32_t FW, bool trans_oc, uint32_t alignbits,
+                              uint32_t interleaved, cudaStream_t stream);
 }  // namespace cutlass_wrapper
 }  // namespace cuda
 }  // namespace megdnn
--- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_int4_nhwc_imma.cpp
@@ -68,13 +68,27 @@ ConvBiasForwardImpl::AlgoInt4Int4NHWCIMMAImplicitGemm::get_constants(
                  args.filter_layout->dtype.param<dtype::QuantizedS4>().scale,
          bias_scale =
                  args.bias_layout->dtype.param<dtype::QuantizedS32>().scale,
+          dst_scale;
+
+    if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) {
        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale;
+    } else {  // DTypeEnum::QuantizedS8
+        megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8);
+        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale;
+    }

    float alpha = src_scale * filter_scale / dst_scale,
          beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f, theta = 0.f;

    if (args.z_layout->ndim > 0) {
-        float z_scale = args.z_layout->dtype.param<dtype::QuantizedS4>().scale;
+        float z_scale;
+        if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS4) {
+            z_scale = args.z_layout->dtype.param<dtype::QuantizedS4>().scale;
+        } else {  // DTypeEnum::QuantizedS8
+            megdnn_assert(args.z_layout->dtype.enumv() ==
+                          DTypeEnum::QuantizedS8);
+            z_scale = args.z_layout->dtype.param<dtype::QuantizedS8>().scale;
+        }
        gamma = z_scale / dst_scale;
    }


--- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nchw64_imma_base.cpp
@@ -76,6 +76,14 @@ bool ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::is_available(
    if (fh * fw > kMaxFilterPixels)
        return false;

+    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
+    bool without_shared_load = true;
+    const auto* op = get_cutlass_conv_op(
+            args, ConvOperator::kFprop, ConvType::kConvolution,
+            use_conv_filter_unity_opt, without_shared_load);
+    if (op == nullptr)
+        return false;
+
    return true;
 }

@@ -110,7 +118,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec(
    float dst_scale = 0.f;
    float threshold = 0.f;
    uint8_t src_zero = 0;
-    bool load_from_const = !(fh == 1 && fw == 1);
+    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
    bool without_shared_load = true;

    if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
@@ -126,7 +134,7 @@ void ConvBiasForwardImpl::AlgoInt4NCHW64IMMAImplicitGemmBase::exec(

    const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop,
                                         ConvType::kConvolution,
-                                         load_from_const, without_shared_load);
+                                         use_conv_filter_unity_opt, without_shared_load);

    execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr,
                            z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi,

--- a/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int4_nhwc_imma_base.cpp
@@ -56,8 +56,11 @@ bool ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::is_available(

    if (args.src_layout->dtype.enumv() != src_dtype() ||
        args.filter_layout->dtype.enumv() != DTypeEnum::QuantizedS4 ||
-        args.bias_layout->dtype.enumv() != DTypeEnum::QuantizedS32 ||
-        args.dst_layout->dtype.enumv() != src_dtype())
+        args.bias_layout->dtype.enumv() != DTypeEnum::QuantizedS32)
+        return false;
+
+    if (!(args.dst_layout->dtype.enumv() == src_dtype() ||
+          args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8))
        return false;

    // uint4 do not support H_SWISH activition
@@ -83,6 +86,16 @@ bool ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::is_available(
    if ((co % 8 != 0) || (ci % m_algo_param.access_size != 0))
        return false;

+    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
+    bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) &&
+                                (m_algo_param.threadblock_n == 32 ||
+                                 m_algo_param.threadblock_n == 64));
+    const auto* op = get_cutlass_conv_op(
+            args, ConvOperator::kFprop, ConvType::kConvolution,
+            use_conv_filter_unity_opt, without_shared_load);
+    if (op == nullptr)
+        return false;
+
    return true;
 }

@@ -117,26 +130,31 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::exec(
    float dst_scale = 0.f;
    float threshold = 0.f;
    uint8_t src_zero = 0;
-    bool load_from_const = !(fh == 1 && fw == 1);
+    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);

    bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) &&
                                (m_algo_param.threadblock_n == 32 ||
                                 m_algo_param.threadblock_n == 64));

+    if (args.src_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
+        src_zero = args.src_layout->dtype.param<dtype::Quantized4Asymm>()
+                           .zero_point;
+    }
+
    if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
        dst_scale =
                args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale;
-        src_zero = args.src_layout->dtype.param<dtype::Quantized4Asymm>()
-                           .zero_point;
-    } else {  // DTypeEnum::QuantizedS4
+    } else if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) {
        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale;
+    } else {  // DTypeEnum::QuantizedS8
+        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale;
    }

    cudaStream_t stream = cuda_stream(args.opr->handle());

    const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop,
                                         ConvType::kConvolution,
-                                         load_from_const, without_shared_load);
+                                         use_conv_filter_unity_opt, without_shared_load);

    execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr,
                            z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi,
@@ -166,29 +184,18 @@ void ConvBiasForwardImpl::AlgoInt4NHWCIMMAImplicitGemmBase::reorder_filter(
    cudaStream_t stream = cuda_stream(args.opr->handle());

    // reformat filter from nhwc to ncxhwx and reorder oc
-    // use trans_oc threadblock_n must be 32 or 64
+    // use trans_oc threadblock_n must be 32 or 64 and src dtype == dest dtype
    bool trans_oc = ((co % m_algo_param.threadblock_n == 0) &&
                     (m_algo_param.threadblock_n == 32 ||
                      m_algo_param.threadblock_n == 64));
-    uint32_t oc_iterleave = (m_algo_param.threadblock_n == 64) ? 64 : 32;
+    uint32_t oc_iterleaved = (m_algo_param.threadblock_n == 64) ? 64 : 32;

-    if (iterleaved == 8) {
-        cutlass_wrapper::reorder_nhwc_imma_filter<4, 32>(
-                reinterpret_cast<int8_t*>(reordered_filter),
-                reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci,
-                fh, fw, trans_oc, oc_iterleave, stream);
-    } else if (iterleaved == 16) {
-        cutlass_wrapper::reorder_nhwc_imma_filter<4, 64>(
-                reinterpret_cast<int8_t*>(reordered_filter),
-                reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci,
-                fh, fw, trans_oc, oc_iterleave, stream);
-    } else {
-        megdnn_assert(iterleaved == 32);
-        cutlass_wrapper::reorder_nhwc_imma_filter<4, 128>(
+    uint32_t alignbits = iterleaved * 4;
+
+    cutlass_wrapper::reorder_nhwc_imma_filter<4>(
            reinterpret_cast<int8_t*>(reordered_filter),
-                reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci,
-                fh, fw, trans_oc, oc_iterleave, stream);
-    }
+            reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci, fh,
+            fw, trans_oc, alignbits, oc_iterleaved, stream);
 }
 #endif


--- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp
@@ -77,6 +77,14 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available(
    // FIXME: too large filter size is not supported now
    size_t kMaxFilterPixels = 848 / (2 * m_algo_param.warp_k / 32) - 2;
    available &= fh * fw <= kMaxFilterPixels;
+
+    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
+    bool without_shared_load = (param.format == Format::NCHW32);
+    const auto* op = get_cutlass_conv_op(
+            args, ConvOperator::kFprop, ConvType::kConvolution,
+            use_conv_filter_unity_opt, without_shared_load);
+    available &= (op != nullptr);
+
    return available;
 }

@@ -155,12 +163,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec(
        gamma = z_scale / dst_scale;
    }
    float delta = 0.f, theta = 0.f, threshold = 0.f;
-    bool load_from_const = !(fh == 1 && fw == 1);
+    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
    bool without_shared_load = (param.format == Format::NCHW32);

    const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop,
                                         ConvType::kConvolution,
-                                         load_from_const, without_shared_load);
+                                         use_conv_filter_unity_opt, without_shared_load);

    execute_cutlass_conv_op(
            op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr,

--- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
@@ -98,7 +98,14 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available(
    // FIXME: too large filter size is not supported now
    size_t kMaxFilterPixels = 848 / (2 * m_algo_param.warp_k / 4) - 2;
    available &= fh * fw <= kMaxFilterPixels;
-    ;
+
+    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
+    bool without_shared_load = false;
+    const auto* op = get_cutlass_conv_op(
+            args, ConvOperator::kFprop, ConvType::kConvolution,
+            use_conv_filter_unity_opt, without_shared_load);
+    available &= (op != nullptr);
+
    return available;
 }

@@ -213,12 +220,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
        }
    }
    float threshold = 0.f;
-    bool load_from_const = !(fh == 1 && fw == 1);
+    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
    bool without_shared_load = false;

-    const auto* op = get_cutlass_conv_op(args, ConvOperator::kFprop,
-                                         ConvType::kConvolution,
-                                         load_from_const, without_shared_load);
+    const auto* op = get_cutlass_conv_op(
+            args, ConvOperator::kFprop, ConvType::kConvolution,
+            use_conv_filter_unity_opt, without_shared_load);

    execute_cutlass_conv_op(
            op, args.src_tensor->raw_ptr, filter_ptr, args.bias_tensor->raw_ptr,

--- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp
+/**
+ * \file dnn/src/cuda/conv_bias/implicit_gemm_int8_nhwc_imma.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "src/common/conv_bias.h"
+#include "src/cuda/conv_bias/algo.h"
+#include "src/cuda/conv_bias/cutlass_reorder_filter.cuh"
+#include "src/cuda/convolution_helper/parameter.cuh"
+#include "src/cuda/utils.h"
+
+using namespace megdnn;
+using namespace cuda;
+using namespace convolution;
+
+#if CUDA_VERSION >= 10020
+bool ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::is_available(
+        const SizeArgs& args) const {
+    if (args.bias_layout->ndim <= 0)
+        return false;
+
+    using Param = param::ConvBias;
+    using Format = Param::Format;
+    using Sparse = Param::Sparse;
+    using Mode = Param::Mode;
+    using NonlineMode = megdnn::param::ConvBias::NonlineMode;
+
+    auto&& param = args.opr->param();
+
+    if (!check_bias_share_in_channel(*(args.bias_layout), param.format))
+        return false;
+
+    if (param.format != Format::NHWC || param.sparse != Sparse::DENSE ||
+        param.mode != Mode::CROSS_CORRELATION)
+        return false;
+
+    if (param.nonlineMode != NonlineMode::IDENTITY &&
+        param.nonlineMode != NonlineMode::RELU &&
+        param.nonlineMode != NonlineMode::H_SWISH)
+        return false;
+
+    if (args.src_layout->dtype.enumv() != DTypeEnum::QuantizedS8 ||
+        args.filter_layout->dtype.enumv() != DTypeEnum::QuantizedS8)
+        return false;
+
+    auto dst_dtype = args.dst_layout->dtype.enumv();
+
+    if (!(dst_dtype == DTypeEnum::QuantizedS8 ||
+          dst_dtype == DTypeEnum::QuantizedS4 ||
+          dst_dtype == DTypeEnum::Quantized4Asymm ||
+          dst_dtype == DTypeEnum::Float32))
+        return false;
+
+    if (!(args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32 ||
+          (args.bias_layout->dtype.enumv() == DTypeEnum::Float32 &&
+           dst_dtype == DTypeEnum::Float32)))
+        return false;
+
+    if (!is_compute_capability_required(7, 5))
+        return false;
+
+    size_t co = args.filter_layout->operator[](0),
+           ci = args.filter_layout->operator[](3),
+           fh = args.filter_layout->operator[](1),
+           fw = args.filter_layout->operator[](2);
+
+    // param buffer size is 4K, use 3.4K to store precomputed offset
+    size_t kMaxFilterPixels =
+            848 / (m_algo_param.warp_k / m_algo_param.access_size) - 1;
+    if (fh * fw > kMaxFilterPixels)
+        return false;
+    // co should be aligned with 4, and ci should be aligned with
+    // algo_param.access_size
+    if ((co % 4 != 0) || (ci % m_algo_param.access_size != 0))
+        return false;
+
+    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
+    bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) &&
+                                (m_algo_param.threadblock_n == 16 ||
+                                 (m_algo_param.threadblock_n == 32 &&
+                                  dst_dtype != DTypeEnum::Float32)));
+    const auto* op = get_cutlass_conv_op(
+            args, ConvOperator::kFprop, ConvType::kConvolution,
+            use_conv_filter_unity_opt, without_shared_load);
+    if (op == nullptr)
+        return false;
+
+    return true;
+}
+
+size_t
+ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::get_workspace_in_bytes(
+        const SizeArgs& args) const {
+    if (args.preprocessed_filter) {
+        return 0;
+    } else {
+        return args.filter_layout->span().dist_byte();
+    }
+}
+
+size_t ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::
+        get_preprocess_workspace_in_bytes(const SizeArgs& args) const {
+    return 0;
+}
+
+SmallVector<TensorLayout> ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::
+        deduce_preprocessed_filter_layout(const SizeArgs& args) const {
+    return {args.filter_layout->collapse_contiguous()};
+}
+
+void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec_preprocess(
+        const ExecArgs& args) const {
+    void* filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
+    reorder_filter(args, m_algo_param.access_size, filter_ptr);
+}
+
+std::tuple<float, float, float, float, float>
+ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::get_constants(
+        const ExecArgs& args) const {
+    float src_scale = args.src_layout->dtype.param<dtype::QuantizedS8>().scale,
+          filter_scale =
+                  args.filter_layout->dtype.param<dtype::QuantizedS8>().scale,
+          bias_scale = 1.f, dst_scale;
+
+    if (args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32) {
+        bias_scale = args.bias_layout->dtype.param<dtype::QuantizedS32>().scale;
+    }
+
+    uint8_t dst_zero = 0;
+
+    if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8) {
+        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale;
+    } else if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) {
+        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale;
+    } else if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
+        dst_scale =
+                args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale;
+        dst_zero = args.dst_layout->dtype.param<dtype::Quantized4Asymm>()
+                           .zero_point;
+    } else {  // DTypeEnum::Float32
+        megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::Float32);
+        dst_scale = 1.f;
+    }
+
+    float alpha = src_scale * filter_scale / dst_scale,
+          beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f,
+          theta = dst_zero;
+
+    if (args.z_layout->ndim > 0) {
+        float z_scale;
+        if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS8) {
+            z_scale = args.z_layout->dtype.param<dtype::QuantizedS8>().scale;
+            gamma = z_scale / dst_scale;
+        } else if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS4) {
+            z_scale = args.z_layout->dtype.param<dtype::QuantizedS4>().scale;
+            gamma = z_scale / dst_scale;
+        } else if (args.z_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
+            z_scale =
+                    args.z_layout->dtype.param<dtype::Quantized4Asymm>().scale;
+            uint8_t z_zero =
+                    args.z_layout->dtype.param<dtype::Quantized4Asymm>()
+                            .zero_point;
+            gamma = z_scale / dst_scale;
+            delta = -z_zero * gamma;
+        } else {  // DTypeEnum::Float32
+            megdnn_assert(args.z_layout->dtype.enumv() == DTypeEnum::Float32);
+            gamma = 1.f;
+        }
+    }
+
+    if (args.opr->param().nonlineMode ==
+        param::ConvBias::NonlineMode::IDENTITY) {
+        delta += theta;
+        theta = 0.f;
+    }
+
+    return {alpha, beta, gamma, delta, theta};
+}
+
+void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::exec(
+        const ExecArgs& args) const {
+    auto&& param = args.opr->param();
+    auto&& fm = args.filter_meta;
+    size_t n = args.src_layout->operator[](0),
+           ci = args.src_layout->operator[](3),
+           hi = args.src_layout->operator[](1),
+           wi = args.src_layout->operator[](2);
+    size_t co = args.dst_layout->operator[](3),
+           ho = args.dst_layout->operator[](1),
+           wo = args.dst_layout->operator[](2);
+    UNPACK_CONV_PARAMETER(fm, param);
+    MARK_USED_VAR
+
+    void* filter_ptr = nullptr;
+    void* bias_ptr = nullptr;
+    void* z_ptr = nullptr;
+
+    if (args.preprocessed_filter) {
+        filter_ptr = args.preprocessed_filter->tensors[0].raw_ptr;
+    } else {
+        filter_ptr = reinterpret_cast<void*>(args.workspace.raw_ptr);
+        reorder_filter(args, m_algo_param.access_size, filter_ptr);
+    }
+    bias_ptr = args.bias_tensor->raw_ptr;
+
+    if (args.z_layout->ndim > 0)
+        z_ptr = args.z_tensor->raw_ptr;
+
+    // \note these constants of cutlass epilogue will be passed to method
+    // `execute_cutlass_conv_op` by pointer and interpreted as ElementCompute*,
+    // a different dtype here results in undefined epilogue behaviors
+    float alpha, beta, gamma, delta, theta;
+    std::tie(alpha, beta, gamma, delta, theta) = get_constants(args);
+
+    float dst_scale = 1.f;
+    float threshold = 0.f;
+    bool use_conv_filter_unity_opt = (fh == 1 && fw == 1);
+
+    auto dst_dtype = args.dst_layout->dtype.enumv();
+
+    bool without_shared_load = ((co % m_algo_param.threadblock_n == 0) &&
+                                (m_algo_param.threadblock_n == 16 ||
+                                 (m_algo_param.threadblock_n == 32 &&
+                                  dst_dtype != DTypeEnum::Float32)));
+
+    if (dst_dtype == DTypeEnum::QuantizedS8) {  // DTypeEnum::QuantizedS8
+        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale;
+    } else if (dst_dtype == DTypeEnum::QuantizedS4) {
+        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS4>().scale;
+    } else if (dst_dtype == DTypeEnum::Quantized4Asymm) {
+        dst_scale =
+                args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale;
+    } else {  // DTypeEnum::Float32
+        dst_scale = 1.f;
+    }
+
+    cudaStream_t stream = cuda_stream(args.opr->handle());
+
+    const auto* op = get_cutlass_conv_op(
+            args, ConvOperator::kFprop, ConvType::kConvolution,
+            use_conv_filter_unity_opt, without_shared_load);
+
+    execute_cutlass_conv_op(op, args.src_tensor->raw_ptr, filter_ptr, bias_ptr,
+                            z_ptr, args.dst_tensor->raw_ptr, nullptr, n, hi, wi,
+                            ci, co, fh, fw, ho, wo, ph, pw, sh, sw, dh, dw,
+                            &alpha, &beta, &gamma, &delta, &theta, &threshold,
+                            &dst_scale, stream);
+
+    after_kernel_launch();
+}
+
+std::string ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::to_string(
+        AlgoParam algo_param) {
+    return ssprintf("%dX%dX%d_%dX%dX%d_%d_%d", algo_param.threadblock_m,
+                    algo_param.threadblock_n, algo_param.threadblock_k,
+                    algo_param.warp_m, algo_param.warp_n, algo_param.warp_k,
+                    algo_param.stage, algo_param.access_size);
+}
+
+void ConvBiasForwardImpl::AlgoInt8NHWCIMMAImplicitGemm::reorder_filter(
+        const ExecArgs& args, const int iterleaved,
+        void* reordered_filter) const {
+    size_t co = args.filter_layout->operator[](0),
+           ci = args.filter_layout->operator[](3),
+           fh = args.filter_layout->operator[](1),
+           fw = args.filter_layout->operator[](2);
+
+    cudaStream_t stream = cuda_stream(args.opr->handle());
+
+    // reformat filter from nhwc to ncxhwx and reorder oc
+    // use trans_oc threadblock_n must be 16 or 32 and src dtype == dest dtype
+    bool trans_oc = ((co % m_algo_param.threadblock_n == 0) &&
+                     (m_algo_param.threadblock_n == 16 ||
+                      (m_algo_param.threadblock_n == 32 &&
+                       args.dst_layout->dtype.enumv() != DTypeEnum::Float32)));
+    uint32_t oc_iterleaved = (m_algo_param.threadblock_n == 32) ? 32 : 16;
+
+    uint32_t alignbits = iterleaved * 8;
+
+    cutlass_wrapper::reorder_nhwc_imma_filter<8>(
+            reinterpret_cast<int8_t*>(reordered_filter),
+            reinterpret_cast<int8_t*>(args.filter_tensor->raw_ptr), co, ci, fh,
+            fw, trans_oc, alignbits, oc_iterleaved, stream);
+}
+#endif
+
+// vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp
+++ b/dnn/src/cuda/conv_bias/implicit_gemm_uint4_int4_nhwc_imma.cpp
@@ -102,22 +102,41 @@ ConvBiasForwardImpl::AlgoUInt4Int4NHWCIMMAImplicitGemm::get_constants(
                  args.filter_layout->dtype.param<dtype::QuantizedS4>().scale,
          bias_scale =
                  args.bias_layout->dtype.param<dtype::QuantizedS32>().scale,
+          dst_scale;
+
+    uint8_t dst_zero = 0;
+
+    if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
        dst_scale =
                args.dst_layout->dtype.param<dtype::Quantized4Asymm>().scale;

-    uint8_t dst_zero =
-            args.dst_layout->dtype.param<dtype::Quantized4Asymm>().zero_point;
+        dst_zero = args.dst_layout->dtype.param<dtype::Quantized4Asymm>()
+                           .zero_point;
+    } else {  // DTypeEnum::QuantizedS8
+        megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8);
+        dst_scale = args.dst_layout->dtype.param<dtype::QuantizedS8>().scale;
+    }
+
    float alpha = src_scale * filter_scale / dst_scale,
          beta = bias_scale / dst_scale, gamma = 0.f, delta = 0.f,
          theta = dst_zero;

    if (args.z_layout->ndim > 0) {
-        float z_scale =
+        float z_scale;
+        if (args.z_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
+            z_scale =
                    args.z_layout->dtype.param<dtype::Quantized4Asymm>().scale;
-        gamma = z_scale / dst_scale;
            uint8_t z_zero =
-                args.z_layout->dtype.param<dtype::Quantized4Asymm>().zero_point;
+                    args.z_layout->dtype.param<dtype::Quantized4Asymm>()
+                            .zero_point;
+            gamma = z_scale / dst_scale;
            delta = -z_zero * gamma;
+        } else {  // DTypeEnum::QuantizedS8
+            megdnn_assert(args.z_layout->dtype.enumv() ==
+                          DTypeEnum::QuantizedS8);
+            z_scale = args.z_layout->dtype.param<dtype::QuantizedS8>().scale;
+            gamma = z_scale / dst_scale;
+        }
    }

    // identity epilogue has no theta:

--- a/dnn/src/cuda/conv_bias/opr_impl.h
+++ b/dnn/src/cuda/conv_bias/opr_impl.h
@@ -65,6 +65,7 @@ public:
    class AlgoInt8CHWN4IMMAImplicitGemmReorderFilter;
    class AlgoInt8CHWN4IMMAImplicitGemmUnrollWidth;
    class AlgoInt8NCHW32IMMAImplicitGemm;
+    class AlgoInt8NHWCIMMAImplicitGemm;
    class AlgoInt4NCHW64IMMAImplicitGemmBase;
    class AlgoInt4Int4NCHW64IMMAImplicitGemm;
    class AlgoUInt4Int4NCHW64IMMAImplicitGemm;

--- a/dnn/src/cuda/convolution/backward_data/algo.h
+++ b/dnn/src/cuda/convolution/backward_data/algo.h
@@ -275,6 +275,7 @@ public:
 private:
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
                                         const SizeArgs& args) const;
+    const void* get_available_op(const SizeArgs& args) const;
    AlgoParam m_algo_param;
    std::string m_name;
 };
@@ -295,6 +296,7 @@ public:
 private:
    WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr,
                                         const SizeArgs& args) const;
+    const void* get_available_op(const SizeArgs& args) const;
 };

 class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj {

--- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp
+++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp
@@ -20,6 +20,43 @@
 using namespace megdnn;
 using namespace cuda;

+const void*
+ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::get_available_op(
+        const SizeArgs& args) const {
+    using namespace cutlass::library;
+    auto&& fm = args.filter_meta;
+    size_t sh = fm.stride[0], sw = fm.stride[1];
+    cutlass::conv::SpecialOptimizeDesc special_optimization =
+            (sh == 2 && sw == 2) ? cutlass::conv::SpecialOptimizeDesc::
+                                           DECONV_DOUBLE_UPSAMPLING
+                                 : cutlass::conv::SpecialOptimizeDesc::NONE;
+    ConvolutionKey key{
+            cutlass::conv::Operator::kDgrad,
+            NumericTypeID::kS8,
+            LayoutTypeID::kTensorNC4HW4,
+            NumericTypeID::kS8,
+            LayoutTypeID::kTensorK4RSC4,
+            NumericTypeID::kS8,
+            LayoutTypeID::kTensorNC4HW4,
+            NumericTypeID::kS32,
+            LayoutTypeID::kTensorNC4HW4,
+            cutlass::conv::ConvType::kConvolution,
+            m_algo_param.threadblock_m,
+            m_algo_param.threadblock_n,
+            m_algo_param.threadblock_k,
+            m_algo_param.warp_m,
+            m_algo_param.warp_n,
+            m_algo_param.warp_k,
+            1,
+            1,
+            4,
+            cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
+            m_algo_param.stage,
+            special_optimization,
+            false};
+    return (void*)Singleton::get().operation_table.find_op(key);
+}
+
 bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
        is_available(const SizeArgs& args) const {
    auto&& fm = args.filter_meta;
@@ -51,6 +88,7 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::
    // FIXME: too large filter size is not supported now
    available &= fm.spatial[0] * fm.spatial[1] <=
                 (uint32_t)(848 / (2 * m_algo_param.warp_k / 4) - 2);
+    available &= (get_available_op(args) != nullptr);
    // only support sm_61 or later, platform should have fast native int8
    // support
    available &= is_compute_capability_required(6, 1);
@@ -105,40 +143,14 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
                  args.grad_layout->dtype.param<dtype::QuantizedS8>().scale;

    // \note these constants of cutlass epilogue will be passed to struct
-    // `ConvolutionArguments` by pointer and interpreted as ElementCompute*, a
-    // different dtype here results in undefined epilogue behaviors
+    // `ConvolutionArguments` by pointer and interpreted as ElementCompute*,
+    // a different dtype here results in undefined epilogue behaviors
    float alpha = diff_scale * filter_scale / grad_scale, beta = 0.f,
          gamma = 0.f, delta = 0.f;

    using namespace cutlass::library;

-    // only use 16x64x8_16x64x8_2stages impl
-    ConvolutionKey key{
-            cutlass::conv::Operator::kDgrad,
-            NumericTypeID::kS8,
-            LayoutTypeID::kTensorNC4HW4,
-            NumericTypeID::kS8,
-            LayoutTypeID::kTensorK4RSC4,
-            NumericTypeID::kS8,
-            LayoutTypeID::kTensorNC4HW4,
-            NumericTypeID::kS32,
-            LayoutTypeID::kTensorNC4HW4,
-            cutlass::conv::ConvType::kConvolution,
-            m_algo_param.threadblock_m,
-            m_algo_param.threadblock_n,
-            m_algo_param.threadblock_k,
-            m_algo_param.warp_m,
-            m_algo_param.warp_n,
-            m_algo_param.warp_k,
-            1,
-            1,
-            4,
-            cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
-            m_algo_param.stage,
-            true,
-            false};
-
-    const Operation* op = Singleton::get().operation_table.find_op(key);
+    const Operation* op = (const Operation*)get_available_op(args);

    // gcc prints warnings when size_t values are implicitly narrowed to int
    cutlass::conv::Conv2dProblemSize problem_size{
@@ -167,7 +179,6 @@ void ConvolutionBackwardDataImpl::AlgoPack::fill_int8_dp4a_algos() {
    int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2});
    int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1});
    int8_nchw4_dotprod.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2});
-    int8_nchw4_dotprod.emplace_back(AlgoParam{64, 128, 32, 64, 32, 32, 2});
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp
+++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw_dp4a.cpp
@@ -19,6 +19,44 @@
 using namespace megdnn;
 using namespace cuda;

+const void*
+ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::get_available_op(
+        const SizeArgs& args) const {
+    using namespace cutlass::library;
+    auto&& fm = args.filter_meta;
+    size_t sh = fm.stride[0], sw = fm.stride[1];
+    cutlass::conv::SpecialOptimizeDesc special_optimization =
+            (sh == 2 && sw == 2) ? cutlass::conv::SpecialOptimizeDesc::
+                                           DECONV_DOUBLE_UPSAMPLING
+                                 : cutlass::conv::SpecialOptimizeDesc::NONE;
+    // only use 16x64x8_16x64x8_2stages impl
+    ConvolutionKey key{
+            cutlass::conv::Operator::kDgrad,
+            NumericTypeID::kS8,
+            LayoutTypeID::kTensorNC4HW4,
+            NumericTypeID::kS8,
+            LayoutTypeID::kTensorK4RSC4,
+            NumericTypeID::kS8,
+            LayoutTypeID::kTensorNC4HW4,
+            NumericTypeID::kS32,
+            LayoutTypeID::kTensorNC4HW4,
+            cutlass::conv::ConvType::kConvolution,
+            16,
+            64,
+            8,
+            16,
+            64,
+            8,
+            1,
+            1,
+            4,
+            cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
+            2,
+            special_optimization,
+            false};
+    return (void*)Singleton::get().operation_table.find_op(key);
+}
+
 bool ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::is_available(
        const SizeArgs& args) const {
    auto&& fm = args.filter_meta;
@@ -52,6 +90,9 @@ bool ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::is_available(
    available &= (fm.dilation[0] == 1 && fm.dilation[1] == 1);
    // FIXME: too large filter size is not supported now
    available &= fm.spatial[0] * fm.spatial[1] <= (848 / (2 * 8 / 4) - 2);
+
+    available &= (get_available_op(args) != nullptr);
+
    // only support sm_61 or later, platform should have fast native int8
    // support
    available &= is_compute_capability_required(6, 1);
@@ -138,33 +179,7 @@ void ConvolutionBackwardDataImpl::AlgoInt8NCHWDotProdImplicitGemm::exec(

    using namespace cutlass::library;

-    // only use 16x64x8_16x64x8_2stages impl
-    ConvolutionKey key{
-            cutlass::conv::Operator::kDgrad,
-            NumericTypeID::kS8,
-            LayoutTypeID::kTensorNC4HW4,
-            NumericTypeID::kS8,
-            LayoutTypeID::kTensorK4RSC4,
-            NumericTypeID::kS8,
-            LayoutTypeID::kTensorNC4HW4,
-            NumericTypeID::kS32,
-            LayoutTypeID::kTensorNC4HW4,
-            cutlass::conv::ConvType::kConvolution,
-            16,
-            64,
-            8,
-            16,
-            64,
-            8,
-            1,
-            1,
-            4,
-            cutlass::epilogue::EpilogueType::kBiasAddLinearCombinationClamp,
-            2,
-            true,
-            false};
-
-    const Operation* op = Singleton::get().operation_table.find_op(key);
+    const Operation* op = (const Operation*)get_available_op(args);

    // gcc prints warnings when size_t values are implicitly narrowed to int
    cutlass::conv::Conv2dProblemSize problem_size{

--- a/dnn/src/cuda/cutlass/convolution_operation.h
+++ b/dnn/src/cuda/cutlass/convolution_operation.h
@@ -119,8 +119,8 @@ public:
        m_description.threadblock_swizzle = ThreadblockSwizzleMap<
                typename Operator::ThreadblockSwizzle>::kId;

-        m_description.need_load_from_const_mem =
-                Operator::kNeedLoadFromConstMem;
+        m_description.special_optimization =
+                Operator::kSpecialOpt;
        m_description.gemm_mode = Operator::kGemmMode;
        m_description.without_shared_load = Operator::kWithoutSharedLoad;
    }

--- a/dnn/src/cuda/cutlass/library.h
+++ b/dnn/src/cuda/cutlass/library.h
@@ -487,7 +487,7 @@ struct ConvolutionDescription : public OperationDescription {

    ThreadblockSwizzleID threadblock_swizzle;

-    bool need_load_from_const_mem;
+    conv::SpecialOptimizeDesc special_optimization;
    conv::ImplicitGemmMode gemm_mode;
    bool without_shared_load;
 };

--- a/dnn/src/cuda/cutlass/operation_table.cpp
+++ b/dnn/src/cuda/cutlass/operation_table.cpp
@@ -124,7 +124,7 @@ ConvolutionKey get_convolution_key_from_desc(
    key.epilogue_type = desc.epilogue_type;

    key.stages = desc.tile_description.threadblock_stages;
-    key.need_load_from_const_mem = desc.need_load_from_const_mem;
+    key.special_optimization = desc.special_optimization;
    key.without_shared_load = desc.without_shared_load;

    return key;
@@ -156,23 +156,25 @@ void OperationTable::append(Manifest const& manifest) {
 /////////////////////////////////////////////////////////////////////////////////////////////////

 Operation const* OperationTable::find_op(GemmKey const& key) const {
-    megdnn_assert(gemm_operations.count(key) > 0,
-                  "key not found in cutlass operation table");
+    if (gemm_operations.count(key)) {
        auto const& ops = gemm_operations.at(key);
        megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu",
                      ops.size());
        return ops[0];
+    }
+    return nullptr;
 }

 /////////////////////////////////////////////////////////////////////////////////////////////////

 Operation const* OperationTable::find_op(ConvolutionKey const& key) const {
-    megdnn_assert(convolution_operations.count(key) > 0,
-                  "key not found in cutlass operation table");
+    if (convolution_operations.count(key) > 0) {
        auto const& ops = convolution_operations.at(key);
        megdnn_assert(ops.size() == 1, "exactly one kernel expected, got %zu",
                      ops.size());
        return ops[0];
+    }
+    return nullptr;
 }

 /////////////////////////////////////////////////////////////////////////////////////////////////

--- a/dnn/src/cuda/cutlass/operation_table.h
+++ b/dnn/src/cuda/cutlass/operation_table.h
@@ -211,7 +211,7 @@ struct ConvolutionKey {

    epilogue::EpilogueType epilogue_type;
    int stages;
-    bool need_load_from_const_mem;
+    conv::SpecialOptimizeDesc special_optimization;
    bool without_shared_load;

    inline bool operator==(ConvolutionKey const& rhs) const {
@@ -234,7 +234,7 @@ struct ConvolutionKey {
               (instruction_shape_n == rhs.instruction_shape_n) &&
               (instruction_shape_k == rhs.instruction_shape_k) &&
               (epilogue_type == rhs.epilogue_type) && (stages == rhs.stages) &&
-               (need_load_from_const_mem == rhs.need_load_from_const_mem) &&
+               (special_optimization == rhs.special_optimization) &&
               (without_shared_load == rhs.without_shared_load);
    }

@@ -270,8 +270,8 @@ struct ConvolutionKey {
               "\n    instruction_shape: " + instruction_shape_str +
               "\n    epilogue_type: " + to_string(epilogue_type) +
               "\n    stages: " + std::to_string(stages) +
-               "\n    need_load_from_const_mem: " +
-               to_string(need_load_from_const_mem) +
+               "\n    special_optimization: " +
+               to_string(special_optimization) +
               "\n    without_shared_load: " + to_string(without_shared_load) +
               "\n}";
    }
@@ -308,8 +308,8 @@ struct ConvolutionKeyHasher {
                        sizeof(key.instruction_shape_k))
                .update(&key.epilogue_type, sizeof(key.epilogue_type))
                .update(&key.stages, sizeof(key.stages))
-                .update(&key.need_load_from_const_mem,
-                        sizeof(key.need_load_from_const_mem))
+                .update(&key.special_optimization,
+                        sizeof(key.special_optimization))
                .update(&key.without_shared_load,
                        sizeof(key.without_shared_load))
                .digest();

--- a/dnn/src/cuda/cutlass/util.cu
+++ b/dnn/src/cuda/cutlass/util.cu
@@ -1566,6 +1566,35 @@ char const* to_string(MathOperationID math_op, bool pretty) {

 ///////////////////////////////////////////////////////////////////////////////////////////////////

+static struct {
+    char const* text;
+    char const* pretty;
+    conv::SpecialOptimizeDesc enumerant;
+} SpecialOptimizeDesc_enumerants[] = {
+        {"none_special_opt", "NoneSpecialOpt", conv::SpecialOptimizeDesc::NONE},
+        {"conv_filter_unity", "ConvFilterUnity",
+         conv::SpecialOptimizeDesc::CONV_FILTER_UNITY},
+        {"deconv_double_upsampling", "DeconvDoubleUpsampling",
+         conv::SpecialOptimizeDesc::DECONV_DOUBLE_UPSAMPLING},
+};
+
+/// Converts an SpecialOptimizeDesc enumerant to a string
+char const* to_string(conv::SpecialOptimizeDesc special_opt, bool pretty) {
+    for (auto const& possible : SpecialOptimizeDesc_enumerants) {
+        if (special_opt == possible.enumerant) {
+            if (pretty) {
+                return possible.pretty;
+            } else {
+                return possible.text;
+            }
+        }
+    }
+
+    return pretty ? "Invalid" : "invalid";
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
 static struct {
    char const* text;
    char const* pretty;

--- a/dnn/src/cuda/cutlass/util.h
+++ b/dnn/src/cuda/cutlass/util.h
@@ -207,6 +207,10 @@ char const* to_string(bool val, bool pretty = false);
 /// Converts a MathOperationID enumerant to a string
 char const* to_string(MathOperationID math_op, bool pretty = false);

+/// Converts a SpecialOptimizeDesc enumerant to a string
+char const* to_string(conv::SpecialOptimizeDesc special_opt,
+                      bool pretty = false);
+
 /// Converts an ImplicitGemmMode enumerant to a string
 char const* to_string(conv::ImplicitGemmMode mode, bool pretty = false);


--- a/dnn/src/cuda/matrix_mul/algos.h
+++ b/dnn/src/cuda/matrix_mul/algos.h
@@ -235,6 +235,7 @@ public:
              m_name{ssprintf("CUTLASS_FLOAT32_SIMT_%s",
                              m_algo_param.to_string().c_str())} {}
    bool is_available(const SizeArgs& args) const override;
+
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
    AlgoAttribute attribute() const override {
@@ -260,6 +261,7 @@ private:
    void do_exec(const ExecArgs& args) const override;
    int min_alignment_requirement() const override { return 1; }
    std::string m_name;
+    const void* get_available_op(const SizeArgs& args) const;
 };

 class MatrixMulForwardImpl::AlgoFloat32SIMTSplitK final
@@ -270,6 +272,7 @@ public:
              m_name{ssprintf("CUTLASS_FLOAT32_SIMT_SPLIT_K_%s",
                              m_algo_param.to_string().c_str())} {}
    bool is_available(const SizeArgs& args) const override;
+    
    size_t get_workspace_in_bytes(const SizeArgs& args) const override;
    const char* name() const override { return m_name.c_str(); }
    AlgoAttribute attribute() const override {
@@ -297,6 +300,7 @@ private:
    void do_exec(const ExecArgs& args) const override;
    int min_alignment_requirement() const override { return 1; }
    std::string m_name;
+    const void* get_available_op(const SizeArgs& args) const;
 };

 class MatrixMulForwardImpl::AlgoFloat32SIMTGemvBatchedStrided final

--- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp
+++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt.cpp
@@ -19,6 +19,39 @@
 using namespace megdnn;
 using namespace cuda;

+const void* MatrixMulForwardImpl::AlgoFloat32SIMT::get_available_op(
+        const SizeArgs& args) const {
+    using namespace cutlass::library;
+    auto&& param = args.opr->param();
+    auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor
+                                    : LayoutTypeID::kRowMajor;
+    auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor
+                                    : LayoutTypeID::kRowMajor;
+
+    int alignment = min_alignment_requirement();
+    GemmKey key{NumericTypeID::kF32,
+                layoutA,
+                NumericTypeID::kF32,
+                layoutB,
+                NumericTypeID::kF32,
+                LayoutTypeID::kRowMajor,
+                NumericTypeID::kF32,
+                m_algo_param.threadblock_m,
+                m_algo_param.threadblock_n,
+                m_algo_param.threadblock_k,
+                m_algo_param.warp_m,
+                m_algo_param.warp_n,
+                m_algo_param.warp_k,
+                1,
+                1,
+                1,
+                2,
+                alignment,
+                alignment,
+                SplitKMode::kNone};
+    return (void*)Singleton::get().operation_table.find_op(key);
+}
+
 bool MatrixMulForwardImpl::AlgoFloat32SIMT::is_available(
        const SizeArgs& args) const {
    bool available =
@@ -34,6 +67,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMT::is_available(
                          m_algo_param.threadblock_n <=
                  y_grid_limit);

+    available &= (get_available_op(args) != nullptr);
+
    return available;
 }

@@ -61,34 +96,7 @@ void MatrixMulForwardImpl::AlgoFloat32SIMT::do_exec(

    using namespace cutlass::library;

-    auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor
-                                    : LayoutTypeID::kRowMajor;
-    auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor
-                                    : LayoutTypeID::kRowMajor;
-
-    int alignment = min_alignment_requirement();
-    GemmKey key{NumericTypeID::kF32,
-                layoutA,
-                NumericTypeID::kF32,
-                layoutB,
-                NumericTypeID::kF32,
-                LayoutTypeID::kRowMajor,
-                NumericTypeID::kF32, 
-                m_algo_param.threadblock_m,
-                m_algo_param.threadblock_n,
-                m_algo_param.threadblock_k,
-                m_algo_param.warp_m,
-                m_algo_param.warp_n,
-                m_algo_param.warp_k,
-                1,
-                1,
-                1, 
-                2, 
-                alignment, 
-                alignment, 
-                SplitKMode::kNone};
-
-    const Operation* op = Singleton::get().operation_table.find_op(key);
+    const Operation* op = (const Operation*)get_available_op(args);

    GemmArguments gemm_args{problem_size,
                            args.tensor_a.raw_ptr,

--- a/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
+++ b/dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
@@ -19,6 +19,39 @@
 using namespace megdnn;
 using namespace cuda;

+const void* MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::get_available_op(
+        const SizeArgs& args) const {
+    using namespace cutlass::library;
+    auto&& param = args.opr->param();
+    auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor
+                                    : LayoutTypeID::kRowMajor;
+    auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor
+                                    : LayoutTypeID::kRowMajor;
+
+    int alignment = min_alignment_requirement();
+    GemmKey key{NumericTypeID::kF32,
+                layoutA,
+                NumericTypeID::kF32,
+                layoutB,
+                NumericTypeID::kF32,
+                LayoutTypeID::kRowMajor,
+                NumericTypeID::kF32,
+                m_algo_param.threadblock_m,
+                m_algo_param.threadblock_n,
+                m_algo_param.threadblock_k,
+                m_algo_param.warp_m,
+                m_algo_param.warp_n,
+                m_algo_param.warp_k,
+                1,
+                1,
+                1,
+                2,
+                alignment,
+                alignment,
+                SplitKMode::kParallel};
+    return (void*)Singleton::get().operation_table.find_op(key);
+}
+
 bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available(
        const SizeArgs& args) const {
    auto&& param = args.opr->param();
@@ -35,6 +68,8 @@ bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available(
    available &= ((m + m_algo_param.threadblock_m - 1) /
                          m_algo_param.threadblock_m <=
                  y_grid_limit);
+    available &= (get_available_op(args) != nullptr);
+
    return available;
 }

@@ -66,35 +101,7 @@ void MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::do_exec(
    float alpha = 1.f, beta = 0.f;

    using namespace cutlass::library;
-
-    auto layoutA = param.transposeA ? LayoutTypeID::kColumnMajor
-                                    : LayoutTypeID::kRowMajor;
-    auto layoutB = param.transposeB ? LayoutTypeID::kColumnMajor
-                                    : LayoutTypeID::kRowMajor;
-
-    int alignment = min_alignment_requirement();
-    GemmKey key{NumericTypeID::kF32,
-                layoutA,
-                NumericTypeID::kF32,
-                layoutB,
-                NumericTypeID::kF32,
-                LayoutTypeID::kRowMajor,
-                NumericTypeID::kF32, 
-                m_algo_param.threadblock_m,
-                m_algo_param.threadblock_n,
-                m_algo_param.threadblock_k,
-                m_algo_param.warp_m,
-                m_algo_param.warp_n,
-                m_algo_param.warp_k,
-                1,
-                1,
-                1,
-                2, 
-                alignment, 
-                alignment, 
-                SplitKMode::kParallel};
-
-    Operation const* op = Singleton::get().operation_table.find_op(key);
+    const Operation* op = (const Operation*)get_available_op(args);

    GemmArguments gemm_args{problem_size,
                            args.tensor_a.raw_ptr,

--- a/dnn/test/cuda/conv_bias_int8.cpp
+++ b/dnn/test/cuda/conv_bias_int8.cpp
@@ -882,6 +882,125 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) {
            ConvBias::DirectParam{});
    check(algo);
 }
+
+TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC) {
+    require_compute_capability(7, 5);
+    Checker<ConvBiasForward> checker(handle_cuda());
+    auto check = [&checker](const std::string& algo) {
+        checker.set_before_exec_callback(
+                conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
+        UniformIntRNG rng{-8, 8};
+        UniformIntRNG bias_rng{-50, 50};
+        checker.set_rng(0, &rng)
+                .set_rng(1, &rng)
+                .set_rng(2, &bias_rng)
+                .set_rng(3, &rng)
+                .set_dtype(0, dtype::QuantizedS8{1.2f})
+                .set_dtype(1, dtype::QuantizedS8{1.3f})
+                .set_dtype(2, dtype::QuantizedS32{1.2f * 1.3f})
+                .set_dtype(3, dtype::QuantizedS8{19.990229f})
+                .set_dtype(4, dtype::QuantizedS8{19.990228f})
+                .set_epsilon(1e-3);
+        param::ConvBias param;
+        param.pad_h = param.pad_w = 1;
+        param.stride_h = param.stride_w = 1;
+        param.format = param::ConvBias::Format::NHWC;
+        checker.set_param(param).execs(
+                {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}});
+        param.pad_h = param.pad_w = 0;
+        param.nonlineMode = param::ConvBias::NonlineMode::RELU;
+        checker.set_param(param).execs(
+                {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}});
+    };
+    std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
+            "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16",
+            ConvBias::DirectParam{});
+    check(algo);
+    algo = ConvBias::algo_name<ConvBias::DirectParam>(
+            "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16",
+            ConvBias::DirectParam{});
+    check(algo);
+}
+
+TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC_UINT4_WEIGHT_PREPROCESS) {
+    require_compute_capability(7, 5);
+    Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker(
+            handle_cuda());
+    auto check = [&checker](const std::string& algo) {
+        checker.set_before_exec_callback(
+                conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
+        UniformIntRNG rng{-8, 8};
+        UniformIntRNG bias_rng{-50, 50};
+        UniformIntRNG rng_u4{0, 15};
+        checker.set_rng(0, &rng)
+                .set_rng(1, &rng)
+                .set_rng(2, &bias_rng)
+                .set_rng(3, &rng_u4)
+                .set_dtype(0, dtype::QuantizedS8{0.2f})
+                .set_dtype(1, dtype::QuantizedS8{0.3f})
+                .set_dtype(2, dtype::QuantizedS32{0.2f * 0.3f})
+                .set_dtype(3, dtype::Quantized4Asymm{0.5f, 8})
+                .set_dtype(4, dtype::Quantized4Asymm{0.5f, 4})
+                .set_epsilon(1 + 1e-3);
+        param::ConvBias param;
+        param.pad_h = param.pad_w = 1;
+        param.stride_h = param.stride_w = 1;
+        param.format = param::ConvBias::Format::NHWC;
+        checker.set_param(param).execs(
+                {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}});
+        param.pad_h = param.pad_w = 0;
+        param.nonlineMode = param::ConvBias::NonlineMode::RELU;
+        checker.set_param(param).execs(
+                {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}});
+    };
+    std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
+            "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16",
+            ConvBias::DirectParam{});
+    check(algo);
+    algo = ConvBias::algo_name<ConvBias::DirectParam>(
+            "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16",
+            ConvBias::DirectParam{});
+    check(algo);
+}
+
+TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NHWC_FLOAT) {
+    require_compute_capability(7, 5);
+    Checker<ConvBiasForward> checker(handle_cuda());
+    auto check = [&checker](const std::string& algo) {
+        checker.set_before_exec_callback(
+                conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(algo.c_str()));
+        UniformIntRNG rng{-8, 8};
+        UniformFloatRNG float_rng{-50, 50};
+        checker.set_rng(0, &rng)
+                .set_rng(1, &rng)
+                .set_rng(2, &float_rng)
+                .set_rng(3, &float_rng)
+                .set_dtype(0, dtype::QuantizedS8(1.9980618f))
+                .set_dtype(1, dtype::QuantizedS8(1.9980927f))
+                .set_dtype(2, dtype::Float32())
+                .set_dtype(3, dtype::Float32())
+                .set_dtype(4, dtype::Float32());
+        param::ConvBias param;
+        param.pad_h = param.pad_w = 1;
+        param.stride_h = param.stride_w = 1;
+        param.format = param::ConvBias::Format::NHWC;
+        checker.set_param(param).execs(
+                {{16, 7, 7, 16}, {32, 3, 3, 16}, {1, 1, 1, 32}, {}, {}});
+        param.pad_h = param.pad_w = 0;
+        param.nonlineMode = param::ConvBias::NonlineMode::RELU;
+        checker.set_param(param).execs(
+                {{16, 7, 7, 16}, {16, 1, 1, 16}, {1, 1, 1, 16}, {}, {}});
+    };
+    std::string algo = ConvBias::algo_name<ConvBias::DirectParam>(
+            "INT8_NHWC_IMMA_IMPLICIT_GEMM_64X16X32_64X16X32_2_16",
+            ConvBias::DirectParam{});
+    check(algo);
+    algo = ConvBias::algo_name<ConvBias::DirectParam>(
+            "INT8_NHWC_IMMA_IMPLICIT_GEMM_128X32X32_64X32X32_1_16",
+            ConvBias::DirectParam{});
+    check(algo);
+}
+
 #endif

 TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW) {
@@ -969,7 +1088,7 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_NCHW4) {
    checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker<
                                     ConvBiasForward>(
            ConvBias::algo_name<ConvBias::DirectParam>(
-                    "INT8_NCHW32_IMMA_IMPLICIT_GEMM_128X128X64_64X64X64_2",
+                    "INT8_NCHW32_IMMA_IMPLICIT_GEMM_32X128X32_32X64X32_1",
                    ConvBias::DirectParam{})
                    .c_str()));
    checker.set_dtype(0, dtype::QuantizedS8(1.9980618f))
@@ -1109,6 +1228,16 @@ TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW32) {
            "DIRECT:INT8_NCHW32_IMMA_IMPLICIT_GEMM",
            param::ConvBias::Format::NCHW32);
 }
+
+TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NHWC) {
+    require_compute_capability(7, 5);
+    benchmark_target_algo_with_cudnn_tsc(
+            handle_cuda(), get_det_first_bench_args(16),
+            dtype::QuantizedS8{1.2f}, dtype::QuantizedS8{1.3f},
+            dtype::QuantizedS32{1.2f * 1.3f}, dtype::QuantizedS8{1.0f},
+            "DIRECT:INT8_NHWC_IMMA_IMPLICIT_GEMM",
+            param::ConvBias::Format::NHWC);
+}
 #endif

 TEST_F(CUDA, BENCHMARK_CUTLASS_CONV_BIAS_INT8_NCHW4) {

--- a/dnn/test/cuda/conv_test_utils.cpp
+++ b/dnn/test/cuda/conv_test_utils.cpp
@@ -102,9 +102,7 @@ std::vector<BenchArgs> get_det_first_bench_args(size_t batch) {
    args.emplace_back(BenchArgs{batch, 16, 384, 640, 16, 3, 1});
    args.emplace_back(BenchArgs{batch, 16, 384, 640, 32, 3, 2});
    args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 3, 1});
-    args.emplace_back(BenchArgs{batch, 32, 384, 640, 64, 3, 2});
    args.emplace_back(BenchArgs{batch, 32, 184, 320, 32, 1, 1});
-    args.emplace_back(BenchArgs{batch, 32, 384, 640, 64, 1, 2});
    return args;
 }

@@ -333,6 +331,9 @@ void benchmark_target_algo_with_cudnn_tsc(
                            .reshape({shape[0], shape[1] / 4, 4, shape[2],
                                      shape[3]})
                            .dimshuffle({1, 3, 4, 0, 2}));
+        } else if (format == Format::NHWC) {
+            ret = static_cast<TensorShape>(
+                    TensorLayout{shape, dtype}.dimshuffle({0, 2, 3, 1}));
        }
        return ret;
    };
@@ -363,6 +364,9 @@ void benchmark_target_algo_with_cudnn_tsc(
        if ((format == Format::CHWN4 || format == Format::NCHW4) &&
            (arg.ci % 16 != 0))
            continue;
+        // skip testcase which cannot enable nhwc tensorcore
+        if ((format == Format::NHWC) && (arg.ci % 4 != 0 || arg.co % 4 != 0))
+            continue;
        Format format_cudnn = arg.ci % 32 == 0 && arg.co % 32 == 0
                                      ? Format::NCHW32
                                      : Format::NCHW4;

--- a/dnn/test/cuda/convolution.cpp
+++ b/dnn/test/cuda/convolution.cpp
@@ -327,7 +327,6 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_NCHW4_DP4A) {
    all_params.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2});
    all_params.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1});
    all_params.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2});
-    all_params.emplace_back(AlgoParam{64, 128, 32, 64, 32, 32, 2});

    for (auto algo_param : all_params) {
        Checker<ConvolutionBackwardData> checker(handle_cuda());