From 14e9ad625d9b54765ff1e4a245e6abbd10d3dd8b Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Wed, 6 Apr 2022 20:56:31 +0800
Subject: [PATCH] fix(megdnn): emit define-but-not-referenced and
 extra-;-ignored warning on cuda9.0~cuda9.1

GitOrigin-RevId: f6db42e3958499c214bf3fc6f15ed7e8d8321da3
---
 dnn/include/megdnn/arch.h                     | 26 ++++++++---------
 dnn/include/megdnn/dtype.h                    | 12 ++++----
 dnn/include/megdnn/thin/small_vector.h        |  2 +-
 dnn/src/common/basic_types.cpp                |  4 +--
 dnn/src/common/elemwise_helper.cuh            |  4 +--
 dnn/src/common/named_tensor.cpp               |  2 +-
 dnn/src/cuda/argsort/argsort.cu               |  2 ++
 dnn/src/cuda/checksum/kern.cu                 |  2 ++
 dnn/src/cuda/conv_bias/chanwise/kern.cuh      |  6 ++--
 .../conv_bias/matmul/inplace_matmul_impl.cu   |  2 ++
 .../reduce_with_scale_data.cu                 |  4 ++-
 dnn/src/cuda/conv_bias/reduce_filter.cu       |  2 ++
 .../backward_filter/inplace_matmul_impl.cu    |  2 ++
 .../forward/inplace_matmul_impl.cu            |  2 ++
 dnn/src/cuda/convolution_helper/config.cuh    |  4 +--
 .../conv_trait/ibatch_conv_trait.cuh          |  6 ++--
 .../conv_trait/iconv_imma_trait.cuh           | 10 +++----
 .../conv_trait/iconv_trait.cuh                |  4 +--
 .../global_memory_visitor_cixhw.cuh           |  2 +-
 .../global_memory_visitor_imma_cixn.cuh       |  2 +-
 .../global_memory_visitor_imma_cixwixn.cuh    |  2 +-
 .../global_memory_visitor_imma_cixwoxn.cuh    |  2 +-
 .../global_memory_visitor_imma_fwxco.cuh      |  2 +-
 dnn/src/cuda/cumsum/kern_impl.cu              |  2 ++
 dnn/src/cuda/elemwise/special_kerns.inl       |  2 +-
 .../kernel_common/diagnostic_epilogue.cuh     | 24 +++++++++++++++
 .../kernel_common/diagnostic_prologue.cuh     | 29 +++++++++++++++++++
 .../local_share_bwd_data_f32_implicit_gemm.cu |  2 ++
 ...ocal_share_bwd_filter_f32_implicit_gemm.cu |  2 ++
 ...cal_share_fwd_chwn_f32_batch_size_aware.cu |  4 ++-
 ...d_chwn_f32_batch_size_aware_small_image.cu |  2 ++
 .../uint4x4x32_wmma/wmma_matrix_mul_u4.cu     |  2 +-
 dnn/src/cuda/pooling/pooling2d_qint.cu        |  7 +++--
 dnn/src/cuda/query_blocksize_impl.cu          |  2 ++
 .../cuda/relayout_format/relayout_format.cu   |  5 +++-
 .../relayout_format/relayout_format_kern.cuh  |  6 ++--
 .../relayout_format/relayout_format_utils.cuh |  2 +-
 dnn/src/cuda/resize/forward.cu                |  2 ++
 dnn/src/cuda/type_cvt/kern.cu                 |  2 ++
 dnn/src/cuda/warp_affine/warp_affine.cu       |  2 ++
 dnn/src/cuda/warp_perspective/forward.cu      |  2 ++
 dnn/src/naive/relayout_format/opr_impl.cpp    |  2 +-
 src/opr/test/basic_arith/elemwise.cpp         |  4 +--
 43 files changed, 152 insertions(+), 59 deletions(-)
 create mode 100644 dnn/src/cuda/kernel_common/diagnostic_epilogue.cuh
 create mode 100644 dnn/src/cuda/kernel_common/diagnostic_prologue.cuh

diff --git a/dnn/include/megdnn/arch.h b/dnn/include/megdnn/arch.h
index ebaa59f8c..ce6f76cbc 100644
--- a/dnn/include/megdnn/arch.h
+++ b/dnn/include/megdnn/arch.h
@@ -36,15 +36,15 @@
 #define MEGDNN_ALWAYS_INLINE inline __attribute__((__always_inline__))
 #endif
 
-#define MEGDNN_DEPRECATED             __attribute__((deprecated))
-#define MEGDNN_PACKED                 __attribute__((packed))
-#define MEGDNN_CONSTEXPR              constexpr
-#define MEGDNN_NOEXCEPT               noexcept
-#define MEGDNN_STATIC_ASSERT          static_assert
-#define MEGDNN_FINAL                  final
-#define MEGDNN_NORETURN               __attribute__((noreturn))
-#define MEGDNN_WARN_UNUSED_RESULT     __attribute__((warn_unused_result))
-#define MEGDNN_ATTRIBUTE_TARGET(simd) __attribute__((target(simd)))
+#define MEGDNN_DEPRECATED               __attribute__((deprecated))
+#define MEGDNN_PACKED                   __attribute__((packed))
+#define MEGDNN_CONSTEXPR                constexpr
+#define MEGDNN_NOEXCEPT                 noexcept
+#define MEGDNN_STATIC_ASSERT(cond, msg) static_assert(cond, msg);
+#define MEGDNN_FINAL                    final
+#define MEGDNN_NORETURN                 __attribute__((noreturn))
+#define MEGDNN_WARN_UNUSED_RESULT       __attribute__((warn_unused_result))
+#define MEGDNN_ATTRIBUTE_TARGET(simd)   __attribute__((target(simd)))
 #if defined(__clang_major__) && (__clang_major__ >= 7)
 #define MEGDNN_LAMBDA_ATTRIBUTE_TARGET(simd) __attribute__((target(simd)))
 #else
@@ -64,10 +64,10 @@
 
 #define MEGDNN_DEPRECATED
 #define MEGDNN_PACKED
-#define MEGDNN_CONSTEXPR     constexpr
-#define MEGDNN_NOEXCEPT      noexcept
-#define MEGDNN_STATIC_ASSERT static_assert
-#define MEGDNN_FINAL         final
+#define MEGDNN_CONSTEXPR                constexpr
+#define MEGDNN_NOEXCEPT                 noexcept
+#define MEGDNN_STATIC_ASSERT(cond, msg) static_assert(cond, msg);
+#define MEGDNN_FINAL                    final
 
 #if defined(_MSC_VER)
 #define MEGDNN_NORETURN __declspec(noreturn)
diff --git a/dnn/include/megdnn/dtype.h b/dnn/include/megdnn/dtype.h
index 7a8c876bd..7b381bcf4 100644
--- a/dnn/include/megdnn/dtype.h
+++ b/dnn/include/megdnn/dtype.h
@@ -311,11 +311,11 @@ public:
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
-MEGDNN_STATIC_ASSERT(sizeof(dt_byte) == 1, "bad dt_byte size");
-MEGDNN_STATIC_ASSERT(sizeof(dt_qint1) == 1, "bad dt_qint1 size");
-MEGDNN_STATIC_ASSERT(sizeof(dt_quint8) == 1, "bad dt_quint8 size");
-MEGDNN_STATIC_ASSERT(sizeof(dt_qint16) == 2, "bad dt_qint16 size");
-MEGDNN_STATIC_ASSERT(sizeof(dt_qint32) == 4, "bad dt_qint32 size");
+MEGDNN_STATIC_ASSERT(sizeof(dt_byte) == 1, "bad dt_byte size")
+MEGDNN_STATIC_ASSERT(sizeof(dt_qint1) == 1, "bad dt_qint1 size")
+MEGDNN_STATIC_ASSERT(sizeof(dt_quint8) == 1, "bad dt_quint8 size")
+MEGDNN_STATIC_ASSERT(sizeof(dt_qint16) == 2, "bad dt_qint16 size")
+MEGDNN_STATIC_ASSERT(sizeof(dt_qint32) == 4, "bad dt_qint32 size")
 typedef float dt_float32;
 typedef int32_t dt_int32;
 typedef int16_t dt_int16;
@@ -613,7 +613,7 @@ template <uint16_t n>
 struct log {
     static MEGDNN_CONSTEXPR size_t value = log<(n >> 1)>::value + 1;
 #if MEGDNN_CC_HOST
-    MEGDNN_STATIC_ASSERT((n & (n - 1)) == 0, "only full power number can have log");
+    MEGDNN_STATIC_ASSERT((n & (n - 1)) == 0, "only full power number can have log")
 #endif
 };
 template <>
diff --git a/dnn/include/megdnn/thin/small_vector.h b/dnn/include/megdnn/thin/small_vector.h
index b6ccf1901..80d5404a6 100644
--- a/dnn/include/megdnn/thin/small_vector.h
+++ b/dnn/include/megdnn/thin/small_vector.h
@@ -333,7 +333,7 @@ public:
         } else if (n > this->size()) {
             if (this->capacity() < n)
                 this->grow(n);
-            for (auto it = this->end(), end = this->begin() + n; it != end; ++it)
+            for (iterator it = this->end(), end = this->begin() + n; it != end; ++it)
                 new (&*it) T();
             this->set_end(this->begin() + n);
         }
diff --git a/dnn/src/common/basic_types.cpp b/dnn/src/common/basic_types.cpp
index 35375ed03..be2d97696 100644
--- a/dnn/src/common/basic_types.cpp
+++ b/dnn/src/common/basic_types.cpp
@@ -144,7 +144,7 @@ size_t TensorShape::total_nr_elems() const {
 }
 
 bool TensorShape::eq_shape(const TensorShape& rhs) const {
-    MEGDNN_STATIC_ASSERT(MAX_NDIM == 7, "please update the code");
+    MEGDNN_STATIC_ASSERT(MAX_NDIM == 7, "please update the code")
     if (ndim == rhs.ndim) {
         size_t eq = 0;
         switch (ndim) {
@@ -379,7 +379,7 @@ bool TensorLayout::eq_layout(const TensorLayout& rhs) const {
             dtype == rhs.dtype,
             "could not compare layout on different dtypes: %s vs %s", dtype.name(),
             rhs.dtype.name());
-    MEGDNN_STATIC_ASSERT(MAX_NDIM == 7, "please update the code");
+    MEGDNN_STATIC_ASSERT(MAX_NDIM == 7, "please update the code")
 
     auto ax = [](size_t shape0, size_t shape1, ptrdiff_t stride0, ptrdiff_t stride1) {
         return (shape0 == shape1) & ((shape0 <= 1) | (stride0 == stride1));
diff --git a/dnn/src/common/elemwise_helper.cuh b/dnn/src/common/elemwise_helper.cuh
index e91f92a78..e7ca6f74e 100644
--- a/dnn/src/common/elemwise_helper.cuh
+++ b/dnn/src/common/elemwise_helper.cuh
@@ -98,10 +98,10 @@ MEGDNN_DEVICE MEGDNN_HOST inline T round_mulh_saturate(T a, T b) {
     MEGDNN_STATIC_ASSERT(
             std::numeric_limits<T>::digits <= 32,
             "Portable RMULH is not supported for integer "
-            "types larger than 32 bits.");
+            "types larger than 32 bits.")
     MEGDNN_STATIC_ASSERT(
             std::numeric_limits<T>::is_integer,
-            "Input types should be integer for RMULH");
+            "Input types should be integer for RMULH")
     bool overflow = a == b && a == DTypeTrait<T>::min();
     // TODO: This really should be
     // rounding_shift_right_away_from_zero, but we haven't yet found a fast way
diff --git a/dnn/src/common/named_tensor.cpp b/dnn/src/common/named_tensor.cpp
index 9e952475d..2b97230c1 100644
--- a/dnn/src/common/named_tensor.cpp
+++ b/dnn/src/common/named_tensor.cpp
@@ -185,7 +185,7 @@ NamedTensorShape::NamedTensorShape(std::initializer_list<Dimension> init_shape)
         : NamedTensorShape(SmallVector<Dimension>{init_shape}) {}
 
 bool NamedTensorShape::eq_shape(const NamedTensorShape& rhs) const {
-    MEGDNN_STATIC_ASSERT(MAX_NDIM == 7, "please update the code");
+    MEGDNN_STATIC_ASSERT(MAX_NDIM == 7, "please update the code")
     if (ndim == rhs.ndim) {
         size_t eq = 0;
         switch (ndim) {
diff --git a/dnn/src/cuda/argsort/argsort.cu b/dnn/src/cuda/argsort/argsort.cu
index 25236c9fe..16326f034 100644
--- a/dnn/src/cuda/argsort/argsort.cu
+++ b/dnn/src/cuda/argsort/argsort.cu
@@ -16,6 +16,7 @@
 
 #include "src/cuda/cub/device/device_radix_sort.cuh"
 #include "src/cuda/cub/device/device_segmented_radix_sort.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 
 using namespace megdnn;
 using namespace cuda;
@@ -175,4 +176,5 @@ INST_CUB_SORT(uint64_t)
 #undef INST_FORWARD
 }  // namespace cuda
 }  // namespace megdnn
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: ft=cuda syntax=cuda.doxygen
diff --git a/dnn/src/cuda/checksum/kern.cu b/dnn/src/cuda/checksum/kern.cu
index 8edc8a4d0..81d5b433e 100644
--- a/dnn/src/cuda/checksum/kern.cu
+++ b/dnn/src/cuda/checksum/kern.cu
@@ -10,6 +10,7 @@
  */
 #include "./kern.cuh"
 
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/reduce_helper.cuh"
 #include "src/cuda/utils.cuh"
 
@@ -63,4 +64,5 @@ void megdnn::cuda::checksum::calc(
 size_t megdnn::cuda::checksum::get_workspace_in_bytes(size_t nr_elem) {
     return get_reduce_workspace_in_bytes<ChecksumOp>(1, nr_elem, 1);
 }
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: ft=cpp syntax=cpp.doxygen
diff --git a/dnn/src/cuda/conv_bias/chanwise/kern.cuh b/dnn/src/cuda/conv_bias/chanwise/kern.cuh
index 58d239e8c..15cb5ec59 100644
--- a/dnn/src/cuda/conv_bias/chanwise/kern.cuh
+++ b/dnn/src/cuda/conv_bias/chanwise/kern.cuh
@@ -25,13 +25,13 @@ namespace conv_bias {
 namespace chanwise {
 
 struct Param {
-    uint32_t batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w, pad_h,
-            pad_w, stride_h, stride_w, dilation_h, dilation_w;
+    int batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w, pad_h, pad_w,
+            stride_h, stride_w, dilation_h, dilation_w;
     bool is_compute_deafult;
 #if MEGDNN_CC_HOST
     static Param from_fwd_args(
             const BiasForwardSizeArgs& args, bool is_compute_deafult_ = true) {
-#define U(v) static_cast<uint32_t>(v)
+#define U(v) static_cast<int>(v)
         auto&& src = args.src_layout->shape;
         auto&& dst = args.dst_layout->shape;
         auto&& fm = args.filter_meta;
diff --git a/dnn/src/cuda/conv_bias/matmul/inplace_matmul_impl.cu b/dnn/src/cuda/conv_bias/matmul/inplace_matmul_impl.cu
index af6c1860d..be44c4491 100644
--- a/dnn/src/cuda/conv_bias/matmul/inplace_matmul_impl.cu
+++ b/dnn/src/cuda/conv_bias/matmul/inplace_matmul_impl.cu
@@ -9,6 +9,7 @@
  * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  */
 #include "src/cuda/conv_bias/matmul/inplace_matmul_impl.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/utils.cuh"
 
 using namespace megdnn;
@@ -386,4 +387,5 @@ void conv_bias::exec_inplace_matmul_fwd(
     after_kernel_launch();
 }
 
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cu b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cu
index 6041f267b..c436eb47e 100644
--- a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cu
+++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cu
@@ -37,6 +37,7 @@
 #include "./reduce_with_scale_data.cuh"
 #include "./wmma_conv_integer_u4.cuh"
 #include "src/cuda/cub/util_ptx.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 
 using namespace megdnn;
 using namespace cuda;
@@ -50,7 +51,7 @@ template <
 struct TileCounter {
     MEGDNN_STATIC_ASSERT(
             thread_blk_x % WARP_SIZE == 0,
-            "thread block size in dim x not divided by warpSize");
+            "thread block size in dim x not divided by warpSize")
     static const size_t spatial_tile_x = thread_blk_x * pixels_per_thread_x;
     static const size_t spatial_tile_y = thread_blk_y * pixels_per_thread_y;
     static const size_t global_load_tile_x =
@@ -678,4 +679,5 @@ void megdnn::cuda::do_dispatch_reduce_with_scale_data_u4(
     after_kernel_launch();
 }
 
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: ft=cpp syntax=cuda.doxygen
diff --git a/dnn/src/cuda/conv_bias/reduce_filter.cu b/dnn/src/cuda/conv_bias/reduce_filter.cu
index 779c3184e..28d027e3c 100644
--- a/dnn/src/cuda/conv_bias/reduce_filter.cu
+++ b/dnn/src/cuda/conv_bias/reduce_filter.cu
@@ -40,6 +40,7 @@
 #include "src/cuda/utils.cuh"
 
 #include "src/cuda/integer_subbyte_utils.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/reduce_helper.cuh"
 
 using namespace megdnn;
@@ -163,4 +164,5 @@ size_t megdnn::cuda::do_dispatch_reduce_workspace_in_bytes(
     return get_reduce_workspace_in_bytes<ReduceWithScaleInt4Op<false>>(A, B, C);
 }
 
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: ft=cpp syntax=cuda.doxygen
diff --git a/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cu b/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cu
index 0f06a3fe5..9053adec0 100644
--- a/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cu
+++ b/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cu
@@ -11,6 +11,7 @@
 #include <stdio.h>
 #include <iostream>
 #include "./inplace_matmul_impl.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/utils.cuh"
 using namespace megdnn;
 using namespace cuda;
@@ -429,4 +430,5 @@ void convolution3d::exec_inplace_matmul_bwd_filter(
     after_kernel_launch();
 }
 
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/convolution3d/forward/inplace_matmul_impl.cu b/dnn/src/cuda/convolution3d/forward/inplace_matmul_impl.cu
index 4fc447473..56b9cd747 100644
--- a/dnn/src/cuda/convolution3d/forward/inplace_matmul_impl.cu
+++ b/dnn/src/cuda/convolution3d/forward/inplace_matmul_impl.cu
@@ -11,6 +11,7 @@
 #include <stdio.h>
 #include <iostream>
 #include "./inplace_matmul_impl.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/utils.cuh"
 using namespace megdnn;
 using namespace cuda;
@@ -401,4 +402,5 @@ void convolution3d::exec_inplace_matmul_fwd(
     after_kernel_launch();
 }
 
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/convolution_helper/config.cuh b/dnn/src/cuda/convolution_helper/config.cuh
index 35218ba1a..6163a357a 100644
--- a/dnn/src/cuda/convolution_helper/config.cuh
+++ b/dnn/src/cuda/convolution_helper/config.cuh
@@ -54,9 +54,9 @@ struct RegBlockConfig {
     static int constexpr reg_n = reg_n_;
     static int constexpr reg_k = reg_k_;
     MEGDNN_STATIC_ASSERT(
-            reg_m % pack_size == 0, "reg_m must be a multiple of pack_size");
+            reg_m % pack_size == 0, "reg_m must be a multiple of pack_size")
     MEGDNN_STATIC_ASSERT(
-            reg_k % pack_size == 0, "reg_k must be a multiple of pack_size");
+            reg_k % pack_size == 0, "reg_k must be a multiple of pack_size")
     static int constexpr reg_k_packed = reg_k / pack_size;
     static int constexpr reg_m_packed = reg_m / pack_size;
     static int constexpr reg_width = reg_width_;
diff --git a/dnn/src/cuda/convolution_helper/conv_trait/ibatch_conv_trait.cuh b/dnn/src/cuda/convolution_helper/conv_trait/ibatch_conv_trait.cuh
index c67bb20df..f4c7c6a87 100644
--- a/dnn/src/cuda/convolution_helper/conv_trait/ibatch_conv_trait.cuh
+++ b/dnn/src/cuda/convolution_helper/conv_trait/ibatch_conv_trait.cuh
@@ -57,7 +57,7 @@ namespace convolution {
     using KernLayout = _kern_layout;                                                 \
     using OutputLayout = _output_layout;                                             \
     using Param = _conv_param;                                                       \
-    static constexpr bool check_bounds = check_bounds_;
+    static constexpr bool check_bounds = check_bounds_
 #define MEGDNN_COMMA ,
 
 template <
@@ -80,7 +80,7 @@ struct IBatchConvTrait_f1x1s1x1 {
         static int constexpr block_tile_batch = RegBlockConfig::reg_n;
         MEGDNN_STATIC_ASSERT(
                 block_tile_batch == 1,
-                "this algorithm does not unroll on batch dimension");
+                "this algorithm does not unroll on batch dimension")
         static int constexpr block_tile_out_height_width =
                 RegBlockConfig::reg_width * ThreadConfig::nr_thread_x;
         static int constexpr block_tile_in_channel = RegBlockConfig::reg_k;
@@ -162,7 +162,7 @@ struct IBatchConvTrait {
         static int constexpr block_tile_batch = RegBlockConfig::reg_n;
         MEGDNN_STATIC_ASSERT(
                 block_tile_batch == 1,
-                "this algorithm does not unroll on batch dimension");
+                "this algorithm does not unroll on batch dimension")
         static int constexpr block_tile_out_height_width =
                 RegBlockConfig::reg_width * ThreadConfig::nr_thread_x;
         static int constexpr block_tile_in_channel = RegBlockConfig::reg_k;
diff --git a/dnn/src/cuda/convolution_helper/conv_trait/iconv_imma_trait.cuh b/dnn/src/cuda/convolution_helper/conv_trait/iconv_imma_trait.cuh
index 0481e7394..443151779 100644
--- a/dnn/src/cuda/convolution_helper/conv_trait/iconv_imma_trait.cuh
+++ b/dnn/src/cuda/convolution_helper/conv_trait/iconv_imma_trait.cuh
@@ -54,7 +54,7 @@ namespace convolution {
     using KernLayout = _kern_layout;                                                 \
     using OutputLayout = _output_layout;                                             \
     using Param = _conv_param;                                                       \
-    static constexpr bool check_bounds = check_bounds_;
+    static constexpr bool check_bounds = check_bounds_
 #define MEGDNN_COMMA ,
 
 template <
@@ -183,7 +183,7 @@ struct IConvIMMATraitReorderFilter {
                             check_bounds MEGDNN_COMMA IMMAConfig MEGDNN_COMMA
                                     WarpTileConfig MEGDNN_COMMA ThreadConfig>::src_dtype
                                  MEGDNN_COMMA src_dtype>::value == true,
-            "data type of input tensor should be int8_t");
+            "data type of input tensor should be int8_t")
     using DataTileCount = typename IConvIMMATrait<
             check_bounds, IMMAConfig, WarpTileConfig, ThreadConfig>::DataTileCount;
     struct FilterTileCount {
@@ -284,7 +284,7 @@ struct IConvIMMATraitUnrollWidth {
                             check_bounds MEGDNN_COMMA IMMAConfig MEGDNN_COMMA
                                     WarpTileConfig MEGDNN_COMMA ThreadConfig>::
                                  filter_dtype MEGDNN_COMMA filter_dtype>::value == true,
-            "data type of filter tensor should be int8_t");
+            "data type of filter tensor should be int8_t")
     using FilterTileCount = typename IConvIMMATraitReorderFilter<
             check_bounds, IMMAConfig, WarpTileConfig, ThreadConfig>::FilterTileCount;
     using BlockTileIterator =
@@ -345,7 +345,7 @@ struct IConvIMMATraitUnrollWidthV2 {
         MEGDNN_STATIC_ASSERT(
                 WarpTileConfig::warp_tile_k == 1,
                 "kernel unrolling along width axis assumes tile k "
-                "in warp-level must be 1");
+                "in warp-level must be 1")
         using copy_t = int4;
         using smem_storage_dtype = smem_storage_dtype;
         static int constexpr load_width = sizeof(copy_t) / sizeof(smem_storage_dtype);
@@ -388,7 +388,7 @@ struct IConvIMMATraitUnrollWidthV2 {
         MEGDNN_STATIC_ASSERT(
                 WarpTileConfig::warp_tile_k == 1,
                 "kernel unrolling along width axis assumes tile k "
-                "in warp-level must be 1");
+                "in warp-level must be 1")
         using copy_t = int4;
         using smem_storage_dtype = smem_storage_dtype;
         static int constexpr load_width = sizeof(copy_t) / sizeof(smem_storage_dtype);
diff --git a/dnn/src/cuda/convolution_helper/conv_trait/iconv_trait.cuh b/dnn/src/cuda/convolution_helper/conv_trait/iconv_trait.cuh
index d4c52724e..5adcbdd9b 100644
--- a/dnn/src/cuda/convolution_helper/conv_trait/iconv_trait.cuh
+++ b/dnn/src/cuda/convolution_helper/conv_trait/iconv_trait.cuh
@@ -54,7 +54,7 @@ namespace convolution {
     using KernLayout = _kern_layout;                                                 \
     using OutputLayout = _output_layout;                                             \
     using Param = _conv_param;                                                       \
-    static constexpr bool check_bounds = check_bounds_;
+    static constexpr bool check_bounds = check_bounds_
 #define MEGDNN_COMMA ,
 
 template <
@@ -175,7 +175,7 @@ struct IConvTraitUnrollWidth {
                             check_bounds MEGDNN_COMMA ldg_dtype MEGDNN_COMMA
                                     RegBlockConfig MEGDNN_COMMA ThreadConfig>::
                                  filter_dtype MEGDNN_COMMA filter_dtype>::value == true,
-            "data type of filter tensor should be int8_t");
+            "data type of filter tensor should be int8_t")
     using FilterTileCount = typename IConvTrait<
             check_bounds, ldg_dtype, RegBlockConfig, ThreadConfig>::FilterTileCount;
     using BlockTileIterator =
diff --git a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixhw.cuh b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixhw.cuh
index 554202fe4..881c8ab96 100644
--- a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixhw.cuh
+++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixhw.cuh
@@ -108,7 +108,7 @@ struct Global2ShareMemVisitor_CIxHW;
 DEF(true, Layout<NCHW4>)
 
 copy_t reg[TileCount::reg_h][TileCount::reg_w][TileCount::reg_d];
-MEGDNN_STATIC_ASSERT(load_width == 4, "load four element from src tensor per time");
+MEGDNN_STATIC_ASSERT(load_width == 4, "load four element from src tensor per time")
 
 __device__ Global2ShareMemVisitor_CIxHW(
         smem_storage_dtype* smem_, const int* __restrict__ offset_)
diff --git a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixn.cuh b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixn.cuh
index 38ff7d6a7..d8e386c6b 100644
--- a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixn.cuh
+++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixn.cuh
@@ -169,7 +169,7 @@ const int gl_load_x = tid - gl_load_y * TileCount::load_x;
 copy_t reg[TileCount::reg_h][TileCount::reg_w];
 MEGDNN_STATIC_ASSERT(
         std::is_same<copy_t MEGDNN_COMMA int4>::value == true,
-        "ldg data type must be int4 for this memory visitor");
+        "ldg data type must be int4 for this memory visitor")
 
 __device__ __forceinline__ void init_stride(Layout<Format::CHWN16> layout) {
     stride = layout.channel_stride / TileCount::ldg_load_width;
diff --git a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwixn.cuh b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwixn.cuh
index 0c44f5d52..3a3f2791c 100644
--- a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwixn.cuh
+++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwixn.cuh
@@ -65,7 +65,7 @@ const int gl_load_x = tid - gl_load_y * TileCount::load_x;
 copy_t reg[TileCount::reg_h][TileCount::reg_w][TileCount::reg_d];
 MEGDNN_STATIC_ASSERT(
         std::is_same<copy_t MEGDNN_COMMA int4>::value == true,
-        "ldg data type must be int4 for this memory visitor");
+        "ldg data type must be int4 for this memory visitor")
 
 __device__ __forceinline__ void init_stride(Layout<Format::CHWN4> layout) {
     stride = layout.channel_stride / TileCount::ldg_load_width;
diff --git a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwoxn.cuh b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwoxn.cuh
index cb14ea429..6b55c0205 100644
--- a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwoxn.cuh
+++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwoxn.cuh
@@ -66,7 +66,7 @@ const int gl_load_x = tid - gl_load_y * TileCount::load_x;
 copy_t reg[TileCount::reg_h][TileCount::reg_w][TileCount::reg_d];
 MEGDNN_STATIC_ASSERT(
         std::is_same<copy_t MEGDNN_COMMA int4>::value == true,
-        "ldg data type must be int4 for this memory visitor");
+        "ldg data type must be int4 for this memory visitor")
 
 __device__ __forceinline__ void init_stride(Layout<Format::CHWN4> layout) {
     stride = layout.channel_stride / TileCount::ldg_load_width;
diff --git a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_fwxco.cuh b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_fwxco.cuh
index 6b576a347..6e67bfc36 100644
--- a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_fwxco.cuh
+++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_fwxco.cuh
@@ -63,7 +63,7 @@ const int gl_load_x = tid - gl_load_y * TileCount::load_x;
 copy_t reg[TileCount::reg_h][TileCount::reg_w];
 MEGDNN_STATIC_ASSERT(
         std::is_same<copy_t MEGDNN_COMMA int4>::value == true,
-        "ldg data type must be int4 for this memory visitor");
+        "ldg data type must be int4 for this memory visitor")
 
 __device__ __forceinline__ void init_stride(Layout<Format::CHWN16> layout) {
     stride = layout.width_stride / TileCount::ldg_load_width;
diff --git a/dnn/src/cuda/cumsum/kern_impl.cu b/dnn/src/cuda/cumsum/kern_impl.cu
index d16a6c288..08dcfc03a 100644
--- a/dnn/src/cuda/cumsum/kern_impl.cu
+++ b/dnn/src/cuda/cumsum/kern_impl.cu
@@ -12,6 +12,7 @@
 #include "./kern.cuh"
 #include "./kern_helper.cuh"
 #include "./kern_impl.cuinl"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 
 using namespace megdnn::cuda;
 using namespace cumsum::detail::cubwrap;
@@ -88,4 +89,5 @@ void cumsum::get_BX_BY(
     BY = 512 / BX;
 }
 
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/elemwise/special_kerns.inl b/dnn/src/cuda/elemwise/special_kerns.inl
index c5cffcdc3..160f072c6 100644
--- a/dnn/src/cuda/elemwise/special_kerns.inl
+++ b/dnn/src/cuda/elemwise/special_kerns.inl
@@ -125,7 +125,7 @@ struct OpCallerBinary<FuseOpWrapper<Op>, PVis0, PVis1> {
     PVis0 par0;
     PVis1 par1;
     MEGDNN_STATIC_ASSERT(
-            PVis0::packed_size == PVis1::packed_size, "vector size mismatch");
+            PVis0::packed_size == PVis1::packed_size, "vector size mismatch")
     static const uint32_t packed_size = PVis0::packed_size;
 
     __device__ __forceinline__ void thread_init(uint32_t idx) {
diff --git a/dnn/src/cuda/kernel_common/diagnostic_epilogue.cuh b/dnn/src/cuda/kernel_common/diagnostic_epilogue.cuh
new file mode 100644
index 000000000..1b87671ae
--- /dev/null
+++ b/dnn/src/cuda/kernel_common/diagnostic_epilogue.cuh
@@ -0,0 +1,24 @@
+/**
+ * \file dnn/src/cuda/kernel_common/diagnostic_epilogue.cuh
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#ifdef __GNUC__
+#if CUDA_VERSION < 9020
+#pragma GCC diagnostic pop
+#endif
+#endif
+
+#ifdef MEGDNN_DIAGNOSTIC_PROLOGUE_INCLUDED
+#undef MEGDNN_DIAGNOSTIC_PROLOGUE_INCLUDED
+#else
+#error "diagnostic_epilogue.h must be included after diagnostic_prologue.h"
+#endif
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/kernel_common/diagnostic_prologue.cuh b/dnn/src/cuda/kernel_common/diagnostic_prologue.cuh
new file mode 100644
index 000000000..7ef1ced30
--- /dev/null
+++ b/dnn/src/cuda/kernel_common/diagnostic_prologue.cuh
@@ -0,0 +1,29 @@
+/**
+ * \file dnn/src/cuda/kernel_common/diagnostic_prologue.cuh
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#ifdef MEGDNN_DIAGNOSTIC_PROLOGUE_INCLUDED
+#error "diagnostic_prologue.h included twice without including diagnostic_epilogue.h"
+#else
+#define MEGDNN_DIAGNOSTIC_PROLOGUE_INCLUDED
+#endif
+
+//! see
+//! https://stackoverflow.com/questions/49836419/how-to-hide-nvccs-function-was-declared-but-never-referenced-warnings
+//! for more details.
+#ifdef __GNUC__
+#if CUDA_VERSION < 9020
+#pragma GCC diagnostic push
+#pragma diag_suppress 177  // suppress "function was declared but never referenced
+                           // warning"
+#endif
+#endif
+
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/local_share/backward_data/local_share_bwd_data_f32_implicit_gemm.cu b/dnn/src/cuda/local_share/backward_data/local_share_bwd_data_f32_implicit_gemm.cu
index e58bc7bed..19dd3e1f8 100644
--- a/dnn/src/cuda/local_share/backward_data/local_share_bwd_data_f32_implicit_gemm.cu
+++ b/dnn/src/cuda/local_share/backward_data/local_share_bwd_data_f32_implicit_gemm.cu
@@ -9,6 +9,7 @@
  * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  */
 #include "./local_share_bwd_data.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 
 using namespace megdnn;
 using namespace cuda;
@@ -580,4 +581,5 @@ void megdnn::cuda::local_share_bwd_data::_do_local_share_bwd_data_implicit_gemm(
     }
 }
 
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cuda.doxygen
diff --git a/dnn/src/cuda/local_share/backward_filter/local_share_bwd_filter_f32_implicit_gemm.cu b/dnn/src/cuda/local_share/backward_filter/local_share_bwd_filter_f32_implicit_gemm.cu
index 36bfab579..5bb958781 100644
--- a/dnn/src/cuda/local_share/backward_filter/local_share_bwd_filter_f32_implicit_gemm.cu
+++ b/dnn/src/cuda/local_share/backward_filter/local_share_bwd_filter_f32_implicit_gemm.cu
@@ -10,6 +10,7 @@
  * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  */
 #include "./local_share_bwd_filter.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 
 using namespace megdnn;
 using namespace cuda;
@@ -495,4 +496,5 @@ void megdnn::cuda::local_share_bwd_filter::_do_local_share_bwd_filter_implicit_g
     }
 }
 
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cuda.doxygen
diff --git a/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware.cu b/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware.cu
index 0dd1a8894..60f6b454e 100644
--- a/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware.cu
+++ b/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware.cu
@@ -9,6 +9,7 @@
  * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  */
 #include "./local_share_forward.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 
 using namespace megdnn;
 using namespace cuda;
@@ -54,7 +55,7 @@ struct FilterTileCount {
 
     MEGDNN_STATIC_ASSERT(
             smem_w % ThreadConfig::nr_thread_x == 0,
-            "col of share memory must be divided by nr_thread_x");
+            "col of share memory must be divided by nr_thread_x")
     static int const reg_h =
             (smem_h + ThreadConfig::nr_thread_y - 1) / ThreadConfig::nr_thread_y;
     static int const reg_w = smem_w / ThreadConfig::nr_thread_x;
@@ -1292,4 +1293,5 @@ void megdnn::cuda::local_share::_do_local_share_convolution_large_batch_size(
     }
 }
 
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cuda.doxygen
diff --git a/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware_small_image.cu b/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware_small_image.cu
index 5fd46d85c..7578f2217 100644
--- a/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware_small_image.cu
+++ b/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware_small_image.cu
@@ -10,6 +10,7 @@
  * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  */
 #include "./local_share_forward.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 
 using namespace megdnn;
 using namespace cuda;
@@ -582,4 +583,5 @@ void megdnn::cuda::local_share::
     }
 }
 
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cuda.doxygen
diff --git a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu
index 8ecd6b42f..b944019f6 100644
--- a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu
+++ b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu
@@ -79,7 +79,7 @@ struct GlobalToShareMemStream {
             GlobalToShareMemStreamConfig_::BlockSize ==
                     GlobalToShareMemStreamConfig_::CACHE_SIZE *
                             BlockConfig_::WARPS_PER_BLOCK,
-            "Block size mismatch");
+            "Block size mismatch")
 
     uint8_t* smem;
     const uint8_t* g_ptr;
diff --git a/dnn/src/cuda/pooling/pooling2d_qint.cu b/dnn/src/cuda/pooling/pooling2d_qint.cu
index e9da27256..2f85b1256 100644
--- a/dnn/src/cuda/pooling/pooling2d_qint.cu
+++ b/dnn/src/cuda/pooling/pooling2d_qint.cu
@@ -11,6 +11,7 @@
  */
 #include "./pooling2d_qint.cuh"
 #include "src/common/opr_param_defs_enumv.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/query_blocksize.cuh"
 
 using namespace megdnn;
@@ -350,7 +351,7 @@ __global__ void pooling2d_device_template_nchwc(
     static int constexpr section = pack_byte / sizeof(ldg_type);
     MEGDNN_STATIC_ASSERT(
             ldg_width == ldg_width_assert,
-            "pooling2d (NCHW64) kernel must use 128bit width ldg instruction");
+            "pooling2d (NCHW64) kernel must use 128bit width ldg instruction")
     const int c_packed = param.c / pack_size;
     const int batch = tid / (param.ho * param.wo * c_packed * section);
     const int batch_residual = tid - batch * param.ho * param.wo * c_packed * section;
@@ -402,7 +403,7 @@ __global__ void pooling2d_device_template_nhwc(
     static int constexpr ldg_width_bytes = sizeof(ldg_type);
     MEGDNN_STATIC_ASSERT(
             ldg_width == ldg_width_assert,
-            "pooling2d (NHWC) kernel must ldg_width == ldg_width_assert");
+            "pooling2d (NHWC) kernel must ldg_width == ldg_width_assert")
     const int c_packed = param.c / pack_size;
     const int batch = tid / (param.ho * param.wo * c_packed);
     const int batch_residual = tid - batch * param.ho * param.wo * c_packed;
@@ -691,4 +692,6 @@ void megdnn::cuda::pooling2d::do_pooling2d_int4_nhwc(
     kern<<<nr_blocks, nr_threads, 0, stream>>>(d_src, d_dst, param, zero_point);
     after_kernel_launch();
 }
+
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cuda.doxygen
diff --git a/dnn/src/cuda/query_blocksize_impl.cu b/dnn/src/cuda/query_blocksize_impl.cu
index 86044a96b..9adb04eae 100644
--- a/dnn/src/cuda/query_blocksize_impl.cu
+++ b/dnn/src/cuda/query_blocksize_impl.cu
@@ -9,6 +9,7 @@
  * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  */
 
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/query_blocksize.cuh"
 #include "src/cuda/utils.cuh"
 
@@ -51,4 +52,5 @@ LaunchConfig cuda::detail::query_launch_config_for_kernel_uncached(
     return ret;
 }
 
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/src/cuda/relayout_format/relayout_format.cu b/dnn/src/cuda/relayout_format/relayout_format.cu
index 1e3410a4c..998861085 100644
--- a/dnn/src/cuda/relayout_format/relayout_format.cu
+++ b/dnn/src/cuda/relayout_format/relayout_format.cu
@@ -10,6 +10,7 @@
  * implied.
  */
 
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/query_blocksize.cuh"
 #include "src/cuda/relayout_format/relayout_format_kern.cuh"
 
@@ -104,7 +105,7 @@ __global__ void kern_nchw_nchwx(
     MEGDNN_STATIC_ASSERT(
             std::is_same<SrcType MEGDNN_COMMA DstType>::value,
             "Currently this kernel only support accessing tensor "
-            "src and dst in same data type.");
+            "src and dst in same data type.")
     n_stride_src /= size_src_type;
     ic_stride /= size_src_type;
     n_stride_dst /= size_dst_type;
@@ -585,3 +586,5 @@ void relayout_format::relayout_format_cuda_nchw_nchw4_weight(
             ic_stride, oc_stride_dst, group_stride_src, group_stride_dst, 0, {});
     after_kernel_launch();
 }
+
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
diff --git a/dnn/src/cuda/relayout_format/relayout_format_kern.cuh b/dnn/src/cuda/relayout_format/relayout_format_kern.cuh
index 889130173..7e37cb36c 100644
--- a/dnn/src/cuda/relayout_format/relayout_format_kern.cuh
+++ b/dnn/src/cuda/relayout_format/relayout_format_kern.cuh
@@ -478,12 +478,12 @@ struct RelayoutProblem {
     using Transpose = Transpose_;
     using CudaPostProcess = CudaPostProcess_;
     MEGDNN_STATIC_ASSERT(
-            SrcIterator::chan_blk == DstIterator::chan_blk, "channel block mismatch");
+            SrcIterator::chan_blk == DstIterator::chan_blk, "channel block mismatch")
     MEGDNN_STATIC_ASSERT(
-            SrcIterator::width == DstIterator::width, "width block mismatch");
+            SrcIterator::width == DstIterator::width, "width block mismatch")
     MEGDNN_STATIC_ASSERT(
             SrcIterator::size_nbits == DstIterator::size_nbits,
-            "size in bits of elements mismatch");
+            "size in bits of elements mismatch")
     static constexpr int pack_chan = SrcIterator::chan_blk;
     static constexpr int pack_width = SrcIterator::width;
     using DnnSrcType = typename CudaPostProcess::SrcType;
diff --git a/dnn/src/cuda/relayout_format/relayout_format_utils.cuh b/dnn/src/cuda/relayout_format/relayout_format_utils.cuh
index daa2cf8f6..5ebba2191 100644
--- a/dnn/src/cuda/relayout_format/relayout_format_utils.cuh
+++ b/dnn/src/cuda/relayout_format/relayout_format_utils.cuh
@@ -80,7 +80,7 @@ inline __device__ DstType make_zero_pad(const uint8_t zero_point) {
 
 template <>
 inline __device__ char4 make_zero_pad<char4>(const uint8_t zero_point) {
-    char izp = reinterpret_cast<const char&>(zero_point);
+    signed char izp = reinterpret_cast<const signed char&>(zero_point);
     return {izp, izp, izp, izp};
 }
 
diff --git a/dnn/src/cuda/resize/forward.cu b/dnn/src/cuda/resize/forward.cu
index deeb21f93..71907a5c2 100644
--- a/dnn/src/cuda/resize/forward.cu
+++ b/dnn/src/cuda/resize/forward.cu
@@ -16,6 +16,7 @@
 
 #include "src/common/resize.cuh"
 #include "src/cuda/cv/kernel_common.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 
 using namespace megdnn;
 using namespace cuda;
@@ -319,4 +320,5 @@ INST(int8_t);
 }  // namespace cuda
 }  // namespace megdnn
 
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/type_cvt/kern.cu b/dnn/src/cuda/type_cvt/kern.cu
index 94ec995ba..007fd1bef 100644
--- a/dnn/src/cuda/type_cvt/kern.cu
+++ b/dnn/src/cuda/type_cvt/kern.cu
@@ -13,6 +13,7 @@
 #include "megdnn/dtype.h"
 #include "src/cuda/elemwise_helper.cuh"
 #include "src/cuda/elemwise_helper_q4.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 
 using namespace megdnn;
 using namespace cuda;
@@ -410,4 +411,5 @@ MEGDNN_FOREACH_COMPUTING_CTYPE(INST_SRC_NORMAL_LOWBIT)
 }  // namespace cuda
 }  // namespace megdnn
 
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/warp_affine/warp_affine.cu b/dnn/src/cuda/warp_affine/warp_affine.cu
index c52f86e94..d13686ce9 100644
--- a/dnn/src/cuda/warp_affine/warp_affine.cu
+++ b/dnn/src/cuda/warp_affine/warp_affine.cu
@@ -12,6 +12,7 @@
 
 #include <cstdio>
 #include "src/common/rounding_converter.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/utils.cuh"
 #include "src/cuda/warp_affine/common.cuh"
 
@@ -288,4 +289,5 @@ INST(int8_t)
 }  // namespace cuda
 }  // namespace megdnn
 
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/cuda/warp_perspective/forward.cu b/dnn/src/cuda/warp_perspective/forward.cu
index b6c5e8cf8..2d251f094 100644
--- a/dnn/src/cuda/warp_perspective/forward.cu
+++ b/dnn/src/cuda/warp_perspective/forward.cu
@@ -16,6 +16,7 @@
 #include "src/common/rounding_converter.cuh"
 #include "src/cuda/error_info.cuh"
 #include "src/cuda/integer_subbyte_utils.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/utils.cuh"
 #include "src/cuda/warp_perspective/common.cuh"
 
@@ -1760,4 +1761,5 @@ INST(dt_quint8, uint8_t, float)
 }  // namespace cuda
 }  // namespace megdnn
 
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cpp.doxygen
diff --git a/dnn/src/naive/relayout_format/opr_impl.cpp b/dnn/src/naive/relayout_format/opr_impl.cpp
index 0f2f91002..0831bb7a7 100644
--- a/dnn/src/naive/relayout_format/opr_impl.cpp
+++ b/dnn/src/naive/relayout_format/opr_impl.cpp
@@ -51,7 +51,7 @@ void lowbit_recursive_cp(
     MEGDNN_STATIC_ASSERT(
             !(8_z % size_nbits),
             "size in bits of lowbit data type can only be 1, 2, 4 "
-            "or 8");
+            "or 8")
     if (idx < (src.layout.ndim - 1)) {
         for (size_t i = 0; i < src.layout[idx]; ++i) {
             lowbit_recursive_cp<size_nbits>(
diff --git a/src/opr/test/basic_arith/elemwise.cpp b/src/opr/test/basic_arith/elemwise.cpp
index 2ed8d8bd2..b602c1d51 100644
--- a/src/opr/test/basic_arith/elemwise.cpp
+++ b/src/opr/test/basic_arith/elemwise.cpp
@@ -130,10 +130,10 @@ T do_round_mulh_saturate(T a, T b) {
     MEGDNN_STATIC_ASSERT(
             std::numeric_limits<T>::digits <= 32,
             "Portable RMULH is not supported for integer "
-            "types larger than 32 bits.");
+            "types larger than 32 bits.")
     MEGDNN_STATIC_ASSERT(
             std::numeric_limits<T>::is_integer,
-            "Input types should be integer for RMULH");
+            "Input types should be integer for RMULH")
     bool overflow = a == b && a == DTypeTrait<T>::min();
     // TODO: This really should be
     // rounding_shift_right_away_from_zero, but we haven't yet found a fast
-- 
GitLab