fix(megdnn): emit define-but-not-referenced and extra-;-ignored warning on cuda9.0~cuda9.1

GitOrigin-RevId: f6db42e3958499c214bf3fc6f15ed7e8d8321da3

fix(megdnn): emit define-but-not-referenced and extra-;-ignored warning on cuda9.0~cuda9.1
GitOrigin-RevId: f6db42e3958499c214bf3fc6f15ed7e8d8321da3
14e9ad62 · Megvii Engine Team · 4c0bff1d · 14e9ad62 · 14e9ad62 · 14e9ad62
43 changed file
--- a/dnn/include/megdnn/arch.h
+++ b/dnn/include/megdnn/arch.h
@@ -36,15 +36,15 @@
 #define MEGDNN_ALWAYS_INLINE inline __attribute__((__always_inline__))
 #endif

-#define MEGDNN_DEPRECATED             __attribute__((deprecated))
-#define MEGDNN_PACKED                 __attribute__((packed))
-#define MEGDNN_CONSTEXPR              constexpr
-#define MEGDNN_NOEXCEPT               noexcept
-#define MEGDNN_STATIC_ASSERT          static_assert
-#define MEGDNN_FINAL                  final
-#define MEGDNN_NORETURN               __attribute__((noreturn))
-#define MEGDNN_WARN_UNUSED_RESULT     __attribute__((warn_unused_result))
-#define MEGDNN_ATTRIBUTE_TARGET(simd) __attribute__((target(simd)))
+#define MEGDNN_DEPRECATED               __attribute__((deprecated))
+#define MEGDNN_PACKED                   __attribute__((packed))
+#define MEGDNN_CONSTEXPR                constexpr
+#define MEGDNN_NOEXCEPT                 noexcept
+#define MEGDNN_STATIC_ASSERT(cond, msg) static_assert(cond, msg);
+#define MEGDNN_FINAL                    final
+#define MEGDNN_NORETURN                 __attribute__((noreturn))
+#define MEGDNN_WARN_UNUSED_RESULT       __attribute__((warn_unused_result))
+#define MEGDNN_ATTRIBUTE_TARGET(simd)   __attribute__((target(simd)))
 #if defined(__clang_major__) && (__clang_major__ >= 7)
 #define MEGDNN_LAMBDA_ATTRIBUTE_TARGET(simd) __attribute__((target(simd)))
 #else
@@ -64,10 +64,10 @@

 #define MEGDNN_DEPRECATED
 #define MEGDNN_PACKED
-#define MEGDNN_CONSTEXPR     constexpr
-#define MEGDNN_NOEXCEPT      noexcept
-#define MEGDNN_STATIC_ASSERT static_assert
-#define MEGDNN_FINAL         final
+#define MEGDNN_CONSTEXPR                constexpr
+#define MEGDNN_NOEXCEPT                 noexcept
+#define MEGDNN_STATIC_ASSERT(cond, msg) static_assert(cond, msg);
+#define MEGDNN_FINAL                    final

 #if defined(_MSC_VER)
 #define MEGDNN_NORETURN __declspec(noreturn)

--- a/dnn/include/megdnn/dtype.h
+++ b/dnn/include/megdnn/dtype.h
@@ -311,11 +311,11 @@ public:
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
-MEGDNN_STATIC_ASSERT(sizeof(dt_byte) == 1, "bad dt_byte size");
-MEGDNN_STATIC_ASSERT(sizeof(dt_qint1) == 1, "bad dt_qint1 size");
-MEGDNN_STATIC_ASSERT(sizeof(dt_quint8) == 1, "bad dt_quint8 size");
-MEGDNN_STATIC_ASSERT(sizeof(dt_qint16) == 2, "bad dt_qint16 size");
-MEGDNN_STATIC_ASSERT(sizeof(dt_qint32) == 4, "bad dt_qint32 size");
+MEGDNN_STATIC_ASSERT(sizeof(dt_byte) == 1, "bad dt_byte size")
+MEGDNN_STATIC_ASSERT(sizeof(dt_qint1) == 1, "bad dt_qint1 size")
+MEGDNN_STATIC_ASSERT(sizeof(dt_quint8) == 1, "bad dt_quint8 size")
+MEGDNN_STATIC_ASSERT(sizeof(dt_qint16) == 2, "bad dt_qint16 size")
+MEGDNN_STATIC_ASSERT(sizeof(dt_qint32) == 4, "bad dt_qint32 size")
 typedef float dt_float32;
 typedef int32_t dt_int32;
 typedef int16_t dt_int16;
@@ -613,7 +613,7 @@ template <uint16_t n>
 struct log {
    static MEGDNN_CONSTEXPR size_t value = log<(n >> 1)>::value + 1;
 #if MEGDNN_CC_HOST
-    MEGDNN_STATIC_ASSERT((n & (n - 1)) == 0, "only full power number can have log");
+    MEGDNN_STATIC_ASSERT((n & (n - 1)) == 0, "only full power number can have log")
 #endif
 };
 template <>

--- a/dnn/include/megdnn/thin/small_vector.h
+++ b/dnn/include/megdnn/thin/small_vector.h
@@ -333,7 +333,7 @@ public:
        } else if (n > this->size()) {
            if (this->capacity() < n)
                this->grow(n);
-            for (auto it = this->end(), end = this->begin() + n; it != end; ++it)
+            for (iterator it = this->end(), end = this->begin() + n; it != end; ++it)
                new (&*it) T();
            this->set_end(this->begin() + n);
        }

--- a/dnn/src/common/basic_types.cpp
+++ b/dnn/src/common/basic_types.cpp
@@ -144,7 +144,7 @@ size_t TensorShape::total_nr_elems() const {
 }

 bool TensorShape::eq_shape(const TensorShape& rhs) const {
-    MEGDNN_STATIC_ASSERT(MAX_NDIM == 7, "please update the code");
+    MEGDNN_STATIC_ASSERT(MAX_NDIM == 7, "please update the code")
    if (ndim == rhs.ndim) {
        size_t eq = 0;
        switch (ndim) {
@@ -379,7 +379,7 @@ bool TensorLayout::eq_layout(const TensorLayout& rhs) const {
            dtype == rhs.dtype,
            "could not compare layout on different dtypes: %s vs %s", dtype.name(),
            rhs.dtype.name());
-    MEGDNN_STATIC_ASSERT(MAX_NDIM == 7, "please update the code");
+    MEGDNN_STATIC_ASSERT(MAX_NDIM == 7, "please update the code")

    auto ax = [](size_t shape0, size_t shape1, ptrdiff_t stride0, ptrdiff_t stride1) {
        return (shape0 == shape1) & ((shape0 <= 1) | (stride0 == stride1));

--- a/dnn/src/common/elemwise_helper.cuh
+++ b/dnn/src/common/elemwise_helper.cuh
@@ -98,10 +98,10 @@ MEGDNN_DEVICE MEGDNN_HOST inline T round_mulh_saturate(T a, T b) {
    MEGDNN_STATIC_ASSERT(
            std::numeric_limits<T>::digits <= 32,
            "Portable RMULH is not supported for integer "
-            "types larger than 32 bits.");
+            "types larger than 32 bits.")
    MEGDNN_STATIC_ASSERT(
            std::numeric_limits<T>::is_integer,
-            "Input types should be integer for RMULH");
+            "Input types should be integer for RMULH")
    bool overflow = a == b && a == DTypeTrait<T>::min();
    // TODO: This really should be
    // rounding_shift_right_away_from_zero, but we haven't yet found a fast way

--- a/dnn/src/common/named_tensor.cpp
+++ b/dnn/src/common/named_tensor.cpp
@@ -185,7 +185,7 @@ NamedTensorShape::NamedTensorShape(std::initializer_list<Dimension> init_shape)
        : NamedTensorShape(SmallVector<Dimension>{init_shape}) {}

 bool NamedTensorShape::eq_shape(const NamedTensorShape& rhs) const {
-    MEGDNN_STATIC_ASSERT(MAX_NDIM == 7, "please update the code");
+    MEGDNN_STATIC_ASSERT(MAX_NDIM == 7, "please update the code")
    if (ndim == rhs.ndim) {
        size_t eq = 0;
        switch (ndim) {

--- a/dnn/src/cuda/argsort/argsort.cu
+++ b/dnn/src/cuda/argsort/argsort.cu
@@ -16,6 +16,7 @@

 #include "src/cuda/cub/device/device_radix_sort.cuh"
 #include "src/cuda/cub/device/device_segmented_radix_sort.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"

 using namespace megdnn;
 using namespace cuda;
@@ -175,4 +176,5 @@ INST_CUB_SORT(uint64_t)
 #undef INST_FORWARD
 }  // namespace cuda
 }  // namespace megdnn
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: ft=cuda syntax=cuda.doxygen
--- a/dnn/src/cuda/checksum/kern.cu
+++ b/dnn/src/cuda/checksum/kern.cu
@@ -10,6 +10,7 @@
 */
 #include "./kern.cuh"

+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/reduce_helper.cuh"
 #include "src/cuda/utils.cuh"

@@ -63,4 +64,5 @@ void megdnn::cuda::checksum::calc(
 size_t megdnn::cuda::checksum::get_workspace_in_bytes(size_t nr_elem) {
    return get_reduce_workspace_in_bytes<ChecksumOp>(1, nr_elem, 1);
 }
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: ft=cpp syntax=cpp.doxygen
--- a/dnn/src/cuda/conv_bias/chanwise/kern.cuh
+++ b/dnn/src/cuda/conv_bias/chanwise/kern.cuh
@@ -25,13 +25,13 @@ namespace conv_bias {
 namespace chanwise {

 struct Param {
-    uint32_t batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w, pad_h,
-            pad_w, stride_h, stride_w, dilation_h, dilation_w;
+    int batch, src_chl, src_h, src_w, chl_mul, flt_h, flt_w, out_h, out_w, pad_h, pad_w,
+            stride_h, stride_w, dilation_h, dilation_w;
    bool is_compute_deafult;
 #if MEGDNN_CC_HOST
    static Param from_fwd_args(
            const BiasForwardSizeArgs& args, bool is_compute_deafult_ = true) {
-#define U(v) static_cast<uint32_t>(v)
+#define U(v) static_cast<int>(v)
        auto&& src = args.src_layout->shape;
        auto&& dst = args.dst_layout->shape;
        auto&& fm = args.filter_meta;

--- a/dnn/src/cuda/conv_bias/matmul/inplace_matmul_impl.cu
+++ b/dnn/src/cuda/conv_bias/matmul/inplace_matmul_impl.cu
@@ -9,6 +9,7 @@
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #include "src/cuda/conv_bias/matmul/inplace_matmul_impl.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/utils.cuh"

 using namespace megdnn;
@@ -386,4 +387,5 @@ void conv_bias::exec_inplace_matmul_fwd(
    after_kernel_launch();
 }

+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cu
+++ b/dnn/src/cuda/conv_bias/quint4x4x32_wmma/reduce_with_scale_data.cu
@@ -37,6 +37,7 @@
 #include "./reduce_with_scale_data.cuh"
 #include "./wmma_conv_integer_u4.cuh"
 #include "src/cuda/cub/util_ptx.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"

 using namespace megdnn;
 using namespace cuda;
@@ -50,7 +51,7 @@ template <
 struct TileCounter {
    MEGDNN_STATIC_ASSERT(
            thread_blk_x % WARP_SIZE == 0,
-            "thread block size in dim x not divided by warpSize");
+            "thread block size in dim x not divided by warpSize")
    static const size_t spatial_tile_x = thread_blk_x * pixels_per_thread_x;
    static const size_t spatial_tile_y = thread_blk_y * pixels_per_thread_y;
    static const size_t global_load_tile_x =
@@ -678,4 +679,5 @@ void megdnn::cuda::do_dispatch_reduce_with_scale_data_u4(
    after_kernel_launch();
 }

+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: ft=cpp syntax=cuda.doxygen
--- a/dnn/src/cuda/conv_bias/reduce_filter.cu
+++ b/dnn/src/cuda/conv_bias/reduce_filter.cu
@@ -40,6 +40,7 @@
 #include "src/cuda/utils.cuh"

 #include "src/cuda/integer_subbyte_utils.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/reduce_helper.cuh"

 using namespace megdnn;
@@ -163,4 +164,5 @@ size_t megdnn::cuda::do_dispatch_reduce_workspace_in_bytes(
    return get_reduce_workspace_in_bytes<ReduceWithScaleInt4Op<false>>(A, B, C);
 }

+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: ft=cpp syntax=cuda.doxygen
--- a/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cu
+++ b/dnn/src/cuda/convolution3d/backward_filter/inplace_matmul_impl.cu
@@ -11,6 +11,7 @@
 #include <stdio.h>
 #include <iostream>
 #include "./inplace_matmul_impl.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/utils.cuh"
 using namespace megdnn;
 using namespace cuda;
@@ -429,4 +430,5 @@ void convolution3d::exec_inplace_matmul_bwd_filter(
    after_kernel_launch();
 }

+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/convolution3d/forward/inplace_matmul_impl.cu
+++ b/dnn/src/cuda/convolution3d/forward/inplace_matmul_impl.cu
@@ -11,6 +11,7 @@
 #include <stdio.h>
 #include <iostream>
 #include "./inplace_matmul_impl.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/utils.cuh"
 using namespace megdnn;
 using namespace cuda;
@@ -401,4 +402,5 @@ void convolution3d::exec_inplace_matmul_fwd(
    after_kernel_launch();
 }

+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/convolution_helper/config.cuh
+++ b/dnn/src/cuda/convolution_helper/config.cuh
@@ -54,9 +54,9 @@ struct RegBlockConfig {
    static int constexpr reg_n = reg_n_;
    static int constexpr reg_k = reg_k_;
    MEGDNN_STATIC_ASSERT(
-            reg_m % pack_size == 0, "reg_m must be a multiple of pack_size");
+            reg_m % pack_size == 0, "reg_m must be a multiple of pack_size")
    MEGDNN_STATIC_ASSERT(
-            reg_k % pack_size == 0, "reg_k must be a multiple of pack_size");
+            reg_k % pack_size == 0, "reg_k must be a multiple of pack_size")
    static int constexpr reg_k_packed = reg_k / pack_size;
    static int constexpr reg_m_packed = reg_m / pack_size;
    static int constexpr reg_width = reg_width_;

--- a/dnn/src/cuda/convolution_helper/conv_trait/ibatch_conv_trait.cuh
+++ b/dnn/src/cuda/convolution_helper/conv_trait/ibatch_conv_trait.cuh
@@ -57,7 +57,7 @@ namespace convolution {
    using KernLayout = _kern_layout;                                                 \
    using OutputLayout = _output_layout;                                             \
    using Param = _conv_param;                                                       \
-    static constexpr bool check_bounds = check_bounds_;
+    static constexpr bool check_bounds = check_bounds_
 #define MEGDNN_COMMA ,

 template <
@@ -80,7 +80,7 @@ struct IBatchConvTrait_f1x1s1x1 {
        static int constexpr block_tile_batch = RegBlockConfig::reg_n;
        MEGDNN_STATIC_ASSERT(
                block_tile_batch == 1,
-                "this algorithm does not unroll on batch dimension");
+                "this algorithm does not unroll on batch dimension")
        static int constexpr block_tile_out_height_width =
                RegBlockConfig::reg_width * ThreadConfig::nr_thread_x;
        static int constexpr block_tile_in_channel = RegBlockConfig::reg_k;
@@ -162,7 +162,7 @@ struct IBatchConvTrait {
        static int constexpr block_tile_batch = RegBlockConfig::reg_n;
        MEGDNN_STATIC_ASSERT(
                block_tile_batch == 1,
-                "this algorithm does not unroll on batch dimension");
+                "this algorithm does not unroll on batch dimension")
        static int constexpr block_tile_out_height_width =
                RegBlockConfig::reg_width * ThreadConfig::nr_thread_x;
        static int constexpr block_tile_in_channel = RegBlockConfig::reg_k;

--- a/dnn/src/cuda/convolution_helper/conv_trait/iconv_imma_trait.cuh
+++ b/dnn/src/cuda/convolution_helper/conv_trait/iconv_imma_trait.cuh
@@ -54,7 +54,7 @@ namespace convolution {
    using KernLayout = _kern_layout;                                                 \
    using OutputLayout = _output_layout;                                             \
    using Param = _conv_param;                                                       \
-    static constexpr bool check_bounds = check_bounds_;
+    static constexpr bool check_bounds = check_bounds_
 #define MEGDNN_COMMA ,

 template <
@@ -183,7 +183,7 @@ struct IConvIMMATraitReorderFilter {
                            check_bounds MEGDNN_COMMA IMMAConfig MEGDNN_COMMA
                                    WarpTileConfig MEGDNN_COMMA ThreadConfig>::src_dtype
                                 MEGDNN_COMMA src_dtype>::value == true,
-            "data type of input tensor should be int8_t");
+            "data type of input tensor should be int8_t")
    using DataTileCount = typename IConvIMMATrait<
            check_bounds, IMMAConfig, WarpTileConfig, ThreadConfig>::DataTileCount;
    struct FilterTileCount {
@@ -284,7 +284,7 @@ struct IConvIMMATraitUnrollWidth {
                            check_bounds MEGDNN_COMMA IMMAConfig MEGDNN_COMMA
                                    WarpTileConfig MEGDNN_COMMA ThreadConfig>::
                                 filter_dtype MEGDNN_COMMA filter_dtype>::value == true,
-            "data type of filter tensor should be int8_t");
+            "data type of filter tensor should be int8_t")
    using FilterTileCount = typename IConvIMMATraitReorderFilter<
            check_bounds, IMMAConfig, WarpTileConfig, ThreadConfig>::FilterTileCount;
    using BlockTileIterator =
@@ -345,7 +345,7 @@ struct IConvIMMATraitUnrollWidthV2 {
        MEGDNN_STATIC_ASSERT(
                WarpTileConfig::warp_tile_k == 1,
                "kernel unrolling along width axis assumes tile k "
-                "in warp-level must be 1");
+                "in warp-level must be 1")
        using copy_t = int4;
        using smem_storage_dtype = smem_storage_dtype;
        static int constexpr load_width = sizeof(copy_t) / sizeof(smem_storage_dtype);
@@ -388,7 +388,7 @@ struct IConvIMMATraitUnrollWidthV2 {
        MEGDNN_STATIC_ASSERT(
                WarpTileConfig::warp_tile_k == 1,
                "kernel unrolling along width axis assumes tile k "
-                "in warp-level must be 1");
+                "in warp-level must be 1")
        using copy_t = int4;
        using smem_storage_dtype = smem_storage_dtype;
        static int constexpr load_width = sizeof(copy_t) / sizeof(smem_storage_dtype);

--- a/dnn/src/cuda/convolution_helper/conv_trait/iconv_trait.cuh
+++ b/dnn/src/cuda/convolution_helper/conv_trait/iconv_trait.cuh
@@ -54,7 +54,7 @@ namespace convolution {
    using KernLayout = _kern_layout;                                                 \
    using OutputLayout = _output_layout;                                             \
    using Param = _conv_param;                                                       \
-    static constexpr bool check_bounds = check_bounds_;
+    static constexpr bool check_bounds = check_bounds_
 #define MEGDNN_COMMA ,

 template <
@@ -175,7 +175,7 @@ struct IConvTraitUnrollWidth {
                            check_bounds MEGDNN_COMMA ldg_dtype MEGDNN_COMMA
                                    RegBlockConfig MEGDNN_COMMA ThreadConfig>::
                                 filter_dtype MEGDNN_COMMA filter_dtype>::value == true,
-            "data type of filter tensor should be int8_t");
+            "data type of filter tensor should be int8_t")
    using FilterTileCount = typename IConvTrait<
            check_bounds, ldg_dtype, RegBlockConfig, ThreadConfig>::FilterTileCount;
    using BlockTileIterator =

--- a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixhw.cuh
+++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_cixhw.cuh
@@ -108,7 +108,7 @@ struct Global2ShareMemVisitor_CIxHW;
 DEF(true, Layout<NCHW4>)

 copy_t reg[TileCount::reg_h][TileCount::reg_w][TileCount::reg_d];
-MEGDNN_STATIC_ASSERT(load_width == 4, "load four element from src tensor per time");
+MEGDNN_STATIC_ASSERT(load_width == 4, "load four element from src tensor per time")

 __device__ Global2ShareMemVisitor_CIxHW(
        smem_storage_dtype* smem_, const int* __restrict__ offset_)

--- a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixn.cuh
+++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixn.cuh
@@ -169,7 +169,7 @@ const int gl_load_x = tid - gl_load_y * TileCount::load_x;
 copy_t reg[TileCount::reg_h][TileCount::reg_w];
 MEGDNN_STATIC_ASSERT(
        std::is_same<copy_t MEGDNN_COMMA int4>::value == true,
-        "ldg data type must be int4 for this memory visitor");
+        "ldg data type must be int4 for this memory visitor")

 __device__ __forceinline__ void init_stride(Layout<Format::CHWN16> layout) {
    stride = layout.channel_stride / TileCount::ldg_load_width;

--- a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwixn.cuh
+++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwixn.cuh
@@ -65,7 +65,7 @@ const int gl_load_x = tid - gl_load_y * TileCount::load_x;
 copy_t reg[TileCount::reg_h][TileCount::reg_w][TileCount::reg_d];
 MEGDNN_STATIC_ASSERT(
        std::is_same<copy_t MEGDNN_COMMA int4>::value == true,
-        "ldg data type must be int4 for this memory visitor");
+        "ldg data type must be int4 for this memory visitor")

 __device__ __forceinline__ void init_stride(Layout<Format::CHWN4> layout) {
    stride = layout.channel_stride / TileCount::ldg_load_width;

--- a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwoxn.cuh
+++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_cixwoxn.cuh
@@ -66,7 +66,7 @@ const int gl_load_x = tid - gl_load_y * TileCount::load_x;
 copy_t reg[TileCount::reg_h][TileCount::reg_w][TileCount::reg_d];
 MEGDNN_STATIC_ASSERT(
        std::is_same<copy_t MEGDNN_COMMA int4>::value == true,
-        "ldg data type must be int4 for this memory visitor");
+        "ldg data type must be int4 for this memory visitor")

 __device__ __forceinline__ void init_stride(Layout<Format::CHWN4> layout) {
    stride = layout.channel_stride / TileCount::ldg_load_width;

--- a/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_fwxco.cuh
+++ b/dnn/src/cuda/convolution_helper/global_memory_visitor/global_memory_visitor_imma_fwxco.cuh
@@ -63,7 +63,7 @@ const int gl_load_x = tid - gl_load_y * TileCount::load_x;
 copy_t reg[TileCount::reg_h][TileCount::reg_w];
 MEGDNN_STATIC_ASSERT(
        std::is_same<copy_t MEGDNN_COMMA int4>::value == true,
-        "ldg data type must be int4 for this memory visitor");
+        "ldg data type must be int4 for this memory visitor")

 __device__ __forceinline__ void init_stride(Layout<Format::CHWN16> layout) {
    stride = layout.width_stride / TileCount::ldg_load_width;

--- a/dnn/src/cuda/cumsum/kern_impl.cu
+++ b/dnn/src/cuda/cumsum/kern_impl.cu
@@ -12,6 +12,7 @@
 #include "./kern.cuh"
 #include "./kern_helper.cuh"
 #include "./kern_impl.cuinl"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"

 using namespace megdnn::cuda;
 using namespace cumsum::detail::cubwrap;
@@ -88,4 +89,5 @@ void cumsum::get_BX_BY(
    BY = 512 / BX;
 }

+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/elemwise/special_kerns.inl
+++ b/dnn/src/cuda/elemwise/special_kerns.inl
@@ -125,7 +125,7 @@ struct OpCallerBinary<FuseOpWrapper<Op>, PVis0, PVis1> {
    PVis0 par0;
    PVis1 par1;
    MEGDNN_STATIC_ASSERT(
-            PVis0::packed_size == PVis1::packed_size, "vector size mismatch");
+            PVis0::packed_size == PVis1::packed_size, "vector size mismatch")
    static const uint32_t packed_size = PVis0::packed_size;

    __device__ __forceinline__ void thread_init(uint32_t idx) {

--- a/dnn/src/cuda/kernel_common/diagnostic_epilogue.cuh
+++ b/dnn/src/cuda/kernel_common/diagnostic_epilogue.cuh
+/**
+ * \file dnn/src/cuda/kernel_common/diagnostic_epilogue.cuh
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#ifdef __GNUC__
+#if CUDA_VERSION < 9020
+#pragma GCC diagnostic pop
+#endif
+#endif
+
+#ifdef MEGDNN_DIAGNOSTIC_PROLOGUE_INCLUDED
+#undef MEGDNN_DIAGNOSTIC_PROLOGUE_INCLUDED
+#else
+#error "diagnostic_epilogue.h must be included after diagnostic_prologue.h"
+#endif
+
+// vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/kernel_common/diagnostic_prologue.cuh
+++ b/dnn/src/cuda/kernel_common/diagnostic_prologue.cuh
+/**
+ * \file dnn/src/cuda/kernel_common/diagnostic_prologue.cuh
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ */
+
+#ifdef MEGDNN_DIAGNOSTIC_PROLOGUE_INCLUDED
+#error "diagnostic_prologue.h included twice without including diagnostic_epilogue.h"
+#else
+#define MEGDNN_DIAGNOSTIC_PROLOGUE_INCLUDED
+#endif
+
+//! see
+//! https://stackoverflow.com/questions/49836419/how-to-hide-nvccs-function-was-declared-but-never-referenced-warnings
+//! for more details.
+#ifdef __GNUC__
+#if CUDA_VERSION < 9020
+#pragma GCC diagnostic push
+#pragma diag_suppress 177  // suppress "function was declared but never referenced
+                           // warning"
+#endif
+#endif
+
+// vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/local_share/backward_data/local_share_bwd_data_f32_implicit_gemm.cu
+++ b/dnn/src/cuda/local_share/backward_data/local_share_bwd_data_f32_implicit_gemm.cu
@@ -9,6 +9,7 @@
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #include "./local_share_bwd_data.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"

 using namespace megdnn;
 using namespace cuda;
@@ -580,4 +581,5 @@ void megdnn::cuda::local_share_bwd_data::_do_local_share_bwd_data_implicit_gemm(
    }
 }

+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cuda.doxygen
--- a/dnn/src/cuda/local_share/backward_filter/local_share_bwd_filter_f32_implicit_gemm.cu
+++ b/dnn/src/cuda/local_share/backward_filter/local_share_bwd_filter_f32_implicit_gemm.cu
@@ -10,6 +10,7 @@
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #include "./local_share_bwd_filter.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"

 using namespace megdnn;
 using namespace cuda;
@@ -495,4 +496,5 @@ void megdnn::cuda::local_share_bwd_filter::_do_local_share_bwd_filter_implicit_g
    }
 }

+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cuda.doxygen
--- a/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware.cu
+++ b/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware.cu
@@ -9,6 +9,7 @@
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #include "./local_share_forward.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"

 using namespace megdnn;
 using namespace cuda;
@@ -54,7 +55,7 @@ struct FilterTileCount {

    MEGDNN_STATIC_ASSERT(
            smem_w % ThreadConfig::nr_thread_x == 0,
-            "col of share memory must be divided by nr_thread_x");
+            "col of share memory must be divided by nr_thread_x")
    static int const reg_h =
            (smem_h + ThreadConfig::nr_thread_y - 1) / ThreadConfig::nr_thread_y;
    static int const reg_w = smem_w / ThreadConfig::nr_thread_x;
@@ -1292,4 +1293,5 @@ void megdnn::cuda::local_share::_do_local_share_convolution_large_batch_size(
    }
 }

+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cuda.doxygen
--- a/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware_small_image.cu
+++ b/dnn/src/cuda/local_share/forward/local_share_fwd_chwn_f32_batch_size_aware_small_image.cu
@@ -10,6 +10,7 @@
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */
 #include "./local_share_forward.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"

 using namespace megdnn;
 using namespace cuda;
@@ -582,4 +583,5 @@ void megdnn::cuda::local_share::
    }
 }

+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cuda.doxygen
--- a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu
+++ b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu
@@ -79,7 +79,7 @@ struct GlobalToShareMemStream {
            GlobalToShareMemStreamConfig_::BlockSize ==
                    GlobalToShareMemStreamConfig_::CACHE_SIZE *
                            BlockConfig_::WARPS_PER_BLOCK,
-            "Block size mismatch");
+            "Block size mismatch")

    uint8_t* smem;
    const uint8_t* g_ptr;

--- a/dnn/src/cuda/pooling/pooling2d_qint.cu
+++ b/dnn/src/cuda/pooling/pooling2d_qint.cu
@@ -11,6 +11,7 @@
 */
 #include "./pooling2d_qint.cuh"
 #include "src/common/opr_param_defs_enumv.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/query_blocksize.cuh"

 using namespace megdnn;
@@ -350,7 +351,7 @@ __global__ void pooling2d_device_template_nchwc(
    static int constexpr section = pack_byte / sizeof(ldg_type);
    MEGDNN_STATIC_ASSERT(
            ldg_width == ldg_width_assert,
-            "pooling2d (NCHW64) kernel must use 128bit width ldg instruction");
+            "pooling2d (NCHW64) kernel must use 128bit width ldg instruction")
    const int c_packed = param.c / pack_size;
    const int batch = tid / (param.ho * param.wo * c_packed * section);
    const int batch_residual = tid - batch * param.ho * param.wo * c_packed * section;
@@ -402,7 +403,7 @@ __global__ void pooling2d_device_template_nhwc(
    static int constexpr ldg_width_bytes = sizeof(ldg_type);
    MEGDNN_STATIC_ASSERT(
            ldg_width == ldg_width_assert,
-            "pooling2d (NHWC) kernel must ldg_width == ldg_width_assert");
+            "pooling2d (NHWC) kernel must ldg_width == ldg_width_assert")
    const int c_packed = param.c / pack_size;
    const int batch = tid / (param.ho * param.wo * c_packed);
    const int batch_residual = tid - batch * param.ho * param.wo * c_packed;
@@ -691,4 +692,6 @@ void megdnn::cuda::pooling2d::do_pooling2d_int4_nhwc(
    kern<<<nr_blocks, nr_threads, 0, stream>>>(d_src, d_dst, param, zero_point);
    after_kernel_launch();
 }
+
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cuda.doxygen
--- a/dnn/src/cuda/query_blocksize_impl.cu
+++ b/dnn/src/cuda/query_blocksize_impl.cu
@@ -9,6 +9,7 @@
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/query_blocksize.cuh"
 #include "src/cuda/utils.cuh"

@@ -51,4 +52,5 @@ LaunchConfig cuda::detail::query_launch_config_for_kernel_uncached(
    return ret;
 }

+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/dnn/src/cuda/relayout_format/relayout_format.cu
+++ b/dnn/src/cuda/relayout_format/relayout_format.cu
@@ -10,6 +10,7 @@
 * implied.
 */

+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/query_blocksize.cuh"
 #include "src/cuda/relayout_format/relayout_format_kern.cuh"

@@ -104,7 +105,7 @@ __global__ void kern_nchw_nchwx(
    MEGDNN_STATIC_ASSERT(
            std::is_same<SrcType MEGDNN_COMMA DstType>::value,
            "Currently this kernel only support accessing tensor "
-            "src and dst in same data type.");
+            "src and dst in same data type.")
    n_stride_src /= size_src_type;
    ic_stride /= size_src_type;
    n_stride_dst /= size_dst_type;
@@ -585,3 +586,5 @@ void relayout_format::relayout_format_cuda_nchw_nchw4_weight(
            ic_stride, oc_stride_dst, group_stride_src, group_stride_dst, 0, {});
    after_kernel_launch();
 }
+
+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
--- a/dnn/src/cuda/relayout_format/relayout_format_kern.cuh
+++ b/dnn/src/cuda/relayout_format/relayout_format_kern.cuh
@@ -478,12 +478,12 @@ struct RelayoutProblem {
    using Transpose = Transpose_;
    using CudaPostProcess = CudaPostProcess_;
    MEGDNN_STATIC_ASSERT(
-            SrcIterator::chan_blk == DstIterator::chan_blk, "channel block mismatch");
+            SrcIterator::chan_blk == DstIterator::chan_blk, "channel block mismatch")
    MEGDNN_STATIC_ASSERT(
-            SrcIterator::width == DstIterator::width, "width block mismatch");
+            SrcIterator::width == DstIterator::width, "width block mismatch")
    MEGDNN_STATIC_ASSERT(
            SrcIterator::size_nbits == DstIterator::size_nbits,
-            "size in bits of elements mismatch");
+            "size in bits of elements mismatch")
    static constexpr int pack_chan = SrcIterator::chan_blk;
    static constexpr int pack_width = SrcIterator::width;
    using DnnSrcType = typename CudaPostProcess::SrcType;

--- a/dnn/src/cuda/relayout_format/relayout_format_utils.cuh
+++ b/dnn/src/cuda/relayout_format/relayout_format_utils.cuh
@@ -80,7 +80,7 @@ inline __device__ DstType make_zero_pad(const uint8_t zero_point) {

 template <>
 inline __device__ char4 make_zero_pad<char4>(const uint8_t zero_point) {
-    char izp = reinterpret_cast<const char&>(zero_point);
+    signed char izp = reinterpret_cast<const signed char&>(zero_point);
    return {izp, izp, izp, izp};
 }


--- a/dnn/src/cuda/resize/forward.cu
+++ b/dnn/src/cuda/resize/forward.cu
@@ -16,6 +16,7 @@

 #include "src/common/resize.cuh"
 #include "src/cuda/cv/kernel_common.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"

 using namespace megdnn;
 using namespace cuda;
@@ -319,4 +320,5 @@ INST(int8_t);
 }  // namespace cuda
 }  // namespace megdnn

+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/type_cvt/kern.cu
+++ b/dnn/src/cuda/type_cvt/kern.cu
@@ -13,6 +13,7 @@
 #include "megdnn/dtype.h"
 #include "src/cuda/elemwise_helper.cuh"
 #include "src/cuda/elemwise_helper_q4.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"

 using namespace megdnn;
 using namespace cuda;
@@ -410,4 +411,5 @@ MEGDNN_FOREACH_COMPUTING_CTYPE(INST_SRC_NORMAL_LOWBIT)
 }  // namespace cuda
 }  // namespace megdnn

+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/warp_affine/warp_affine.cu
+++ b/dnn/src/cuda/warp_affine/warp_affine.cu
@@ -12,6 +12,7 @@

 #include <cstdio>
 #include "src/common/rounding_converter.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/utils.cuh"
 #include "src/cuda/warp_affine/common.cuh"

@@ -288,4 +289,5 @@ INST(int8_t)
 }  // namespace cuda
 }  // namespace megdnn

+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/warp_perspective/forward.cu
+++ b/dnn/src/cuda/warp_perspective/forward.cu
@@ -16,6 +16,7 @@
 #include "src/common/rounding_converter.cuh"
 #include "src/cuda/error_info.cuh"
 #include "src/cuda/integer_subbyte_utils.cuh"
+#include "src/cuda/kernel_common/diagnostic_prologue.cuh"
 #include "src/cuda/utils.cuh"
 #include "src/cuda/warp_perspective/common.cuh"

@@ -1760,4 +1761,5 @@ INST(dt_quint8, uint8_t, float)
 }  // namespace cuda
 }  // namespace megdnn

+#include "src/cuda/kernel_common/diagnostic_epilogue.cuh"
 // vim: syntax=cpp.doxygen
--- a/dnn/src/naive/relayout_format/opr_impl.cpp
+++ b/dnn/src/naive/relayout_format/opr_impl.cpp
@@ -51,7 +51,7 @@ void lowbit_recursive_cp(
    MEGDNN_STATIC_ASSERT(
            !(8_z % size_nbits),
            "size in bits of lowbit data type can only be 1, 2, 4 "
-            "or 8");
+            "or 8")
    if (idx < (src.layout.ndim - 1)) {
        for (size_t i = 0; i < src.layout[idx]; ++i) {
            lowbit_recursive_cp<size_nbits>(

--- a/src/opr/test/basic_arith/elemwise.cpp
+++ b/src/opr/test/basic_arith/elemwise.cpp
@@ -130,10 +130,10 @@ T do_round_mulh_saturate(T a, T b) {
    MEGDNN_STATIC_ASSERT(
            std::numeric_limits<T>::digits <= 32,
            "Portable RMULH is not supported for integer "
-            "types larger than 32 bits.");
+            "types larger than 32 bits.")
    MEGDNN_STATIC_ASSERT(
            std::numeric_limits<T>::is_integer,
-            "Input types should be integer for RMULH");
+            "Input types should be integer for RMULH")
    bool overflow = a == b && a == DTypeTrait<T>::min();
    // TODO: This really should be
    // rounding_shift_right_away_from_zero, but we haven't yet found a fast