fix(build/windows/cuda/llvm): fix windows bazel build with cuda

* Adapt to the new version llvm/clang-11 * fix windows bazel build with cuda * add windows bazel build cuda ci * opt windows bazel ci scripts GitOrigin-RevId: 6ea7c66585348590d8f8a34150853c776c16352c

fix(build/windows/cuda/llvm): fix windows bazel build with cuda
* Adapt to the new version llvm/clang-11 * fix windows bazel build with cuda * add windows bazel build cuda ci * opt windows bazel ci scripts GitOrigin-RevId: 6ea7c66585348590d8f8a34150853c776c16352c
0398a786 · Megvii Engine Team · b9c37112 · 0398a786 · 0398a786 · 0398a786
8 changed file
--- a/dnn/src/cuda/dct/opr_impl.h
+++ b/dnn/src/cuda/dct/opr_impl.h
@@ -26,7 +26,7 @@ public:
    size_t get_workspace_in_bytes(const TensorLayout& /*src*/,
                                  const TensorLayout& /*mask_offset*/,
                                  const TensorLayout& /*mask_val*/,
-                                  const TensorLayout& /*dst*/) {
+                                  const TensorLayout& /*dst*/) override {
        return 0;
    };
    void set_error_tracker(void* tracker) override {

--- a/dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.h
+++ b/dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.h
@@ -43,7 +43,7 @@ public:
                                  const TensorLayout& /* out_diff */,
                                  const TensorLayout& /* out_count */,
                                  const TensorLayout& /* data_diff */,
-                                  const TensorLayout& /* trans_diff */) {
+                                  const TensorLayout& /* trans_diff */) override {
        return 0ULL;
    };


--- a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cu
+++ b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cu
@@ -192,13 +192,11 @@ void megdnn::cuda::exec_span_qsum(const int32_t* qSumA, const uint32_t M,
                                  int32_t* dst, const uint32_t strd,
                                  const int32_t scaler_bias,
                                  cudaStream_t stream) {
-    constexpr size_t TX = 32, TY = 32;
-    constexpr size_t BX = 32, BY = 32;
+    constexpr uint32_t TX = 32, TY = 32, BX = 32, BY = 32;
    dim3 nthreads{TX, TY};
-    dim3 nblocks{static_cast<uint32_t>(DIVUP(N, BX)),
-                 static_cast<uint32_t>(DIVUP(M, BY))};
-    span_qsum<TX, TY, BX, BY><<<nblocks, nthreads, 0, stream>>>(qSumA, M, qSumB, N, dst, strd,
-                                                scaler_bias);
+    dim3 nblocks{DIVUP(N, BX), DIVUP(M, BY)};
+    span_qsum<TX, TY, BX, BY><<<nblocks, nthreads, 0, stream>>>(
+            qSumA, M, qSumB, N, dst, strd, scaler_bias);
    after_kernel_launch();
 }


--- a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu
+++ b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu
@@ -46,10 +46,7 @@ using namespace wmma::experimental::precision;

 namespace wmma_matrix_mul_u4 {

-constexpr size_t WMMA_M = 8;
-constexpr size_t WMMA_N = 8;
-constexpr size_t WMMA_K = 32;
-constexpr size_t WARP_SIZE = 32;
+constexpr uint32_t WMMA_M = 8, WMMA_N = 8, WMMA_K = 32, WARP_SIZE = 32;

 template <size_t WARP_X_, size_t WARP_Y_, size_t ROW_PER_WARP_,
          size_t COL_PER_WARP_>
@@ -334,10 +331,8 @@ __global__ void u4_gemm_template_device_nt(const uint8_t* /*A*/,
 void _do_dispatch_wmma_matrix_mul_u4(const uint8_t* A, const uint8_t* B,
                                     int32_t* C, int M, int N, int K, int lda,
                                     int ldb, int ldc, cudaStream_t stream) {
-    constexpr size_t warp_x = 4;
-    constexpr size_t warp_y = 4;
-    constexpr size_t row_per_warp = 4;
-    constexpr size_t col_per_warp = 4;
+    constexpr uint32_t warp_x = 4, warp_y = 4, row_per_warp = 4,
+                       col_per_warp = 4;
    typedef BlockConfig<warp_x, warp_y, row_per_warp, col_per_warp>
            BlockConfig_;
    dim3 block{warp_x * WARP_SIZE, warp_y};

--- a/dnn/src/fallback/type_cvt/opr_impl.cpp
+++ b/dnn/src/fallback/type_cvt/opr_impl.cpp
@@ -110,8 +110,10 @@ void do_cvt_normal_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
    float scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale;
    float dscale = 1.f / scale;
    for (size_t i = 0; i < n; ++i) {
-        dptr[i] = saturate<int32_t, float>(std::round(sptr[i] * dscale),
-                                           -2147483648, 2147483647);
+        dptr[i] = saturate<int32_t, float>(
+                std::round(sptr[i] * dscale),
+                static_cast<float>(std::numeric_limits<int32_t>::min()),
+                static_cast<float>(std::numeric_limits<int32_t>::max()));
    }
 }

@@ -219,8 +221,10 @@ void do_cvt_s8_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
    float dst_scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale;
    float scale = src_scale / dst_scale;
    for (size_t i = 0; i < n; ++i) {
-        dptr[i] = saturate<int32_t, float>(std::round(sptr[i] * scale),
-                                           -2147483648, 2147483647);
+        dptr[i] = saturate<int32_t, float>(
+                std::round(sptr[i] * scale),
+                static_cast<float>(std::numeric_limits<int32_t>::min()),
+                static_cast<float>(std::numeric_limits<int32_t>::max()));
    }
 }

@@ -232,8 +236,10 @@ void do_cvt_s32_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
    float dst_scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale;
    float scale = src_scale / dst_scale;
    for (size_t i = 0; i < n; ++i) {
-        dptr[i] = saturate<int32_t, float>(std::round(sptr[i] * scale),
-                                           -2147483648, 2147483647);
+        dptr[i] = saturate<int32_t, float>(
+                std::round(sptr[i] * scale),
+                static_cast<float>(std::numeric_limits<int32_t>::min()),
+                static_cast<float>(std::numeric_limits<int32_t>::max()));
    }
 }

@@ -247,9 +253,10 @@ void do_cvt_asymm8_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
    float dst_scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale;
    float scale = src_scale / dst_scale;
    for (size_t i = 0; i < n; ++i) {
-        dptr[i] =
-                saturate<int32_t, float>(std::round((sptr[i] - src_zp) * scale),
-                                         -2147483648, 2147483647);
+        dptr[i] = saturate<int32_t, float>(
+                std::round((sptr[i] - src_zp) * scale),
+                static_cast<float>(std::numeric_limits<int32_t>::min()),
+                static_cast<float>(std::numeric_limits<int32_t>::max()));
    }
 }


--- a/dnn/src/x86/quantized_converter.h
+++ b/dnn/src/x86/quantized_converter.h
@@ -66,8 +66,10 @@ inline dt_quint8 QConverter::convert(const float& src, const uint8_t& zp) {

 template <>
 inline dt_qint32 QConverter::convert(const float& src) {
-    return dt_qint32(
-            saturate<int32_t, float>(std::round(src), -2147483648, 2147483647));
+    return dt_qint32(saturate<int32_t, float>(
+            std::round(src),
+            static_cast<float>(std::numeric_limits<int32_t>::min()),
+            static_cast<float>(std::numeric_limits<int32_t>::max())));
 }

 template <>

--- a/src/core/impl/graph/cg_impl_seq.h
+++ b/src/core/impl/graph/cg_impl_seq.h
@@ -101,7 +101,8 @@ public:
    ComputingSequence(const std::shared_ptr<ComputingGraph>& graph)
            : m_owner_graph_refkeep{graph},
              m_owner_graph{ComputingGraphImpl::downcast(graph.get())},
-              m_have_parent_graph{m_owner_graph->m_parent_graph} {}
+              m_have_parent_graph{
+                      static_cast<bool>(m_owner_graph->m_parent_graph)} {}

    GraphExecutable::ExecEnv& exec_env() { return m_exec_env; }


--- a/src/opr/impl/internal/indexing_helper.cpp
+++ b/src/opr/impl/internal/indexing_helper.cpp
@@ -371,9 +371,11 @@ serialization::IndexDescMaskDump::from_index_desc(const IndexDesc &desc) {
    ret.nr_item = desc.size();
    for (size_t i = 0; i < desc.size(); ++ i) {
        auto &&s = desc[i];
-        ret.items[i] = {
-            static_cast<int8_t>(s.axis.get_raw()),
-            s.begin.node(), s.end.node(), s.step.node(), s.idx.node()};
+        ret.items[i] = {static_cast<int8_t>(s.axis.get_raw()),
+                        static_cast<bool>(s.begin.node()),
+                        static_cast<bool>(s.end.node()),
+                        static_cast<bool>(s.step.node()),
+                        static_cast<bool>(s.idx.node())};
    }
    return ret;
 }