From 0398a7867f509872c15dfe95492f2c097ee15c09 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Fri, 11 Dec 2020 08:22:44 +0800 Subject: [PATCH] fix(build/windows/cuda/llvm): fix windows bazel build with cuda * Adapt to the new version llvm/clang-11 * fix windows bazel build with cuda * add windows bazel build cuda ci * opt windows bazel ci scripts GitOrigin-RevId: 6ea7c66585348590d8f8a34150853c776c16352c --- dnn/src/cuda/dct/opr_impl.h | 2 +- .../cuda/deformable_ps_roi_pooling/opr_impl.h | 2 +- .../preprocess_quantize_sum.cu | 10 +++----- .../uint4x4x32_wmma/wmma_matrix_mul_u4.cu | 11 +++----- dnn/src/fallback/type_cvt/opr_impl.cpp | 25 ++++++++++++------- dnn/src/x86/quantized_converter.h | 6 +++-- src/core/impl/graph/cg_impl_seq.h | 3 ++- src/opr/impl/internal/indexing_helper.cpp | 8 +++--- 8 files changed, 36 insertions(+), 31 deletions(-) diff --git a/dnn/src/cuda/dct/opr_impl.h b/dnn/src/cuda/dct/opr_impl.h index 60c899b11..cb10b3906 100644 --- a/dnn/src/cuda/dct/opr_impl.h +++ b/dnn/src/cuda/dct/opr_impl.h @@ -26,7 +26,7 @@ public: size_t get_workspace_in_bytes(const TensorLayout& /*src*/, const TensorLayout& /*mask_offset*/, const TensorLayout& /*mask_val*/, - const TensorLayout& /*dst*/) { + const TensorLayout& /*dst*/) override { return 0; }; void set_error_tracker(void* tracker) override { diff --git a/dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.h b/dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.h index 245604de5..92f1da1db 100644 --- a/dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.h +++ b/dnn/src/cuda/deformable_ps_roi_pooling/opr_impl.h @@ -43,7 +43,7 @@ public: const TensorLayout& /* out_diff */, const TensorLayout& /* out_count */, const TensorLayout& /* data_diff */, - const TensorLayout& /* trans_diff */) { + const TensorLayout& /* trans_diff */) override { return 0ULL; }; diff --git a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cu b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cu index c27d77a4f..53de87484 100644 --- a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cu +++ b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/preprocess_quantize_sum.cu @@ -192,13 +192,11 @@ void megdnn::cuda::exec_span_qsum(const int32_t* qSumA, const uint32_t M, int32_t* dst, const uint32_t strd, const int32_t scaler_bias, cudaStream_t stream) { - constexpr size_t TX = 32, TY = 32; - constexpr size_t BX = 32, BY = 32; + constexpr uint32_t TX = 32, TY = 32, BX = 32, BY = 32; dim3 nthreads{TX, TY}; - dim3 nblocks{static_cast(DIVUP(N, BX)), - static_cast(DIVUP(M, BY))}; - span_qsum<<>>(qSumA, M, qSumB, N, dst, strd, - scaler_bias); + dim3 nblocks{DIVUP(N, BX), DIVUP(M, BY)}; + span_qsum<<>>( + qSumA, M, qSumB, N, dst, strd, scaler_bias); after_kernel_launch(); } diff --git a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu index 0fb02c1e6..4e524f6dd 100644 --- a/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu +++ b/dnn/src/cuda/matrix_mul/uint4x4x32_wmma/wmma_matrix_mul_u4.cu @@ -46,10 +46,7 @@ using namespace wmma::experimental::precision; namespace wmma_matrix_mul_u4 { -constexpr size_t WMMA_M = 8; -constexpr size_t WMMA_N = 8; -constexpr size_t WMMA_K = 32; -constexpr size_t WARP_SIZE = 32; +constexpr uint32_t WMMA_M = 8, WMMA_N = 8, WMMA_K = 32, WARP_SIZE = 32; template @@ -334,10 +331,8 @@ __global__ void u4_gemm_template_device_nt(const uint8_t* /*A*/, void _do_dispatch_wmma_matrix_mul_u4(const uint8_t* A, const uint8_t* B, int32_t* C, int M, int N, int K, int lda, int ldb, int ldc, cudaStream_t stream) { - constexpr size_t warp_x = 4; - constexpr size_t warp_y = 4; - constexpr size_t row_per_warp = 4; - constexpr size_t col_per_warp = 4; + constexpr uint32_t warp_x = 4, warp_y = 4, row_per_warp = 4, + col_per_warp = 4; typedef BlockConfig BlockConfig_; dim3 block{warp_x * WARP_SIZE, warp_y}; diff --git a/dnn/src/fallback/type_cvt/opr_impl.cpp b/dnn/src/fallback/type_cvt/opr_impl.cpp index 04020f9d9..e01054684 100644 --- a/dnn/src/fallback/type_cvt/opr_impl.cpp +++ b/dnn/src/fallback/type_cvt/opr_impl.cpp @@ -110,8 +110,10 @@ void do_cvt_normal_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) { float scale = dst.layout.dtype.param().scale; float dscale = 1.f / scale; for (size_t i = 0; i < n; ++i) { - dptr[i] = saturate(std::round(sptr[i] * dscale), - -2147483648, 2147483647); + dptr[i] = saturate( + std::round(sptr[i] * dscale), + static_cast(std::numeric_limits::min()), + static_cast(std::numeric_limits::max())); } } @@ -219,8 +221,10 @@ void do_cvt_s8_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) { float dst_scale = dst.layout.dtype.param().scale; float scale = src_scale / dst_scale; for (size_t i = 0; i < n; ++i) { - dptr[i] = saturate(std::round(sptr[i] * scale), - -2147483648, 2147483647); + dptr[i] = saturate( + std::round(sptr[i] * scale), + static_cast(std::numeric_limits::min()), + static_cast(std::numeric_limits::max())); } } @@ -232,8 +236,10 @@ void do_cvt_s32_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) { float dst_scale = dst.layout.dtype.param().scale; float scale = src_scale / dst_scale; for (size_t i = 0; i < n; ++i) { - dptr[i] = saturate(std::round(sptr[i] * scale), - -2147483648, 2147483647); + dptr[i] = saturate( + std::round(sptr[i] * scale), + static_cast(std::numeric_limits::min()), + static_cast(std::numeric_limits::max())); } } @@ -247,9 +253,10 @@ void do_cvt_asymm8_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) { float dst_scale = dst.layout.dtype.param().scale; float scale = src_scale / dst_scale; for (size_t i = 0; i < n; ++i) { - dptr[i] = - saturate(std::round((sptr[i] - src_zp) * scale), - -2147483648, 2147483647); + dptr[i] = saturate( + std::round((sptr[i] - src_zp) * scale), + static_cast(std::numeric_limits::min()), + static_cast(std::numeric_limits::max())); } } diff --git a/dnn/src/x86/quantized_converter.h b/dnn/src/x86/quantized_converter.h index a0635b54f..96bbed986 100644 --- a/dnn/src/x86/quantized_converter.h +++ b/dnn/src/x86/quantized_converter.h @@ -66,8 +66,10 @@ inline dt_quint8 QConverter::convert(const float& src, const uint8_t& zp) { template <> inline dt_qint32 QConverter::convert(const float& src) { - return dt_qint32( - saturate(std::round(src), -2147483648, 2147483647)); + return dt_qint32(saturate( + std::round(src), + static_cast(std::numeric_limits::min()), + static_cast(std::numeric_limits::max()))); } template <> diff --git a/src/core/impl/graph/cg_impl_seq.h b/src/core/impl/graph/cg_impl_seq.h index 0eee81ab2..625091bcd 100644 --- a/src/core/impl/graph/cg_impl_seq.h +++ b/src/core/impl/graph/cg_impl_seq.h @@ -101,7 +101,8 @@ public: ComputingSequence(const std::shared_ptr& graph) : m_owner_graph_refkeep{graph}, m_owner_graph{ComputingGraphImpl::downcast(graph.get())}, - m_have_parent_graph{m_owner_graph->m_parent_graph} {} + m_have_parent_graph{ + static_cast(m_owner_graph->m_parent_graph)} {} GraphExecutable::ExecEnv& exec_env() { return m_exec_env; } diff --git a/src/opr/impl/internal/indexing_helper.cpp b/src/opr/impl/internal/indexing_helper.cpp index 3b4520035..df6a81a9c 100644 --- a/src/opr/impl/internal/indexing_helper.cpp +++ b/src/opr/impl/internal/indexing_helper.cpp @@ -371,9 +371,11 @@ serialization::IndexDescMaskDump::from_index_desc(const IndexDesc &desc) { ret.nr_item = desc.size(); for (size_t i = 0; i < desc.size(); ++ i) { auto &&s = desc[i]; - ret.items[i] = { - static_cast(s.axis.get_raw()), - s.begin.node(), s.end.node(), s.step.node(), s.idx.node()}; + ret.items[i] = {static_cast(s.axis.get_raw()), + static_cast(s.begin.node()), + static_cast(s.end.node()), + static_cast(s.step.node()), + static_cast(s.idx.node())}; } return ret; } -- GitLab