提交 0398a786 编写于 作者: M Megvii Engine Team

fix(build/windows/cuda/llvm): fix windows bazel build with cuda

* Adapt to the new version llvm/clang-11
* fix windows bazel build with cuda
* add windows bazel build cuda ci
* opt windows bazel ci scripts

GitOrigin-RevId: 6ea7c66585348590d8f8a34150853c776c16352c
上级 b9c37112
......@@ -26,7 +26,7 @@ public:
size_t get_workspace_in_bytes(const TensorLayout& /*src*/,
const TensorLayout& /*mask_offset*/,
const TensorLayout& /*mask_val*/,
const TensorLayout& /*dst*/) {
const TensorLayout& /*dst*/) override {
return 0;
};
void set_error_tracker(void* tracker) override {
......
......@@ -43,7 +43,7 @@ public:
const TensorLayout& /* out_diff */,
const TensorLayout& /* out_count */,
const TensorLayout& /* data_diff */,
const TensorLayout& /* trans_diff */) {
const TensorLayout& /* trans_diff */) override {
return 0ULL;
};
......
......@@ -192,13 +192,11 @@ void megdnn::cuda::exec_span_qsum(const int32_t* qSumA, const uint32_t M,
int32_t* dst, const uint32_t strd,
const int32_t scaler_bias,
cudaStream_t stream) {
constexpr size_t TX = 32, TY = 32;
constexpr size_t BX = 32, BY = 32;
constexpr uint32_t TX = 32, TY = 32, BX = 32, BY = 32;
dim3 nthreads{TX, TY};
dim3 nblocks{static_cast<uint32_t>(DIVUP(N, BX)),
static_cast<uint32_t>(DIVUP(M, BY))};
span_qsum<TX, TY, BX, BY><<<nblocks, nthreads, 0, stream>>>(qSumA, M, qSumB, N, dst, strd,
scaler_bias);
dim3 nblocks{DIVUP(N, BX), DIVUP(M, BY)};
span_qsum<TX, TY, BX, BY><<<nblocks, nthreads, 0, stream>>>(
qSumA, M, qSumB, N, dst, strd, scaler_bias);
after_kernel_launch();
}
......
......@@ -46,10 +46,7 @@ using namespace wmma::experimental::precision;
namespace wmma_matrix_mul_u4 {
constexpr size_t WMMA_M = 8;
constexpr size_t WMMA_N = 8;
constexpr size_t WMMA_K = 32;
constexpr size_t WARP_SIZE = 32;
constexpr uint32_t WMMA_M = 8, WMMA_N = 8, WMMA_K = 32, WARP_SIZE = 32;
template <size_t WARP_X_, size_t WARP_Y_, size_t ROW_PER_WARP_,
size_t COL_PER_WARP_>
......@@ -334,10 +331,8 @@ __global__ void u4_gemm_template_device_nt(const uint8_t* /*A*/,
void _do_dispatch_wmma_matrix_mul_u4(const uint8_t* A, const uint8_t* B,
int32_t* C, int M, int N, int K, int lda,
int ldb, int ldc, cudaStream_t stream) {
constexpr size_t warp_x = 4;
constexpr size_t warp_y = 4;
constexpr size_t row_per_warp = 4;
constexpr size_t col_per_warp = 4;
constexpr uint32_t warp_x = 4, warp_y = 4, row_per_warp = 4,
col_per_warp = 4;
typedef BlockConfig<warp_x, warp_y, row_per_warp, col_per_warp>
BlockConfig_;
dim3 block{warp_x * WARP_SIZE, warp_y};
......
......@@ -110,8 +110,10 @@ void do_cvt_normal_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
float scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale;
float dscale = 1.f / scale;
for (size_t i = 0; i < n; ++i) {
dptr[i] = saturate<int32_t, float>(std::round(sptr[i] * dscale),
-2147483648, 2147483647);
dptr[i] = saturate<int32_t, float>(
std::round(sptr[i] * dscale),
static_cast<float>(std::numeric_limits<int32_t>::min()),
static_cast<float>(std::numeric_limits<int32_t>::max()));
}
}
......@@ -219,8 +221,10 @@ void do_cvt_s8_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
float dst_scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale;
float scale = src_scale / dst_scale;
for (size_t i = 0; i < n; ++i) {
dptr[i] = saturate<int32_t, float>(std::round(sptr[i] * scale),
-2147483648, 2147483647);
dptr[i] = saturate<int32_t, float>(
std::round(sptr[i] * scale),
static_cast<float>(std::numeric_limits<int32_t>::min()),
static_cast<float>(std::numeric_limits<int32_t>::max()));
}
}
......@@ -232,8 +236,10 @@ void do_cvt_s32_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
float dst_scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale;
float scale = src_scale / dst_scale;
for (size_t i = 0; i < n; ++i) {
dptr[i] = saturate<int32_t, float>(std::round(sptr[i] * scale),
-2147483648, 2147483647);
dptr[i] = saturate<int32_t, float>(
std::round(sptr[i] * scale),
static_cast<float>(std::numeric_limits<int32_t>::min()),
static_cast<float>(std::numeric_limits<int32_t>::max()));
}
}
......@@ -247,9 +253,10 @@ void do_cvt_asymm8_s32(_megdnn_tensor_in src, _megdnn_tensor_out dst) {
float dst_scale = dst.layout.dtype.param<dtype::QuantizedS32>().scale;
float scale = src_scale / dst_scale;
for (size_t i = 0; i < n; ++i) {
dptr[i] =
saturate<int32_t, float>(std::round((sptr[i] - src_zp) * scale),
-2147483648, 2147483647);
dptr[i] = saturate<int32_t, float>(
std::round((sptr[i] - src_zp) * scale),
static_cast<float>(std::numeric_limits<int32_t>::min()),
static_cast<float>(std::numeric_limits<int32_t>::max()));
}
}
......
......@@ -66,8 +66,10 @@ inline dt_quint8 QConverter::convert(const float& src, const uint8_t& zp) {
template <>
inline dt_qint32 QConverter::convert(const float& src) {
return dt_qint32(
saturate<int32_t, float>(std::round(src), -2147483648, 2147483647));
return dt_qint32(saturate<int32_t, float>(
std::round(src),
static_cast<float>(std::numeric_limits<int32_t>::min()),
static_cast<float>(std::numeric_limits<int32_t>::max())));
}
template <>
......
......@@ -101,7 +101,8 @@ public:
ComputingSequence(const std::shared_ptr<ComputingGraph>& graph)
: m_owner_graph_refkeep{graph},
m_owner_graph{ComputingGraphImpl::downcast(graph.get())},
m_have_parent_graph{m_owner_graph->m_parent_graph} {}
m_have_parent_graph{
static_cast<bool>(m_owner_graph->m_parent_graph)} {}
GraphExecutable::ExecEnv& exec_env() { return m_exec_env; }
......
......@@ -371,9 +371,11 @@ serialization::IndexDescMaskDump::from_index_desc(const IndexDesc &desc) {
ret.nr_item = desc.size();
for (size_t i = 0; i < desc.size(); ++ i) {
auto &&s = desc[i];
ret.items[i] = {
static_cast<int8_t>(s.axis.get_raw()),
s.begin.node(), s.end.node(), s.step.node(), s.idx.node()};
ret.items[i] = {static_cast<int8_t>(s.axis.get_raw()),
static_cast<bool>(s.begin.node()),
static_cast<bool>(s.end.node()),
static_cast<bool>(s.step.node()),
static_cast<bool>(s.idx.node())};
}
return ret;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册