提交 8da2f698 编写于 作者: M Megvii Engine Team

feat(dnn/cuda): support warp perspective/pooling op when channel not aligned to 64

GitOrigin-RevId: 39f29ec990ebb4c61e8d217911976b51810df34c
上级 c218d4b0
......@@ -568,7 +568,7 @@ TensorLayout LowbitsAlignedTensorFormatBase::collapse_contiguous_spec(
res.stride[0] = 1;
return res;
}
if (res.shape[i] == 1) {
if (res.shape[i] == 1 && res.stride[i] != 1) {
res.remove_axis_inplace(i);
}
}
......
......@@ -16,6 +16,7 @@
#include "src/cuda/int_fastdiv.cuh"
#include "src/cuda/query_blocksize.cuh"
#include "src/cuda/utils.cuh"
#include "src/cuda/integer_subbyte_utils.cuh"
/*
* please note that all arithmetics on GPU are 32-bit for best performance; this
......@@ -633,7 +634,7 @@ public:
int vec_idx = offset_ >> 1;
int lane_idx = offset_ & 0x1;
Storage item = Storage(unpack_integer_4bits<true>(
Storage item = Storage(integer_subbyte::unpack_integer_4bits<true>(
*(Storage*)&Super::m_ptr[vec_idx], lane_idx * 4));
dt_qint4 result(item);
......@@ -664,7 +665,7 @@ public:
int vec_idx = offset_ >> 1;
int lane_idx = offset_ & 0x1;
Storage item = Storage(unpack_integer_4bits<false>(
Storage item = Storage(integer_subbyte::unpack_integer_4bits<false>(
*(Storage*)&Super::m_ptr[vec_idx], lane_idx * 4));
dt_quint4 result(item);
......
......@@ -15,6 +15,7 @@
#include "src/cuda/elemwise_helper_q4.cuh"
#include "src/cuda/elemwise_multi_type/kern.cuh"
#include "src/cuda/utils.cuh"
#include "src/cuda/integer_subbyte_utils.cuh"
namespace megdnn {
namespace cuda {
......@@ -380,10 +381,10 @@ struct QuantizedMultiTypeOp<
}
__device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a) {
dst_storage x = apply(
src_storage(unpack_integer_4bits<src_signedness>(a.x, 0)));
dst_storage y = apply(
src_storage(unpack_integer_4bits<src_signedness>(a.x, 4)));
dst_storage x = apply(src_storage(
integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 0)));
dst_storage y = apply(src_storage(
integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 4)));
*(dst_vect_type*)(&dst[idx]) =
elemwise_intl::VectTypeTrait<ctype_dst>::make_vector(x, y);
......@@ -470,14 +471,14 @@ struct QuantizedMultiTypeOp<
__device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a,
src_vect_type b) {
src_storage a_x =
src_storage(unpack_integer_4bits<src_signedness>(a.x, 0));
src_storage a_y =
src_storage(unpack_integer_4bits<src_signedness>(a.x, 4));
src_storage b_x =
src_storage(unpack_integer_4bits<src_signedness>(b.x, 0));
src_storage b_y =
src_storage(unpack_integer_4bits<src_signedness>(b.x, 4));
src_storage a_x = src_storage(
integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 0));
src_storage a_y = src_storage(
integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 4));
src_storage b_x = src_storage(
integer_subbyte::unpack_integer_4bits<src_signedness>(b.x, 0));
src_storage b_y = src_storage(
integer_subbyte::unpack_integer_4bits<src_signedness>(b.x, 4));
dst_storage x = apply(a_x, b_x), y = apply(a_y, b_y);
......@@ -572,18 +573,18 @@ struct QuantizedMultiTypeOp<
__device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a,
src_vect_type b,
src_vect_type c) {
src_storage a_x =
src_storage(unpack_integer_4bits<src_signedness>(a.x, 0));
src_storage a_y =
src_storage(unpack_integer_4bits<src_signedness>(a.x, 4));
src_storage b_x =
src_storage(unpack_integer_4bits<src_signedness>(b.x, 0));
src_storage b_y =
src_storage(unpack_integer_4bits<src_signedness>(b.x, 4));
src_storage c_x =
src_storage(unpack_integer_4bits<src_signedness>(c.x, 0));
src_storage c_y =
src_storage(unpack_integer_4bits<src_signedness>(c.x, 4));
src_storage a_x = src_storage(
integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 0));
src_storage a_y = src_storage(
integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 4));
src_storage b_x = src_storage(
integer_subbyte::unpack_integer_4bits<src_signedness>(b.x, 0));
src_storage b_y = src_storage(
integer_subbyte::unpack_integer_4bits<src_signedness>(b.x, 4));
src_storage c_x = src_storage(
integer_subbyte::unpack_integer_4bits<src_signedness>(c.x, 0));
src_storage c_y = src_storage(
integer_subbyte::unpack_integer_4bits<src_signedness>(c.x, 4));
dst_storage x = apply(a_x, b_x, c_x), y = apply(a_y, b_y, c_y);
......
/**
* \file dnn/src/cuda/integer_subbyte_utils.cuh
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#if MEGDNN_CC_CUDA
#pragma once
#include "src/cuda/utils.cuh"
namespace megdnn {
namespace cuda {
namespace integer_subbyte {
template <bool signedness>
struct integer_trait;
template <>
struct integer_trait<true> {
using type = int;
};
template <>
struct integer_trait<false> {
using type = unsigned;
};
MEGDNN_DEVICE __forceinline__ static int transform_int8_to_int4x8(
int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7) {
unsigned out;
#if __CUDA_ARCH__ >= 750 && \
((__CUDACC_VER_MAJOR__ > 10) || \
((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
asm volatile(
"{ .reg .u32 r4;"
"cvt.pack.sat.s4.s32.b32 r4, %8, %7, 0;"
"cvt.pack.sat.s4.s32.b32 r4, %6, %5, r4;"
"cvt.pack.sat.s4.s32.b32 r4, %4, %3, r4;"
"cvt.pack.sat.s4.s32.b32 %0, %2, %1, r4;"
"}"
: "=r"(out)
: "r"(s0), "r"(s1), "r"(s2), "r"(s3), "r"(s4), "r"(s5), "r"(s6),
"r"(s7));
#else
#define CVT_SAT_S4_S32(r, bits) \
r = r <= -8 ? -8 : r; \
r = r > 7 ? 7 : r; \
r = (((unsigned)r & 0xf) << bits);
CVT_SAT_S4_S32(s0, 0)
CVT_SAT_S4_S32(s1, 4)
CVT_SAT_S4_S32(s2, 8)
CVT_SAT_S4_S32(s3, 12)
CVT_SAT_S4_S32(s4, 16)
CVT_SAT_S4_S32(s5, 20)
CVT_SAT_S4_S32(s6, 24)
CVT_SAT_S4_S32(s7, 28)
out = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7;
#undef CVT_SAT_S4_S32
#endif
return reinterpret_cast<int const&>(out);
}
MEGDNN_DEVICE __forceinline__ static int transform_int8_to_uint4x8(
int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7) {
unsigned out;
#if __CUDA_ARCH__ >= 750 && \
((__CUDACC_VER_MAJOR__ > 10) || \
((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
asm volatile(
"{ .reg .u32 r4;"
"cvt.pack.sat.u4.s32.b32 r4, %8, %7, 0;"
"cvt.pack.sat.u4.s32.b32 r4, %6, %5, r4;"
"cvt.pack.sat.u4.s32.b32 r4, %4, %3, r4;"
"cvt.pack.sat.u4.s32.b32 %0, %2, %1, r4;"
"}"
: "=r"(out)
: "r"(s0), "r"(s1), "r"(s2), "r"(s3), "r"(s4), "r"(s5), "r"(s6),
"r"(s7));
#else
#define CVT_SAT_U4_S32(r, bits) \
r = r <= 0 ? 0 : r; \
r = r > 15 ? 15 : r; \
r = (((unsigned)r & 0xf) << bits);
CVT_SAT_U4_S32(s0, 0)
CVT_SAT_U4_S32(s1, 4)
CVT_SAT_U4_S32(s2, 8)
CVT_SAT_U4_S32(s3, 12)
CVT_SAT_U4_S32(s4, 16)
CVT_SAT_U4_S32(s5, 20)
CVT_SAT_U4_S32(s6, 24)
CVT_SAT_U4_S32(s7, 28)
out = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7;
#undef CVT_SAT_U4_S32
#endif
return reinterpret_cast<int const&>(out);
}
template <bool signedness, typename T>
MEGDNN_DEVICE __forceinline__ static int unpack_integer_4bits(T storage,
int bits) {
//! size in bits of 32 bit integer - 4 bits
static constexpr int shift = 28;
using type = typename integer_trait<signedness>::type;
unsigned intermediate = static_cast<unsigned>(storage);
type result = reinterpret_cast<type&>(intermediate);
return (result << (shift - bits)) >> shift;
}
MEGDNN_DEVICE __forceinline__ static void transform_int4x8_to_int8(
int (&result)[8], const int& source) {
#pragma unroll
for (int i = 0; i < 8; i++) {
result[i] = unpack_integer_4bits<true>(
reinterpret_cast<unsigned const&>(source), (i << 2));
}
}
MEGDNN_DEVICE __forceinline__ static void transform_uint4x8_to_int8(
int (&result)[8], const int& source) {
#pragma unroll
for (int i = 0; i < 8; i++) {
result[i] = unpack_integer_4bits<false>(
reinterpret_cast<unsigned const&>(source), (i << 2));
}
}
MEGDNN_DEVICE __forceinline__ static void transform_int4x2_to_int8(
int (&result)[2], const uint8_t& source) {
result[0] = unpack_integer_4bits<true>(source, 0);
result[1] = unpack_integer_4bits<true>(source, 4);
}
MEGDNN_DEVICE __forceinline__ static void transform_uint4x2_to_int8(
int (&result)[2], const uint8_t& source) {
result[0] = unpack_integer_4bits<false>(source, 0);
result[1] = unpack_integer_4bits<false>(source, 4);
}
} // namespace integer_subbyte
} // namespace cuda
} // namespace megdnn
#endif
// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file dnn/src/cuda/memory_utils.cuh
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#if MEGDNN_CC_CUDA
#pragma once
#include "src/cuda/utils.cuh"
namespace megdnn {
namespace cuda {
namespace memory {
/////////////////////////////////////////////////////////////////////////////////////////////////
template <typename AccessType, int LoadBytes>
struct global_load;
/////////////////////////////////////////////////////////////////////////////////////////////////
//
// Specializations
//
/////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
// The redundant mov PTX instruction is used to enforce the compiler to
// initialize data to zero before ld.global
template <typename AccessType>
struct global_load<AccessType, 32> {
MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr,
bool pred_guard, int val = 0) {
uint4* data = reinterpret_cast<uint4*>(&D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %9, 0;\n"
" mov.b32 %0, %10;\n"
" mov.b32 %1, %10;\n"
" mov.b32 %2, %10;\n"
" mov.b32 %3, %10;\n"
" mov.b32 %4, %10;\n"
" mov.b32 %5, %10;\n"
" mov.b32 %6, %10;\n"
" mov.b32 %7, %10;\n"
" @p ld.global.v4.u32 {%0, %1, %2, %3}, [%8];\n"
" @p ld.global.v4.u32 {%4, %5, %6, %7}, [%11];\n"
"}\n"
: "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z),
"=r"(data[0].w), "=r"(data[1].x), "=r"(data[1].y),
"=r"(data[1].z), "=r"(data[1].w)
: "l"(ptr), "r"((int)pred_guard),
"r"(reinterpret_cast<unsigned&>(val)),
"l"(((uint8_t*)ptr) + 16));
}
};
template <typename AccessType>
struct global_load<AccessType, 16> {
MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr,
bool pred_guard, int val) {
uint4& data = reinterpret_cast<uint4&>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %5, 0;\n"
" mov.b32 %0, %6;\n"
" mov.b32 %1, %6;\n"
" mov.b32 %2, %6;\n"
" mov.b32 %3, %6;\n"
" @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
"}\n"
: "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
: "l"(ptr), "r"((int)pred_guard),
"r"(reinterpret_cast<unsigned&>(val)));
}
};
template <typename AccessType>
struct global_load<AccessType, 8> {
MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr,
bool pred_guard, int val) {
uint2& data = reinterpret_cast<uint2&>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %3, 0;\n"
" mov.b32 %0, %4;\n"
" mov.b32 %1, %4;\n"
" @p ld.global.v2.u32 {%0, %1}, [%2];\n"
"}\n"
: "=r"(data.x), "=r"(data.y)
: "l"(ptr), "r"((int)pred_guard),
"r"(reinterpret_cast<unsigned&>(val)));
}
};
template <typename AccessType>
struct global_load<AccessType, 4> {
MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr,
bool pred_guard, int val) {
unsigned& data = reinterpret_cast<unsigned&>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %2, 0;\n"
" mov.b32 %0, %3;\n"
" @p ld.global.u32 %0, [%1];\n"
"}\n"
: "=r"(data)
: "l"(ptr), "r"((int)pred_guard),
"r"(reinterpret_cast<unsigned&>(val)));
}
};
template <typename AccessType>
struct global_load<AccessType, 1> {
MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr,
bool pred_guard, int val) {
if (pred_guard)
D = *(reinterpret_cast<AccessType const*>(ptr));
else {
unsigned uv = reinterpret_cast<unsigned&>(val);
uint8_t& data = reinterpret_cast<uint8_t&>(D);
data = uv & 0xff;
}
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
template <
/// Fragment type to store loaded data
typename AccessType,
/// The bytes of loading
int LoadBytes>
struct global_store;
/////////////////////////////////////////////////////////////////////////////////////////////////
//
// Specializations
//
/////////////////////////////////////////////////////////////////////////////////////////////////
template <typename AccessType>
struct global_store<AccessType, 32> {
MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr,
bool pred_guard) {
uint4 const* data = reinterpret_cast<uint4 const*>(&D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %5, 0;\n"
" @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
" @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n"
"}\n"
:
: "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z),
"r"(data[0].w), "r"((int)pred_guard),
"l"(((uint8_t*)ptr) + 16), "r"(data[1].x), "r"(data[1].y),
"r"(data[1].z), "r"(data[1].w));
}
};
template <typename AccessType>
struct global_store<AccessType, 16> {
MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr,
bool pred_guard) {
uint4 const& data = reinterpret_cast<uint4 const&>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %5, 0;\n"
" @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
"}\n"
:
: "l"(ptr), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w),
"r"((int)pred_guard));
}
};
template <typename AccessType>
struct global_store<AccessType, 8> {
MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr,
bool pred_guard) {
uint2 const& data = reinterpret_cast<uint2 const&>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %3, 0;\n"
" @p st.global.v2.u32 [%0], {%1, %2};\n"
"}\n"
:
: "l"(ptr), "r"(data.x), "r"(data.y), "r"((int)pred_guard));
}
};
template <typename AccessType>
struct global_store<AccessType, 4> {
MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr,
bool pred_guard) {
uint32_t const& data = reinterpret_cast<uint32_t const&>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %2, 0;\n"
" @p st.global.u32 [%0], %1;\n"
"}\n"
:
: "l"(ptr), "r"(data), "r"((int)pred_guard));
}
};
template <typename AccessType>
struct global_store<AccessType, 2> {
MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr,
bool pred_guard) {
uint16_t const& data = reinterpret_cast<uint16_t const&>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %2, 0;\n"
" @p st.global.u16 [%0], %1;\n"
"}\n"
:
: "l"(ptr), "h"(data), "r"((int)pred_guard));
}
};
template <typename AccessType>
struct global_store<AccessType, 1> {
MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr,
bool pred_guard) {
if (pred_guard)
*(reinterpret_cast<AccessType*>(ptr)) = D;
}
};
} // namespace memory
} // namespace cuda
} // namespace megdnn
#endif
// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
......@@ -83,12 +83,7 @@ bool RelayoutForwardImpl::Param::try_copy_contig() {
return false;
if (lsrc.stride[0] != 1 || ldst.stride[0] != 1)
return false;
size_t copy_size;
if (ldst.dtype.is_low_bit()) {
copy_size = ldst.access_bytes();
} else {
copy_size = ldst.total_nr_elems() * dtype_size();
}
size_t copy_size = ldst.span().dist_byte();
cuda_check(cudaMemcpyAsync(m_dst.raw_ptr, m_src.raw_ptr, copy_size,
cudaMemcpyDeviceToDevice, m_opr->stream()));
......@@ -191,7 +186,6 @@ bool RelayoutForwardImpl::Param::try_copy_last_contig() {
}
void RelayoutForwardImpl::Param::copy_general() {
copy_noncontig_general(m_dst, m_src, m_opr->stream());
}
......
......@@ -6,14 +6,15 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "megdnn/basic_types.h"
#include "src/cuda/int_fastdiv.cuh"
#include "src/cuda/integer_subbyte_utils.cuh"
#include "src/cuda/utils.cuh"
#pragma once
namespace megdnn {
......@@ -56,13 +57,13 @@ y
template <int ndim, typename ctype, ContigType contig_type>
class ParamElemVisitor;
#define PARAM_ELEM_VISITOR_COMMON_DEV \
devfunc ctype *ptr() { return m_ptr; } \
devfunc ctype &at(uint32_t idx) { return m_ptr[offset(idx)]; }
devfunc ctype* ptr() { return m_ptr; } \
devfunc ctype& at(uint32_t idx) { return m_ptr[offset(idx)]; }
//! specialization for CONTIG_OTHER
template <int ndim, typename ctype>
class ParamElemVisitor<ndim, ctype, CONTIG_OTHER> {
ctype *__restrict m_ptr;
ctype* __restrict m_ptr;
int m_stride[ndim];
//! m_shape_highdim[i] = original_shape[i + 1]
......@@ -75,7 +76,7 @@ class ParamElemVisitor<ndim, ctype, CONTIG_OTHER> {
public:
static const int NDIM = ndim;
void host_init(const TensorND &rv, int grid_size, int block_size);
void host_init(const TensorND& rv, int grid_size, int block_size);
#if MEGDNN_CC_CUDA
devfunc void thread_init(uint32_t) {}
......@@ -86,7 +87,7 @@ public:
int offset = 0;
#pragma unroll
for (int i = ndim - 1; i >= 1; --i) {
Uint32Fastdiv &shp = m_shape_highdim[i - 1];
Uint32Fastdiv& shp = m_shape_highdim[i - 1];
uint32_t idx_div = idx / shp;
offset += (idx - idx_div * shp.divisor()) * m_stride[i];
idx = idx_div;
......@@ -102,12 +103,12 @@ public:
//! specialization for CONTIG_FULL
template <int ndim, typename ctype>
class ParamElemVisitor<ndim, ctype, CONTIG_FULL> {
ctype *__restrict m_ptr;
ctype* __restrict m_ptr;
public:
static const int NDIM = ndim;
void host_init(const TensorND &rv, int grid_size, int block_size);
void host_init(const TensorND& rv, int grid_size, int block_size);
#if MEGDNN_CC_CUDA
devfunc void thread_init(uint32_t) {}
......@@ -126,7 +127,6 @@ template <int ndim>
class ParamElemVisitor<ndim, dt_quint4, CONTIG_OTHER> {
using Storage = uint8_t;
protected:
Storage* __restrict m_ptr;
int m_stride[ndim];
int m_shape[ndim];
......@@ -205,7 +205,6 @@ public:
for (int i = 0; i < ndim; ++i) {
valid &= (shape_idx[i] < m_shape[i]);
}
#pragma unroll
for (int i = 0; i < ndim - 1; ++i) {
idx = (idx + shape_idx[i]) * m_shape[i + 1];
}
......@@ -213,7 +212,6 @@ public:
}
return idx;
}
devfunc Storage* ptr() { return m_ptr; }
devfunc Storage at(uint32_t idx) {
......@@ -221,7 +219,7 @@ public:
int vec_idx = offset_ >> 1;
int lane_idx = offset_ & 0x1;
Storage item = Storage(unpack_integer_4bits<false>(
Storage item = Storage(integer_subbyte::unpack_integer_4bits<false>(
*(Storage*)&m_ptr[vec_idx], lane_idx * 4));
return item;
......@@ -235,7 +233,7 @@ public:
#endif
};
} // namespace cuda
} // namespace megdnn
} // namespace cuda
} // namespace megdnn
// vim: ft=cpp syntax=cpp.doxygen
......@@ -143,6 +143,109 @@ struct global_load_with_zero_point<AccessType, 1> {
}
};
/////////////////////////////////////////////////////////////////////////////////////////////////
template <
/// Fragment type to store loaded data
typename AccessType,
/// The bytes of loading
int LoadBytes>
struct global_store;
/////////////////////////////////////////////////////////////////////////////////////////////////
//
// Specializations
//
/////////////////////////////////////////////////////////////////////////////////////////////////
template <typename AccessType>
struct global_store<AccessType, 32> {
devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) {
uint4 const* data = reinterpret_cast<uint4 const*>(&D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %5, 0;\n"
" @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
" @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n"
"}\n"
:
: "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z),
"r"(data[0].w), "r"((int)pred_guard),
"l"(((uint8_t*)ptr) + 16), "r"(data[1].x), "r"(data[1].y),
"r"(data[1].z), "r"(data[1].w));
}
};
template <typename AccessType>
struct global_store<AccessType, 16> {
devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) {
uint4 const& data = reinterpret_cast<uint4 const&>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %5, 0;\n"
" @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
"}\n"
:
: "l"(ptr), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w),
"r"((int)pred_guard));
}
};
template <typename AccessType>
struct global_store<AccessType, 8> {
devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) {
uint2 const& data = reinterpret_cast<uint2 const&>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %3, 0;\n"
" @p st.global.v2.u32 [%0], {%1, %2};\n"
"}\n"
:
: "l"(ptr), "r"(data.x), "r"(data.y), "r"((int)pred_guard));
}
};
template <typename AccessType>
struct global_store<AccessType, 4> {
devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) {
uint32_t const& data = reinterpret_cast<uint32_t const&>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %2, 0;\n"
" @p st.global.u32 [%0], %1;\n"
"}\n"
:
: "l"(ptr), "r"(data), "r"((int)pred_guard));
}
};
template <typename AccessType>
struct global_store<AccessType, 2> {
devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) {
uint16_t const& data = reinterpret_cast<uint16_t const&>(D);
asm volatile(
"{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %2, 0;\n"
" @p st.global.u16 [%0], %1;\n"
"}\n"
:
: "l"(ptr), "h"(data), "r"((int)pred_guard));
}
};
template <typename AccessType>
struct global_store<AccessType, 1> {
devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) {
if (pred_guard)
*(reinterpret_cast<AccessType*>(ptr)) = D;
}
};
#undef devfunc
} // namespace relayout_format
} // namespace cuda
......
......@@ -10,17 +10,14 @@
* implied.
*/
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "cutlass/fast_math.h"
#include "cutlass/arch/memory.h"
#pragma GCC diagnostic pop
#include "src/cuda/int_fastdiv.cuh"
#include "src/cuda/query_blocksize.cuh"
#include "src/cuda/relayout_format/relayout_format.cuh"
#include "src/cuda/relayout_format/helper.cuh"
#include "src/cuda/integer_subbyte_utils.cuh"
#include "src/cuda/memory_utils.cuh"
using namespace megdnn;
using namespace cuda;
using namespace integer_subbyte;
namespace {
......@@ -322,26 +319,34 @@ struct Translayout<2, 64, SrcType, dtype::QuantizedS4, dtype::QuantizedS4,
int* dst_frag = reinterpret_cast<int*>(dst_width);
#pragma unroll
for (int i = 0; i < 64; i += 8) {
#define unpack_int4x2(_idx) \
intermediate[_idx][0] = unpack_integer_4bits<true>( \
reinterpret_cast<uint8_t&>(read_channel[i + _idx]), 0); \
intermediate[_idx][1] = unpack_integer_4bits<true>( \
reinterpret_cast<uint8_t&>(read_channel[i + _idx]), 4);
// clang-format off
unpack_int4x2(0)
unpack_int4x2(1)
unpack_int4x2(2)
unpack_int4x2(3)
unpack_int4x2(4)
unpack_int4x2(5)
unpack_int4x2(6)
unpack_int4x2(7)
// clang-format on
transform_int4x2_to_int8(
intermediate[0],
reinterpret_cast<uint8_t&>(read_channel[i + 0]));
transform_int4x2_to_int8(
intermediate[1],
reinterpret_cast<uint8_t&>(read_channel[i + 1]));
transform_int4x2_to_int8(
intermediate[2],
reinterpret_cast<uint8_t&>(read_channel[i + 2]));
transform_int4x2_to_int8(
intermediate[3],
reinterpret_cast<uint8_t&>(read_channel[i + 3]));
transform_int4x2_to_int8(
intermediate[4],
reinterpret_cast<uint8_t&>(read_channel[i + 4]));
transform_int4x2_to_int8(
intermediate[5],
reinterpret_cast<uint8_t&>(read_channel[i + 5]));
transform_int4x2_to_int8(
intermediate[6],
reinterpret_cast<uint8_t&>(read_channel[i + 6]));
transform_int4x2_to_int8(
intermediate[7],
reinterpret_cast<uint8_t&>(read_channel[i + 7]));
int frag_idx = i / 8;
dst_frag[0 * 8 + frag_idx] = pack_channel(0);
dst_frag[1 * 8 + frag_idx] = pack_channel(1);
#undef unpack_int4x2
}
}
using Fragment = array_wrapper<SrcType, 64>;
......@@ -429,26 +434,34 @@ struct Translayout<2, 64, SrcType, dtype::Quantized4Asymm,
int* dst_frag = reinterpret_cast<int*>(dst_width);
#pragma unroll
for (int i = 0; i < 64; i += 8) {
#define unpack_int4x2(_idx) \
intermediate[_idx][0] = unpack_integer_4bits<false>( \
reinterpret_cast<uint8_t&>(read_channel[i + _idx]), 0); \
intermediate[_idx][1] = unpack_integer_4bits<false>( \
reinterpret_cast<uint8_t&>(read_channel[i + _idx]), 4);
// clang-format off
unpack_int4x2(0)
unpack_int4x2(1)
unpack_int4x2(2)
unpack_int4x2(3)
unpack_int4x2(4)
unpack_int4x2(5)
unpack_int4x2(6)
unpack_int4x2(7)
// clang-format on
transform_uint4x2_to_int8(
intermediate[0],
reinterpret_cast<uint8_t&>(read_channel[i + 0]));
transform_uint4x2_to_int8(
intermediate[1],
reinterpret_cast<uint8_t&>(read_channel[i + 1]));
transform_uint4x2_to_int8(
intermediate[2],
reinterpret_cast<uint8_t&>(read_channel[i + 2]));
transform_uint4x2_to_int8(
intermediate[3],
reinterpret_cast<uint8_t&>(read_channel[i + 3]));
transform_uint4x2_to_int8(
intermediate[4],
reinterpret_cast<uint8_t&>(read_channel[i + 4]));
transform_uint4x2_to_int8(
intermediate[5],
reinterpret_cast<uint8_t&>(read_channel[i + 5]));
transform_uint4x2_to_int8(
intermediate[6],
reinterpret_cast<uint8_t&>(read_channel[i + 6]));
transform_uint4x2_to_int8(
intermediate[7],
reinterpret_cast<uint8_t&>(read_channel[i + 7]));
int frag_idx = i / 8;
dst_frag[0 * 8 + frag_idx] = pack_channel(0);
dst_frag[1 * 8 + frag_idx] = pack_channel(1);
#undef unpack_int4x2
}
}
using Fragment = array_wrapper<SrcType, 64>;
......@@ -744,6 +757,16 @@ inline __device__ int4 make_zero_pad<int4>(const uint8_t zero_point) {
return {zero_point, zero_point, zero_point, zero_point};
}
template <int size_nbits>
inline __device__ int make_zero(int zero_point);
template <>
inline __device__ int make_zero<4>(int zero_point) {
return transform_int8_to_uint4x8(zero_point, zero_point, zero_point,
zero_point, zero_point, zero_point,
zero_point, zero_point);
}
template <typename DstDtype>
inline __device__ void write_helper(DstDtype* ptr, DstDtype val) {
*ptr = val;
......@@ -1062,11 +1085,11 @@ public:
using AccessType = array_wrapper<Type, pack_size_in_type>;
using Fragment = array_wrapper<Type, elements_in_type>;
MEGDNN_DEVICE TensorIteratorOverChannel()
MEGDNN_HOST TensorIteratorOverChannel()
: pointer{nullptr}, chan_stride_in_elements{0}, channel{0} {}
MEGDNN_DEVICE TensorIteratorOverChannel(Type* pointer_,
int chan_stride_in_elements_,
int channel_, int, int)
MEGDNN_HOST TensorIteratorOverChannel(Type* pointer_,
int chan_stride_in_elements_,
int channel_, int, int)
: pointer{pointer_},
chan_stride_in_elements{chan_stride_in_elements_},
channel{channel_} {}
......@@ -1093,8 +1116,7 @@ public:
(lane_size_in_type / pack_size_in_type) +
j;
bool guard = i < channel;
relayout_format::global_load_with_zero_point<AccessType,
pack_size_in_byte>(
memory::global_load<AccessType, pack_size_in_byte>(
frag_ptr[frag_idx],
reinterpret_cast<void*>(pointer_ +
j * pack_size_in_type),
......@@ -1115,7 +1137,7 @@ public:
(lane_size_in_type / pack_size_in_type) +
j;
bool guard = i < channel;
cutlass::arch::global_store<AccessType, pack_size_in_byte>(
memory::global_store<AccessType, pack_size_in_byte>(
frag_ptr[frag_idx],
reinterpret_cast<void*>(pointer_ +
j * pack_size_in_type),
......@@ -1160,20 +1182,18 @@ public:
using AccessType = array_wrapper<Type, pack_size_in_type>;
using Fragment = array_wrapper<Type, elements_in_type>;
MEGDNN_HOST MEGDNN_DEVICE MaskedTensorIteratorOverChannel()
MEGDNN_HOST MaskedTensorIteratorOverChannel()
: pointer{nullptr},
chan_stride_in_elements{0},
channel{0} {}
MEGDNN_HOST MEGDNN_DEVICE MaskedTensorIteratorOverChannel(
MEGDNN_HOST MaskedTensorIteratorOverChannel(
Type* pointer_, int chan_stride_in_elements_, int channel_,
int bound_, int div_)
: pointer{pointer_},
chan_stride_in_elements{chan_stride_in_elements_},
channel{channel_},
bound{bound_},
div{div_} {
cutlass::find_divisor(mul, shr, div);
}
div{uint32_t(div_)} {}
MEGDNN_DEVICE __forceinline__ void initialize(int c_idx, int hw_idx) {
pointer += (c_idx / pack_size) * chan_stride_in_elements;
......@@ -1187,8 +1207,8 @@ public:
#pragma unroll
for (int j = 0; j < lane_size_in_type / pack_size_in_type; j++) {
int offset = hw_idx + j;
int h, w;
cutlass::fast_divmod(h, w, offset, div, mul, shr);
int h = (int)((uint32_t)(offset) / div);
int w = (int)((uint32_t)(offset) % div);
bool guard = (i < channel) && (w < bound);
int index = (i / pack_size) *
(lane_size_in_type / pack_size_in_type) +
......@@ -1219,8 +1239,7 @@ public:
int mask_index = (frag_idx >> 5);
int mask_shift = (frag_idx & 0x1f);
bool guard = (mask[mask_index] & (1 << mask_shift));
relayout_format::global_load_with_zero_point<AccessType,
pack_size_in_byte>(
memory::global_load<AccessType, pack_size_in_byte>(
frag_ptr[frag_idx],
reinterpret_cast<void*>(pointer_ + stride[j]), guard,
zero_point);
......@@ -1242,7 +1261,7 @@ public:
int mask_index = (frag_idx >> 5);
int mask_shift = (frag_idx & 0x1f);
bool guard = (mask[mask_index] & (1 << mask_shift));
cutlass::arch::global_store<AccessType, pack_size_in_byte>(
memory::global_store<AccessType, pack_size_in_byte>(
frag_ptr[frag_idx],
reinterpret_cast<void*>(pointer_ + stride[j]), guard);
}
......@@ -1260,9 +1279,7 @@ private:
int chan_stride_in_elements;
int channel;
int bound;
int div;
uint32_t mul;
uint32_t shr;
Uint32Fastdiv div;
uint32_t mask[mask_size];
size_t stride[lane_size_in_type / pack_size_in_type];
};
......@@ -1355,8 +1372,7 @@ __global__ void relayout_kern(typename RelayoutProblem_::Param param) {
param.dst_iterator.initialize(c_idx, hw_idx);
typename SrcIterator::Fragment src_frag;
typename DstIterator::Fragment dst_frag;
int zp = relayout_format::make_zero<SrcIterator::size_nbits>(
param.zero_point);
int zp = make_zero<SrcIterator::size_nbits>(param.zero_point);
param.src_iterator.load(src_frag, zp);
RelayoutProblem_::Transpose::trans(
reinterpret_cast<typename SrcIterator::Fragment&>(dst_frag),
......@@ -1456,7 +1472,7 @@ void relayout_format::relayout_format_cuda_nchw_nchwx(
megdnn_assert(src_layout.dtype.is_low_bit());
int n = src.layout[0];
int ic = src.layout[1];
int oc = dst.layout[1] * 64;
int oc = dst.layout[1] * pack_oc;
int h = src.layout[2];
// align to byte
int w = src.layout[3];
......
......@@ -223,10 +223,12 @@ struct TypeCvtOpFromQuantizedToQuantized4bit<
}
__device__ __forceinline__ void operator()(uint32_t idx,
src_vect_type src) {
dst_storage x = apply(
src_storage(unpack_integer_4bits<src_signedness>(src.x, 0)));
dst_storage y = apply(
src_storage(unpack_integer_4bits<src_signedness>(src.x, 4)));
dst_storage x = apply(src_storage(
integer_subbyte::unpack_integer_4bits<src_signedness>(src.x,
0)));
dst_storage y = apply(src_storage(
integer_subbyte::unpack_integer_4bits<src_signedness>(src.x,
4)));
*(dst_vect_type*)(&dest[idx]) =
VectTypeTrait<ctype_dest>::make_vector(x, y);
......
......@@ -21,7 +21,6 @@
#include "cuda.h"
#include "src/cuda/cudnn_with_check.h"
#include "cutlass/cutlass.h"
#include "cutlass/platform/platform.h"
#define cuda_check(_x) \
do { \
......@@ -376,104 +375,6 @@ MEGDNN_DEVICE __forceinline__ static float4 operator+(float4 lval,
lval.w + rval.w);
}
MEGDNN_DEVICE __forceinline__ static int transform_int8_to_int4x8(
int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7) {
unsigned out;
#if __CUDA_ARCH__ >= 750 && \
((__CUDACC_VER_MAJOR__ > 10) || \
((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
asm volatile(
"{ .reg .u32 r4;"
"cvt.pack.sat.s4.s32.b32 r4, %8, %7, 0;"
"cvt.pack.sat.s4.s32.b32 r4, %6, %5, r4;"
"cvt.pack.sat.s4.s32.b32 r4, %4, %3, r4;"
"cvt.pack.sat.s4.s32.b32 %0, %2, %1, r4;"
"}"
: "=r"(out)
: "r"(s0), "r"(s1), "r"(s2), "r"(s3), "r"(s4), "r"(s5), "r"(s6),
"r"(s7));
#else
#define CVT_SAT_S4_S32(r, bits) \
r = r <= -8 ? -8 : r; \
r = r > 7 ? 7 : r; \
r = (((unsigned)r & 0xf) << bits);
CVT_SAT_S4_S32(s0, 0)
CVT_SAT_S4_S32(s1, 4)
CVT_SAT_S4_S32(s2, 8)
CVT_SAT_S4_S32(s3, 12)
CVT_SAT_S4_S32(s4, 16)
CVT_SAT_S4_S32(s5, 20)
CVT_SAT_S4_S32(s6, 24)
CVT_SAT_S4_S32(s7, 28)
out = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7;
#undef CVT_SAT_S4_S32
#endif
return reinterpret_cast<int const&>(out);
}
MEGDNN_DEVICE __forceinline__ static int transform_int8_to_uint4x8(
int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7) {
unsigned out;
#if __CUDA_ARCH__ >= 750 && \
((__CUDACC_VER_MAJOR__ > 10) || \
((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
asm volatile(
"{ .reg .u32 r4;"
"cvt.pack.sat.u4.s32.b32 r4, %8, %7, 0;"
"cvt.pack.sat.u4.s32.b32 r4, %6, %5, r4;"
"cvt.pack.sat.u4.s32.b32 r4, %4, %3, r4;"
"cvt.pack.sat.u4.s32.b32 %0, %2, %1, r4;"
"}"
: "=r"(out)
: "r"(s0), "r"(s1), "r"(s2), "r"(s3), "r"(s4), "r"(s5), "r"(s6),
"r"(s7));
#else
#define CVT_SAT_U4_S32(r, bits) \
r = r <= 0 ? 0 : r; \
r = r > 15 ? 15 : r; \
r = (((unsigned)r & 0xf) << bits);
CVT_SAT_U4_S32(s0, 0)
CVT_SAT_U4_S32(s1, 4)
CVT_SAT_U4_S32(s2, 8)
CVT_SAT_U4_S32(s3, 12)
CVT_SAT_U4_S32(s4, 16)
CVT_SAT_U4_S32(s5, 20)
CVT_SAT_U4_S32(s6, 24)
CVT_SAT_U4_S32(s7, 28)
out = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7;
#undef CVT_SAT_U4_S32
#endif
return reinterpret_cast<int const&>(out);
}
template <bool signedness, typename T>
MEGDNN_DEVICE __forceinline__ static int unpack_integer_4bits(T storage,
int bits) {
static constexpr int shift = 28;
using type = typename cutlass::platform::conditional<signedness, int,
unsigned>::type;
unsigned intermediate = static_cast<unsigned>(storage);
type result = reinterpret_cast<type&>(intermediate);
return (result << (shift - bits)) >> shift;
}
MEGDNN_DEVICE __forceinline__ static void transform_int4x8_to_int8(
int (&result)[8], const int& source) {
#pragma unroll
for (int i = 0; i < 8; i++) {
result[i] = unpack_integer_4bits<true>(
reinterpret_cast<unsigned const&>(source), (i << 2));
}
}
MEGDNN_DEVICE __forceinline__ static void transform_uint4x8_to_int8(
int (&result)[8], const int& source) {
#pragma unroll
for (int i = 0; i < 8; i++) {
result[i] = unpack_integer_4bits<false>(
reinterpret_cast<unsigned const&>(source), (i << 2));
}
}
#endif
} // namespace cuda
} // namespace megdnn
......
......@@ -348,6 +348,7 @@ void WarpPerspectiveForwardImpl::exec(_megdnn_tensor_in ssrc,
RelayoutFormat::Param trans_param;
trans_param.mode =
RelayoutFormat::Param::Mode::NCHW64_NCHW;
trans_param.oc = sdst.layout[1];
relayout_opr->param() = trans_param;
relayout_opr->exec(dst, sdst, {});
}
......
......@@ -17,10 +17,12 @@
#include "src/common/rounding_converter.cuh"
#include "megdnn/dtype.h"
#include <cstdio>
#include "src/cuda/integer_subbyte_utils.cuh"
using namespace megdnn;
using namespace cuda;
using namespace warp_perspective;
using namespace integer_subbyte;
namespace {
......
......@@ -247,14 +247,13 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW_Q4) {
using Param = param::Pooling;
Checker<Pooling> checker(handle_cuda());
Param param{Param::Mode::MAX, 0, 0, 2, 2, 2, 2};
checker.set_dtype(0, dtype::QuantizedS4(0.1f));
checker.set_dtype(0, dtype::QuantizedS4(3.1415926f));
param.format = Param::Format::NCHW;
checker.set_epsilon(1 + 1e-3);
checker.set_param(param).exec({{20, 64, 22, 33}, {}});
param.mode = Param::Mode::AVERAGE;
checker.set_param(param).exec({{20, 64, 22, 33}, {}});
checker.set_param(param).exec({{20, 96, 22, 33}, {}});
param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
checker.set_param(param).exec({{20, 64, 22, 33}, {}});
checker.set_param(param).exec({{20, 24, 22, 33}, {}});
}
TEST_F(CUDA, POOLING_FORWARD_NCHW4) {
......
......@@ -107,7 +107,7 @@ TEST_F(CUDA, QUANTIZED_TYPECVT) {
}
TEST_F(CUDA, QUANTIZED_TYPECVT_4BIT) {
UniformIntRNG int_rng{0, 8};
UniformIntRNG int_rng{-8, 8};
Checker<TypeCvt> checker(handle_cuda());
checker.set_rng(0, &int_rng).set_rng(1, &int_rng);
......
......@@ -627,9 +627,9 @@ TEST_F(CUDA, WARP_PERSPECTIVE_FORWARD_QINT4) {
Checker<WarpPerspectiveForward> checker(handle_cuda());
WarpPerspectiveMatRNG rng;
checker.set_rng(1, &rng);
checker.set_dtype(0, dtype::QuantizedS4(0.1f))
checker.set_dtype(0, dtype::QuantizedS4(1.25f))
.set_dtype(1, dtype::Float32())
.set_dtype(2, dtype::QuantizedS4(0.1f));
.set_dtype(2, dtype::QuantizedS4(1.25f));
for (auto bmode : {WarpPerspective::BorderMode::WRAP,
WarpPerspective::BorderMode::REFLECT,
WarpPerspective::BorderMode::REPLICATE,
......
......@@ -68,7 +68,7 @@ using namespace gopt;
* oprs should not get involved in any actual computing.
*/
MGB_DEFINE_OPR_CLASS(TensorReformatPass::RelayoutPlaceholder,
cg::SingleCNOperatorNodeBase) // {
cg::SingleCNOperatorNodeBase) // {
public:
//! relayout type of this opr
enum class LayoutType {
......@@ -124,14 +124,14 @@ public:
NCHW4_TO_NCHW64, //! <from nchw4 layout to nchw64 layout
};
RelayoutPlaceholder(VarNode * src_var, LayoutType layout_type);
RelayoutPlaceholder(VarNode* src_var, LayoutType layout_type);
/*!
* \param src_var the input var
* \param layout_type tensor layout transform type of this relayout
* placeholder as described in LayoutType
*/
static SymbolVar make(VarNode * src_var, LayoutType layout_type);
static SymbolVar make(VarNode* src_var, LayoutType layout_type);
LayoutType layout_type() const { return m_layout_type; }
......@@ -141,7 +141,6 @@ private:
void init_output_comp_node() override;
const LayoutType m_layout_type;
};
MGB_DYN_TYPE_OBJ_FINAL_IMPL(TensorReformatPass::RelayoutPlaceholder);
TensorReformatPass::RelayoutPlaceholder::RelayoutPlaceholder(
......@@ -3866,8 +3865,12 @@ void PaddingChannelPass::apply(OptState& opt) const {
};
auto extract_subtensor = [](VarNode* inp,
size_t orig_channels) -> VarNode* {
const TensorShape& orig_shape) -> VarNode* {
mgb_assert(inp->shape().ndim == 4);
mgb_assert(inp->shape()[0] == orig_shape[0]);
mgb_assert(inp->shape()[2] == orig_shape[2]);
mgb_assert(inp->shape()[3] == orig_shape[3]);
size_t orig_channels = orig_shape[1];
auto x = SymbolVar(inp);
auto cv = [&x](int v) { return x.make_scalar(v); };
using AIdx = opr::Subtensor::AxisIndexer;
......@@ -4108,8 +4111,7 @@ void PaddingChannelPass::apply(OptState& opt) const {
bool padding_cur_inp =
padding_oprs.count(cur_inp->owner_opr()) > 0;
if (padding_cur_inp) {
size_t orig_channels = cur_inp->shape()[1];
inps[i] = extract_subtensor(inps[i], orig_channels);
inps[i] = extract_subtensor(inps[i], cur_inp->shape());
}
}
return serialization::copy_opr_shallow(*opr, inps, opr->config());
......@@ -4133,8 +4135,7 @@ void PaddingChannelPass::apply(OptState& opt) const {
auto cur_inp = opr->input(i);
bool padding_cur_inp = padding_oprs.count(cur_inp->owner_opr()) > 0;
if (padding_cur_inp) {
size_t orig_channels = cur_inp->shape()[1];
inps[i] = extract_subtensor(inps[i], orig_channels);
inps[i] = extract_subtensor(inps[i], cur_inp->shape());
}
}
return serialization::copy_opr_shallow(*opr, inps, opr->config());
......@@ -4142,6 +4143,8 @@ void PaddingChannelPass::apply(OptState& opt) const {
opr_replace_funcs[opr::Reshape::typeinfo()] = replace_nonpadding_oprs;
opr_replace_funcs[opr::GetVarShape::typeinfo()] = replace_nonpadding_oprs;
opr_replace_funcs[opr::Concat::typeinfo()] = replace_nonpadding_oprs;
opr_replace_funcs[opr::Reduce::typeinfo()] = replace_nonpadding_oprs;
opr_replace_funcs[opr::Subtensor::typeinfo()] = replace_nonpadding_oprs;
auto on_opr = [&opt, &rewriter, &opr_replace_funcs,
&extract_subtensor](OperatorNodeBase* opr) {
......@@ -4169,8 +4172,7 @@ void PaddingChannelPass::apply(OptState& opt) const {
auto dst = out1[i];
if (opt.graph().endpoint_contain(src) &&
!src->shape().eq_shape(dst->shape())) {
size_t orig_channels = src->shape()[1];
dst = extract_subtensor(dst, orig_channels);
dst = extract_subtensor(dst, src->shape());
}
rewriter.replace_var(src, dst, nullptr);
}
......
......@@ -4183,14 +4183,7 @@ TEST(TestGoptInference, PaddingChannels) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 61) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 61);
return;
}
REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
......@@ -4263,15 +4256,8 @@ TEST(TestGoptInference, ConcatAfterPaddingChannels) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 61) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 61);
return;
}
REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
......@@ -4332,19 +4318,11 @@ TEST(TestGoptInference, ConcatAfterPaddingChannels) {
MGB_ASSERT_TENSOR_EQ(t1, t2);
}
// FIXME replace cpu with gpu to enable gpu validation
TEST(TestGoptInference, PaddingChannelsWithPooling) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 61) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 61);
return;
}
REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
......@@ -4408,17 +4386,7 @@ TEST(TestGoptInference, PaddingChannelsWithPooling) {
// FIXME replace cpu with gpu to enable gpu validation
TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) {
REQUIRE_GPU(1);
auto cn = CompNode::load("cpu0");
// cn.activate();
// auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
// auto sm_ver = prop.major * 10 + prop.minor;
// if (sm_ver < 61) {
// printf("This testcast ignored due to insufficient cuda cap(got: %d, "
// "expected: %d)\n",
// sm_ver, 61);
// return;
// }
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
......@@ -4488,16 +4456,9 @@ TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) {
TEST(TestGoptInference, EnableNCHW64Basic) {
REQUIRE_GPU(1);
auto cn = CompNode::load("cpu0");
// cn.activate();
// auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
// auto sm_ver = prop.major * 10 + prop.minor;
// if (sm_ver < 61) {
// printf("This testcast ignored due to insufficient cuda cap(got: %d, "
// "expected: %d)\n",
// sm_ver, 61);
// return;
// }
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
......@@ -4517,8 +4478,8 @@ TEST(TestGoptInference, EnableNCHW64Basic) {
};
auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)),
w = mkcvar("w", {32, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
b = mkcvar("b", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f));
w = mkcvar("w", {16, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
b = mkcvar("b", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f));
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NCHW;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
......@@ -4527,7 +4488,7 @@ TEST(TestGoptInference, EnableNCHW64Basic) {
auto y = opr::ConvBias::make(x, w, b, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
auto w1 = mkcvar("w1", {32, 32, 3, 3}, dtype::QuantizedS8(2.5f)),
auto w1 = mkcvar("w1", {32, 16, 3, 3}, dtype::QuantizedS8(2.5f)),
b1 = mkcvar("b1", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f));
auto y1 = opr::ConvBias::make(y, w1, b1, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
......@@ -4541,14 +4502,14 @@ TEST(TestGoptInference, EnableNCHW64Basic) {
auto y3 = opr::ConvBias::make(y2, w3, b3, param, {},
OperatorNodeConfig{dtype::QuantizedS4{40.f}});
y3 = opr::TypeCvt::make(y3, dtype::QuantizedS8{2.5f});
auto w4 = mkcvar("w4", {32, 64, 3, 3}, dtype::QuantizedS8(2.5f)),
b4 = mkcvar("b4", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f));
auto w4 = mkcvar("w4", {16, 64, 3, 3}, dtype::QuantizedS8(2.5f)),
b4 = mkcvar("b4", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f));
auto y4 = opr::ConvBias::make(y3, w4, b4, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode;
auto y5 = opr::ElemwiseMultiType::make(
{y, y4}, {ElemMultiMode::QFUSE_ADD_RELU},
OperatorNodeConfig{dtype::QuantizedS8{1.2f}});
OperatorNodeConfig{dtype::QuantizedS8{1.3f}});
y5 = opr::TypeCvt::make(y5, dtype::Float32());
SymbolVar y5_pad;
unpack_vector(
......@@ -4573,10 +4534,10 @@ TEST(TestGoptInference, EnableNCHW64Basic) {
ASSERT_EQ(o.param().format, Format::_fmt); \
}
CHECK(0, NCHW4);
CHECK(1, NCHW32);
CHECK(1, NCHW4);
CHECK(2, NCHW32);
CHECK(3, NCHW64);
CHECK(4, NCHW32);
CHECK(4, NCHW4);
#undef CHECK
HostTensorND t1, t2;
auto func1 = graph->compile({make_callback_copy(y5, t1)});
......@@ -4588,16 +4549,9 @@ TEST(TestGoptInference, EnableNCHW64Basic) {
TEST(TestGoptInference, EnableNCHW64PaddingChannel) {
REQUIRE_GPU(1);
auto cn = CompNode::load("cpu0");
// cn.activate();
// auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
// auto sm_ver = prop.major * 10 + prop.minor;
// if (sm_ver < 61) {
// printf("This testcast ignored due to insufficient cuda cap(got: %d, "
// "expected: %d)\n",
// sm_ver, 61);
// return;
// }
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
......@@ -4616,8 +4570,8 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) {
dtype);
};
auto x = mkvar("x", {16, 3, 14, 14}, dtype::QuantizedS8(2.5f)),
w = mkcvar("w", {20, 3, 3, 3}, dtype::QuantizedS8(2.5f)),
auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)),
w = mkcvar("w", {20, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f));
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NCHW;
......@@ -4630,7 +4584,7 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) {
opr::Pooling::Param pool;
pool.format = opr::Pooling::Param::Format::NCHW;
y = opr::Pooling::make(y, pool);
auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)),
b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f));
auto y1 = opr::ConvBias::make(y, w1, b1, param, {},
......@@ -4657,11 +4611,12 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) {
deconv.format = opr::ConvolutionBackwardData::Param::Format::NCHW;
deconv.stride_h = deconv.stride_w = 2;
deconv.pad_h = deconv.pad_w = 1;
auto w6 = mkcvar("w6", {20, 20, 4, 4}, dtype::QuantizedS8{2.5f});
auto w6 = mkcvar("w6", {20, 64, 4, 4}, dtype::QuantizedS8{2.5f});
auto y6 = opr::ConvolutionBackwardData::make(
w6, y5, deconv, {},
OperatorNodeConfig{dtype::QuantizedS8(2.0f)});
y6 = opr::TypeCvt::make(y6, dtype::QuantizedS4{32.f});
std::shared_ptr<HostTensorND> mat = std::make_shared<HostTensorND>(
cn, TensorShape{16, 3, 3}, dtype::Float32());
warp_perspective_mat_gen(*mat, 16, 14, 14);
......@@ -4676,25 +4631,31 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) {
opt.enable_nchw64();
unpack_vector(gopt::optimize_for_inference({y7}, opt), y7_pad);
EXPECT_TRUE(y7.node()->shape().eq_shape(y7_pad.node()->shape()));
SmallVector<cg::OperatorNodeBase*> oprs;
auto cb = [&oprs](cg::OperatorNodeBase* opr) {
if (opr->same_type<opr::ConvBias>()) {
oprs.push_back(opr);
}
};
cg::DepOprIter{cb}.add(y7_pad.node()->owner_opr());
ASSERT_EQ(oprs.size(), 5);
HostTensorND t1, t2;
auto func1 = graph->compile({make_callback_copy(y7, t1)});
func1->execute();
auto func2 = graph->compile({make_callback_copy(y7_pad, t2)});
func2->execute();
MGB_ASSERT_TENSOR_EQ(t1, t2);
using Format = opr::ConvBiasForward::Param::Format;
SmallVector<cg::OperatorNodeBase*> oprs;
auto cb = [&oprs](cg::OperatorNodeBase* opr) {
if (opr->same_type<opr::ConvBias>()) {
oprs.push_back(opr);
}
};
cg::DepOprIter{cb}.add(y7_pad.node()->owner_opr());
ASSERT_EQ(oprs.size(), 5);
#define CHECK(_i, _fmt) \
{ \
const auto& o = oprs[_i]->cast_final<opr::ConvBias>(); \
ASSERT_EQ(o.param().format, Format::_fmt); \
}
CHECK(0, NCHW4);
CHECK(1, NCHW32);
CHECK(2, NCHW32);
CHECK(3, NCHW64);
CHECK(4, NCHW64);
CHECK(0, NCHW4);
CHECK(1, NCHW32);
CHECK(2, NCHW32);
CHECK(3, NCHW64);
CHECK(4, NCHW64);
#undef CHECK
{
const auto& deconv = find_opr<opr::ConvolutionBackwardData>(y7_pad);
......@@ -4702,30 +4663,19 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) {
const auto& pool = find_opr<opr::PoolingForward>(y7_pad);
ASSERT_EQ(pool.param().format, Format::NCHW4);
const auto& warp = find_opr<opr::WarpPerspectiveForward>(y7_pad);
ASSERT_EQ(warp.param().format, Format::NCHW4);
ASSERT_EQ(warp.param().format, Format::NCHW64);
}
size_t nr_dimshuffle = find_opr_num<opr::Dimshuffle>(y7_pad);
HostTensorND t1, t2;
auto func1 = graph->compile({make_callback_copy(y7, t1)});
func1->execute();
auto func2 = graph->compile({make_callback_copy(y7_pad, t2)});
func2->execute();
MGB_ASSERT_TENSOR_EQ(t1, t2);
ASSERT_EQ(nr_dimshuffle, 8);
}
TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) {
REQUIRE_GPU(1);
auto cn = CompNode::load("cpu0");
// cn.activate();
// auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
// auto sm_ver = prop.major * 10 + prop.minor;
// if (sm_ver < 61) {
// printf("This testcast ignored due to insufficient cuda cap(got: %d, "
// "expected: %d)\n",
// sm_ver, 61);
// return;
// }
auto cn = CompNode::load("gpu0");
cn.activate();
REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
HostTensorND t1, t2;
HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
graph->options().graph_opt_level = 0;
......@@ -4757,7 +4707,7 @@ TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) {
auto w1 = mkcvar("w1", {64, 32, 3, 3}, dtype::QuantizedS8(2.5f)),
b1 = mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f));
auto y1 = opr::ConvBias::make(y, w1, b1, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f});
auto w2 = mkcvar("w2", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)),
b2 = mkcvar("b2", {1, 64, 1, 1}, dtype::QuantizedS32(100.f));
......@@ -4772,6 +4722,9 @@ TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) {
{y1, y3}, {ElemMultiMode::QFUSE_ADD_RELU},
OperatorNodeConfig{dtype::QuantizedS4{40.f}});
y4 = opr::TypeCvt::make(y4, dtype::Float32());
auto y5 = opr::ConvBias::make(y2, w3, b3, y1, param, {},
OperatorNodeConfig{dtype::QuantizedS4(40.f)});
y5 = opr::TypeCvt::make(y5, dtype::Float32());
SymbolVar y4_pad;
auto opt = gopt::OptimizeForInferenceOptions{};
opt.enable_nchw64();
......@@ -4779,15 +4732,35 @@ TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) {
EXPECT_TRUE(y4.node()->shape().eq_shape(y4_pad.node()->shape()));
size_t nr_elem_mult_type = find_opr_num<opr::ElemwiseMultiType>(y4_pad);
ASSERT_EQ(nr_elem_mult_type, 0);
// FIXME need impl of elemwise/elemwise_multi_type on CUDA
#if 0
HostTensorND t1, t2;
auto func1 = graph->compile({make_callback_copy(y4, t1)});
func1->execute();
auto func2 = graph->compile({make_callback_copy(y4_pad, t2)});
func2->execute();
auto func = graph->compile({make_callback_copy(y4_pad, t1)});
func->execute();
{
opr::ConvBias::Param param;
param.format = opr::ConvBias::Param::Format::NCHW;
param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
param.stride_h = param.stride_w = 1;
param.pad_h = param.pad_w = 1;
auto y = opr::ConvBias::make(
x, w, b, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
auto y1 = opr::ConvBias::make(
y, w1, b1, param, {},
OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f});
auto y2 = opr::ConvBias::make(
y1, w2, b2, param, {},
OperatorNodeConfig{dtype::QuantizedS4{40.f}});
param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
auto y3 = opr::ConvBias::make(
y2, w3, b3, y1, param, {},
OperatorNodeConfig{dtype::QuantizedS4(40.f)});
y3 = opr::TypeCvt::make(y3, dtype::Float32());
auto func = graph->compile({make_callback_copy(y3, t2)});
func->execute();
}
MGB_ASSERT_TENSOR_EQ(t1, t2);
#endif
}
#endif
......
......@@ -102,7 +102,8 @@ void WarpPerspectiveForward::outshape_by_symvar_do_get_output_shape(
default:
size_t height_idx = 0;
if (param().format == Param::Format::NCHW ||
param().format == Param::Format::NCHW4) {
param().format == Param::Format::NCHW4 ||
param().format == Param::Format::NCHW64) {
height_idx = 2;
} else {
height_idx = 1;
......
......@@ -2604,11 +2604,21 @@ TEST_F(TestNoWeightPreprocess, NoPreprocess) {
#endif
namespace {
// FIXME change comp node from "cpu0" to "gpu0"
TEST(TestOprDNN, ConvBiasInt4NCHW) {
auto run = [](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S,
size_t P) {
auto cn = CompNode::load("cpu0");
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver != 75) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 75);
return;
}
auto run = [&cn](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S,
size_t P) {
auto graph = ComputingGraph::make();
HostTensorGenerator<dtype::Int8> gen;
......@@ -2671,6 +2681,18 @@ TEST(TestOprDNN, ConvBiasInt4NCHW) {
}
TEST(TestOprDNN, ConvBiasInt4NCHW64) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver != 75) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 75);
return;
}
auto nchw2nchw64 = [](SymbolVar x) {
auto y = opr::RelayoutFormat::make(
x, opr::RelayoutFormat::Param::Mode::NCHW_NCHW64);
......@@ -2685,7 +2707,6 @@ TEST(TestOprDNN, ConvBiasInt4NCHW64) {
auto run = [&](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S,
size_t P) {
auto cn = CompNode::load("cpu0");
auto graph = ComputingGraph::make();
HostTensorGenerator<dtype::Int8> gen;
......
......@@ -138,15 +138,19 @@ uint64_t eval_conv_computation(const TensorShape& src_shape,
src_shape[1] / group * 2;
return hybird_nchwx ? computation : computation * 4;
}
if (param.format == Param::Format::NCHW32 ||
param.format == Param::Format::NCHW32_NCHW4) {
return dst_shape.total_nr_elems() * fh * fw * src_shape[1] * 32 /
group * 2;
size_t packed_size;
if (param.format == Param::Format::NCHW64) {
packed_size = 64;
} else if (param.format == Param::Format::NCHW32 ||
param.format == Param::Format::NCHW32_NCHW4) {
packed_size = 32;
} else {
mgb_assert(param.format == Param::Format::NCHW4 ||
param.format == Param::Format::NCHW4_NCHW ||
param.format == Param::Format::NCHW4_NCHW32,
"format should be NCHW4/NCHW4_NCHW/NCHW4_NCHW32");
packed_size = 4;
}
mgb_assert(param.format == Param::Format::NCHW4 ||
param.format == Param::Format::NCHW4_NCHW ||
param.format == Param::Format::NCHW4_NCHW32,
"format should be NCHW4/NCHW4_NCHW/NCHW4_NCHW32");
return dst_shape.total_nr_elems() * fh * fw * src_shape[1] * 4 / group *
2;
};
......
......@@ -390,7 +390,37 @@ bool mgb::check_compute_capability(int major, int minor) {
MGB_CUDA_CHECK(cudaGetDevice(&dev));
cudaDeviceProp prop;
MGB_CUDA_CHECK(cudaGetDeviceProperties(&prop, dev));
return prop.major > major || (prop.major == major && prop.minor >= minor);
bool available = prop.major > major || (prop.major == major && prop.minor >= minor);
if (!available) {
mgb_log_warn(
"This testcase is ignored due to insufficient cuda cap(got: "
"%d.%d, "
"expected: %d.%d)",
prop.major, prop.minor, major, minor);
}
return available;
#else
MGB_MARK_USED_VAR(major);
MGB_MARK_USED_VAR(minor);
return false;
#endif
}
bool mgb::check_compute_capability_eq(int major, int minor) {
#if MGB_CUDA
int dev;
MGB_CUDA_CHECK(cudaGetDevice(&dev));
cudaDeviceProp prop;
MGB_CUDA_CHECK(cudaGetDeviceProperties(&prop, dev));
bool available = prop.major == major && prop.minor == minor;
if (!available) {
mgb_log_warn(
"This testcase is ignored due to insufficient cuda cap(got: "
"%d.%d, "
"expected: %d.%d)",
prop.major, prop.minor, major, minor);
}
return available;
#else
MGB_MARK_USED_VAR(major);
MGB_MARK_USED_VAR(minor);
......
......@@ -504,6 +504,9 @@ bool check_cambricon_device_available(size_t num);
//! check current capability >= major.minor
bool check_compute_capability(int major, int minor);
//! check current capability == major.minor
bool check_compute_capability_eq(int major, int minor);
//! check compnode avaiable
bool check_device_type_avaiable(CompNode::DeviceType device_type);
......@@ -540,6 +543,12 @@ public:
return; \
} while (0)
#define REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(major, minor) \
do { \
if (!check_compute_capability_eq(major, minor)) \
return; \
} while (0)
//! skip a testcase if amd gpu not available
#define REQUIRE_AMD_GPU(n) do { \
if (!check_amd_gpu_available(n)) \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册