提交 8fef78d0 编写于 作者: M Megvii Engine Team

feat(dnn/cuda): add relayout format when width is an odd number

GitOrigin-RevId: f059f1f56dd66c33633118c893027ddd50ac8f1d
上级 91d61607
...@@ -380,7 +380,7 @@ void RelayoutFormat::deduce_format(TensorFormat src, TensorFormat& dst) { ...@@ -380,7 +380,7 @@ void RelayoutFormat::deduce_format(TensorFormat src, TensorFormat& dst) {
break; break;
} }
if (!dst.is_default() && if (dst.type() == TensorFormat::Type::IMAGE2D_PACK4 &&
( (
handle()->type() != Handle::HandleType::NAIVE)) { handle()->type() != Handle::HandleType::NAIVE)) {
#if MEGDNN_ENABLE_MANGLING #if MEGDNN_ENABLE_MANGLING
......
...@@ -87,10 +87,11 @@ public: ...@@ -87,10 +87,11 @@ public:
for (size_t i = 0; i < shapes.size(); ++i) { for (size_t i = 0; i < shapes.size(); ++i) {
DType dt = (m_dtype.find(i) != m_dtype.end() ? m_dtype[i] DType dt = (m_dtype.find(i) != m_dtype.end() ? m_dtype[i]
: dtype::Float32()); : dtype::Float32());
TensorFormat fmt = (m_fmt.find(i) != m_fmt.end() if (m_fmt.find(i) == m_fmt.end()) {
? m_fmt[i] layouts[i] = TensorLayout(shapes[i], dt);
: DefaultTensorFormat::make()); layouts[i].init_contiguous_stride();
layouts[i] = TensorLayout(shapes[i], dt, fmt); } else
layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]);
} }
return layouts; return layouts;
} }
......
...@@ -19,7 +19,6 @@ using namespace megdnn; ...@@ -19,7 +19,6 @@ using namespace megdnn;
using namespace test; using namespace test;
namespace { namespace {
template<typename ctype, class Iter> template<typename ctype, class Iter>
::testing::AssertionResult assert_tensor_eq_with_iter( ::testing::AssertionResult assert_tensor_eq_with_iter(
const char *expr0, const char *expr1, const char *expr0, const char *expr1,
...@@ -30,7 +29,7 @@ namespace { ...@@ -30,7 +29,7 @@ namespace {
double error_sum = 0; double error_sum = 0;
double error_sum_biased = 0; double error_sum_biased = 0;
for (size_t i = 0; i < nr_elem; ++ i) { for (size_t i = 0; i < nr_elem; ++ i) {
ctype iv0 = *it0, iv1 = *it1; ctype iv0 = ctype(*it0), iv1 = ctype(*it1);
float err = diff(iv0, iv1); float err = diff(iv0, iv1);
error_sum += std::abs(err); error_sum += std::abs(err);
error_sum_biased += err; error_sum_biased += err;
...@@ -84,12 +83,14 @@ namespace { ...@@ -84,12 +83,14 @@ namespace {
const char *expr0, const char *expr1, const char *expr0, const char *expr1,
const TensorND &v0, const TensorND &v1, const TensorND &v0, const TensorND &v1,
float maxerr, float maxerr_avg, float maxerr_avg_biased) { float maxerr, float maxerr_avg, float maxerr_avg_biased) {
if (!std::is_same<ctype, dt_qint4>::value &&
if (v0.layout.is_physical_contiguous() && !std::is_same<ctype, dt_quint4>::value) {
v1.layout.is_physical_contiguous()) { if (v0.layout.is_physical_contiguous() &&
return assert_tensor_eq_with_iter<ctype>( v1.layout.is_physical_contiguous()) {
expr0, expr1, v0.ptr<ctype>(), v1.ptr<ctype>(), v0.layout, return assert_tensor_eq_with_iter<ctype>(
maxerr, maxerr_avg, maxerr_avg_biased); expr0, expr1, v0.ptr<ctype>(), v1.ptr<ctype>(),
v0.layout, maxerr, maxerr_avg, maxerr_avg_biased);
}
} }
auto it0 = megdnn::tensor_iter_valonly<ctype>(v0).begin(), auto it0 = megdnn::tensor_iter_valonly<ctype>(v0).begin(),
...@@ -100,56 +101,6 @@ namespace { ...@@ -100,56 +101,6 @@ namespace {
maxerr_avg_biased); maxerr_avg_biased);
} }
template <typename ITYPE>
::testing::AssertionResult assert_tensor_eq_with_lowbit4(
const char* expr0, const char* expr1,
const TensorND& v0, const TensorND& v1,
float maxerr, float maxerr_avg) {
if (!v0.layout.eq_layout(v1.layout)) {
return ::testing::AssertionFailure()
<< "Layout mismatch for testing equality of lowbit4\n"
<< "Value of: " << expr1 << "\n"
<< " Actual: " << v1.layout.TensorShape::to_string() << "\n"
<< "Expected: " << expr0 << "\n"
<< "Which is: " << v0.layout.TensorShape::to_string() << "\n";
}
auto v0_ptr = static_cast<ITYPE*>(v0.raw_ptr) - v0.layout.span().low_byte;
auto v1_ptr = static_cast<ITYPE*>(v1.raw_ptr) - v1.layout.span().low_byte;
double error_sum = 0;
for (size_t i = 0; i < v0.layout.span().dist_elem(); ++i) {
ITYPE iv0 = (v0_ptr[i / 2] << (i ^ 1) * 4);
iv0 = iv0 >> 4;
ITYPE iv1 = (v1_ptr[i / 2] << (i ^ 1) * 4);
iv1 = iv1 >> 4;
float err = std::abs(diff(iv0, iv1));
error_sum += err;
if (!good_float(iv0) || !good_float(iv1) || err >= maxerr) {
Index index(v0.layout, i);
return ::testing::AssertionFailure()
<< "Unequal value\n"
<< "Value of: " << expr1 << "\n"
<< " Actual: " << (iv1+0) << "\n"
<< "Expected: " << expr0 << "\n"
<< "Which is: " << (iv0+0) << "\n"
<< "At index: " <<
index.to_string() << "/" << v0.layout.TensorShape::to_string() << "\n"
<< " Dtype: " << v0.layout.dtype.name() << "\n"
<< " error: " << err << "/" << maxerr;
}
}
float error_avg = error_sum / v0.layout.total_nr_elems();
if (error_avg > maxerr_avg) {
return ::testing::AssertionFailure()
<< "Average error too high\n"
<< "Value of: " << expr1 << "\n"
<< "Expected: " << expr0 << "\n"
<< "Average error: " << error_avg << "/" << maxerr_avg;
}
return ::testing::AssertionSuccess();
}
template<class Impl> template<class Impl>
void memcpy_noncontig( void memcpy_noncontig(
void *dst, const void *src, const TensorLayout &layout, void *dst, const void *src, const TensorLayout &layout,
...@@ -215,12 +166,7 @@ namespace { ...@@ -215,12 +166,7 @@ namespace {
//! In order to avoid an unnecessary increase in binary size, we just //! In order to avoid an unnecessary increase in binary size, we just
//! use QuantizedS16 dtype in winograd_filter_preprocess now. //! use QuantizedS16 dtype in winograd_filter_preprocess now.
cb(::megdnn::dtype::QuantizedS16) cb(::megdnn::dtype::QuantizedS16)
case DTypeTrait<dtype::Quantized4Asymm>::enumv: MEGDNN_FOREACH_QUANTIZED_LOWBIT_DTYPE(cb)
return assert_tensor_eq_with_lowbit4<uint8_t>(expr0, expr1, v0, v1,
maxerr, maxerr_avg);
case DTypeTrait<dtype::QuantizedS4>::enumv:
return assert_tensor_eq_with_lowbit4<int8_t>(expr0, expr1, v0, v1,
maxerr, maxerr_avg);
#undef cb #undef cb
default: default:
megdnn_trap(); megdnn_trap();
......
...@@ -228,6 +228,14 @@ static inline int diff(dt_qint8 x, dt_qint8 y) { ...@@ -228,6 +228,14 @@ static inline int diff(dt_qint8 x, dt_qint8 y) {
return x.as_int8() - y.as_int8(); return x.as_int8() - y.as_int8();
} }
static inline int diff(dt_qint4 x, dt_qint4 y) {
return x.as_int8() - y.as_int8();
}
static inline int diff(dt_quint4 x, dt_quint4 y) {
return x.as_uint8() - y.as_uint8();
}
inline TensorShape cvt_src_or_dst_nchw2nhwc(const TensorShape& shape) { inline TensorShape cvt_src_or_dst_nchw2nhwc(const TensorShape& shape) {
megdnn_assert(shape.ndim == 4); megdnn_assert(shape.ndim == 4);
auto N = shape[0], C = shape[1], H = shape[2], W = shape[3]; auto N = shape[0], C = shape[1], H = shape[2], W = shape[3];
...@@ -356,6 +364,15 @@ static inline int operator+(dt_qint16 lhs, int rhs) { ...@@ -356,6 +364,15 @@ static inline int operator+(dt_qint16 lhs, int rhs) {
return lhs.as_int16(); return lhs.as_int16();
} }
static inline int operator+(dt_quint4 lhs, int rhs) {
megdnn_assert(rhs == 0, "unexpected rhs");
return lhs.as_uint8();
}
static inline int operator+(dt_qint4 lhs, int rhs) {
megdnn_assert(rhs == 0, "unexpected rhs");
return lhs.as_int8();
}
} // namespace test } // namespace test
static inline bool operator==(const TensorLayout& a, const TensorLayout& b) { static inline bool operator==(const TensorLayout& a, const TensorLayout& b) {
......
...@@ -11,13 +11,14 @@ ...@@ -11,13 +11,14 @@
*/ */
#include "megdnn/dtype.h" #include "megdnn/dtype.h"
#include "megdnn/oprs.h" #include "megdnn/oprs.h"
#include "test/common/benchmarker.h" #include "test/cuda/benchmark.h"
#include "test/common/checker.h" #include "test/common/checker.h"
#include "test/common/rng.h" #include "test/common/rng.h"
#include "test/cuda/fixture.h" #include "test/cuda/fixture.h"
using namespace megdnn; using namespace megdnn;
using namespace test; using namespace test;
#define MEGDNN_WITH_BENCHMARK 1
TEST_F(CUDA, RELAYOUT_FORMAT) { TEST_F(CUDA, RELAYOUT_FORMAT) {
Checker<RelayoutFormat> checker(handle_cuda()); Checker<RelayoutFormat> checker(handle_cuda());
...@@ -246,7 +247,7 @@ TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW64) { ...@@ -246,7 +247,7 @@ TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW64) {
for (size_t n : {1, 3}) { for (size_t n : {1, 3}) {
for (size_t c : {64, 128}) { for (size_t c : {64, 128}) {
for (size_t h : {7, 14, 16, 28}) { for (size_t h : {7, 14, 16, 28}) {
for (size_t w : {2, 4, 14, 16}) { for (size_t w : {2, 3, 7, 8, 16, 31}) {
checker.set_dtype(0, dtype::QuantizedS4{2.f}) checker.set_dtype(0, dtype::QuantizedS4{2.f})
.set_dtype(1, dtype::QuantizedS4{2.f}) .set_dtype(1, dtype::QuantizedS4{2.f})
.set_rng(0, &s4) .set_rng(0, &s4)
...@@ -286,7 +287,7 @@ TEST_F(CUDA, RELAYOUT_FORMAT_NCHW64_NCHW) { ...@@ -286,7 +287,7 @@ TEST_F(CUDA, RELAYOUT_FORMAT_NCHW64_NCHW) {
for (size_t n : {1, 3}) { for (size_t n : {1, 3}) {
for (size_t c : {64, 128}) { for (size_t c : {64, 128}) {
for (size_t h : {7, 14, 16, 28}) { for (size_t h : {7, 14, 16, 28}) {
for (size_t w : {2, 4, 14, 16}) { for (size_t w : {2, 3, 4, 7, 14, 16, 17}) {
checker.set_dtype(0, dtype::QuantizedS4{2.f}) checker.set_dtype(0, dtype::QuantizedS4{2.f})
.set_dtype(1, dtype::QuantizedS4{2.f}) .set_dtype(1, dtype::QuantizedS4{2.f})
.set_rng(0, &s4) .set_rng(0, &s4)
...@@ -366,6 +367,46 @@ TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT) { ...@@ -366,6 +367,46 @@ TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT) {
run(shapes, param, default_param); run(shapes, param, default_param);
} }
} }
TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT_QS4) {
using Param = RelayoutFormat::Param;
auto run = [&](const TensorShapeArray& shapes, Param param) {
CUBenchmarker<RelayoutFormat> benchmarker(handle_cuda());
benchmarker.set_param(param);
benchmarker.set_dtype(0, dtype::QuantizedS4{1.19990307f})
.set_dtype(1, dtype::QuantizedS4{1.20210322f});
for (auto&& shape : shapes) {
double memaccess = double(shape.total_nr_elems()) * 1e-6;
auto time_ms = benchmarker.execs({shape, {}});
printf("execute %s, time %.4f ms, %.4f GB/s\n",
shape.to_string().c_str(), time_ms, memaccess / time_ms);
}
};
{
TensorShapeArray shapes = {
{1, 64, 56, 56}, {16, 64, 56, 56}, {64, 64, 56, 56},
{1, 64, 56, 55}, {16, 64, 56, 55}, {64, 64, 56, 55},
};
Param param;
param.mode = param::RelayoutFormat::Mode::NCHW_NCHW64;
run(shapes, param);
}
{
TensorShapeArray shapes = {
{64, 1, 56, 56, 64},
{1, 32, 7, 7, 64},
{16, 32, 7, 7, 64},
{64, 32, 7, 7, 64},
};
Param param;
param.mode = param::RelayoutFormat::Mode::NCHW64_NCHW;
run(shapes, param);
}
}
#endif #endif
TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4) { TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册