提交 8fef78d0 编写于 作者: M Megvii Engine Team

feat(dnn/cuda): add relayout format when width is an odd number

GitOrigin-RevId: f059f1f56dd66c33633118c893027ddd50ac8f1d
上级 91d61607
......@@ -380,7 +380,7 @@ void RelayoutFormat::deduce_format(TensorFormat src, TensorFormat& dst) {
break;
}
if (!dst.is_default() &&
if (dst.type() == TensorFormat::Type::IMAGE2D_PACK4 &&
(
handle()->type() != Handle::HandleType::NAIVE)) {
#if MEGDNN_ENABLE_MANGLING
......
......@@ -87,10 +87,11 @@ public:
for (size_t i = 0; i < shapes.size(); ++i) {
DType dt = (m_dtype.find(i) != m_dtype.end() ? m_dtype[i]
: dtype::Float32());
TensorFormat fmt = (m_fmt.find(i) != m_fmt.end()
? m_fmt[i]
: DefaultTensorFormat::make());
layouts[i] = TensorLayout(shapes[i], dt, fmt);
if (m_fmt.find(i) == m_fmt.end()) {
layouts[i] = TensorLayout(shapes[i], dt);
layouts[i].init_contiguous_stride();
} else
layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]);
}
return layouts;
}
......
......@@ -19,7 +19,6 @@ using namespace megdnn;
using namespace test;
namespace {
template<typename ctype, class Iter>
::testing::AssertionResult assert_tensor_eq_with_iter(
const char *expr0, const char *expr1,
......@@ -30,7 +29,7 @@ namespace {
double error_sum = 0;
double error_sum_biased = 0;
for (size_t i = 0; i < nr_elem; ++ i) {
ctype iv0 = *it0, iv1 = *it1;
ctype iv0 = ctype(*it0), iv1 = ctype(*it1);
float err = diff(iv0, iv1);
error_sum += std::abs(err);
error_sum_biased += err;
......@@ -84,12 +83,14 @@ namespace {
const char *expr0, const char *expr1,
const TensorND &v0, const TensorND &v1,
float maxerr, float maxerr_avg, float maxerr_avg_biased) {
if (!std::is_same<ctype, dt_qint4>::value &&
!std::is_same<ctype, dt_quint4>::value) {
if (v0.layout.is_physical_contiguous() &&
v1.layout.is_physical_contiguous()) {
return assert_tensor_eq_with_iter<ctype>(
expr0, expr1, v0.ptr<ctype>(), v1.ptr<ctype>(), v0.layout,
maxerr, maxerr_avg, maxerr_avg_biased);
expr0, expr1, v0.ptr<ctype>(), v1.ptr<ctype>(),
v0.layout, maxerr, maxerr_avg, maxerr_avg_biased);
}
}
auto it0 = megdnn::tensor_iter_valonly<ctype>(v0).begin(),
......@@ -100,56 +101,6 @@ namespace {
maxerr_avg_biased);
}
template <typename ITYPE>
::testing::AssertionResult assert_tensor_eq_with_lowbit4(
const char* expr0, const char* expr1,
const TensorND& v0, const TensorND& v1,
float maxerr, float maxerr_avg) {
if (!v0.layout.eq_layout(v1.layout)) {
return ::testing::AssertionFailure()
<< "Layout mismatch for testing equality of lowbit4\n"
<< "Value of: " << expr1 << "\n"
<< " Actual: " << v1.layout.TensorShape::to_string() << "\n"
<< "Expected: " << expr0 << "\n"
<< "Which is: " << v0.layout.TensorShape::to_string() << "\n";
}
auto v0_ptr = static_cast<ITYPE*>(v0.raw_ptr) - v0.layout.span().low_byte;
auto v1_ptr = static_cast<ITYPE*>(v1.raw_ptr) - v1.layout.span().low_byte;
double error_sum = 0;
for (size_t i = 0; i < v0.layout.span().dist_elem(); ++i) {
ITYPE iv0 = (v0_ptr[i / 2] << (i ^ 1) * 4);
iv0 = iv0 >> 4;
ITYPE iv1 = (v1_ptr[i / 2] << (i ^ 1) * 4);
iv1 = iv1 >> 4;
float err = std::abs(diff(iv0, iv1));
error_sum += err;
if (!good_float(iv0) || !good_float(iv1) || err >= maxerr) {
Index index(v0.layout, i);
return ::testing::AssertionFailure()
<< "Unequal value\n"
<< "Value of: " << expr1 << "\n"
<< " Actual: " << (iv1+0) << "\n"
<< "Expected: " << expr0 << "\n"
<< "Which is: " << (iv0+0) << "\n"
<< "At index: " <<
index.to_string() << "/" << v0.layout.TensorShape::to_string() << "\n"
<< " Dtype: " << v0.layout.dtype.name() << "\n"
<< " error: " << err << "/" << maxerr;
}
}
float error_avg = error_sum / v0.layout.total_nr_elems();
if (error_avg > maxerr_avg) {
return ::testing::AssertionFailure()
<< "Average error too high\n"
<< "Value of: " << expr1 << "\n"
<< "Expected: " << expr0 << "\n"
<< "Average error: " << error_avg << "/" << maxerr_avg;
}
return ::testing::AssertionSuccess();
}
template<class Impl>
void memcpy_noncontig(
void *dst, const void *src, const TensorLayout &layout,
......@@ -215,12 +166,7 @@ namespace {
//! In order to avoid an unnecessary increase in binary size, we just
//! use QuantizedS16 dtype in winograd_filter_preprocess now.
cb(::megdnn::dtype::QuantizedS16)
case DTypeTrait<dtype::Quantized4Asymm>::enumv:
return assert_tensor_eq_with_lowbit4<uint8_t>(expr0, expr1, v0, v1,
maxerr, maxerr_avg);
case DTypeTrait<dtype::QuantizedS4>::enumv:
return assert_tensor_eq_with_lowbit4<int8_t>(expr0, expr1, v0, v1,
maxerr, maxerr_avg);
MEGDNN_FOREACH_QUANTIZED_LOWBIT_DTYPE(cb)
#undef cb
default:
megdnn_trap();
......
......@@ -228,6 +228,14 @@ static inline int diff(dt_qint8 x, dt_qint8 y) {
return x.as_int8() - y.as_int8();
}
static inline int diff(dt_qint4 x, dt_qint4 y) {
return x.as_int8() - y.as_int8();
}
static inline int diff(dt_quint4 x, dt_quint4 y) {
return x.as_uint8() - y.as_uint8();
}
inline TensorShape cvt_src_or_dst_nchw2nhwc(const TensorShape& shape) {
megdnn_assert(shape.ndim == 4);
auto N = shape[0], C = shape[1], H = shape[2], W = shape[3];
......@@ -356,6 +364,15 @@ static inline int operator+(dt_qint16 lhs, int rhs) {
return lhs.as_int16();
}
static inline int operator+(dt_quint4 lhs, int rhs) {
megdnn_assert(rhs == 0, "unexpected rhs");
return lhs.as_uint8();
}
static inline int operator+(dt_qint4 lhs, int rhs) {
megdnn_assert(rhs == 0, "unexpected rhs");
return lhs.as_int8();
}
} // namespace test
static inline bool operator==(const TensorLayout& a, const TensorLayout& b) {
......
......@@ -11,13 +11,14 @@
*/
#include "megdnn/dtype.h"
#include "megdnn/oprs.h"
#include "test/common/benchmarker.h"
#include "test/cuda/benchmark.h"
#include "test/common/checker.h"
#include "test/common/rng.h"
#include "test/cuda/fixture.h"
using namespace megdnn;
using namespace test;
#define MEGDNN_WITH_BENCHMARK 1
TEST_F(CUDA, RELAYOUT_FORMAT) {
Checker<RelayoutFormat> checker(handle_cuda());
......@@ -246,7 +247,7 @@ TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW64) {
for (size_t n : {1, 3}) {
for (size_t c : {64, 128}) {
for (size_t h : {7, 14, 16, 28}) {
for (size_t w : {2, 4, 14, 16}) {
for (size_t w : {2, 3, 7, 8, 16, 31}) {
checker.set_dtype(0, dtype::QuantizedS4{2.f})
.set_dtype(1, dtype::QuantizedS4{2.f})
.set_rng(0, &s4)
......@@ -286,7 +287,7 @@ TEST_F(CUDA, RELAYOUT_FORMAT_NCHW64_NCHW) {
for (size_t n : {1, 3}) {
for (size_t c : {64, 128}) {
for (size_t h : {7, 14, 16, 28}) {
for (size_t w : {2, 4, 14, 16}) {
for (size_t w : {2, 3, 4, 7, 14, 16, 17}) {
checker.set_dtype(0, dtype::QuantizedS4{2.f})
.set_dtype(1, dtype::QuantizedS4{2.f})
.set_rng(0, &s4)
......@@ -366,6 +367,46 @@ TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT) {
run(shapes, param, default_param);
}
}
TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT_QS4) {
using Param = RelayoutFormat::Param;
auto run = [&](const TensorShapeArray& shapes, Param param) {
CUBenchmarker<RelayoutFormat> benchmarker(handle_cuda());
benchmarker.set_param(param);
benchmarker.set_dtype(0, dtype::QuantizedS4{1.19990307f})
.set_dtype(1, dtype::QuantizedS4{1.20210322f});
for (auto&& shape : shapes) {
double memaccess = double(shape.total_nr_elems()) * 1e-6;
auto time_ms = benchmarker.execs({shape, {}});
printf("execute %s, time %.4f ms, %.4f GB/s\n",
shape.to_string().c_str(), time_ms, memaccess / time_ms);
}
};
{
TensorShapeArray shapes = {
{1, 64, 56, 56}, {16, 64, 56, 56}, {64, 64, 56, 56},
{1, 64, 56, 55}, {16, 64, 56, 55}, {64, 64, 56, 55},
};
Param param;
param.mode = param::RelayoutFormat::Mode::NCHW_NCHW64;
run(shapes, param);
}
{
TensorShapeArray shapes = {
{64, 1, 56, 56, 64},
{1, 32, 7, 7, 64},
{16, 32, 7, 7, 64},
{64, 32, 7, 7, 64},
};
Param param;
param.mode = param::RelayoutFormat::Mode::NCHW64_NCHW;
run(shapes, param);
}
}
#endif
TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册