提交 3cbe60cc 编写于 作者: T Tomoaki Teshima 提交者: Alexander Alekhin

Merge pull request #9753 from tomoaki0705:universalMatmul

* add accuracy test and performance check for matmul
  * add performance tests for transform and dotProduct
  * add test Core_TransformLargeTest for 8u version of transform

* remove raw SSE2/NEON implementation from matmul.cpp
  * use universal intrinsic instead of raw intrinsic
  * remove unused templated function
  * add v_matmuladd which multiply 3x3 matrix and add 3x1 vector
  * add v_rotate_left/right in universal intrinsic
  * suppress intrinsic on some function and platform
  * add pure SW implementation of new universal intrinsics
  * add test for new universal intrinsics

* core: prevent memory access after the end of buffer

* fix perf tests
上级 2674c6b5
......@@ -885,12 +885,59 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg
/** @brief Bitwise shift left
For 16-, 32- and 64-bit integer values. */
OPENCV_HAL_IMPL_SHIFT_OP(<<)
OPENCV_HAL_IMPL_SHIFT_OP(<< )
/** @brief Bitwise shift right
For 16-, 32- and 64-bit integer values. */
OPENCV_HAL_IMPL_SHIFT_OP(>>)
OPENCV_HAL_IMPL_SHIFT_OP(>> )
/** @brief Element shift left among vector
For all type */
#define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
{ \
v_reg<_Tp, n> b; \
for (int i = 0; i < n; i++) \
{ \
int sIndex = i opA imm; \
if (0 <= sIndex && sIndex < n) \
{ \
b.s[i] = a.s[sIndex]; \
} \
else \
{ \
b.s[i] = 0; \
} \
} \
return b; \
} \
template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
v_reg<_Tp, n> c; \
for (int i = 0; i < n; i++) \
{ \
int aIndex = i opA imm; \
int bIndex = i opA imm opB n; \
if (0 <= bIndex && bIndex < n) \
{ \
c.s[i] = b.s[bIndex]; \
} \
else if (0 <= aIndex && aIndex < n) \
{ \
c.s[i] = a.s[aIndex]; \
} \
else \
{ \
c.s[i] = 0; \
} \
} \
return c; \
}
OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left, -, +)
OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -)
/** @brief Sum packed values
......@@ -1860,6 +1907,30 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
}
/** @brief Matrix multiplication and add
Scheme:
@code
{A0 A1 A2 } |V0| |D0|
{B0 B1 B2 } |V1| |D1|
{C0 C1 C2 } x |V2| + |D2|
====================
{R0 R1 R2 R3}, where:
R0 = A0V0 + A1V1 + A2V2 + D0,
R1 = B0V0 + B1V1 + B2V2 + D1
...
@endcode
*/
inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
const v_float32x4& m1, const v_float32x4& m2,
const v_float32x4& m3)
{
return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + m3.s[0],
v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + m3.s[1],
v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + m3.s[2],
v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
}
//! @}
//! @name Check SIMD support
......
......@@ -407,6 +407,18 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
return v_float32x4(res);
}
inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
const v_float32x4& m1, const v_float32x4& m2,
const v_float32x4& a)
{
float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
res = vmlaq_lane_f32(res, m1.val, vl, 1);
res = vmlaq_lane_f32(res, m2.val, vh, 0);
res = vaddq_f32(res, a.val);
return v_float32x4(res);
}
#define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
{ \
......@@ -747,7 +759,15 @@ template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
{ return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); } \
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); }
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)
......
......@@ -602,6 +602,16 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
}
inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
const v_float32x4& m1, const v_float32x4& m2,
const v_float32x4& a)
{
__m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
__m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
__m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
}
#define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
......@@ -1011,6 +1021,29 @@ OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
template<int imm, typename _Tpvec>
inline _Tpvec v_rotate_right(const _Tpvec &a)
{
return _Tpvec(_mm_srli_si128(a.val, imm*(sizeof(typename _Tpvec::lane_type))));
}
template<int imm, typename _Tpvec>
inline _Tpvec v_rotate_left(const _Tpvec &a)
{
return _Tpvec(_mm_slli_si128(a.val, imm*(sizeof(typename _Tpvec::lane_type))));
}
template<int imm, typename _Tpvec>
inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
{
const int cWidth = sizeof(typename _Tpvec::lane_type);
return _Tpvec(_mm_or_si128(_mm_srli_si128(a.val, imm*cWidth), _mm_slli_si128(b.val, (16 - imm*cWidth))));
}
template<int imm, typename _Tpvec>
inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
{
const int cWidth = sizeof(typename _Tpvec::lane_type);
return _Tpvec(_mm_or_si128(_mm_slli_si128(a.val, imm*cWidth), _mm_srli_si128(b.val, (16 - imm*cWidth))));
}
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
inline _Tpvec v_load(const _Tp* ptr) \
{ return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
......
......@@ -1062,6 +1062,34 @@ OCL_PERF_TEST_P(ScaleAddFixture, ScaleAdd,
SANITY_CHECK(dst, 1e-6);
}
///////////// Transform ////////////////////////
typedef Size_MatType TransformFixture;
OCL_PERF_TEST_P(TransformFixture, Transform,
::testing::Combine(OCL_TEST_SIZES,
::testing::Values(CV_8UC3, CV_8SC3, CV_16UC3, CV_16SC3, CV_32SC3, CV_32FC3, CV_64FC3)))
{
const Size_MatType_t params = GetParam();
const Size srcSize = get<0>(params);
const int type = get<1>(params);
checkDeviceMaxMemoryAllocSize(srcSize, type);
const float transform[] = { 0.5f, 0.f, 0.86602540378f, 128,
0.f, 1.f, 0.f, -64,
0.86602540378f, 0.f, 0.5f, 32,};
Mat mtx(Size(4, 3), CV_32FC1, (void*)transform);
UMat src(srcSize, type), dst(srcSize, type);
randu(src, 0, 30);
declare.in(src).out(dst);
OCL_TEST_CYCLE() cv::transform(src, dst, mtx);
SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
}
///////////// PSNR ////////////////////////
typedef Size_MatType PSNRFixture;
......
......@@ -96,3 +96,31 @@ PERF_TEST_P(Size_MatType, Mat_Clone_Roi,
SANITY_CHECK(destination, 1);
}
///////////// Transform ////////////////////////
PERF_TEST_P(Size_MatType, Mat_Transform,
testing::Combine(testing::Values(TYPICAL_MAT_SIZES),
testing::Values(CV_8UC3, CV_8SC3, CV_16UC3, CV_16SC3, CV_32SC3, CV_32FC3, CV_64FC3))
)
{
const Size_MatType_t params = GetParam();
const Size srcSize0 = get<0>(params);
const Size srcSize = Size(1, srcSize0.width*srcSize0.height);
const int type = get<1>(params);
const float transform[] = { 0.5f, 0.f, 0.86602540378f, 128,
0.f, 1.f, 0.f, -64,
0.86602540378f, 0.f, 0.5f, 32,};
Mat mtx(Size(4, 3), CV_32FC1, (void*)transform);
Mat src(srcSize, type), dst(srcSize, type);
randu(src, 0, 30);
declare.in(src).out(dst);
TEST_CYCLE()
{
cv::transform(src, dst, mtx);
}
SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
}
此差异已折叠。
......@@ -33,6 +33,7 @@ TEST(hal_intrin, uint8x16) {
.test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
.test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
;
}
......@@ -54,6 +55,7 @@ TEST(hal_intrin, int8x16) {
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
.test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
;
}
......@@ -81,6 +83,7 @@ TEST(hal_intrin, uint16x8) {
.test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
.test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
;
}
......@@ -107,6 +110,7 @@ TEST(hal_intrin, int16x8) {
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
.test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
;
}
......@@ -132,6 +136,7 @@ TEST(hal_intrin, uint32x4) {
.test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
.test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
.test_transpose()
;
}
......@@ -155,6 +160,7 @@ TEST(hal_intrin, int32x4) {
.test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
.test_unpack()
.test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
.test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
.test_float_cvt32()
.test_float_cvt64()
.test_transpose()
......@@ -170,6 +176,7 @@ TEST(hal_intrin, uint64x2) {
.test_shift<1>().test_shift<8>()
.test_logic()
.test_extract<0>().test_extract<1>()
.test_rotate<0>().test_rotate<1>()
;
}
......@@ -180,6 +187,7 @@ TEST(hal_intrin, int64x2) {
.test_shift<1>().test_shift<8>()
.test_logic()
.test_extract<0>().test_extract<1>()
.test_rotate<0>().test_rotate<1>()
;
}
......
......@@ -793,6 +793,30 @@ template<typename R> struct TheTest
return *this;
}
template<int s>
TheTest & test_rotate()
{
Data<R> dataA, dataB;
dataB *= 10;
R a = dataA, b = dataB;
Data<R> resC = v_rotate_right<s>(a);
Data<R> resD = v_rotate_right<s>(a, b);
for (int i = 0; i < R::nlanes; ++i)
{
if (i + s >= R::nlanes)
{
EXPECT_EQ((LaneType)0, resC[i]);
EXPECT_EQ(dataB[i - R::nlanes + s], resD[i]);
}
else
EXPECT_EQ(dataA[i + s], resC[i]);
}
return *this;
}
TheTest & test_float_math()
{
typedef typename V_RegTrait128<LaneType>::int_reg Ri;
......@@ -882,6 +906,16 @@ template<typename R> struct TheTest
+ dataV[3] * dataD[i];
EXPECT_DOUBLE_EQ(val, res[i]);
}
Data<R> resAdd = v_matmuladd(v, a, b, c, d);
for (int i = 0; i < R::nlanes; ++i)
{
LaneType val = dataV[0] * dataA[i]
+ dataV[1] * dataB[i]
+ dataV[2] * dataC[i]
+ dataD[i];
EXPECT_DOUBLE_EQ(val, resAdd[i]);
}
return *this;
}
......
......@@ -904,6 +904,60 @@ void Core_TransformTest::prepare_to_validation( int )
cvtest::transform( test_mat[INPUT][0], test_mat[REF_OUTPUT][0], transmat, shift );
}
class Core_TransformLargeTest : public Core_TransformTest
{
public:
typedef Core_MatrixTest Base;
protected:
void get_test_array_types_and_sizes(int test_case_idx, vector<vector<Size> >& sizes, vector<vector<int> >& types);
};
void Core_TransformLargeTest::get_test_array_types_and_sizes(int test_case_idx, vector<vector<Size> >& sizes, vector<vector<int> >& types)
{
RNG& rng = ts->get_rng();
int bits = cvtest::randInt(rng);
int depth, dst_cn, mat_cols, mattype;
Base::get_test_array_types_and_sizes(test_case_idx, sizes, types);
for (unsigned int j = 0; j < sizes.size(); j++)
{
for (unsigned int i = 0; i < sizes[j].size(); i++)
{
sizes[j][i].width *= 4;
}
}
mat_cols = CV_MAT_CN(types[INPUT][0]);
depth = CV_MAT_DEPTH(types[INPUT][0]);
dst_cn = cvtest::randInt(rng) % 4 + 1;
types[OUTPUT][0] = types[REF_OUTPUT][0] = CV_MAKETYPE(depth, dst_cn);
mattype = depth < CV_32S ? CV_32F : depth == CV_64F ? CV_64F : bits & 1 ? CV_32F : CV_64F;
types[INPUT][1] = mattype;
types[INPUT][2] = CV_MAKETYPE(mattype, dst_cn);
scale = 1. / ((cvtest::randInt(rng) % 4) * 50 + 1);
if (bits & 2)
{
sizes[INPUT][2] = Size(0, 0);
mat_cols += (bits & 4) != 0;
}
else if (bits & 4)
sizes[INPUT][2] = Size(1, 1);
else
{
if (bits & 8)
sizes[INPUT][2] = Size(dst_cn, 1);
else
sizes[INPUT][2] = Size(1, dst_cn);
types[INPUT][2] &= ~CV_MAT_CN_MASK;
}
diagMtx = (bits & 16) != 0;
sizes[INPUT][1] = Size(mat_cols, dst_cn);
}
///////////////// PerspectiveTransform /////////////////////
......@@ -2691,6 +2745,7 @@ TEST(Core_Invert, accuracy) { Core_InvertTest test; test.safe_run(); }
TEST(Core_Mahalanobis, accuracy) { Core_MahalanobisTest test; test.safe_run(); }
TEST(Core_MulTransposed, accuracy) { Core_MulTransposedTest test; test.safe_run(); }
TEST(Core_Transform, accuracy) { Core_TransformTest test; test.safe_run(); }
TEST(Core_TransformLarge, accuracy) { Core_TransformLargeTest test; test.safe_run(); }
TEST(Core_PerspectiveTransform, accuracy) { Core_PerspectiveTransformTest test; test.safe_run(); }
TEST(Core_Pow, accuracy) { Core_PowTest test; test.safe_run(); }
TEST(Core_SolveLinearSystem, accuracy) { Core_SolveTest test; test.safe_run(); }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册