提交 bf718b08 编写于 作者: T Tomoaki Teshima

use universal intrinsic in FAST

上级 6a5298a5
......@@ -42,7 +42,7 @@ The references are:
*/
#include "fast_score.hpp"
#include "opencv2/core/hal/intrin.hpp"
#define VERIFY_CORNERS 0
namespace cv {
......@@ -125,45 +125,48 @@ int cornerScore<16>(const uchar* ptr, const int pixel[], int threshold)
for( k = 0; k < N; k++ )
d[k] = (short)(v - ptr[pixel[k]]);
#if CV_SSE2
__m128i q0 = _mm_set1_epi16(-1000), q1 = _mm_set1_epi16(1000);
for( k = 0; k < 16; k += 8 )
#if CV_SIMD128
if (hasSIMD128())
{
v_int16x8 q0 = v_setall_s16(-1000), q1 = v_setall_s16(1000);
for (k = 0; k < 16; k += 8)
{
__m128i v0 = _mm_loadu_si128((__m128i*)(d+k+1));
__m128i v1 = _mm_loadu_si128((__m128i*)(d+k+2));
__m128i a = _mm_min_epi16(v0, v1);
__m128i b = _mm_max_epi16(v0, v1);
v0 = _mm_loadu_si128((__m128i*)(d+k+3));
a = _mm_min_epi16(a, v0);
b = _mm_max_epi16(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+4));
a = _mm_min_epi16(a, v0);
b = _mm_max_epi16(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+5));
a = _mm_min_epi16(a, v0);
b = _mm_max_epi16(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+6));
a = _mm_min_epi16(a, v0);
b = _mm_max_epi16(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+7));
a = _mm_min_epi16(a, v0);
b = _mm_max_epi16(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+8));
a = _mm_min_epi16(a, v0);
b = _mm_max_epi16(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k));
q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0));
q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0));
v0 = _mm_loadu_si128((__m128i*)(d+k+9));
q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0));
q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0));
v_int16x8 v0 = v_load(d + k + 1);
v_int16x8 v1 = v_load(d + k + 2);
v_int16x8 a = v_min(v0, v1);
v_int16x8 b = v_max(v0, v1);
v0 = v_load(d + k + 3);
a = v_min(a, v0);
b = v_max(b, v0);
v0 = v_load(d + k + 4);
a = v_min(a, v0);
b = v_max(b, v0);
v0 = v_load(d + k + 5);
a = v_min(a, v0);
b = v_max(b, v0);
v0 = v_load(d + k + 6);
a = v_min(a, v0);
b = v_max(b, v0);
v0 = v_load(d + k + 7);
a = v_min(a, v0);
b = v_max(b, v0);
v0 = v_load(d + k + 8);
a = v_min(a, v0);
b = v_max(b, v0);
v0 = v_load(d + k);
q0 = v_max(q0, v_min(a, v0));
q1 = v_min(q1, v_max(b, v0));
v0 = v_load(d + k + 9);
q0 = v_max(q0, v_min(a, v0));
q1 = v_min(q1, v_max(b, v0));
}
q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1));
q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0));
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4));
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2));
threshold = (short)_mm_cvtsi128_si32(q0) - 1;
#else
q0 = v_max(q0, v_setzero_s16() - q1);
threshold = v_reduce_max(q0) - 1;
}
else
#endif
{
int a0 = threshold;
for( k = 0; k < 16; k += 2 )
{
......@@ -197,8 +200,8 @@ int cornerScore<16>(const uchar* ptr, const int pixel[], int threshold)
b0 = std::min(b0, std::max(b, (int)d[k+9]));
}
threshold = -b0-1;
#endif
threshold = -b0 - 1;
}
#if VERIFY_CORNERS
testCorner(ptr, pixel, K, N, threshold);
......@@ -214,44 +217,46 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold)
short d[N + 4];
for( k = 0; k < N; k++ )
d[k] = (short)(v - ptr[pixel[k]]);
#if CV_SSE2
#if CV_SIMD128
for( k = 0; k < 4; k++ )
d[N+k] = d[k];
#endif
#if CV_SSE2
__m128i q0 = _mm_set1_epi16(-1000), q1 = _mm_set1_epi16(1000);
for( k = 0; k < 16; k += 8 )
#if CV_SIMD128
if (hasSIMD128())
{
__m128i v0 = _mm_loadu_si128((__m128i*)(d+k+1));
__m128i v1 = _mm_loadu_si128((__m128i*)(d+k+2));
__m128i a = _mm_min_epi16(v0, v1);
__m128i b = _mm_max_epi16(v0, v1);
v0 = _mm_loadu_si128((__m128i*)(d+k+3));
a = _mm_min_epi16(a, v0);
b = _mm_max_epi16(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+4));
a = _mm_min_epi16(a, v0);
b = _mm_max_epi16(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+5));
a = _mm_min_epi16(a, v0);
b = _mm_max_epi16(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k+6));
a = _mm_min_epi16(a, v0);
b = _mm_max_epi16(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+k));
q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0));
q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0));
v0 = _mm_loadu_si128((__m128i*)(d+k+7));
q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0));
q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0));
v_int16x8 q0 = v_setall_s16(-1000), q1 = v_setall_s16(1000);
for (k = 0; k < 16; k += 8)
{
v_int16x8 v0 = v_load(d + k + 1);
v_int16x8 v1 = v_load(d + k + 2);
v_int16x8 a = v_min(v0, v1);
v_int16x8 b = v_max(v0, v1);
v0 = v_load(d + k + 3);
a = v_min(a, v0);
b = v_max(b, v0);
v0 = v_load(d + k + 4);
a = v_min(a, v0);
b = v_max(b, v0);
v0 = v_load(d + k + 5);
a = v_min(a, v0);
b = v_max(b, v0);
v0 = v_load(d + k + 6);
a = v_min(a, v0);
b = v_max(b, v0);
v0 = v_load(d + k);
q0 = v_max(q0, v_min(a, v0));
q1 = v_min(q1, v_max(b, v0));
v0 = v_load(d + k + 7);
q0 = v_max(q0, v_min(a, v0));
q1 = v_min(q1, v_max(b, v0));
}
q0 = v_max(q0, v_setzero_s16() - q1);
threshold = v_reduce_max(q0) - 1;
}
q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1));
q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0));
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4));
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2));
threshold = (short)_mm_cvtsi128_si32(q0) - 1;
#else
else
#endif
{
int a0 = threshold;
for( k = 0; k < 12; k += 2 )
{
......@@ -282,8 +287,7 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold)
}
threshold = -b0-1;
#endif
}
#if VERIFY_CORNERS
testCorner(ptr, pixel, K, N, threshold);
#endif
......@@ -293,35 +297,37 @@ int cornerScore<12>(const uchar* ptr, const int pixel[], int threshold)
template<>
int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold)
{
const int K = 4, N = K*3 + 1;
const int K = 4, N = K * 3 + 1;
int k, v = ptr[0];
short d[N];
for( k = 0; k < N; k++ )
for (k = 0; k < N; k++)
d[k] = (short)(v - ptr[pixel[k]]);
#if CV_SSE2
__m128i v0 = _mm_loadu_si128((__m128i*)(d+1));
__m128i v1 = _mm_loadu_si128((__m128i*)(d+2));
__m128i a = _mm_min_epi16(v0, v1);
__m128i b = _mm_max_epi16(v0, v1);
v0 = _mm_loadu_si128((__m128i*)(d+3));
a = _mm_min_epi16(a, v0);
b = _mm_max_epi16(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+4));
a = _mm_min_epi16(a, v0);
b = _mm_max_epi16(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d));
__m128i q0 = _mm_min_epi16(a, v0);
__m128i q1 = _mm_max_epi16(b, v0);
v0 = _mm_loadu_si128((__m128i*)(d+5));
q0 = _mm_max_epi16(q0, _mm_min_epi16(a, v0));
q1 = _mm_min_epi16(q1, _mm_max_epi16(b, v0));
q0 = _mm_max_epi16(q0, _mm_sub_epi16(_mm_setzero_si128(), q1));
q0 = _mm_max_epi16(q0, _mm_unpackhi_epi64(q0, q0));
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 4));
q0 = _mm_max_epi16(q0, _mm_srli_si128(q0, 2));
threshold = (short)_mm_cvtsi128_si32(q0) - 1;
#else
#if CV_SIMD128
if (hasSIMD128())
{
v_int16x8 v0 = v_load(d + 1);
v_int16x8 v1 = v_load(d + 2);
v_int16x8 a = v_min(v0, v1);
v_int16x8 b = v_max(v0, v1);
v0 = v_load(d + 3);
a = v_min(a, v0);
b = v_max(b, v0);
v0 = v_load(d + 4);
a = v_min(a, v0);
b = v_max(b, v0);
v0 = v_load(d);
v_int16x8 q0 = v_min(a, v0);
v_int16x8 q1 = v_max(b, v0);
v0 = v_load(d + 5);
q0 = v_max(q0, v_min(a, v0));
q1 = v_min(q1, v_max(b, v0));
q0 = v_max(q0, v_setzero_s16() - q1);
threshold = v_reduce_max(q0) - 1;
}
else
#endif
{
int a0 = threshold;
for( k = 0; k < 8; k += 2 )
{
......@@ -348,7 +354,7 @@ int cornerScore<8>(const uchar* ptr, const int pixel[], int threshold)
}
threshold = -b0-1;
#endif
}
#if VERIFY_CORNERS
testCorner(ptr, pixel, K, N, threshold);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册