unified norm computing; added generalized Hamming distance

b74116e6 · Vadim Pisarevsky · c1277b61 · b74116e6 · b74116e6 · b74116e6
12 changed file
--- a/modules/core/include/opencv2/core/core.hpp
+++ b/modules/core/include/opencv2/core/core.hpp
@@ -598,6 +598,9 @@ public:
    //! per-element multiplication
    Vec mul(const Vec<_Tp, cn>& v) const;
    
+    //! conjugation (makes sense for complex numbers and quaternions)
+    Vec conj() const;
+    
    /*!
      cross product of the two 3D vectors.
    

--- a/modules/core/include/opencv2/core/operations.hpp
+++ b/modules/core/include/opencv2/core/operations.hpp
@@ -81,7 +81,7 @@
  #define CV_XADD(addr,delta) InterlockedExchangeAdd((long volatile*)(addr), (delta))
 #else

-  template<typename _Tp> static inline _Tp CV_XADD(_Tp* addr, _Tp delta)
+  static inline int CV_XADD(int* addr, int delta)
  { int tmp = *addr; *addr += delta; return tmp; }
    
 #endif
@@ -179,7 +179,14 @@ template<> inline int saturate_cast<int>(double v) { return cvRound(v); }
 // we intentionally do not clip negative numbers, to make -1 become 0xffffffff etc.
 template<> inline unsigned saturate_cast<unsigned>(float v){ return cvRound(v); }
 template<> inline unsigned saturate_cast<unsigned>(double v) { return cvRound(v); }
-
+    
+inline int fast_abs(uchar v) { return v; }
+inline int fast_abs(schar v) { return std::abs((int)v); }
+inline int fast_abs(ushort v) { return v; }
+inline int fast_abs(short v) { return std::abs((int)v); }
+inline int fast_abs(int v) { return std::abs(v); }
+inline float fast_abs(float v) { return std::abs(v); }
+inline double fast_abs(double v) { return std::abs(v); }

 //////////////////////////////// Matx /////////////////////////////////

@@ -891,38 +898,152 @@ Matx<_Tp, n, l> Matx<_Tp, m, n>::solve(const Matx<_Tp, m, l>& rhs, int method) c
    return ok ? x : Matx<_Tp, n, l>::zeros();
 }

+    
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL2Sqr(const _Tp* a, int n)
+{
+    _AccTp s = 0;
+    int i;
+    for( i = 0; i <= n - 4; i += 4 )
+    {
+        _AccTp v0 = a[i], v1 = a[i+1], v2 = a[i+2], v3 = a[i+3];
+        s += v0*v0 + v1*v1 + v2*v2 + v3*v3;
+    }
+    for( ; i < n; i++ )
+    {
+        _AccTp v = a[i];
+        s += v*v;
+    }
+    return s;
+}
+

-template<typename _Tp, int m, int n> static inline
-double norm(const Matx<_Tp, m, n>& M)
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL1(const _Tp* a, int n)
 {
-    double s = 0;
-    for( int i = 0; i < m*n; i++ )
-        s += (double)M.val[i]*M.val[i];
-    return std::sqrt(s);
+    _AccTp s = 0;
+    int i;
+    for( i = 0; i <= n - 4; i += 4 )
+    {
+        s += (_AccTp)fast_abs(a[i]) + (_AccTp)fast_abs(a[i+1]) +
+            (_AccTp)fast_abs(a[i+2]) + (_AccTp)fast_abs(a[i+3]);
+    }
+    for( ; i < n; i++ )
+        s += fast_abs(a[i]);
+    return s;
 }

+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normInf(const _Tp* a, int n)
+{
+    _AccTp s = 0;
+    for( int i = 0; i < n; i++ )
+        s = std::max(s, (_AccTp)fast_abs(a[i]));
+    return s;
+}
    
-template<typename _Tp, int m, int n> static inline
-double norm(const Matx<_Tp, m, n>& M, int normType)
+    
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL2Sqr(const _Tp* a, const _Tp* b, int n)
 {
-    if( normType == NORM_INF )
+    _AccTp s = 0;
+    int i;
+    for( i = 0; i <= n - 4; i += 4 )
+    {
+        _AccTp v0 = a[i] - b[i], v1 = a[i+1] - b[i+1], v2 = a[i+2] - b[i+2], v3 = a[i+3] - b[i+3];
+        s += v0*v0 + v1*v1 + v2*v2 + v3*v3;
+    }
+    for( ; i < n; i++ )
    {
-        _Tp s = 0;
-        for( int i = 0; i < m*n; i++ )
-            s = std::max(s, std::abs(M.val[i]));
-        return s;
+        _AccTp v = a[i] - b[i];
+        s += v*v;
    }
+    return s;
+}
+
+CV_EXPORTS float normL2Sqr_(const float* a, const float* b, int n);
+CV_EXPORTS float normL1_(const float* a, const float* b, int n);
+CV_EXPORTS int normL1_(const uchar* a, const uchar* b, int n);
+CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n);
+CV_EXPORTS int normHamming(const uchar* a, const uchar* b, int n, int cellSize);
    
-    if( normType == NORM_L1 )
+template<> static inline float normL2Sqr(const float* a, const float* b, int n)
+{
+    if( n >= 8 )
+        return normL2Sqr_(a, b, n);
+    float s = 0;
+    for( int i = 0; i < n; i++ )
    {
-        _Tp s = 0;
-        for( int i = 0; i < m*n; i++ )
-            s += std::abs(M.val[i]);
-        return s;
+        float v = a[i] - b[i];
+        s += v*v;
    }
+    return s;
+}
+
    
-    CV_DbgAssert( normType == NORM_L2 );
-    return norm(M);
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normL1(const _Tp* a, const _Tp* b, int n)
+{
+    _AccTp s = 0;
+    int i;
+    for( i = 0; i <= n - 4; i += 4 )
+    {
+        _AccTp v0 = a[i] - b[i], v1 = a[i+1] - b[i+1], v2 = a[i+2] - b[i+2], v3 = a[i+3] - b[i+3];
+        s += std::abs(v0) + std::abs(v1) + std::abs(v2) + std::abs(v3);
+    }
+    for( ; i < n; i++ )
+    {
+        _AccTp v = a[i] - b[i];
+        s += std::abs(v);
+    }
+    return s;
+}
+
+template<> static inline float normL1(const float* a, const float* b, int n)
+{
+    if( n >= 8 )
+        return normL1_(a, b, n);
+    float s = 0;
+    for( int i = 0; i < n; i++ )
+    {
+        float v = a[i] - b[i];
+        s += std::abs(v);
+    }
+    return s;
+}
+
+template<> static inline int normL1(const uchar* a, const uchar* b, int n)
+{
+    return normL1_(a, b, n);
+}    
+
+template<typename _Tp, typename _AccTp> static inline
+_AccTp normInf(const _Tp* a, const _Tp* b, int n)
+{
+    _AccTp s = 0;
+    for( int i = 0; i < n; i++ )
+    {
+        _AccTp v0 = a[i] - b[i];
+        s = std::max(s, std::abs(v0));
+    }
+    return s;
+}
+    
+
+template<typename _Tp, int m, int n> static inline
+double norm(const Matx<_Tp, m, n>& M)
+{
+    return std::sqrt(normL2Sqr<_Tp, double>(M.val, m*n));
+}
+
+    
+template<typename _Tp, int m, int n> static inline
+double norm(const Matx<_Tp, m, n>& M, int normType)
+{
+    return normType == NORM_INF ? (double)normInf<_Tp, DataType<_Tp>::work_type>(M.val, m*n) :
+        normType == NORM_L1 ? (double)normL1<_Tp, DataType<_Tp>::work_type>(M.val, m*n) :
+        std::sqrt((double)normL2Sqr<_Tp, DataType<_Tp>::work_type>(M.val, m*n));
 }
    
    
@@ -1056,7 +1177,37 @@ template<typename _Tp, int cn> inline Vec<_Tp, cn> Vec<_Tp, cn>::mul(const Vec<_
    for( int i = 0; i < cn; i++ ) w.val[i] = saturate_cast<_Tp>(this->val[i]*v.val[i]);
    return w;
 }
+
+template<typename _Tp> Vec<_Tp, 2> conjugate(const Vec<_Tp, 2>& v)
+{
+    return Vec<_Tp, 2>(v[0], -v[1]);
+}
+
+template<typename _Tp> Vec<_Tp, 4> conjugate(const Vec<_Tp, 4>& v)
+{
+    return Vec<_Tp, 4>(v[0], -v[1], -v[2], -v[3]);
+}    
    
+template<> inline Vec<float, 2> Vec<float, 2>::conj() const
+{
+    return conjugate(*this);
+}
+
+template<> inline Vec<double, 2> Vec<double, 2>::conj() const
+{
+    return conjugate(*this);
+}
+
+template<> inline Vec<float, 4> Vec<float, 4>::conj() const
+{
+    return conjugate(*this);
+}
+
+template<> inline Vec<double, 4> Vec<double, 4>::conj() const
+{
+    return conjugate(*this);
+}
+        
 template<typename _Tp, int cn> inline Vec<_Tp, cn> Vec<_Tp, cn>::cross(const Vec<_Tp, cn>& v) const
 {
    CV_Error(CV_StsError, "for arbitrary-size vector there is no cross-product defined");
@@ -1155,7 +1306,33 @@ Vec<_Tp, cn>& operator *= (Vec<_Tp, cn>& a, double alpha)
        a[i] = saturate_cast<_Tp>(a[i]*alpha);
    return a;
 }
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, int alpha)
+{
+    double ialpha = 1./alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}
    
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, float alpha)
+{
+    float ialpha = 1.f/alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}
+
+template<typename _Tp, int cn> static inline
+Vec<_Tp, cn>& operator /= (Vec<_Tp, cn>& a, double alpha)
+{
+    double ialpha = 1./alpha;
+    for( int i = 0; i < cn; i++ )
+        a[i] = saturate_cast<_Tp>(a[i]*ialpha);
+    return a;
+}    
    
 template<typename _Tp, int cn> static inline Vec<_Tp, cn>
 operator * (const Vec<_Tp, cn>& a, int alpha)
@@ -1193,6 +1370,24 @@ operator * (double alpha, const Vec<_Tp, cn>& a)
    return Vec<_Tp, cn>(a, alpha, Matx_ScaleOp());
 }    

+template<typename _Tp, int cn> static inline Vec<_Tp, cn>
+operator / (const Vec<_Tp, cn>& a, int alpha)
+{
+    return Vec<_Tp, cn>(a, 1./alpha, Matx_ScaleOp());
+}
+
+template<typename _Tp, int cn> static inline Vec<_Tp, cn>
+operator / (const Vec<_Tp, cn>& a, float alpha)
+{
+    return Vec<_Tp, cn>(a, 1.f/alpha, Matx_ScaleOp());
+}    
+
+template<typename _Tp, int cn> static inline Vec<_Tp, cn>
+operator / (const Vec<_Tp, cn>& a, double alpha)
+{
+    return Vec<_Tp, cn>(a, 1./alpha, Matx_ScaleOp());
+}        
+    
 template<typename _Tp, int cn> static inline Vec<_Tp, cn>
 operator - (const Vec<_Tp, cn>& a)
 {
@@ -1200,6 +1395,20 @@ operator - (const Vec<_Tp, cn>& a)
    for( int i = 0; i < cn; i++ ) t.val[i] = saturate_cast<_Tp>(-a.val[i]);
    return t;
 }
+
+template<typename _Tp> inline Vec<_Tp, 4> operator * (const Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2)
+{
+    return Vec<_Tp, 4>(saturate_cast<_Tp>(v1[0]*v2[0] - v1[1]*v2[1] - v1[2]*v2[2] - v1[3]*v2[3]),
+                       saturate_cast<_Tp>(v1[0]*v2[1] + v1[1]*v2[0] + v1[2]*v2[3] - v1[3]*v2[2]),
+                       saturate_cast<_Tp>(v1[0]*v2[2] - v1[1]*v2[3] + v1[2]*v2[0] + v1[3]*v2[1]),
+                       saturate_cast<_Tp>(v1[0]*v2[3] + v1[1]*v2[2] - v1[2]*v2[1] + v1[3]*v2[0]));
+}
+    
+template<typename _Tp> inline Vec<_Tp, 4>& operator *= (Vec<_Tp, 4>& v1, const Vec<_Tp, 4>& v2)
+{
+    v1 = v1 * v2;
+    return v1;
+}
    
 template<> inline Vec<float, 3> Vec<float, 3>::cross(const Vec<float, 3>& v) const
 {
@@ -1215,35 +1424,12 @@ template<> inline Vec<double, 3> Vec<double, 3>::cross(const Vec<double, 3>& v)
                     val[0]*v.val[1] - val[1]*v.val[0]);
 }

-template<typename T1, typename T2> static inline
-Vec<T1, 2>& operator += (Vec<T1, 2>& a, const Vec<T2, 2>& b)
+template<typename _Tp, int cn> inline Vec<_Tp, cn> normalize(const Vec<_Tp, cn>& v)
 {
-    a[0] = saturate_cast<T1>(a[0] + b[0]);
-    a[1] = saturate_cast<T1>(a[1] + b[1]);
-    return a;
+    double nv = norm(v);
+    return v * (nv ? 1./nv : 0.);
 }
-
-template<typename T1, typename T2> static inline
-Vec<T1, 3>& operator += (Vec<T1, 3>& a, const Vec<T2, 3>& b)
-{
-    a[0] = saturate_cast<T1>(a[0] + b[0]);
-    a[1] = saturate_cast<T1>(a[1] + b[1]);
-    a[2] = saturate_cast<T1>(a[2] + b[2]);
-    return a;
-}
-
    
-template<typename T1, typename T2> static inline
-Vec<T1, 4>& operator += (Vec<T1, 4>& a, const Vec<T2, 4>& b)
-{
-    a[0] = saturate_cast<T1>(a[0] + b[0]);
-    a[1] = saturate_cast<T1>(a[1] + b[1]);
-    a[2] = saturate_cast<T1>(a[2] + b[2]);
-    a[3] = saturate_cast<T1>(a[3] + b[3]);
-    return a;
-}
-
-        
 template<typename _Tp, typename _T2, int cn> static inline
 VecCommaInitializer<_Tp, cn> operator << (const Vec<_Tp, cn>& vec, _T2 val)
 {
@@ -1898,8 +2084,8 @@ operator * (const Scalar_<_Tp>& a, const Scalar_<_Tp>& b)
 {
    return Scalar_<_Tp>(saturate_cast<_Tp>(a[0]*b[0] - a[1]*b[1] - a[2]*b[2] - a[3]*b[3]),
                        saturate_cast<_Tp>(a[0]*b[1] + a[1]*b[0] + a[2]*b[3] - a[3]*b[2]),
-                        saturate_cast<_Tp>(a[0]*b[2] - a[1]*b[3] + a[2]*b[0] - a[3]*b[1]),
-                        saturate_cast<_Tp>(a[0]*b[3] + a[1]*b[2] - a[2]*b[1] - a[3]*b[0]));
+                        saturate_cast<_Tp>(a[0]*b[2] - a[1]*b[3] + a[2]*b[0] + a[3]*b[1]),
+                        saturate_cast<_Tp>(a[0]*b[3] + a[1]*b[2] - a[2]*b[1] + a[3]*b[0]));
 }
    
 template<typename _Tp> static inline Scalar_<_Tp>&

--- a/modules/core/src/cmdparser.cpp
+++ b/modules/core/src/cmdparser.cpp
@@ -282,7 +282,7 @@ template<typename _Tp>

 		cout << setw(col_p-2) << left << buf;

-		if (buf.length() > col_p-2) 
+		if ((int)buf.length() > col_p-2) 
 		{
 			cout << endl << "  ";
 			cout << setw(col_p-2) << left << " ";
@@ -293,7 +293,7 @@ template<typename _Tp>

 		while (true)
 		{
-			bool tr = (buf.length() > col_d-2) ? true: false;
+			bool tr = ((int)buf.length() > col_d-2) ? true: false;
 			int pos;

 			if (tr)
@@ -301,7 +301,8 @@ template<typename _Tp>
 				pos = buf.find_first_of(' ');
 				while (true)
 				{
-					if (buf.find_first_of(' ', pos + 1 ) < col_d-2 && buf.find_first_of(' ', pos + 1 ) != std::string::npos)
+					if ((int)buf.find_first_of(' ', pos + 1 ) < col_d-2 &&
+                        (int)buf.find_first_of(' ', pos + 1 ) != (int)std::string::npos)
 						pos = buf.find_first_of(' ', pos + 1);
 					else
 						break;

--- a/modules/core/src/matrix.cpp
+++ b/modules/core/src/matrix.cpp
@@ -2161,43 +2161,6 @@ static void generateRandomCenter(const vector<Vec2f>& box, float* center, RNG& r
 }


-static inline float distance(const float* a, const float* b, int n)
-{
-    int j = 0; float d = 0.f;
-#if CV_SSE
-    if( USE_SSE2 )
-    {
-        float CV_DECL_ALIGNED(16) buf[4];
-        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
-
-        for( ; j <= n - 8; j += 8 )
-        {
-            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
-            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
-            d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
-            d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
-        }
-        _mm_store_ps(buf, _mm_add_ps(d0, d1));
-        d = buf[0] + buf[1] + buf[2] + buf[3];
-    }
-    else
-#endif
-    {
-        for( ; j <= n - 4; j += 4 )
-        {
-            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
-            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
-        }
-    }
-
-    for( ; j < n; j++ )
-    {
-        float t = a[j] - b[j];
-        d += t*t;
-    }
-    return d;
-}
-
 /*
 k-means center initialization using the following algorithm:
 Arthur & Vassilvitskii (2007) k-means++: The Advantages of Careful Seeding
@@ -2218,7 +2181,7 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers,

    for( i = 0; i < N; i++ )
    {
-        dist[i] = distance(data + step*i, data + step*centers[0], dims);
+        dist[i] = normL2Sqr_(data + step*i, data + step*centers[0], dims);
        sum0 += dist[i];
    }
    
@@ -2236,7 +2199,7 @@ static void generateCentersPP(const Mat& _data, Mat& _out_centers,
            int ci = i;
            for( i = 0; i < N; i++ )
            {
-                tdist2[i] = std::min(distance(data + step*i, data + step*ci, dims), dist[i]);
+                tdist2[i] = std::min(normL2Sqr_(data + step*i, data + step*ci, dims), dist[i]);
                s += tdist2[i];
            }
            
@@ -2434,7 +2397,7 @@ double cv::kmeans( InputArray _data, int K,
                for( k = 0; k < K; k++ )
                {
                    const float* center = centers.ptr<float>(k);
-                    double dist = distance(sample, center, dims);
+                    double dist = normL2Sqr_(sample, center, dims);

                    if( min_dist > dist )
                    {

--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -810,15 +810,218 @@ void cv::minMaxLoc( InputArray _img, double* minVal, double* maxVal,
 namespace cv
 {

+float normL2Sqr_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    if( USE_SSE2 )
+    {
+        float CV_DECL_ALIGNED(16) buf[4];
+        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+        
+        for( ; j <= n - 8; j += 8 )
+        {
+            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+            d0 = _mm_add_ps(d0, _mm_mul_ps(t0, t0));
+            d1 = _mm_add_ps(d1, _mm_mul_ps(t1, t1));
+        }
+        _mm_store_ps(buf, _mm_add_ps(d0, d1));
+        d = buf[0] + buf[1] + buf[2] + buf[3];
+    }
+    else
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            float t0 = a[j] - b[j], t1 = a[j+1] - b[j+1], t2 = a[j+2] - b[j+2], t3 = a[j+3] - b[j+3];
+            d += t0*t0 + t1*t1 + t2*t2 + t3*t3;
+        }
+    }
+    
+    for( ; j < n; j++ )
+    {
+        float t = a[j] - b[j];
+        d += t*t;
+    }
+    return d;
+}
+
+    
+float normL1_(const float* a, const float* b, int n)
+{
+    int j = 0; float d = 0.f;
+#if CV_SSE
+    if( USE_SSE2 )
+    {
+        float CV_DECL_ALIGNED(16) buf[4];
+        static const float CV_DECL_ALIGNED(16) absbuf[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+        __m128 d0 = _mm_setzero_ps(), d1 = _mm_setzero_ps();
+        __m128 absmask = _mm_load_ps(absbuf);
+        
+        for( ; j <= n - 8; j += 8 )
+        {
+            __m128 t0 = _mm_sub_ps(_mm_loadu_ps(a + j), _mm_loadu_ps(b + j));
+            __m128 t1 = _mm_sub_ps(_mm_loadu_ps(a + j + 4), _mm_loadu_ps(b + j + 4));
+            d0 = _mm_add_ps(d0, _mm_and_ps(t0, absmask));
+            d1 = _mm_add_ps(d1, _mm_and_ps(t1, absmask));
+        }
+        _mm_store_ps(buf, _mm_add_ps(d0, d1));
+        d = buf[0] + buf[1] + buf[2] + buf[3];
+    }
+    else
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+                std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+    
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
+}
+
+int normL1_(const uchar* a, const uchar* b, int n)
+{
+    int j = 0, d = 0;
+#if CV_SSE
+    if( USE_SSE2 )
+    {
+        __m128i d0 = _mm_setzero_si128();
+        
+        for( ; j <= n - 16; j += 16 )
+        {
+            __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
+            __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));
+            
+            d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+        }
+
+        for( ; j <= n - 4; j += 4 )
+        {
+            __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
+            __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));
+            
+            d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
+        }
+        d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
+    }
+    else
+#endif
+    {
+        for( ; j <= n - 4; j += 4 )
+        {
+            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
+                std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
+        }
+    }
+    
+    for( ; j < n; j++ )
+        d += std::abs(a[j] - b[j]);
+    return d;
+}
+
+static const uchar popCountTable[] = 
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+};
+    
+static const uchar popCountTable2[] =
+{
+    0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    1, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4,
+    2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 2, 3, 3, 3, 3, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4
+};
+    
+static const uchar popCountTable4[] =
+{
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+};
+    
+int normHamming(const uchar* a, const uchar* b, int n)
+{
+    int i = 0, result = 0;
+#if defined __GNUC__ && CV_NEON
+    if (CPU_HAS_NEON_FEATURE)
+    {
+        result = 0;  
+        for( ; i <= n - 16; i += 16 )
+        {
+            uint8x16_t A_vec = vld1q_u8 (a + i);
+            uint8x16_t B_vec = vld1q_u8 (b + i);
+            //uint8x16_t veorq_u8 (uint8x16_t, uint8x16_t)
+            uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
+            
+            uint8x16_t bitsSet = vcntq_u8 (AxorB);
+            //uint16x8_t vpadalq_u8 (uint16x8_t, uint8x16_t)
+            uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
+            uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
+            
+            uint64x2_t bitSet2 = vpaddlq_u32 (bitSet4);
+            result += vgetq_lane_u64 (bitSet2,0);
+            result += vgetq_lane_u64 (bitSet2,1);
+        }
+    }
+    else
+#endif
+        for( ; i <= n - 4; i += 4 )
+            result += popCountTable[a[i] ^ b[i]] + popCountTable[a[i+1] ^ b[i+1]] +
+                popCountTable[a[i+2] ^ b[i+2]] + popCountTable[a[i+3] ^ b[i+3]];
+    for( ; i < n; i++ )
+        result += popCountTable[a[i] ^ b[i]];
+    return result;
+}
+    
+int normHamming(const uchar* a, const uchar* b, int n, int cellSize)
+{
+    if( cellSize == 1 )
+        return normHamming(a, b, n);
+    const uchar* tab = 0;
+    if( cellSize == 2 )
+        tab = popCountTable2;
+    else if( cellSize == 4 )
+        tab = popCountTable4;
+    else
+        CV_Error( CV_StsBadSize, "bad cell size (not 1, 2 or 4) in normHamming" );
+    int i = 0, result = 0;
+    for( ; i <= n - 4; i += 4 )
+        result += tab[a[i] ^ b[i]] + tab[a[i+1] ^ b[i+1]] +
+            tab[a[i+2] ^ b[i+2]] + tab[a[i+3] ^ b[i+3]];
+    for( ; i < n; i++ )
+        result += tab[a[i] ^ b[i]];
+    return result;
+}
+    
+    
 template<typename T, typename ST> int
 normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
 {
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-            result = std::max(result, ST(std::abs(src[i])));
+        result = std::max(result, normInf<T, ST>(src, len*cn));
    }
    else
    {
@@ -826,7 +1029,7 @@ normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
            if( mask[i] )
            {
                for( int k = 0; k < cn; k++ )
-                    result = std::max(result, ST(std::abs(src[k])));
+                    result = std::max(result, ST(fast_abs(src[k])));
            }
    }
    *_result = result;
@@ -839,9 +1042,7 @@ normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-            result += std::abs(src[i]);
+        result += normL1<T, ST>(src, len*cn);
    }
    else
    {
@@ -849,7 +1050,7 @@ normL1_(const T* src, const uchar* mask, ST* _result, int len, int cn)
            if( mask[i] )
            {
                for( int k = 0; k < cn; k++ )
-                    result += std::abs(src[k]);
+                    result += fast_abs(src[k]);
            }
    }
    *_result = result;
@@ -862,12 +1063,7 @@ normL2_(const T* src, const uchar* mask, ST* _result, int len, int cn)
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-        {
-            T v = src[i];
-            result += (ST)v*v;
-        }
+        result += normL2Sqr<T, ST>(src, len*cn);
    }
    else
    {
@@ -891,9 +1087,7 @@ normDiffInf_(const T* src1, const T* src2, const uchar* mask, ST* _result, int l
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-            result = std::max(result, (ST)std::abs(src1[i] - src2[i]));
+        result = std::max(result, normInf<T, ST>(src1, src2, len*cn));
    }
    else
    {
@@ -914,9 +1108,7 @@ normDiffL1_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-            result += std::abs(src1[i] - src2[i]);
+        result += normL1<T, ST>(src1, src2, len*cn);
    }
    else
    {
@@ -937,12 +1129,7 @@ normDiffL2_(const T* src1, const T* src2, const uchar* mask, ST* _result, int le
    ST result = *_result;
    if( !mask )
    {
-        len *= cn;
-        for( int i = 0; i < len; i++ )
-        {
-            ST v = src1[i] - src2[i];
-            result += v*v;
-        }
+        result += normL2Sqr<T, ST>(src1, src2, len*cn);
    }
    else
    {

--- a/modules/features2d/include/opencv2/features2d/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d/features2d.hpp
@@ -2104,13 +2104,7 @@ struct CV_EXPORTS SL2

    ResultType operator()( const T* a, const T* b, int size ) const
    {
-        ResultType result = ResultType();
-        for( int i = 0; i < size; i++ )
-        {
-            ResultType diff = (ResultType)(a[i] - b[i]);
-            result += diff*diff;
-        }
-        return result;
+        return normL2Sqr<ValueType, ResultType>(a, b, size);
    }
 };

@@ -2125,13 +2119,7 @@ struct CV_EXPORTS L2

    ResultType operator()( const T* a, const T* b, int size ) const
    {
-        ResultType result = ResultType();
-        for( int i = 0; i < size; i++ )
-        {
-            ResultType diff = (ResultType)(a[i] - b[i]);
-            result += diff*diff;
-        }
-        return (ResultType)sqrt((double)result);
+        return (ResultType)sqrt((double)normL2Sqr<ValueType, ResultType>(a, b, size));
    }
 };

@@ -2146,13 +2134,7 @@ struct CV_EXPORTS L1

    ResultType operator()( const T* a, const T* b, int size ) const
    {
-        ResultType result = ResultType();
-        for( int i = 0; i < size; i++ )
-        {
-            ResultType diff = a[i] - b[i];
-            result += (ResultType)fabs( diff );
-        }
-        return result;
+        return normL1<ValueType, ResultType>(a, b, size);
    }
 };

@@ -2160,40 +2142,20 @@ struct CV_EXPORTS L1
 * Hamming distance functor - counts the bit differences between two strings - useful for the Brief descriptor
 * bit count of A exclusive XOR'ed with B
 */
-struct CV_EXPORTS HammingLUT
+struct CV_EXPORTS Hamming
 {
    typedef unsigned char ValueType;
    typedef int ResultType;

    /** this will count the bits in a ^ b
     */
-    ResultType operator()( const unsigned char* a, const unsigned char* b, int size ) const;
-
-    /** \brief given a byte, count the bits using a compile time generated look up table
-     *  \param b the byte to count bits.  The look up table has an entry for all
-     *  values of b, where that entry is the number of bits.
-     *  \return the number of bits in byte b
-     */
-    static unsigned char byteBitsLookUp(unsigned char b);
-};
-
-
-/// Hamming distance functor, this one will try to use gcc's __builtin_popcountl
-/// but will fall back on HammingLUT if not available
-/// bit count of A exclusive XOR'ed with B
-struct CV_EXPORTS Hamming
-{
-    typedef unsigned char ValueType;
-
-    //! important that this is signed as weird behavior happens
-    // in BruteForce if not
-    typedef int ResultType;
-
-    /** this will count the bits in a ^ b, using __builtin_popcountl try compiling with sse4
-    */
-    ResultType operator()(const unsigned char* a, const unsigned char* b, int size) const;
+    ResultType operator()( const unsigned char* a, const unsigned char* b, int size ) const
+    {
+        return normHamming(a, b, size);
+    }
 };

+typedef Hamming HammingLUT;

 /****************************************************************************************\
 *                                      DMatch                                            *

--- a/modules/features2d/src/brief.cpp
+++ b/modules/features2d/src/brief.cpp
@@ -96,46 +96,6 @@ void pixelTests64(const Mat& sum, const std::vector<KeyPoint>& keypoints, Mat& d
 namespace cv
 {

-HammingLUT::ResultType HammingLUT::operator()( const unsigned char* a, const unsigned char* b, int size ) const
-{
-    ResultType result = 0;
-    for (int i = 0; i < size; i++)
-    {
-        result += byteBitsLookUp(a[i] ^ b[i]);
-    }
-    return result;
-}
-
-Hamming::ResultType Hamming::operator()(const unsigned char* a, const unsigned char* b, int size) const
-{
-  ResultType result;
-#if defined __GNUC__ && CV_NEON
-  if (CPU_HAS_NEON_FEATURE)
-  {
-    result = 0;  
-    for (size_t i = 0; i < size; i += 16)
-    {
-      uint8x16_t A_vec = vld1q_u8 (a + i);
-      uint8x16_t B_vec = vld1q_u8 (b + i);
-      //uint8x16_t veorq_u8 (uint8x16_t, uint8x16_t)
-      uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
-
-      uint8x16_t bitsSet = vcntq_u8 (AxorB);
-      //uint16x8_t vpadalq_u8 (uint16x8_t, uint8x16_t)
-      uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
-      uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
-
-      uint64x2_t bitSet2 = vpaddlq_u32 (bitSet4);
-      result += vgetq_lane_u64 (bitSet2,0);
-      result += vgetq_lane_u64 (bitSet2,1);
-    }
-  }
-  else
-#endif
-      result = HammingLUT()(a,b,size);
-  return result;
-}
-
 BriefDescriptorExtractor::BriefDescriptorExtractor(int bytes) :
    bytes_(bytes), test_fn_(NULL)
 {
@@ -212,292 +172,4 @@ void BriefDescriptorExtractor::computeImpl(const Mat& image, std::vector<KeyPoin
    test_fn_(sum, keypoints, descriptors);
 }

-/**
- *  \brief template meta programming struct that gives number of bits in a byte
- *  @TODO Maybe unintuitive and should just use python to generate the entries in the LUT
- */
-template<unsigned char b>
-struct ByteBits
-{
-    /**
-     * number of bits in the byte given by the template constant
-     */
-    enum
-    {
-        COUNT = ((b >> 0) & 1) +
-                ((b >> 1) & 1) +
-                ((b >> 2) & 1) +
-                ((b >> 3) & 1) +
-                ((b >> 4) & 1) +
-                ((b >> 5) & 1) +
-                ((b >> 6) & 1) +
-                ((b >> 7) & 1)
-    };
-};
-
-unsigned char HammingLUT::byteBitsLookUp(unsigned char b)
-{
-    static const unsigned char table[256] =
-    {
-        ByteBits<0>::COUNT,
-        ByteBits<1>::COUNT,
-        ByteBits<2>::COUNT,
-        ByteBits<3>::COUNT,
-        ByteBits<4>::COUNT,
-        ByteBits<5>::COUNT,
-        ByteBits<6>::COUNT,
-        ByteBits<7>::COUNT,
-        ByteBits<8>::COUNT,
-        ByteBits<9>::COUNT,
-        ByteBits<10>::COUNT,
-        ByteBits<11>::COUNT,
-        ByteBits<12>::COUNT,
-        ByteBits<13>::COUNT,
-        ByteBits<14>::COUNT,
-        ByteBits<15>::COUNT,
-        ByteBits<16>::COUNT,
-        ByteBits<17>::COUNT,
-        ByteBits<18>::COUNT,
-        ByteBits<19>::COUNT,
-        ByteBits<20>::COUNT,
-        ByteBits<21>::COUNT,
-        ByteBits<22>::COUNT,
-        ByteBits<23>::COUNT,
-        ByteBits<24>::COUNT,
-        ByteBits<25>::COUNT,
-        ByteBits<26>::COUNT,
-        ByteBits<27>::COUNT,
-        ByteBits<28>::COUNT,
-        ByteBits<29>::COUNT,
-        ByteBits<30>::COUNT,
-        ByteBits<31>::COUNT,
-        ByteBits<32>::COUNT,
-        ByteBits<33>::COUNT,
-        ByteBits<34>::COUNT,
-        ByteBits<35>::COUNT,
-        ByteBits<36>::COUNT,
-        ByteBits<37>::COUNT,
-        ByteBits<38>::COUNT,
-        ByteBits<39>::COUNT,
-        ByteBits<40>::COUNT,
-        ByteBits<41>::COUNT,
-        ByteBits<42>::COUNT,
-        ByteBits<43>::COUNT,
-        ByteBits<44>::COUNT,
-        ByteBits<45>::COUNT,
-        ByteBits<46>::COUNT,
-        ByteBits<47>::COUNT,
-        ByteBits<48>::COUNT,
-        ByteBits<49>::COUNT,
-        ByteBits<50>::COUNT,
-        ByteBits<51>::COUNT,
-        ByteBits<52>::COUNT,
-        ByteBits<53>::COUNT,
-        ByteBits<54>::COUNT,
-        ByteBits<55>::COUNT,
-        ByteBits<56>::COUNT,
-        ByteBits<57>::COUNT,
-        ByteBits<58>::COUNT,
-        ByteBits<59>::COUNT,
-        ByteBits<60>::COUNT,
-        ByteBits<61>::COUNT,
-        ByteBits<62>::COUNT,
-        ByteBits<63>::COUNT,
-        ByteBits<64>::COUNT,
-        ByteBits<65>::COUNT,
-        ByteBits<66>::COUNT,
-        ByteBits<67>::COUNT,
-        ByteBits<68>::COUNT,
-        ByteBits<69>::COUNT,
-        ByteBits<70>::COUNT,
-        ByteBits<71>::COUNT,
-        ByteBits<72>::COUNT,
-        ByteBits<73>::COUNT,
-        ByteBits<74>::COUNT,
-        ByteBits<75>::COUNT,
-        ByteBits<76>::COUNT,
-        ByteBits<77>::COUNT,
-        ByteBits<78>::COUNT,
-        ByteBits<79>::COUNT,
-        ByteBits<80>::COUNT,
-        ByteBits<81>::COUNT,
-        ByteBits<82>::COUNT,
-        ByteBits<83>::COUNT,
-        ByteBits<84>::COUNT,
-        ByteBits<85>::COUNT,
-        ByteBits<86>::COUNT,
-        ByteBits<87>::COUNT,
-        ByteBits<88>::COUNT,
-        ByteBits<89>::COUNT,
-        ByteBits<90>::COUNT,
-        ByteBits<91>::COUNT,
-        ByteBits<92>::COUNT,
-        ByteBits<93>::COUNT,
-        ByteBits<94>::COUNT,
-        ByteBits<95>::COUNT,
-        ByteBits<96>::COUNT,
-        ByteBits<97>::COUNT,
-        ByteBits<98>::COUNT,
-        ByteBits<99>::COUNT,
-        ByteBits<100>::COUNT,
-        ByteBits<101>::COUNT,
-        ByteBits<102>::COUNT,
-        ByteBits<103>::COUNT,
-        ByteBits<104>::COUNT,
-        ByteBits<105>::COUNT,
-        ByteBits<106>::COUNT,
-        ByteBits<107>::COUNT,
-        ByteBits<108>::COUNT,
-        ByteBits<109>::COUNT,
-        ByteBits<110>::COUNT,
-        ByteBits<111>::COUNT,
-        ByteBits<112>::COUNT,
-        ByteBits<113>::COUNT,
-        ByteBits<114>::COUNT,
-        ByteBits<115>::COUNT,
-        ByteBits<116>::COUNT,
-        ByteBits<117>::COUNT,
-        ByteBits<118>::COUNT,
-        ByteBits<119>::COUNT,
-        ByteBits<120>::COUNT,
-        ByteBits<121>::COUNT,
-        ByteBits<122>::COUNT,
-        ByteBits<123>::COUNT,
-        ByteBits<124>::COUNT,
-        ByteBits<125>::COUNT,
-        ByteBits<126>::COUNT,
-        ByteBits<127>::COUNT,
-        ByteBits<128>::COUNT,
-        ByteBits<129>::COUNT,
-        ByteBits<130>::COUNT,
-        ByteBits<131>::COUNT,
-        ByteBits<132>::COUNT,
-        ByteBits<133>::COUNT,
-        ByteBits<134>::COUNT,
-        ByteBits<135>::COUNT,
-        ByteBits<136>::COUNT,
-        ByteBits<137>::COUNT,
-        ByteBits<138>::COUNT,
-        ByteBits<139>::COUNT,
-        ByteBits<140>::COUNT,
-        ByteBits<141>::COUNT,
-        ByteBits<142>::COUNT,
-        ByteBits<143>::COUNT,
-        ByteBits<144>::COUNT,
-        ByteBits<145>::COUNT,
-        ByteBits<146>::COUNT,
-        ByteBits<147>::COUNT,
-        ByteBits<148>::COUNT,
-        ByteBits<149>::COUNT,
-        ByteBits<150>::COUNT,
-        ByteBits<151>::COUNT,
-        ByteBits<152>::COUNT,
-        ByteBits<153>::COUNT,
-        ByteBits<154>::COUNT,
-        ByteBits<155>::COUNT,
-        ByteBits<156>::COUNT,
-        ByteBits<157>::COUNT,
-        ByteBits<158>::COUNT,
-        ByteBits<159>::COUNT,
-        ByteBits<160>::COUNT,
-        ByteBits<161>::COUNT,
-        ByteBits<162>::COUNT,
-        ByteBits<163>::COUNT,
-        ByteBits<164>::COUNT,
-        ByteBits<165>::COUNT,
-        ByteBits<166>::COUNT,
-        ByteBits<167>::COUNT,
-        ByteBits<168>::COUNT,
-        ByteBits<169>::COUNT,
-        ByteBits<170>::COUNT,
-        ByteBits<171>::COUNT,
-        ByteBits<172>::COUNT,
-        ByteBits<173>::COUNT,
-        ByteBits<174>::COUNT,
-        ByteBits<175>::COUNT,
-        ByteBits<176>::COUNT,
-        ByteBits<177>::COUNT,
-        ByteBits<178>::COUNT,
-        ByteBits<179>::COUNT,
-        ByteBits<180>::COUNT,
-        ByteBits<181>::COUNT,
-        ByteBits<182>::COUNT,
-        ByteBits<183>::COUNT,
-        ByteBits<184>::COUNT,
-        ByteBits<185>::COUNT,
-        ByteBits<186>::COUNT,
-        ByteBits<187>::COUNT,
-        ByteBits<188>::COUNT,
-        ByteBits<189>::COUNT,
-        ByteBits<190>::COUNT,
-        ByteBits<191>::COUNT,
-        ByteBits<192>::COUNT,
-        ByteBits<193>::COUNT,
-        ByteBits<194>::COUNT,
-        ByteBits<195>::COUNT,
-        ByteBits<196>::COUNT,
-        ByteBits<197>::COUNT,
-        ByteBits<198>::COUNT,
-        ByteBits<199>::COUNT,
-        ByteBits<200>::COUNT,
-        ByteBits<201>::COUNT,
-        ByteBits<202>::COUNT,
-        ByteBits<203>::COUNT,
-        ByteBits<204>::COUNT,
-        ByteBits<205>::COUNT,
-        ByteBits<206>::COUNT,
-        ByteBits<207>::COUNT,
-        ByteBits<208>::COUNT,
-        ByteBits<209>::COUNT,
-        ByteBits<210>::COUNT,
-        ByteBits<211>::COUNT,
-        ByteBits<212>::COUNT,
-        ByteBits<213>::COUNT,
-        ByteBits<214>::COUNT,
-        ByteBits<215>::COUNT,
-        ByteBits<216>::COUNT,
-        ByteBits<217>::COUNT,
-        ByteBits<218>::COUNT,
-        ByteBits<219>::COUNT,
-        ByteBits<220>::COUNT,
-        ByteBits<221>::COUNT,
-        ByteBits<222>::COUNT,
-        ByteBits<223>::COUNT,
-        ByteBits<224>::COUNT,
-        ByteBits<225>::COUNT,
-        ByteBits<226>::COUNT,
-        ByteBits<227>::COUNT,
-        ByteBits<228>::COUNT,
-        ByteBits<229>::COUNT,
-        ByteBits<230>::COUNT,
-        ByteBits<231>::COUNT,
-        ByteBits<232>::COUNT,
-        ByteBits<233>::COUNT,
-        ByteBits<234>::COUNT,
-        ByteBits<235>::COUNT,
-        ByteBits<236>::COUNT,
-        ByteBits<237>::COUNT,
-        ByteBits<238>::COUNT,
-        ByteBits<239>::COUNT,
-        ByteBits<240>::COUNT,
-        ByteBits<241>::COUNT,
-        ByteBits<242>::COUNT,
-        ByteBits<243>::COUNT,
-        ByteBits<244>::COUNT,
-        ByteBits<245>::COUNT,
-        ByteBits<246>::COUNT,
-        ByteBits<247>::COUNT,
-        ByteBits<248>::COUNT,
-        ByteBits<249>::COUNT,
-        ByteBits<250>::COUNT,
-        ByteBits<251>::COUNT,
-        ByteBits<252>::COUNT,
-        ByteBits<253>::COUNT,
-        ByteBits<254>::COUNT,
-        ByteBits<255>::COUNT
-    };
-
-    return table[b];
-}
-
 } // namespace cv
--- a/modules/features2d/src/matchers.cpp
+++ b/modules/features2d/src/matchers.cpp
@@ -342,7 +342,7 @@ Ptr<DescriptorMatcher> DescriptorMatcher::create( const string& descriptorMatche
    }
    else if( !descriptorMatcherType.compare( "BruteForce-HammingLUT") )
    {
-        dm = new BruteForceMatcher<HammingLUT>();
+        dm = new BruteForceMatcher<Hamming>();
    }

    return dm;

--- a/modules/features2d/src/planardetect.cpp
+++ b/modules/features2d/src/planardetect.cpp
@@ -55,7 +55,7 @@ namespace cv
      IEEE Transactions on Pattern Analysis and Machine Intelligence, 15 Jan. 2009.

   2. Vincent Lepetit, Pascal Fua,
-      “Towards Recognizing Feature Points Using Classification Trees,”
+      "Towards Recognizing Feature Points Using Classification Trees,"
      Technical Report IC/2004/74, EPFL, 2004.
 */


--- a/modules/flann/include/opencv2/flann/dist.h
+++ b/modules/flann/include/opencv2/flann/dist.h
@@ -383,89 +383,23 @@ struct HammingLUT
     */
    ResultType operator()(const unsigned char* a, const unsigned char* b, int size) const
    {
+        static const uchar popCountTable[] = 
+        {
+            0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+            1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+            2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+            3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+        };
        ResultType result = 0;
        for (int i = 0; i < size; i++) {
-            result += byteBitsLookUp(a[i] ^ b[i]);
+            result += popCountTable[a[i] ^ b[i]];
        }
        return result;
    }
-
-
-    /** \brief given a byte, count the bits using a look up table
-     *  \param b the byte to count bits.  The look up table has an entry for all
-     *  values of b, where that entry is the number of bits.
-     *  \return the number of bits in byte b
-     */
-    static unsigned char byteBitsLookUp(unsigned char b)
-    {
-        static const unsigned char table[256]  = {
-            /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
-            /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
-            /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
-            /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,
-            /* 10 */ 1, /* 11 */ 2, /* 12 */ 2, /* 13 */ 3,
-            /* 14 */ 2, /* 15 */ 3, /* 16 */ 3, /* 17 */ 4,
-            /* 18 */ 2, /* 19 */ 3, /* 1a */ 3, /* 1b */ 4,
-            /* 1c */ 3, /* 1d */ 4, /* 1e */ 4, /* 1f */ 5,
-            /* 20 */ 1, /* 21 */ 2, /* 22 */ 2, /* 23 */ 3,
-            /* 24 */ 2, /* 25 */ 3, /* 26 */ 3, /* 27 */ 4,
-            /* 28 */ 2, /* 29 */ 3, /* 2a */ 3, /* 2b */ 4,
-            /* 2c */ 3, /* 2d */ 4, /* 2e */ 4, /* 2f */ 5,
-            /* 30 */ 2, /* 31 */ 3, /* 32 */ 3, /* 33 */ 4,
-            /* 34 */ 3, /* 35 */ 4, /* 36 */ 4, /* 37 */ 5,
-            /* 38 */ 3, /* 39 */ 4, /* 3a */ 4, /* 3b */ 5,
-            /* 3c */ 4, /* 3d */ 5, /* 3e */ 5, /* 3f */ 6,
-            /* 40 */ 1, /* 41 */ 2, /* 42 */ 2, /* 43 */ 3,
-            /* 44 */ 2, /* 45 */ 3, /* 46 */ 3, /* 47 */ 4,
-            /* 48 */ 2, /* 49 */ 3, /* 4a */ 3, /* 4b */ 4,
-            /* 4c */ 3, /* 4d */ 4, /* 4e */ 4, /* 4f */ 5,
-            /* 50 */ 2, /* 51 */ 3, /* 52 */ 3, /* 53 */ 4,
-            /* 54 */ 3, /* 55 */ 4, /* 56 */ 4, /* 57 */ 5,
-            /* 58 */ 3, /* 59 */ 4, /* 5a */ 4, /* 5b */ 5,
-            /* 5c */ 4, /* 5d */ 5, /* 5e */ 5, /* 5f */ 6,
-            /* 60 */ 2, /* 61 */ 3, /* 62 */ 3, /* 63 */ 4,
-            /* 64 */ 3, /* 65 */ 4, /* 66 */ 4, /* 67 */ 5,
-            /* 68 */ 3, /* 69 */ 4, /* 6a */ 4, /* 6b */ 5,
-            /* 6c */ 4, /* 6d */ 5, /* 6e */ 5, /* 6f */ 6,
-            /* 70 */ 3, /* 71 */ 4, /* 72 */ 4, /* 73 */ 5,
-            /* 74 */ 4, /* 75 */ 5, /* 76 */ 5, /* 77 */ 6,
-            /* 78 */ 4, /* 79 */ 5, /* 7a */ 5, /* 7b */ 6,
-            /* 7c */ 5, /* 7d */ 6, /* 7e */ 6, /* 7f */ 7,
-            /* 80 */ 1, /* 81 */ 2, /* 82 */ 2, /* 83 */ 3,
-            /* 84 */ 2, /* 85 */ 3, /* 86 */ 3, /* 87 */ 4,
-            /* 88 */ 2, /* 89 */ 3, /* 8a */ 3, /* 8b */ 4,
-            /* 8c */ 3, /* 8d */ 4, /* 8e */ 4, /* 8f */ 5,
-            /* 90 */ 2, /* 91 */ 3, /* 92 */ 3, /* 93 */ 4,
-            /* 94 */ 3, /* 95 */ 4, /* 96 */ 4, /* 97 */ 5,
-            /* 98 */ 3, /* 99 */ 4, /* 9a */ 4, /* 9b */ 5,
-            /* 9c */ 4, /* 9d */ 5, /* 9e */ 5, /* 9f */ 6,
-            /* a0 */ 2, /* a1 */ 3, /* a2 */ 3, /* a3 */ 4,
-            /* a4 */ 3, /* a5 */ 4, /* a6 */ 4, /* a7 */ 5,
-            /* a8 */ 3, /* a9 */ 4, /* aa */ 4, /* ab */ 5,
-            /* ac */ 4, /* ad */ 5, /* ae */ 5, /* af */ 6,
-            /* b0 */ 3, /* b1 */ 4, /* b2 */ 4, /* b3 */ 5,
-            /* b4 */ 4, /* b5 */ 5, /* b6 */ 5, /* b7 */ 6,
-            /* b8 */ 4, /* b9 */ 5, /* ba */ 5, /* bb */ 6,
-            /* bc */ 5, /* bd */ 6, /* be */ 6, /* bf */ 7,
-            /* c0 */ 2, /* c1 */ 3, /* c2 */ 3, /* c3 */ 4,
-            /* c4 */ 3, /* c5 */ 4, /* c6 */ 4, /* c7 */ 5,
-            /* c8 */ 3, /* c9 */ 4, /* ca */ 4, /* cb */ 5,
-            /* cc */ 4, /* cd */ 5, /* ce */ 5, /* cf */ 6,
-            /* d0 */ 3, /* d1 */ 4, /* d2 */ 4, /* d3 */ 5,
-            /* d4 */ 4, /* d5 */ 5, /* d6 */ 5, /* d7 */ 6,
-            /* d8 */ 4, /* d9 */ 5, /* da */ 5, /* db */ 6,
-            /* dc */ 5, /* dd */ 6, /* de */ 6, /* df */ 7,
-            /* e0 */ 3, /* e1 */ 4, /* e2 */ 4, /* e3 */ 5,
-            /* e4 */ 4, /* e5 */ 5, /* e6 */ 5, /* e7 */ 6,
-            /* e8 */ 4, /* e9 */ 5, /* ea */ 5, /* eb */ 6,
-            /* ec */ 5, /* ed */ 6, /* ee */ 6, /* ef */ 7,
-            /* f0 */ 4, /* f1 */ 5, /* f2 */ 5, /* f3 */ 6,
-            /* f4 */ 5, /* f5 */ 6, /* f6 */ 6, /* f7 */ 7,
-            /* f8 */ 5, /* f9 */ 6, /* fa */ 6, /* fb */ 7,
-            /* fc */ 6, /* fd */ 7, /* fe */ 7, /* ff */ 8
-        };
-        return table[b];
-    }
 };

 /**

--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1382,12 +1382,6 @@ namespace cv
            explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(L2Dist) {}
            explicit BruteForceMatcher_GPU(L2<T> /*d*/) : BruteForceMatcher_GPU_base(L2Dist) {}
        };
-        template <> class CV_EXPORTS BruteForceMatcher_GPU< HammingLUT > : public BruteForceMatcher_GPU_base
-        {
-        public:
-            explicit BruteForceMatcher_GPU() : BruteForceMatcher_GPU_base(HammingDist) {}
-            explicit BruteForceMatcher_GPU(HammingLUT /*d*/) : BruteForceMatcher_GPU_base(HammingDist) {}
-        };
        template <> class CV_EXPORTS BruteForceMatcher_GPU< Hamming > : public BruteForceMatcher_GPU_base
        {
        public:

--- a/samples/cpp/brief_match_test.cpp
+++ b/samples/cpp/brief_match_test.cpp
@@ -103,13 +103,7 @@ int main(int argc, const char ** argv)

  cout << "done computing descriptors... took " << t << " seconds" << endl;

-  //Do matching with 2 methods using features2d
-  cout << "matching with BruteForceMatcher<HammingLUT>" << endl;
-  BruteForceMatcher<HammingLUT> matcher;
-  vector<DMatch> matches_lut;
-  float lut_time = (float)match(kpts_1, kpts_2, matcher, desc_1, desc_2, matches_lut);
-  cout << "done BruteForceMatcher<HammingLUT> matching. took " << lut_time << " seconds" << endl;
-
+  //Do matching using features2d
  cout << "matching with BruteForceMatcher<Hamming>" << endl;
  BruteForceMatcher<Hamming> matcher_popcount;
  vector<DMatch> matches_popcount;