Add QR decomposition to HAL

dfe4519c · Vladislav Sovrasov · d102ea96 · dfe4519c · dfe4519c · dfe4519c
10 changed file
--- a/doc/opencv.bib
+++ b/doc/opencv.bib
@@ -457,6 +457,11 @@
  title = {ROF and TV-L1 denoising with Primal-Dual algorithm},
  url = {http://znah.net/rof-and-tv-l1-denoising-with-primal-dual-algorithm.html}
 }
+@MISC{VandLec,
+  author = {Vandenberghe, Lieven},
+  title = {QR Factorization},
+  url = {http://www.seas.ucla.edu/~vandenbe/133A/lectures/qr.pdf}
+}
 @ARTICLE{MHT2011,
  author = {Getreuer, Pascal},
  title = {Malvar-He-Cutler Linear Image Demosaicking},

--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -263,6 +263,7 @@ enum { CALIB_USE_INTRINSIC_GUESS = 0x00001,
       CALIB_FIX_S1_S2_S3_S4     = 0x10000,
       CALIB_TILTED_MODEL        = 0x40000,
       CALIB_FIX_TAUX_TAUY       = 0x80000,
+       CALIB_USE_QR              = 0x100000, //!< use QR instead of SVD decomposition for solving. Faster but potentially less precise
       // only for stereo
       CALIB_FIX_INTRINSIC       = 0x00100,
       CALIB_SAME_FOCAL_LENGTH   = 0x00200,

--- a/modules/calib3d/src/calibration.cpp
+++ b/modules/calib3d/src/calibration.cpp
@@ -1485,6 +1485,9 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
    if(flags & CALIB_USE_LU) {
        solver.solveMethod = DECOMP_LU;
    }
+    else if(flags & CALIB_USE_QR) {
+        solver.solveMethod = DECOMP_QR;
+    }

    {
    double* param = solver.param->data.db;

--- a/modules/core/include/opencv2/core/hal/hal.hpp
+++ b/modules/core/include/opencv2/core/hal/hal.hpp
@@ -66,6 +66,8 @@ CV_EXPORTS bool Cholesky32f(float* A, size_t astep, int m, float* b, size_t bste
 CV_EXPORTS bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
 CV_EXPORTS void SVD32f(float* At, size_t astep, float* W, float* U, size_t ustep, float* Vt, size_t vstep, int m, int n, int flags);
 CV_EXPORTS void SVD64f(double* At, size_t astep, double* W, double* U, size_t ustep, double* Vt, size_t vstep, int m, int n, int flags);
+CV_EXPORTS int QR32f(float* A, size_t astep, int m, int n, int k, float* b, size_t bstep, float* hFactors);
+CV_EXPORTS int QR64f(double* A, size_t astep, int m, int n, int k, double* b, size_t bstep, double* hFactors);

 CV_EXPORTS void gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
                        float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,

--- a/modules/core/src/hal_internal.cpp
+++ b/modules/core/src/hal_internal.cpp
@@ -61,10 +61,12 @@
 #include <typeinfo>
 #include <limits>
 #include <complex>
+#include <vector>

 #define HAL_GEMM_SMALL_COMPLEX_MATRIX_THRESH 100
 #define HAL_GEMM_SMALL_MATRIX_THRESH 100
 #define HAL_SVD_SMALL_MATRIX_THRESH 25
+#define HAL_QR_SMALL_MATRIX_THRESH 30
 #define HAL_LU_SMALL_MATRIX_THRESH 100
 #define HAL_CHOLESKY_SMALL_MATRIX_THRESH 100

@@ -259,6 +261,106 @@ lapack_SVD(fptype* a, size_t a_step, fptype *w, fptype* u, size_t u_step, fptype
    return CV_HAL_ERROR_OK;
 }

+template <typename fptype> static inline int
+lapack_QR(fptype* a, size_t a_step, int m, int n, int k, fptype* b, size_t b_step, fptype* dst, int* info)
+{
+    int lda = a_step / sizeof(fptype);
+    char mode[] = { 'N', '\0' };
+    if(m < n)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+
+    std::vector<fptype> tmpAMemHolder;
+    fptype* tmpA;
+    int ldtmpA;
+    if (m == n)
+    {
+        transpose_square_inplace(a, lda, m);
+        tmpA = a;
+        ldtmpA = lda;
+    }
+    else
+    {
+        tmpAMemHolder.resize(m*n);
+        tmpA = &tmpAMemHolder.front();
+        ldtmpA = m;
+        transpose(a, lda, tmpA, m, m, n);
+    }
+
+    int lwork = -1;
+    fptype work1 = 0.;
+
+    if (b)
+    {
+        if (k == 1 && b_step == sizeof(fptype))
+        {
+            if (typeid(fptype) == typeid(float))
+                sgels_(mode, &m, &n, &k, (float*)tmpA, &ldtmpA, (float*)b, &m, (float*)&work1, &lwork, info);
+            else if (typeid(fptype) == typeid(double))
+                dgels_(mode, &m, &n, &k, (double*)tmpA, &ldtmpA, (double*)b, &m, (double*)&work1, &lwork, info);
+
+            lwork = (int)round(work1); //optimal buffer size
+            std::vector<fptype> workBufMemHolder(lwork + 1);
+            fptype* buffer = &workBufMemHolder.front();
+
+            if (typeid(fptype) == typeid(float))
+                sgels_(mode, &m, &n, &k, (float*)tmpA, &ldtmpA, (float*)b, &m, (float*)buffer, &lwork, info);
+            else if (typeid(fptype) == typeid(double))
+                dgels_(mode, &m, &n, &k, (double*)tmpA, &ldtmpA, (double*)b, &m, (double*)buffer, &lwork, info);
+        }
+        else
+        {
+            std::vector<fptype> tmpBMemHolder(m*k);
+            fptype* tmpB = &tmpBMemHolder.front();
+            int ldb = b_step / sizeof(fptype);
+            transpose(b, ldb, tmpB, m, m, k);
+
+            if (typeid(fptype) == typeid(float))
+                sgels_(mode, &m, &n, &k, (float*)tmpA, &ldtmpA, (float*)tmpB, &m, (float*)&work1, &lwork, info);
+            else if (typeid(fptype) == typeid(double))
+                dgels_(mode, &m, &n, &k, (double*)tmpA, &ldtmpA, (double*)tmpB, &m, (double*)&work1, &lwork, info);
+
+            lwork = (int)round(work1); //optimal buffer size
+            std::vector<fptype> workBufMemHolder(lwork + 1);
+            fptype* buffer = &workBufMemHolder.front();
+
+            if (typeid(fptype) == typeid(float))
+                sgels_(mode, &m, &n, &k, (float*)tmpA, &ldtmpA, (float*)tmpB, &m, (float*)buffer, &lwork, info);
+            else if (typeid(fptype) == typeid(double))
+                dgels_(mode, &m, &n, &k, (double*)tmpA, &ldtmpA, (double*)tmpB, &m, (double*)buffer, &lwork, info);
+
+            transpose(tmpB, m, b, ldb, k, m);
+        }
+    }
+    else
+    {
+        if (typeid(fptype) == typeid(float))
+            sgeqrf_(&m, &n, (float*)tmpA, &ldtmpA, (float*)dst, (float*)&work1, &lwork, info);
+        else if (typeid(fptype) == typeid(double))
+            dgeqrf_(&m, &n, (double*)tmpA, &ldtmpA, (double*)dst, (double*)&work1, &lwork, info);
+
+        lwork = (int)round(work1); //optimal buffer size
+        std::vector<fptype> workBufMemHolder(lwork + 1);
+        fptype* buffer = &workBufMemHolder.front();
+
+        if (typeid(fptype) == typeid(float))
+            sgeqrf_(&m, &n, (float*)tmpA, &ldtmpA, (float*)dst, (float*)buffer, &lwork, info);
+        else if (typeid(fptype) == typeid(double))
+            dgeqrf_(&m, &n, (double*)tmpA, &ldtmpA, (double*)dst, (double*)buffer, &lwork, info);
+    }
+
+    if (m == n)
+        transpose_square_inplace(a, lda, m);
+    else
+        transpose(tmpA, m, a, lda, n, m);
+
+    if (*info != 0)
+        *info = 0;
+    else
+        *info = 1;
+
+    return CV_HAL_ERROR_OK;
+}
+
 template <typename fptype> static inline int
 lapack_gemm(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha,
            const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int a_m, int a_n, int d_n, int flags)
@@ -459,6 +561,20 @@ int lapack_SVD64f(double* a, size_t a_step, double *w, double* u, size_t u_step,
    return lapack_SVD(a, a_step, w, u, u_step, vt, v_step, m, n, flags, &info);
 }

+int lapack_QR32f(float* src1, size_t src1_step, int m, int n, int k, float* src2, size_t src2_step, float* dst, int* info)
+{
+    if (m < HAL_QR_SMALL_MATRIX_THRESH)
+      return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    return lapack_QR(src1, src1_step, m, n, k, src2, src2_step, dst, info);
+}
+
+int lapack_QR64f(double* src1, size_t src1_step, int m, int n, int k, double* src2, size_t src2_step, double* dst, int* info)
+{
+    if (m < HAL_QR_SMALL_MATRIX_THRESH)
+      return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    return lapack_QR(src1, src1_step, m, n, k, src2, src2_step, dst, info);
+}
+
 int lapack_gemm32f(const float *src1, size_t src1_step, const float *src2, size_t src2_step, float alpha,
                   const float *src3, size_t src3_step, float beta, float *dst, size_t dst_step, int m, int n, int k, int flags)
 {

--- a/modules/core/src/hal_internal.hpp
+++ b/modules/core/src/hal_internal.hpp
@@ -55,6 +55,8 @@ int lapack_Cholesky32f(float* a, size_t a_step, int m, float* b, size_t b_step,
 int lapack_Cholesky64f(double* a, size_t a_step, int m, double* b, size_t b_step, int n, bool* info);
 int lapack_SVD32f(float* a, size_t a_step, float* w, float* u, size_t u_step, float* vt, size_t v_step, int m, int n, int flags);
 int lapack_SVD64f(double* a, size_t a_step, double* w, double* u, size_t u_step, double* vt, size_t v_step, int m, int n, int flags);
+int lapack_QR32f(float* src1, size_t src1_step, int m, int n, int k, float* src2, size_t src2_step, float* dst, int* info);
+int lapack_QR64f(double* src1, size_t src1_step, int m, int n, int k, double* src2, size_t src2_step, double* dst, int* info);
 int lapack_gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
                   float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
                   int m, int n, int k, int flags);
@@ -83,6 +85,11 @@ int lapack_gemm64fc(const double* src1, size_t src1_step, const double* src2, si
 #undef cv_hal_SVD64f
 #define cv_hal_SVD64f lapack_SVD64f

+#undef cv_hal_QR32f
+#define cv_hal_QR32f lapack_QR32f
+#undef cv_hal_QR64f
+#define cv_hal_QR64f lapack_QR64f
+
 #undef cv_hal_gemm32f
 #define cv_hal_gemm32f lapack_gemm32f
 #undef cv_hal_gemm64f

--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -634,6 +634,27 @@ inline int hal_ni_SVD32f(float* src, size_t src_step, float* w, float* u, size_t
 inline int hal_ni_SVD64f(double* src, size_t src_step, double* w, double* u, size_t u_step, double* vt, size_t vt_step, int m, int n, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 //! @}

+/**
+Performs QR decomposition of \f$M\times N\f$(\f$M>N\f$) matrix \f$A = Q*R\f$ and solves matrix equation \f$A*X=B\f$.
+@param src1 pointer to input matrix \f$A\f$ stored in row major order. After finish of work src1 contains upper triangular \f$N\times N\f$ matrix \f$R\f$.
+Lower triangle of src1 will be filled with vectors of elementary reflectors. See @cite VandLec and Lapack's DGEQRF documentation for details.
+@param src1_step number of bytes between two consequent rows of matrix \f$A\f$.
+@param m number fo rows in matrix \f$A\f$.
+@param n number of columns in matrix \f$A\f$.
+@param k number of right-hand vectors in \f$M\times K\f$ matrix \f$B\f$.
+@param src2 pointer to \f$M\times K\f$ matrix \f$B\f$ which is the right-hand side of system \f$A*X=B\f$. \f$B\f$ stored in row major order.
+If src2 is null pointer only QR decomposition will be performed. Otherwise system will be solved and src1 will be used as temporary buffer, so
+after finish of work src2 contains solution \f$X\f$ of system \f$A*X=B\f$.
+@param src2_step number of bytes between two consequent rows of matrix \f$B\f$.
+@param dst pointer to continiuos \f$N\times 1\f$ array for scalar factors of elementary reflectors. See @cite VandLec for details.
+@param info indicates success of decomposition. If *info is zero decomposition failed.
+*/
+//! @addtogroup core_hal_interface_decomp_qr QR matrix decomposition
+//! @{
+inline int hal_ni_QR32f(float* src1, size_t src1_step, int m, int n, int k, float* src2, size_t src2_step, float* dst, int* info) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_QR64f(double* src1, size_t src1_step, int m, int n, int k, double* src2, size_t src2_step, double* dst, int* info) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+


 //! @cond IGNORED
@@ -643,6 +664,8 @@ inline int hal_ni_SVD64f(double* src, size_t src_step, double* w, double* u, siz
 #define cv_hal_Cholesky64f hal_ni_Cholesky64f
 #define cv_hal_SVD32f hal_ni_SVD32f
 #define cv_hal_SVD64f hal_ni_SVD64f
+#define cv_hal_QR32f hal_ni_QR32f
+#define cv_hal_QR64f hal_ni_QR64f
 //! @endcond



--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -1238,9 +1238,6 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
        return result;
    }

-    if( method == DECOMP_QR )
-        method = DECOMP_SVD;
-
    int m = src.rows, m_ = m, n = src.cols, nb = _src2.cols;
    size_t esz = CV_ELEM_SIZE(type), bufsize = 0;
    size_t vstep = alignSize(n*esz, 16);
@@ -1268,7 +1265,6 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth

    if( is_normal )
        bufsize += n*nb*esz;
-
    if( method == DECOMP_SVD || method == DECOMP_EIG )
        bufsize += n*5*esz + n*vstep + nb*sizeof(double) + 32;

@@ -1321,6 +1317,28 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
        else
            result = hal::Cholesky64f(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
    }
+    else if( method == DECOMP_QR )
+    {
+        Mat rhsMat;
+        if( is_normal || m == n )
+        {
+            src2.copyTo(dst);
+            rhsMat = dst;
+        }
+        else
+        {
+            rhsMat = Mat(m, nb, type);
+            src2.copyTo(rhsMat);
+        }
+
+        if( type == CV_32F )
+            result = hal::QR32f(a.ptr<float>(), a.step, a.rows, a.cols, rhsMat.cols, rhsMat.ptr<float>(), rhsMat.step, NULL) != 0;
+        else
+            result = hal::QR64f(a.ptr<double>(), a.step, a.rows, a.cols, rhsMat.cols, rhsMat.ptr<double>(), rhsMat.step, NULL) != 0;
+
+        if (rhsMat.rows != dst.rows)
+            rhsMat.rowRange(0, dst.rows).copyTo(dst);
+    }
    else
    {
        ptr = alignPtr(ptr, 16);

--- a/modules/core/src/matrix_decomp.cpp
+++ b/modules/core/src/matrix_decomp.cpp
@@ -225,6 +225,129 @@ bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n)
    return CholImpl(A, astep, m, b, bstep, n);
 }

+template<typename _Tp> inline static int
+sign(_Tp x)
+{
+    if (x >= (_Tp)0)
+        return 1;
+    else
+        return -1;
+}
+
+template<typename _Tp> static inline int
+QRImpl(_Tp* A, size_t astep, int m, int n, int k, _Tp* b, size_t bstep, _Tp* hFactors, _Tp eps)
+{
+    astep /= sizeof(_Tp);
+    bstep /= sizeof(_Tp);
+
+    cv::AutoBuffer<_Tp> buffer;
+    size_t buf_size = m ? m + n : hFactors != NULL;
+    buffer.allocate(buf_size);
+    _Tp* vl = buffer;
+    if (hFactors == NULL)
+        hFactors = vl + m;
+
+    for (int l = 0; l < n; l++)
+    {
+        //generate vl
+        int vlSize = m - l;
+        _Tp vlNorm = (_Tp)0;
+        for (int i = 0; i < vlSize; i++)
+        {
+            vl[i] = A[(l + i)*astep + l];
+            vlNorm += vl[i] * vl[i];
+        }
+        _Tp tmpV = vl[0];
+        vl[0] = vl[0] + sign(vl[0])*std::sqrt(vlNorm);
+        vlNorm = std::sqrt(vlNorm + vl[0] * vl[0] - tmpV*tmpV);
+        for (int i = 0; i < vlSize; i++)
+        {
+            vl[i] /= vlNorm;
+        }
+        //multiply A_l*vl
+        for (int j = l; j < n; j++)
+        {
+            _Tp v_lA = (_Tp)0;
+            for (int i = l; i < m; i++)
+            {
+                v_lA += vl[i - l] * A[i*astep + j];
+            }
+
+            for (int i = l; i < m; i++)
+            {
+                A[i*astep + j] -= 2 * vl[i - l] * v_lA;
+            }
+        }
+
+        //save vl and factors
+        hFactors[l] = vl[0] * vl[0];
+        for (int i = 1; i < vlSize; i++)
+        {
+            A[(l + i)*astep + l] = vl[i] / vl[0];
+        }
+    }
+
+    if (b)
+    {
+        //generate new rhs
+        for (int l = 0; l < n; l++)
+        {
+            //unpack vl
+            vl[0] = (_Tp)1;
+            for (int j = 1; j < m - l; j++)
+            {
+              vl[j] = A[(j + l)*astep + l];
+            }
+
+            //h_l*x
+            for (int j = 0; j < k; j++)
+            {
+                _Tp v_lB = (_Tp)0;
+                for (int i = l; i < m; i++)
+                  v_lB += vl[i - l] * b[i*bstep + j];
+
+                for (int i = l; i < m; i++)
+                    b[i*bstep + j] -= 2 * vl[i - l] * v_lB * hFactors[l];
+            }
+        }
+        //do back substitution
+        for (int i = n - 1; i >= 0; i--)
+        {
+            for (int j = n - 1; j > i; j--)
+            {
+                for (int p = 0; p < k; p++)
+                    b[i*bstep + p] -= b[j*bstep + p] * A[i*astep + j];
+            }
+            if (std::abs(A[i*astep + i]) < eps)
+                return 0;
+            for (int p = 0; p < k; p++)
+                b[i*bstep + p] /= A[i*astep + i];
+        }
+    }
+
+    return 1;
+}
+
+int QR32f(float* A, size_t astep, int m, int n, int k, float* b, size_t bstep, float* hFactors)
+{
+    CV_INSTRUMENT_REGION()
+
+    int output;
+    CALL_HAL_RET(QR32f, cv_hal_QR32f, output, A, astep, m, n, k, b, bstep, hFactors);
+    output = QRImpl(A, astep, m, n, k, b, bstep, hFactors, FLT_EPSILON * 10);
+    return output;
+}
+
+int QR64f(double* A, size_t astep, int m, int n, int k, double* b, size_t bstep, double* hFactors)
+{
+    CV_INSTRUMENT_REGION()
+
+    int output;
+    CALL_HAL_RET(QR64f, cv_hal_QR64f, output, A, astep, m, n, k, b, bstep, hFactors)
+    output = QRImpl(A, astep, m, n, k, b, bstep, hFactors, DBL_EPSILON * 100);
+    return output;
+}
+
 //=============================================================================
 // for compatibility with 3.0


--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@@ -2994,6 +2994,51 @@ TEST(Core_Cholesky, accuracy64f)
   for (int i = 0; i < A.rows; i++)
       for (int j = i + 1; j < A.cols; j++)
           A.at<double>(i, j) = 0.0;
-   EXPECT_TRUE(norm(refA - A*A.t()) < 10e-5);
+   EXPECT_LE(norm(refA, A*A.t(), CV_RELATIVE_L2), FLT_EPSILON);
 }
+
+TEST(Core_QR_Solver, accuracy64f)
+{
+    int m = 20, n = 18;
+    Mat A(m, m, CV_64F);
+    Mat B(m, n, CV_64F);
+    Mat mean(1, 1, CV_64F);
+    *mean.ptr<double>() = 10.0;
+    Mat dev(1, 1, CV_64F);
+    *dev.ptr<double>() = 10.0;
+    RNG rng(10);
+    rng.fill(A, RNG::NORMAL, mean, dev);
+    rng.fill(B, RNG::NORMAL, mean, dev);
+    A = A*A.t();
+    Mat solutionQR;
+
+    //solve system with square matrix
+    solve(A, B, solutionQR, DECOMP_QR);
+    EXPECT_LE(norm(A*solutionQR, B, CV_RELATIVE_L2), FLT_EPSILON);
+
+    A = Mat(m, n, CV_64F);
+    B = Mat(m, n, CV_64F);
+    rng.fill(A, RNG::NORMAL, mean, dev);
+    rng.fill(B, RNG::NORMAL, mean, dev);
+
+    //solve normal system
+    solve(A, B, solutionQR, DECOMP_QR | DECOMP_NORMAL);
+    EXPECT_LE(norm(A.t()*(A*solutionQR), A.t()*B, CV_RELATIVE_L2), FLT_EPSILON);
+
+    //solve overdeterminated system as a least squares problem
+    Mat solutionSVD;
+    solve(A, B, solutionQR, DECOMP_QR);
+    solve(A, B, solutionSVD, DECOMP_SVD);
+    EXPECT_LE(norm(solutionQR, solutionSVD, CV_RELATIVE_L2), FLT_EPSILON);
+
+    //solve system with singular matrix
+    A = Mat(10, 10, CV_64F);
+    B = Mat(10, 1, CV_64F);
+    rng.fill(A, RNG::NORMAL, mean, dev);
+    rng.fill(B, RNG::NORMAL, mean, dev);
+    for (int i = 0; i < A.cols; i++)
+      A.at<double>(0, i) = A.at<double>(1, i);
+    ASSERT_FALSE(solve(A, B, solutionQR, DECOMP_QR));
+}
+
 /* End of file. */