提交 dfe4519c 编写于 作者: V Vladislav Sovrasov

Add QR decomposition to HAL

上级 d102ea96
......@@ -457,6 +457,11 @@
title = {ROF and TV-L1 denoising with Primal-Dual algorithm},
url = {http://znah.net/rof-and-tv-l1-denoising-with-primal-dual-algorithm.html}
}
@MISC{VandLec,
author = {Vandenberghe, Lieven},
title = {QR Factorization},
url = {http://www.seas.ucla.edu/~vandenbe/133A/lectures/qr.pdf}
}
@ARTICLE{MHT2011,
author = {Getreuer, Pascal},
title = {Malvar-He-Cutler Linear Image Demosaicking},
......
......@@ -263,6 +263,7 @@ enum { CALIB_USE_INTRINSIC_GUESS = 0x00001,
CALIB_FIX_S1_S2_S3_S4 = 0x10000,
CALIB_TILTED_MODEL = 0x40000,
CALIB_FIX_TAUX_TAUY = 0x80000,
CALIB_USE_QR = 0x100000, //!< use QR instead of SVD decomposition for solving. Faster but potentially less precise
// only for stereo
CALIB_FIX_INTRINSIC = 0x00100,
CALIB_SAME_FOCAL_LENGTH = 0x00200,
......
......@@ -1485,6 +1485,9 @@ static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
if(flags & CALIB_USE_LU) {
solver.solveMethod = DECOMP_LU;
}
else if(flags & CALIB_USE_QR) {
solver.solveMethod = DECOMP_QR;
}
{
double* param = solver.param->data.db;
......
......@@ -66,6 +66,8 @@ CV_EXPORTS bool Cholesky32f(float* A, size_t astep, int m, float* b, size_t bste
CV_EXPORTS bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
CV_EXPORTS void SVD32f(float* At, size_t astep, float* W, float* U, size_t ustep, float* Vt, size_t vstep, int m, int n, int flags);
CV_EXPORTS void SVD64f(double* At, size_t astep, double* W, double* U, size_t ustep, double* Vt, size_t vstep, int m, int n, int flags);
CV_EXPORTS int QR32f(float* A, size_t astep, int m, int n, int k, float* b, size_t bstep, float* hFactors);
CV_EXPORTS int QR64f(double* A, size_t astep, int m, int n, int k, double* b, size_t bstep, double* hFactors);
CV_EXPORTS void gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
......
......@@ -61,10 +61,12 @@
#include <typeinfo>
#include <limits>
#include <complex>
#include <vector>
#define HAL_GEMM_SMALL_COMPLEX_MATRIX_THRESH 100
#define HAL_GEMM_SMALL_MATRIX_THRESH 100
#define HAL_SVD_SMALL_MATRIX_THRESH 25
#define HAL_QR_SMALL_MATRIX_THRESH 30
#define HAL_LU_SMALL_MATRIX_THRESH 100
#define HAL_CHOLESKY_SMALL_MATRIX_THRESH 100
......@@ -259,6 +261,106 @@ lapack_SVD(fptype* a, size_t a_step, fptype *w, fptype* u, size_t u_step, fptype
return CV_HAL_ERROR_OK;
}
template <typename fptype> static inline int
lapack_QR(fptype* a, size_t a_step, int m, int n, int k, fptype* b, size_t b_step, fptype* dst, int* info)
{
int lda = a_step / sizeof(fptype);
char mode[] = { 'N', '\0' };
if(m < n)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
std::vector<fptype> tmpAMemHolder;
fptype* tmpA;
int ldtmpA;
if (m == n)
{
transpose_square_inplace(a, lda, m);
tmpA = a;
ldtmpA = lda;
}
else
{
tmpAMemHolder.resize(m*n);
tmpA = &tmpAMemHolder.front();
ldtmpA = m;
transpose(a, lda, tmpA, m, m, n);
}
int lwork = -1;
fptype work1 = 0.;
if (b)
{
if (k == 1 && b_step == sizeof(fptype))
{
if (typeid(fptype) == typeid(float))
sgels_(mode, &m, &n, &k, (float*)tmpA, &ldtmpA, (float*)b, &m, (float*)&work1, &lwork, info);
else if (typeid(fptype) == typeid(double))
dgels_(mode, &m, &n, &k, (double*)tmpA, &ldtmpA, (double*)b, &m, (double*)&work1, &lwork, info);
lwork = (int)round(work1); //optimal buffer size
std::vector<fptype> workBufMemHolder(lwork + 1);
fptype* buffer = &workBufMemHolder.front();
if (typeid(fptype) == typeid(float))
sgels_(mode, &m, &n, &k, (float*)tmpA, &ldtmpA, (float*)b, &m, (float*)buffer, &lwork, info);
else if (typeid(fptype) == typeid(double))
dgels_(mode, &m, &n, &k, (double*)tmpA, &ldtmpA, (double*)b, &m, (double*)buffer, &lwork, info);
}
else
{
std::vector<fptype> tmpBMemHolder(m*k);
fptype* tmpB = &tmpBMemHolder.front();
int ldb = b_step / sizeof(fptype);
transpose(b, ldb, tmpB, m, m, k);
if (typeid(fptype) == typeid(float))
sgels_(mode, &m, &n, &k, (float*)tmpA, &ldtmpA, (float*)tmpB, &m, (float*)&work1, &lwork, info);
else if (typeid(fptype) == typeid(double))
dgels_(mode, &m, &n, &k, (double*)tmpA, &ldtmpA, (double*)tmpB, &m, (double*)&work1, &lwork, info);
lwork = (int)round(work1); //optimal buffer size
std::vector<fptype> workBufMemHolder(lwork + 1);
fptype* buffer = &workBufMemHolder.front();
if (typeid(fptype) == typeid(float))
sgels_(mode, &m, &n, &k, (float*)tmpA, &ldtmpA, (float*)tmpB, &m, (float*)buffer, &lwork, info);
else if (typeid(fptype) == typeid(double))
dgels_(mode, &m, &n, &k, (double*)tmpA, &ldtmpA, (double*)tmpB, &m, (double*)buffer, &lwork, info);
transpose(tmpB, m, b, ldb, k, m);
}
}
else
{
if (typeid(fptype) == typeid(float))
sgeqrf_(&m, &n, (float*)tmpA, &ldtmpA, (float*)dst, (float*)&work1, &lwork, info);
else if (typeid(fptype) == typeid(double))
dgeqrf_(&m, &n, (double*)tmpA, &ldtmpA, (double*)dst, (double*)&work1, &lwork, info);
lwork = (int)round(work1); //optimal buffer size
std::vector<fptype> workBufMemHolder(lwork + 1);
fptype* buffer = &workBufMemHolder.front();
if (typeid(fptype) == typeid(float))
sgeqrf_(&m, &n, (float*)tmpA, &ldtmpA, (float*)dst, (float*)buffer, &lwork, info);
else if (typeid(fptype) == typeid(double))
dgeqrf_(&m, &n, (double*)tmpA, &ldtmpA, (double*)dst, (double*)buffer, &lwork, info);
}
if (m == n)
transpose_square_inplace(a, lda, m);
else
transpose(tmpA, m, a, lda, n, m);
if (*info != 0)
*info = 0;
else
*info = 1;
return CV_HAL_ERROR_OK;
}
template <typename fptype> static inline int
lapack_gemm(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha,
const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int a_m, int a_n, int d_n, int flags)
......@@ -459,6 +561,20 @@ int lapack_SVD64f(double* a, size_t a_step, double *w, double* u, size_t u_step,
return lapack_SVD(a, a_step, w, u, u_step, vt, v_step, m, n, flags, &info);
}
int lapack_QR32f(float* src1, size_t src1_step, int m, int n, int k, float* src2, size_t src2_step, float* dst, int* info)
{
if (m < HAL_QR_SMALL_MATRIX_THRESH)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
return lapack_QR(src1, src1_step, m, n, k, src2, src2_step, dst, info);
}
int lapack_QR64f(double* src1, size_t src1_step, int m, int n, int k, double* src2, size_t src2_step, double* dst, int* info)
{
if (m < HAL_QR_SMALL_MATRIX_THRESH)
return CV_HAL_ERROR_NOT_IMPLEMENTED;
return lapack_QR(src1, src1_step, m, n, k, src2, src2_step, dst, info);
}
int lapack_gemm32f(const float *src1, size_t src1_step, const float *src2, size_t src2_step, float alpha,
const float *src3, size_t src3_step, float beta, float *dst, size_t dst_step, int m, int n, int k, int flags)
{
......
......@@ -55,6 +55,8 @@ int lapack_Cholesky32f(float* a, size_t a_step, int m, float* b, size_t b_step,
int lapack_Cholesky64f(double* a, size_t a_step, int m, double* b, size_t b_step, int n, bool* info);
int lapack_SVD32f(float* a, size_t a_step, float* w, float* u, size_t u_step, float* vt, size_t v_step, int m, int n, int flags);
int lapack_SVD64f(double* a, size_t a_step, double* w, double* u, size_t u_step, double* vt, size_t v_step, int m, int n, int flags);
int lapack_QR32f(float* src1, size_t src1_step, int m, int n, int k, float* src2, size_t src2_step, float* dst, int* info);
int lapack_QR64f(double* src1, size_t src1_step, int m, int n, int k, double* src2, size_t src2_step, double* dst, int* info);
int lapack_gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
int m, int n, int k, int flags);
......@@ -83,6 +85,11 @@ int lapack_gemm64fc(const double* src1, size_t src1_step, const double* src2, si
#undef cv_hal_SVD64f
#define cv_hal_SVD64f lapack_SVD64f
#undef cv_hal_QR32f
#define cv_hal_QR32f lapack_QR32f
#undef cv_hal_QR64f
#define cv_hal_QR64f lapack_QR64f
#undef cv_hal_gemm32f
#define cv_hal_gemm32f lapack_gemm32f
#undef cv_hal_gemm64f
......
......@@ -634,6 +634,27 @@ inline int hal_ni_SVD32f(float* src, size_t src_step, float* w, float* u, size_t
inline int hal_ni_SVD64f(double* src, size_t src_step, double* w, double* u, size_t u_step, double* vt, size_t vt_step, int m, int n, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
//! @}
/**
Performs QR decomposition of \f$M\times N\f$(\f$M>N\f$) matrix \f$A = Q*R\f$ and solves matrix equation \f$A*X=B\f$.
@param src1 pointer to input matrix \f$A\f$ stored in row major order. After finish of work src1 contains upper triangular \f$N\times N\f$ matrix \f$R\f$.
Lower triangle of src1 will be filled with vectors of elementary reflectors. See @cite VandLec and Lapack's DGEQRF documentation for details.
@param src1_step number of bytes between two consequent rows of matrix \f$A\f$.
@param m number fo rows in matrix \f$A\f$.
@param n number of columns in matrix \f$A\f$.
@param k number of right-hand vectors in \f$M\times K\f$ matrix \f$B\f$.
@param src2 pointer to \f$M\times K\f$ matrix \f$B\f$ which is the right-hand side of system \f$A*X=B\f$. \f$B\f$ stored in row major order.
If src2 is null pointer only QR decomposition will be performed. Otherwise system will be solved and src1 will be used as temporary buffer, so
after finish of work src2 contains solution \f$X\f$ of system \f$A*X=B\f$.
@param src2_step number of bytes between two consequent rows of matrix \f$B\f$.
@param dst pointer to continiuos \f$N\times 1\f$ array for scalar factors of elementary reflectors. See @cite VandLec for details.
@param info indicates success of decomposition. If *info is zero decomposition failed.
*/
//! @addtogroup core_hal_interface_decomp_qr QR matrix decomposition
//! @{
inline int hal_ni_QR32f(float* src1, size_t src1_step, int m, int n, int k, float* src2, size_t src2_step, float* dst, int* info) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
inline int hal_ni_QR64f(double* src1, size_t src1_step, int m, int n, int k, double* src2, size_t src2_step, double* dst, int* info) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
//! @}
//! @cond IGNORED
......@@ -643,6 +664,8 @@ inline int hal_ni_SVD64f(double* src, size_t src_step, double* w, double* u, siz
#define cv_hal_Cholesky64f hal_ni_Cholesky64f
#define cv_hal_SVD32f hal_ni_SVD32f
#define cv_hal_SVD64f hal_ni_SVD64f
#define cv_hal_QR32f hal_ni_QR32f
#define cv_hal_QR64f hal_ni_QR64f
//! @endcond
......
......@@ -1238,9 +1238,6 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
return result;
}
if( method == DECOMP_QR )
method = DECOMP_SVD;
int m = src.rows, m_ = m, n = src.cols, nb = _src2.cols;
size_t esz = CV_ELEM_SIZE(type), bufsize = 0;
size_t vstep = alignSize(n*esz, 16);
......@@ -1268,7 +1265,6 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
if( is_normal )
bufsize += n*nb*esz;
if( method == DECOMP_SVD || method == DECOMP_EIG )
bufsize += n*5*esz + n*vstep + nb*sizeof(double) + 32;
......@@ -1321,6 +1317,28 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
else
result = hal::Cholesky64f(a.ptr<double>(), a.step, n, dst.ptr<double>(), dst.step, nb);
}
else if( method == DECOMP_QR )
{
Mat rhsMat;
if( is_normal || m == n )
{
src2.copyTo(dst);
rhsMat = dst;
}
else
{
rhsMat = Mat(m, nb, type);
src2.copyTo(rhsMat);
}
if( type == CV_32F )
result = hal::QR32f(a.ptr<float>(), a.step, a.rows, a.cols, rhsMat.cols, rhsMat.ptr<float>(), rhsMat.step, NULL) != 0;
else
result = hal::QR64f(a.ptr<double>(), a.step, a.rows, a.cols, rhsMat.cols, rhsMat.ptr<double>(), rhsMat.step, NULL) != 0;
if (rhsMat.rows != dst.rows)
rhsMat.rowRange(0, dst.rows).copyTo(dst);
}
else
{
ptr = alignPtr(ptr, 16);
......
......@@ -225,6 +225,129 @@ bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n)
return CholImpl(A, astep, m, b, bstep, n);
}
template<typename _Tp> inline static int
sign(_Tp x)
{
if (x >= (_Tp)0)
return 1;
else
return -1;
}
template<typename _Tp> static inline int
QRImpl(_Tp* A, size_t astep, int m, int n, int k, _Tp* b, size_t bstep, _Tp* hFactors, _Tp eps)
{
astep /= sizeof(_Tp);
bstep /= sizeof(_Tp);
cv::AutoBuffer<_Tp> buffer;
size_t buf_size = m ? m + n : hFactors != NULL;
buffer.allocate(buf_size);
_Tp* vl = buffer;
if (hFactors == NULL)
hFactors = vl + m;
for (int l = 0; l < n; l++)
{
//generate vl
int vlSize = m - l;
_Tp vlNorm = (_Tp)0;
for (int i = 0; i < vlSize; i++)
{
vl[i] = A[(l + i)*astep + l];
vlNorm += vl[i] * vl[i];
}
_Tp tmpV = vl[0];
vl[0] = vl[0] + sign(vl[0])*std::sqrt(vlNorm);
vlNorm = std::sqrt(vlNorm + vl[0] * vl[0] - tmpV*tmpV);
for (int i = 0; i < vlSize; i++)
{
vl[i] /= vlNorm;
}
//multiply A_l*vl
for (int j = l; j < n; j++)
{
_Tp v_lA = (_Tp)0;
for (int i = l; i < m; i++)
{
v_lA += vl[i - l] * A[i*astep + j];
}
for (int i = l; i < m; i++)
{
A[i*astep + j] -= 2 * vl[i - l] * v_lA;
}
}
//save vl and factors
hFactors[l] = vl[0] * vl[0];
for (int i = 1; i < vlSize; i++)
{
A[(l + i)*astep + l] = vl[i] / vl[0];
}
}
if (b)
{
//generate new rhs
for (int l = 0; l < n; l++)
{
//unpack vl
vl[0] = (_Tp)1;
for (int j = 1; j < m - l; j++)
{
vl[j] = A[(j + l)*astep + l];
}
//h_l*x
for (int j = 0; j < k; j++)
{
_Tp v_lB = (_Tp)0;
for (int i = l; i < m; i++)
v_lB += vl[i - l] * b[i*bstep + j];
for (int i = l; i < m; i++)
b[i*bstep + j] -= 2 * vl[i - l] * v_lB * hFactors[l];
}
}
//do back substitution
for (int i = n - 1; i >= 0; i--)
{
for (int j = n - 1; j > i; j--)
{
for (int p = 0; p < k; p++)
b[i*bstep + p] -= b[j*bstep + p] * A[i*astep + j];
}
if (std::abs(A[i*astep + i]) < eps)
return 0;
for (int p = 0; p < k; p++)
b[i*bstep + p] /= A[i*astep + i];
}
}
return 1;
}
int QR32f(float* A, size_t astep, int m, int n, int k, float* b, size_t bstep, float* hFactors)
{
CV_INSTRUMENT_REGION()
int output;
CALL_HAL_RET(QR32f, cv_hal_QR32f, output, A, astep, m, n, k, b, bstep, hFactors);
output = QRImpl(A, astep, m, n, k, b, bstep, hFactors, FLT_EPSILON * 10);
return output;
}
int QR64f(double* A, size_t astep, int m, int n, int k, double* b, size_t bstep, double* hFactors)
{
CV_INSTRUMENT_REGION()
int output;
CALL_HAL_RET(QR64f, cv_hal_QR64f, output, A, astep, m, n, k, b, bstep, hFactors)
output = QRImpl(A, astep, m, n, k, b, bstep, hFactors, DBL_EPSILON * 100);
return output;
}
//=============================================================================
// for compatibility with 3.0
......
......@@ -2994,6 +2994,51 @@ TEST(Core_Cholesky, accuracy64f)
for (int i = 0; i < A.rows; i++)
for (int j = i + 1; j < A.cols; j++)
A.at<double>(i, j) = 0.0;
EXPECT_TRUE(norm(refA - A*A.t()) < 10e-5);
EXPECT_LE(norm(refA, A*A.t(), CV_RELATIVE_L2), FLT_EPSILON);
}
TEST(Core_QR_Solver, accuracy64f)
{
int m = 20, n = 18;
Mat A(m, m, CV_64F);
Mat B(m, n, CV_64F);
Mat mean(1, 1, CV_64F);
*mean.ptr<double>() = 10.0;
Mat dev(1, 1, CV_64F);
*dev.ptr<double>() = 10.0;
RNG rng(10);
rng.fill(A, RNG::NORMAL, mean, dev);
rng.fill(B, RNG::NORMAL, mean, dev);
A = A*A.t();
Mat solutionQR;
//solve system with square matrix
solve(A, B, solutionQR, DECOMP_QR);
EXPECT_LE(norm(A*solutionQR, B, CV_RELATIVE_L2), FLT_EPSILON);
A = Mat(m, n, CV_64F);
B = Mat(m, n, CV_64F);
rng.fill(A, RNG::NORMAL, mean, dev);
rng.fill(B, RNG::NORMAL, mean, dev);
//solve normal system
solve(A, B, solutionQR, DECOMP_QR | DECOMP_NORMAL);
EXPECT_LE(norm(A.t()*(A*solutionQR), A.t()*B, CV_RELATIVE_L2), FLT_EPSILON);
//solve overdeterminated system as a least squares problem
Mat solutionSVD;
solve(A, B, solutionQR, DECOMP_QR);
solve(A, B, solutionSVD, DECOMP_SVD);
EXPECT_LE(norm(solutionQR, solutionSVD, CV_RELATIVE_L2), FLT_EPSILON);
//solve system with singular matrix
A = Mat(10, 10, CV_64F);
B = Mat(10, 1, CV_64F);
rng.fill(A, RNG::NORMAL, mean, dev);
rng.fill(B, RNG::NORMAL, mean, dev);
for (int i = 0; i < A.cols; i++)
A.at<double>(0, i) = A.at<double>(1, i);
ASSERT_FALSE(solve(A, B, solutionQR, DECOMP_QR));
}
/* End of file. */
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册