提交 65eb9467 编写于 作者: A Alexander Alekhin

core: rework code locality

- to reduce binaries size of FFmpeg Windows wrapper
- MinGW linker doesn't support -ffunction-sections (used for FFmpeg Windows wrapper)
- move code to improve locality with its used dependencies
- move UMat::dot() to matmul.dispatch.cpp (Mat::dot() is already there)
- move UMat::inv() to lapack.cpp
- move UMat::mul() to arithm.cpp
- move UMat:eye() to matrix_operations.cpp (near setIdentity() implementation)
- move normalize(): convert_scale.cpp => norm.cpp
- move convertAndUnrollScalar(): arithm.cpp => copy.cpp
- move scalarToRawData(): array.cpp => copy.cpp
- move transpose(): matrix_operations.cpp => matrix_transform.cpp
- move flip(), rotate(): copy.cpp => matrix_transform.cpp (rotate90 uses flip and transpose)
- add 'OPENCV_CORE_EXCLUDE_C_API' CMake variable to exclude compilation of C-API functions from the core module
- matrix_wrap.cpp: add compile-time checks for CUDA/OpenGL calls
- the steps above allow to reduce FFmpeg wrapper size for ~1.5Mb (initial size of OpenCV part is about 3Mb)
上级 7bcb51ed
......@@ -145,6 +145,10 @@ ocv_target_link_libraries(${the_module} PRIVATE
"${OPENCV_HAL_LINKER_LIBS}"
)
if(OPENCV_CORE_EXCLUDE_C_API)
ocv_target_compile_definitions(${the_module} PRIVATE "OPENCV_EXCLUDE_C_API=1")
endif()
if(HAVE_HPX)
ocv_target_link_libraries(${the_module} LINK_PRIVATE "${HPX_LIBRARIES}")
endif()
......
......@@ -57,26 +57,6 @@ namespace cv
* logical operations *
\****************************************************************************************/
void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize )
{
int scn = (int)sc.total(), cn = CV_MAT_CN(buftype);
size_t esz = CV_ELEM_SIZE(buftype);
BinaryFunc cvtFn = getConvertFunc(sc.depth(), buftype);
CV_Assert(cvtFn);
cvtFn(sc.ptr(), 1, 0, 1, scbuf, 1, Size(std::min(cn, scn), 1), 0);
// unroll the scalar
if( scn < cn )
{
CV_Assert( scn == 1 );
size_t esz1 = CV_ELEM_SIZE1(buftype);
for( size_t i = esz1; i < esz; i++ )
scbuf[i] = scbuf[i - esz1];
}
for( size_t i = esz; i < blocksize*esz; i++ )
scbuf[i] = scbuf[i - esz];
}
enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4,
OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8,
OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14,
......@@ -1002,9 +982,7 @@ static BinaryFuncC* getRecipTab()
return recipTab;
}
}
void cv::multiply(InputArray src1, InputArray src2,
void multiply(InputArray src1, InputArray src2,
OutputArray dst, double scale, int dtype)
{
CV_INSTRUMENT_REGION();
......@@ -1013,7 +991,7 @@ void cv::multiply(InputArray src1, InputArray src2,
true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE);
}
void cv::divide(InputArray src1, InputArray src2,
void divide(InputArray src1, InputArray src2,
OutputArray dst, double scale, int dtype)
{
CV_INSTRUMENT_REGION();
......@@ -1021,7 +999,7 @@ void cv::divide(InputArray src1, InputArray src2,
arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE);
}
void cv::divide(double scale, InputArray src2,
void divide(double scale, InputArray src2,
OutputArray dst, int dtype)
{
CV_INSTRUMENT_REGION();
......@@ -1029,13 +1007,17 @@ void cv::divide(double scale, InputArray src2,
arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE);
}
UMat UMat::mul(InputArray m, double scale) const
{
UMat dst;
multiply(*this, m, dst, scale);
return dst;
}
/****************************************************************************************\
* addWeighted *
\****************************************************************************************/
namespace cv
{
static BinaryFuncC* getAddWeightedTab()
{
static BinaryFuncC addWeightedTab[] =
......@@ -1849,6 +1831,9 @@ void cv::inRange(InputArray _src, InputArray _lowerb,
}
}
#ifndef OPENCV_EXCLUDE_C_API
/****************************************************************************************\
* Earlier API: cvAdd etc. *
\****************************************************************************************/
......@@ -2111,4 +2096,5 @@ cvMaxS( const void* srcarr1, double value, void* dstarr )
cv::max( src1, value, dst );
}
#endif // OPENCV_EXCLUDE_C_API
/* End of file. */
......@@ -48,6 +48,8 @@
#include "precomp.hpp"
#ifndef OPENCV_EXCLUDE_C_API
#define CV_ORIGIN_TL 0
#define CV_ORIGIN_BL 1
......@@ -3211,53 +3213,6 @@ void DefaultDeleter<CvMatND>::operator ()(CvMatND* obj) const { cvReleaseMatND(&
void DefaultDeleter<CvSparseMat>::operator ()(CvSparseMat* obj) const { cvReleaseSparseMat(&obj); }
void DefaultDeleter<CvMemStorage>::operator ()(CvMemStorage* obj) const { cvReleaseMemStorage(&obj); }
template <typename T> static inline
void scalarToRawData_(const Scalar& s, T * const buf, const int cn, const int unroll_to)
{
int i = 0;
for(; i < cn; i++)
buf[i] = saturate_cast<T>(s.val[i]);
for(; i < unroll_to; i++)
buf[i] = buf[i-cn];
}
void scalarToRawData(const Scalar& s, void* _buf, int type, int unroll_to)
{
CV_INSTRUMENT_REGION();
const int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
CV_Assert(cn <= 4);
switch(depth)
{
case CV_8U:
scalarToRawData_<uchar>(s, (uchar*)_buf, cn, unroll_to);
break;
case CV_8S:
scalarToRawData_<schar>(s, (schar*)_buf, cn, unroll_to);
break;
case CV_16U:
scalarToRawData_<ushort>(s, (ushort*)_buf, cn, unroll_to);
break;
case CV_16S:
scalarToRawData_<short>(s, (short*)_buf, cn, unroll_to);
break;
case CV_32S:
scalarToRawData_<int>(s, (int*)_buf, cn, unroll_to);
break;
case CV_32F:
scalarToRawData_<float>(s, (float*)_buf, cn, unroll_to);
break;
case CV_64F:
scalarToRawData_<double>(s, (double*)_buf, cn, unroll_to);
break;
case CV_16F:
scalarToRawData_<float16_t>(s, (float16_t*)_buf, cn, unroll_to);
break;
default:
CV_Error(CV_StsUnsupportedFormat,"");
}
}
} // cv::
......@@ -3295,4 +3250,5 @@ void* cvClone( const void* struct_ptr )
}
#endif // OPENCV_EXCLUDE_C_API
/* End of file. */
......@@ -5,6 +5,7 @@
#include "precomp.hpp"
#ifndef OPENCV_EXCLUDE_C_API
CV_IMPL void
cvSplit( const void* srcarr, void* dstarr0, void* dstarr1, void* dstarr2, void* dstarr3 )
......@@ -132,3 +133,5 @@ CV_IMPL void cvNormalize( const CvArr* srcarr, CvArr* dstarr,
CV_Assert( dst.size() == src.size() && src.channels() == dst.channels() );
cv::normalize( src, dst, a, b, norm_type, dst.type(), mask );
}
#endif // OPENCV_EXCLUDE_C_API
......@@ -9,7 +9,6 @@
#include "convert_scale.simd.hpp"
#include "convert_scale.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content
namespace cv
{
......@@ -117,143 +116,4 @@ void convertScaleAbs(InputArray _src, OutputArray _dst, double alpha, double bet
}
}
//==================================================================================================
#ifdef HAVE_OPENCL
static bool ocl_normalize( InputArray _src, InputOutputArray _dst, InputArray _mask, int dtype,
double scale, double delta )
{
UMat src = _src.getUMat();
if( _mask.empty() )
src.convertTo( _dst, dtype, scale, delta );
else if (src.channels() <= 4)
{
const ocl::Device & dev = ocl::Device::getDefault();
int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32F, std::max(sdepth, ddepth)),
rowsPerWI = dev.isIntel() ? 4 : 1;
float fscale = static_cast<float>(scale), fdelta = static_cast<float>(delta);
bool haveScale = std::fabs(scale - 1) > DBL_EPSILON,
haveZeroScale = !(std::fabs(scale) > DBL_EPSILON),
haveDelta = std::fabs(delta) > DBL_EPSILON,
doubleSupport = dev.doubleFPConfig() > 0;
if (!haveScale && !haveDelta && stype == dtype)
{
_src.copyTo(_dst, _mask);
return true;
}
if (haveZeroScale)
{
_dst.setTo(Scalar(delta), _mask);
return true;
}
if ((sdepth == CV_64F || ddepth == CV_64F) && !doubleSupport)
return false;
char cvt[2][40];
String opts = format("-D srcT=%s -D dstT=%s -D convertToWT=%s -D cn=%d -D rowsPerWI=%d"
" -D convertToDT=%s -D workT=%s%s%s%s -D srcT1=%s -D dstT1=%s",
ocl::typeToStr(stype), ocl::typeToStr(dtype),
ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), cn,
rowsPerWI, ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
doubleSupport ? " -D DOUBLE_SUPPORT" : "",
haveScale ? " -D HAVE_SCALE" : "",
haveDelta ? " -D HAVE_DELTA" : "",
ocl::typeToStr(sdepth), ocl::typeToStr(ddepth));
ocl::Kernel k("normalizek", ocl::core::normalize_oclsrc, opts);
if (k.empty())
return false;
UMat mask = _mask.getUMat(), dst = _dst.getUMat();
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
dstarg = ocl::KernelArg::ReadWrite(dst);
if (haveScale)
{
if (haveDelta)
k.args(srcarg, maskarg, dstarg, fscale, fdelta);
else
k.args(srcarg, maskarg, dstarg, fscale);
}
else
{
if (haveDelta)
k.args(srcarg, maskarg, dstarg, fdelta);
else
k.args(srcarg, maskarg, dstarg);
}
size_t globalsize[2] = { (size_t)src.cols, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false);
}
else
{
UMat temp;
src.convertTo( temp, dtype, scale, delta );
temp.copyTo( _dst, _mask );
}
return true;
}
#endif
void normalize(InputArray _src, InputOutputArray _dst, double a, double b,
int norm_type, int rtype, InputArray _mask)
{
CV_INSTRUMENT_REGION();
double scale = 1, shift = 0;
int type = _src.type(), depth = CV_MAT_DEPTH(type);
if( rtype < 0 )
rtype = _dst.fixedType() ? _dst.depth() : depth;
if( norm_type == CV_MINMAX )
{
double smin = 0, smax = 0;
double dmin = MIN( a, b ), dmax = MAX( a, b );
minMaxIdx( _src, &smin, &smax, 0, 0, _mask );
scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0);
if( rtype == CV_32F )
{
scale = (float)scale;
shift = (float)dmin - (float)(smin*scale);
}
else
shift = dmin - smin*scale;
}
else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C )
{
scale = norm( _src, norm_type, _mask );
scale = scale > DBL_EPSILON ? a/scale : 0.;
shift = 0;
}
else
CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" );
CV_OCL_RUN(_dst.isUMat(),
ocl_normalize(_src, _dst, _mask, rtype, scale, shift))
Mat src = _src.getMat();
if( _mask.empty() )
src.convertTo( _dst, rtype, scale, shift );
else
{
Mat temp;
src.convertTo( temp, rtype, scale, shift );
temp.copyTo( _dst, _mask );
}
}
} // namespace
......@@ -53,6 +53,72 @@
namespace cv
{
template <typename T> static inline
void scalarToRawData_(const Scalar& s, T * const buf, const int cn, const int unroll_to)
{
int i = 0;
for(; i < cn; i++)
buf[i] = saturate_cast<T>(s.val[i]);
for(; i < unroll_to; i++)
buf[i] = buf[i-cn];
}
void scalarToRawData(const Scalar& s, void* _buf, int type, int unroll_to)
{
CV_INSTRUMENT_REGION();
const int depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
CV_Assert(cn <= 4);
switch(depth)
{
case CV_8U:
scalarToRawData_<uchar>(s, (uchar*)_buf, cn, unroll_to);
break;
case CV_8S:
scalarToRawData_<schar>(s, (schar*)_buf, cn, unroll_to);
break;
case CV_16U:
scalarToRawData_<ushort>(s, (ushort*)_buf, cn, unroll_to);
break;
case CV_16S:
scalarToRawData_<short>(s, (short*)_buf, cn, unroll_to);
break;
case CV_32S:
scalarToRawData_<int>(s, (int*)_buf, cn, unroll_to);
break;
case CV_32F:
scalarToRawData_<float>(s, (float*)_buf, cn, unroll_to);
break;
case CV_64F:
scalarToRawData_<double>(s, (double*)_buf, cn, unroll_to);
break;
case CV_16F:
scalarToRawData_<float16_t>(s, (float16_t*)_buf, cn, unroll_to);
break;
default:
CV_Error(CV_StsUnsupportedFormat,"");
}
}
void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize )
{
int scn = (int)sc.total(), cn = CV_MAT_CN(buftype);
size_t esz = CV_ELEM_SIZE(buftype);
BinaryFunc cvtFn = getConvertFunc(sc.depth(), buftype);
CV_Assert(cvtFn);
cvtFn(sc.ptr(), 1, 0, 1, scbuf, 1, Size(std::min(cn, scn), 1), 0);
// unroll the scalar
if( scn < cn )
{
CV_Assert( scn == 1 );
size_t esz1 = CV_ELEM_SIZE1(buftype);
for( size_t i = esz1; i < esz; i++ )
scbuf[i] = scbuf[i - esz1];
}
for( size_t i = esz; i < blocksize*esz; i++ )
scbuf[i] = scbuf[i - esz];
}
template<typename T> static void
copyMask_(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
{
......@@ -594,490 +660,6 @@ Mat& Mat::setTo(InputArray _value, InputArray _mask)
return *this;
}
#if CV_SIMD128
template<typename V> CV_ALWAYS_INLINE void flipHoriz_single( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
{
typedef typename V::lane_type T;
int end = (int)(size.width*esz);
int width = (end + 1)/2;
int width_1 = width & -v_uint8x16::nlanes;
int i, j;
#if CV_STRONG_ALIGNMENT
CV_Assert(isAligned<sizeof(T)>(src, dst));
#endif
for( ; size.height--; src += sstep, dst += dstep )
{
for( i = 0, j = end; i < width_1; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
{
V t0, t1;
t0 = v_load((T*)((uchar*)src + i));
t1 = v_load((T*)((uchar*)src + j - v_uint8x16::nlanes));
t0 = v_reverse(t0);
t1 = v_reverse(t1);
v_store((T*)(dst + j - v_uint8x16::nlanes), t0);
v_store((T*)(dst + i), t1);
}
if (isAligned<sizeof(T)>(src, dst))
{
for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
{
T t0, t1;
t0 = *((T*)((uchar*)src + i));
t1 = *((T*)((uchar*)src + j - sizeof(T)));
*((T*)(dst + j - sizeof(T))) = t0;
*((T*)(dst + i)) = t1;
}
}
else
{
for ( ; i < width; i += sizeof(T), j -= sizeof(T) )
{
for (int k = 0; k < (int)sizeof(T); k++)
{
uchar t0, t1;
t0 = *((uchar*)src + i + k);
t1 = *((uchar*)src + j + k - sizeof(T));
*(dst + j + k - sizeof(T)) = t0;
*(dst + i + k) = t1;
}
}
}
}
}
template<typename T1, typename T2> CV_ALWAYS_INLINE void flipHoriz_double( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
{
int end = (int)(size.width*esz);
int width = (end + 1)/2;
#if CV_STRONG_ALIGNMENT
CV_Assert(isAligned<sizeof(T1)>(src, dst));
CV_Assert(isAligned<sizeof(T2)>(src, dst));
#endif
for( ; size.height--; src += sstep, dst += dstep )
{
for ( int i = 0, j = end; i < width; i += sizeof(T1) + sizeof(T2), j -= sizeof(T1) + sizeof(T2) )
{
T1 t0, t1;
T2 t2, t3;
t0 = *((T1*)((uchar*)src + i));
t2 = *((T2*)((uchar*)src + i + sizeof(T1)));
t1 = *((T1*)((uchar*)src + j - sizeof(T1) - sizeof(T2)));
t3 = *((T2*)((uchar*)src + j - sizeof(T2)));
*((T1*)(dst + j - sizeof(T1) - sizeof(T2))) = t0;
*((T2*)(dst + j - sizeof(T2))) = t2;
*((T1*)(dst + i)) = t1;
*((T2*)(dst + i + sizeof(T1))) = t3;
}
}
}
#endif
static void
flipHoriz( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size size, size_t esz )
{
#if CV_SIMD
#if CV_STRONG_ALIGNMENT
size_t alignmentMark = ((size_t)src)|((size_t)dst)|sstep|dstep;
#endif
if (esz == 2 * v_uint8x16::nlanes)
{
int end = (int)(size.width*esz);
int width = end/2;
for( ; size.height--; src += sstep, dst += dstep )
{
for( int i = 0, j = end - 2 * v_uint8x16::nlanes; i < width; i += 2 * v_uint8x16::nlanes, j -= 2 * v_uint8x16::nlanes )
{
#if CV_SIMD256
v_uint8x32 t0, t1;
t0 = v256_load((uchar*)src + i);
t1 = v256_load((uchar*)src + j);
v_store(dst + j, t0);
v_store(dst + i, t1);
#else
v_uint8x16 t0, t1, t2, t3;
t0 = v_load((uchar*)src + i);
t1 = v_load((uchar*)src + i + v_uint8x16::nlanes);
t2 = v_load((uchar*)src + j);
t3 = v_load((uchar*)src + j + v_uint8x16::nlanes);
v_store(dst + j, t0);
v_store(dst + j + v_uint8x16::nlanes, t1);
v_store(dst + i, t2);
v_store(dst + i + v_uint8x16::nlanes, t3);
#endif
}
}
}
else if (esz == v_uint8x16::nlanes)
{
int end = (int)(size.width*esz);
int width = end/2;
for( ; size.height--; src += sstep, dst += dstep )
{
for( int i = 0, j = end - v_uint8x16::nlanes; i < width; i += v_uint8x16::nlanes, j -= v_uint8x16::nlanes )
{
v_uint8x16 t0, t1;
t0 = v_load((uchar*)src + i);
t1 = v_load((uchar*)src + j);
v_store(dst + j, t0);
v_store(dst + i, t1);
}
}
}
else if (esz == 8
#if CV_STRONG_ALIGNMENT
&& isAligned<sizeof(uint64)>(alignmentMark)
#endif
)
{
flipHoriz_single<v_uint64x2>(src, sstep, dst, dstep, size, esz);
}
else if (esz == 4
#if CV_STRONG_ALIGNMENT
&& isAligned<sizeof(unsigned)>(alignmentMark)
#endif
)
{
flipHoriz_single<v_uint32x4>(src, sstep, dst, dstep, size, esz);
}
else if (esz == 2
#if CV_STRONG_ALIGNMENT
&& isAligned<sizeof(ushort)>(alignmentMark)
#endif
)
{
flipHoriz_single<v_uint16x8>(src, sstep, dst, dstep, size, esz);
}
else if (esz == 1)
{
flipHoriz_single<v_uint8x16>(src, sstep, dst, dstep, size, esz);
}
else if (esz == 24
#if CV_STRONG_ALIGNMENT
&& isAligned<sizeof(uint64_t)>(alignmentMark)
#endif
)
{
int end = (int)(size.width*esz);
int width = (end + 1)/2;
for( ; size.height--; src += sstep, dst += dstep )
{
for ( int i = 0, j = end; i < width; i += v_uint8x16::nlanes + sizeof(uint64_t), j -= v_uint8x16::nlanes + sizeof(uint64_t) )
{
v_uint8x16 t0, t1;
uint64_t t2, t3;
t0 = v_load((uchar*)src + i);
t2 = *((uint64_t*)((uchar*)src + i + v_uint8x16::nlanes));
t1 = v_load((uchar*)src + j - v_uint8x16::nlanes - sizeof(uint64_t));
t3 = *((uint64_t*)((uchar*)src + j - sizeof(uint64_t)));
v_store(dst + j - v_uint8x16::nlanes - sizeof(uint64_t), t0);
*((uint64_t*)(dst + j - sizeof(uint64_t))) = t2;
v_store(dst + i, t1);
*((uint64_t*)(dst + i + v_uint8x16::nlanes)) = t3;
}
}
}
#if !CV_STRONG_ALIGNMENT
else if (esz == 12)
{
flipHoriz_double<uint64_t,uint>(src, sstep, dst, dstep, size, esz);
}
else if (esz == 6)
{
flipHoriz_double<uint,ushort>(src, sstep, dst, dstep, size, esz);
}
else if (esz == 3)
{
flipHoriz_double<ushort,uchar>(src, sstep, dst, dstep, size, esz);
}
#endif
else
#endif // CV_SIMD
{
int i, j, limit = (int)(((size.width + 1)/2)*esz);
AutoBuffer<int> _tab(size.width*esz);
int* tab = _tab.data();
for( i = 0; i < size.width; i++ )
for( size_t k = 0; k < esz; k++ )
tab[i*esz + k] = (int)((size.width - i - 1)*esz + k);
for( ; size.height--; src += sstep, dst += dstep )
{
for( i = 0; i < limit; i++ )
{
j = tab[i];
uchar t0 = src[i], t1 = src[j];
dst[i] = t1; dst[j] = t0;
}
}
}
}
static void
flipVert( const uchar* src0, size_t sstep, uchar* dst0, size_t dstep, Size size, size_t esz )
{
const uchar* src1 = src0 + (size.height - 1)*sstep;
uchar* dst1 = dst0 + (size.height - 1)*dstep;
size.width *= (int)esz;
for( int y = 0; y < (size.height + 1)/2; y++, src0 += sstep, src1 -= sstep,
dst0 += dstep, dst1 -= dstep )
{
int i = 0;
#if CV_SIMD
#if CV_STRONG_ALIGNMENT
if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
#endif
{
for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
{
v_int32 t0 = vx_load((int*)(src0 + i));
v_int32 t1 = vx_load((int*)(src1 + i));
vx_store((int*)(dst0 + i), t1);
vx_store((int*)(dst1 + i), t0);
}
}
#if CV_STRONG_ALIGNMENT
else
{
for (; i <= size.width - CV_SIMD_WIDTH; i += CV_SIMD_WIDTH)
{
v_uint8 t0 = vx_load(src0 + i);
v_uint8 t1 = vx_load(src1 + i);
vx_store(dst0 + i, t1);
vx_store(dst1 + i, t0);
}
}
#endif
#endif
if (isAligned<sizeof(int)>(src0, src1, dst0, dst1))
{
for( ; i <= size.width - 16; i += 16 )
{
int t0 = ((int*)(src0 + i))[0];
int t1 = ((int*)(src1 + i))[0];
((int*)(dst0 + i))[0] = t1;
((int*)(dst1 + i))[0] = t0;
t0 = ((int*)(src0 + i))[1];
t1 = ((int*)(src1 + i))[1];
((int*)(dst0 + i))[1] = t1;
((int*)(dst1 + i))[1] = t0;
t0 = ((int*)(src0 + i))[2];
t1 = ((int*)(src1 + i))[2];
((int*)(dst0 + i))[2] = t1;
((int*)(dst1 + i))[2] = t0;
t0 = ((int*)(src0 + i))[3];
t1 = ((int*)(src1 + i))[3];
((int*)(dst0 + i))[3] = t1;
((int*)(dst1 + i))[3] = t0;
}
for( ; i <= size.width - 4; i += 4 )
{
int t0 = ((int*)(src0 + i))[0];
int t1 = ((int*)(src1 + i))[0];
((int*)(dst0 + i))[0] = t1;
((int*)(dst1 + i))[0] = t0;
}
}
for( ; i < size.width; i++ )
{
uchar t0 = src0[i];
uchar t1 = src1[i];
dst0[i] = t1;
dst1[i] = t0;
}
}
}
#ifdef HAVE_OPENCL
enum { FLIP_COLS = 1 << 0, FLIP_ROWS = 1 << 1, FLIP_BOTH = FLIP_ROWS | FLIP_COLS };
static bool ocl_flip(InputArray _src, OutputArray _dst, int flipCode )
{
CV_Assert(flipCode >= -1 && flipCode <= 1);
const ocl::Device & dev = ocl::Device::getDefault();
int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
flipType, kercn = std::min(ocl::predictOptimalVectorWidth(_src, _dst), 4);
bool doubleSupport = dev.doubleFPConfig() > 0;
if (!doubleSupport && depth == CV_64F)
kercn = cn;
if (cn > 4)
return false;
const char * kernelName;
if (flipCode == 0)
kernelName = "arithm_flip_rows", flipType = FLIP_ROWS;
else if (flipCode > 0)
kernelName = "arithm_flip_cols", flipType = FLIP_COLS;
else
kernelName = "arithm_flip_rows_cols", flipType = FLIP_BOTH;
int pxPerWIy = (dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU)) ? 4 : 1;
kercn = (cn!=3 || flipType == FLIP_ROWS) ? std::max(kercn, cn) : cn;
ocl::Kernel k(kernelName, ocl::core::flip_oclsrc,
format( "-D T=%s -D T1=%s -D DEPTH=%d -D cn=%d -D PIX_PER_WI_Y=%d -D kercn=%d",
kercn != cn ? ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)) : ocl::vecopTypeToStr(CV_MAKE_TYPE(depth, kercn)),
kercn != cn ? ocl::typeToStr(depth) : ocl::vecopTypeToStr(depth), depth, cn, pxPerWIy, kercn));
if (k.empty())
return false;
Size size = _src.size();
_dst.create(size, type);
UMat src = _src.getUMat(), dst = _dst.getUMat();
int cols = size.width * cn / kercn, rows = size.height;
cols = flipType == FLIP_COLS ? (cols + 1) >> 1 : cols;
rows = flipType & FLIP_ROWS ? (rows + 1) >> 1 : rows;
k.args(ocl::KernelArg::ReadOnlyNoSize(src),
ocl::KernelArg::WriteOnly(dst, cn, kercn), rows, cols);
size_t maxWorkGroupSize = dev.maxWorkGroupSize();
CV_Assert(maxWorkGroupSize % 4 == 0);
size_t globalsize[2] = { (size_t)cols, ((size_t)rows + pxPerWIy - 1) / pxPerWIy },
localsize[2] = { maxWorkGroupSize / 4, 4 };
return k.run(2, globalsize, (flipType == FLIP_COLS) && !dev.isIntel() ? localsize : NULL, false);
}
#endif
#if defined HAVE_IPP
static bool ipp_flip(Mat &src, Mat &dst, int flip_mode)
{
#ifdef HAVE_IPP_IW
CV_INSTRUMENT_REGION_IPP();
// Details: https://github.com/opencv/opencv/issues/12943
if (flip_mode <= 0 /* swap rows */
&& cv::ipp::getIppTopFeatures() != ippCPUID_SSE42
&& (int64_t)(src.total()) * src.elemSize() >= CV_BIG_INT(0x80000000)/*2Gb*/
)
return false;
IppiAxis ippMode;
if(flip_mode < 0)
ippMode = ippAxsBoth;
else if(flip_mode == 0)
ippMode = ippAxsHorizontal;
else
ippMode = ippAxsVertical;
try
{
::ipp::IwiImage iwSrc = ippiGetImage(src);
::ipp::IwiImage iwDst = ippiGetImage(dst);
CV_INSTRUMENT_FUN_IPP(::ipp::iwiMirror, iwSrc, iwDst, ippMode);
}
catch(const ::ipp::IwException &)
{
return false;
}
return true;
#else
CV_UNUSED(src); CV_UNUSED(dst); CV_UNUSED(flip_mode);
return false;
#endif
}
#endif
void flip( InputArray _src, OutputArray _dst, int flip_mode )
{
CV_INSTRUMENT_REGION();
CV_Assert( _src.dims() <= 2 );
Size size = _src.size();
if (flip_mode < 0)
{
if (size.width == 1)
flip_mode = 0;
if (size.height == 1)
flip_mode = 1;
}
if ((size.width == 1 && flip_mode > 0) ||
(size.height == 1 && flip_mode == 0))
{
return _src.copyTo(_dst);
}
CV_OCL_RUN( _dst.isUMat(), ocl_flip(_src, _dst, flip_mode))
Mat src = _src.getMat();
int type = src.type();
_dst.create( size, type );
Mat dst = _dst.getMat();
CV_IPP_RUN_FAST(ipp_flip(src, dst, flip_mode));
size_t esz = CV_ELEM_SIZE(type);
if( flip_mode <= 0 )
flipVert( src.ptr(), src.step, dst.ptr(), dst.step, src.size(), esz );
else
flipHoriz( src.ptr(), src.step, dst.ptr(), dst.step, src.size(), esz );
if( flip_mode < 0 )
flipHoriz( dst.ptr(), dst.step, dst.ptr(), dst.step, dst.size(), esz );
}
void rotate(InputArray _src, OutputArray _dst, int rotateMode)
{
CV_Assert(_src.dims() <= 2);
switch (rotateMode)
{
case ROTATE_90_CLOCKWISE:
transpose(_src, _dst);
flip(_dst, _dst, 1);
break;
case ROTATE_180:
flip(_src, _dst, -1);
break;
case ROTATE_90_COUNTERCLOCKWISE:
transpose(_src, _dst);
flip(_dst, _dst, 0);
break;
default:
break;
}
}
#if defined HAVE_OPENCL && !defined __APPLE__
......@@ -1499,6 +1081,9 @@ void cv::copyMakeBorder( InputArray _src, OutputArray _dst, int top, int bottom,
}
}
#ifndef OPENCV_EXCLUDE_C_API
/* dst = src */
CV_IMPL void
cvCopy( const void* srcarr, void* dstarr, const void* maskarr )
......@@ -1614,4 +1199,5 @@ cvRepeat( const CvArr* srcarr, CvArr* dstarr )
cv::repeat(src, dst.rows/src.rows, dst.cols/src.cols, dst);
}
#endif // OPENCV_EXCLUDE_C_API
/* End of file. */
......@@ -40,6 +40,8 @@
//M*/
#include "precomp.hpp"
#ifndef OPENCV_EXCLUDE_C_API
/* default alignment for dynamic data strucutures, resided in storages. */
#define CV_STRUCT_ALIGN ((int)sizeof(double))
......@@ -3585,4 +3587,5 @@ void seqInsertSlice( CvSeq* seq, int before_index, const CvArr* from_arr )
}
#endif // OPENCV_EXCLUDE_C_API
/* End of file. */
......@@ -4640,6 +4640,9 @@ int cv::getOptimalDFTSize( int size0 )
return optimalDFTSizeTab[b];
}
#ifndef OPENCV_EXCLUDE_C_API
CV_IMPL void
cvDFT( const CvArr* srcarr, CvArr* dstarr, int flags, int nonzero_rows )
{
......@@ -4695,4 +4698,5 @@ cvGetOptimalDFTSize( int size0 )
return cv::getOptimalDFTSize(size0);
}
#endif // OPENCV_EXCLUDE_C_API
/* End of file. */
......@@ -753,8 +753,6 @@ SVBkSb( int m, int n, const double* w, size_t wstep,
(double*)alignPtr(buffer, sizeof(double)), DBL_EPSILON*2 );
}
}
/****************************************************************************************\
* Determinant of the matrix *
\****************************************************************************************/
......@@ -764,7 +762,7 @@ SVBkSb( int m, int n, const double* w, size_t wstep,
m(0,1)*((double)m(1,0)*m(2,2) - (double)m(1,2)*m(2,0)) + \
m(0,2)*((double)m(1,0)*m(2,1) - (double)m(1,1)*m(2,0)))
double cv::determinant( InputArray _mat )
double determinant( InputArray _mat )
{
CV_INSTRUMENT_REGION();
......@@ -842,7 +840,7 @@ double cv::determinant( InputArray _mat )
#define Df( y, x ) ((float*)(dstdata + y*dststep))[x]
#define Dd( y, x ) ((double*)(dstdata + y*dststep))[x]
double cv::invert( InputArray _src, OutputArray _dst, int method )
double invert( InputArray _src, OutputArray _dst, int method )
{
CV_INSTRUMENT_REGION();
......@@ -1069,13 +1067,19 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
return result;
}
UMat UMat::inv(int method) const
{
UMat m;
invert(*this, m, method);
return m;
}
/****************************************************************************************\
* Solving a linear system *
\****************************************************************************************/
bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int method )
bool solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int method )
{
CV_INSTRUMENT_REGION();
......@@ -1374,7 +1378,7 @@ bool cv::solve( InputArray _src, InputArray _src2arg, OutputArray _dst, int meth
/////////////////// finding eigenvalues and eigenvectors of a symmetric matrix ///////////////
bool cv::eigen( InputArray _src, OutputArray _evals, OutputArray _evects )
bool eigen( InputArray _src, OutputArray _evals, OutputArray _evects )
{
CV_INSTRUMENT_REGION();
......@@ -1396,7 +1400,7 @@ bool cv::eigen( InputArray _src, OutputArray _evals, OutputArray _evects )
const bool evecNeeded = _evects.needed();
const int esOptions = evecNeeded ? Eigen::ComputeEigenvectors : Eigen::EigenvaluesOnly;
_evals.create(n, 1, type);
cv::Mat evals = _evals.getMat();
Mat evals = _evals.getMat();
if ( type == CV_64F )
{
Eigen::MatrixXd src_eig, zeros_eig;
......@@ -1448,9 +1452,6 @@ bool cv::eigen( InputArray _src, OutputArray _evals, OutputArray _evects )
#endif
}
namespace cv
{
static void _SVDcompute( InputArray _aarr, OutputArray _w,
OutputArray _u, OutputArray _vt, int flags )
{
......@@ -1598,6 +1599,9 @@ void cv::SVBackSubst(InputArray w, InputArray u, InputArray vt, InputArray rhs,
}
#ifndef OPENCV_EXCLUDE_C_API
CV_IMPL double
cvDet( const CvArr* arr )
{
......@@ -1789,3 +1793,4 @@ cvSVBkSb( const CvArr* warr, const CvArr* uarr,
cv::SVD::backSubst(w, u, v, rhs, dst);
CV_Assert( dst.data == dst0.data );
}
#endif // OPENCV_EXCLUDE_C_API
......@@ -1638,6 +1638,9 @@ void patchNaNs( InputOutputArray _a, double _val )
}
#ifndef OPENCV_EXCLUDE_C_API
CV_IMPL float cvCbrt(float value) { return cv::cubeRoot(value); }
CV_IMPL float cvFastArctan(float y, float x) { return cv::fastAtan2(y, x); }
......@@ -1721,6 +1724,7 @@ CV_IMPL int cvCheckArr( const CvArr* arr, int flags,
return cv::checkRange(cv::cvarrToMat(arr), (flags & CV_CHECK_QUIET) != 0, 0, minVal, maxVal );
}
#endif // OPENCV_EXCLUDE_C_API
/*
Finds real roots of cubic, quadratic or linear equation.
......@@ -2016,6 +2020,8 @@ double cv::solvePoly( InputArray _coeffs0, OutputArray _roots0, int maxIters )
}
#ifndef OPENCV_EXCLUDE_C_API
CV_IMPL int
cvSolveCubic( const CvMat* coeffs, CvMat* roots )
{
......@@ -2035,6 +2041,7 @@ void cvSolvePoly(const CvMat* a, CvMat *r, int maxiter, int)
CV_Assert( _r.data == _r0.data ); // check that the array of roots was not reallocated
}
#endif // OPENCV_EXCLUDE_C_API
// Common constants for dispatched code
......
......@@ -999,8 +999,79 @@ double Mat::dot(InputArray _mat) const
return r;
}
#ifdef HAVE_OPENCL
static bool ocl_dot( InputArray _src1, InputArray _src2, double & res )
{
UMat src1 = _src1.getUMat().reshape(1), src2 = _src2.getUMat().reshape(1);
int type = src1.type(), depth = CV_MAT_DEPTH(type),
kercn = ocl::predictOptimalVectorWidth(src1, src2);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if ( !doubleSupport && depth == CV_64F )
return false;
int dbsize = ocl::Device::getDefault().maxComputeUnits();
size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
int ddepth = std::max(CV_32F, depth);
int wgs2_aligned = 1;
while (wgs2_aligned < (int)wgs)
wgs2_aligned <<= 1;
wgs2_aligned >>= 1;
char cvt[40];
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT "
"-D WGS=%d -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d",
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(depth),
ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
ddepth, ocl::convertTypeStr(depth, ddepth, kercn, cvt),
(int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
_src1.isContinuous() ? " -D HAVE_SRC_CONT" : "",
_src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", kercn));
if (k.empty())
return false;
UMat db(1, dbsize, ddepth);
ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
dbarg = ocl::KernelArg::PtrWriteOnly(db);
k.args(src1arg, src1.cols, (int)src1.total(), dbsize, dbarg, src2arg);
size_t globalsize = dbsize * wgs;
if (k.run(1, &globalsize, &wgs, true))
{
res = sum(db.getMat(ACCESS_READ))[0];
return true;
}
return false;
}
#endif
double UMat::dot(InputArray m) const
{
CV_INSTRUMENT_REGION();
CV_Assert(m.sameSize(*this) && m.type() == type());
#ifdef HAVE_OPENCL
double r = 0;
CV_OCL_RUN_(dims <= 2, ocl_dot(*this, m, r), r)
#endif
return getMat(ACCESS_READ).dot(m);
}
} // namespace cv::
#ifndef OPENCV_EXCLUDE_C_API
/****************************************************************************************\
* Earlier API *
\****************************************************************************************/
......@@ -1225,4 +1296,6 @@ cvBackProjectPCA( const CvArr* proj_arr, const CvArr* avg_arr,
CV_Assert(dst0.data == dst.data);
}
#endif // OPENCV_EXCLUDE_C_API
/* End of file. */
......@@ -6,6 +6,7 @@
#include "opencv2/core/mat.hpp"
#include "opencv2/core/types_c.h"
#ifndef OPENCV_EXCLUDE_C_API
// glue
CvMatND cvMatND(const cv::Mat& m)
......@@ -360,7 +361,6 @@ cvSort( const CvArr* _src, CvArr* _dst, CvArr* _idx, int flags )
}
}
CV_IMPL int
cvKMeans2( const CvArr* _samples, int cluster_count, CvArr* _labels,
CvTermCriteria termcrit, int attempts, CvRNG*,
......@@ -389,3 +389,5 @@ cvKMeans2( const CvArr* _samples, int cluster_count, CvArr* _labels,
*_compactness = compactness;
return 1;
}
#endif // OPENCV_EXCLUDE_C_API
......@@ -226,6 +226,23 @@ void cv::setIdentity( InputOutputArray _m, const Scalar& s )
}
}
namespace cv {
UMat UMat::eye(int rows, int cols, int type)
{
return UMat::eye(Size(cols, rows), type);
}
UMat UMat::eye(Size size, int type)
{
UMat m(size, type);
setIdentity(m);
return m;
}
} // namespace
//////////////////////////////////////////// trace ///////////////////////////////////////////
cv::Scalar cv::trace( InputArray _m )
......@@ -260,285 +277,6 @@ cv::Scalar cv::trace( InputArray _m )
return cv::sum(m.diag());
}
////////////////////////////////////// transpose /////////////////////////////////////////
namespace cv
{
template<typename T> static void
transpose_( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz )
{
int i=0, j, m = sz.width, n = sz.height;
#if CV_ENABLE_UNROLLED
for(; i <= m - 4; i += 4 )
{
T* d0 = (T*)(dst + dstep*i);
T* d1 = (T*)(dst + dstep*(i+1));
T* d2 = (T*)(dst + dstep*(i+2));
T* d3 = (T*)(dst + dstep*(i+3));
for( j = 0; j <= n - 4; j += 4 )
{
const T* s0 = (const T*)(src + i*sizeof(T) + sstep*j);
const T* s1 = (const T*)(src + i*sizeof(T) + sstep*(j+1));
const T* s2 = (const T*)(src + i*sizeof(T) + sstep*(j+2));
const T* s3 = (const T*)(src + i*sizeof(T) + sstep*(j+3));
d0[j] = s0[0]; d0[j+1] = s1[0]; d0[j+2] = s2[0]; d0[j+3] = s3[0];
d1[j] = s0[1]; d1[j+1] = s1[1]; d1[j+2] = s2[1]; d1[j+3] = s3[1];
d2[j] = s0[2]; d2[j+1] = s1[2]; d2[j+2] = s2[2]; d2[j+3] = s3[2];
d3[j] = s0[3]; d3[j+1] = s1[3]; d3[j+2] = s2[3]; d3[j+3] = s3[3];
}
for( ; j < n; j++ )
{
const T* s0 = (const T*)(src + i*sizeof(T) + j*sstep);
d0[j] = s0[0]; d1[j] = s0[1]; d2[j] = s0[2]; d3[j] = s0[3];
}
}
#endif
for( ; i < m; i++ )
{
T* d0 = (T*)(dst + dstep*i);
j = 0;
#if CV_ENABLE_UNROLLED
for(; j <= n - 4; j += 4 )
{
const T* s0 = (const T*)(src + i*sizeof(T) + sstep*j);
const T* s1 = (const T*)(src + i*sizeof(T) + sstep*(j+1));
const T* s2 = (const T*)(src + i*sizeof(T) + sstep*(j+2));
const T* s3 = (const T*)(src + i*sizeof(T) + sstep*(j+3));
d0[j] = s0[0]; d0[j+1] = s1[0]; d0[j+2] = s2[0]; d0[j+3] = s3[0];
}
#endif
for( ; j < n; j++ )
{
const T* s0 = (const T*)(src + i*sizeof(T) + j*sstep);
d0[j] = s0[0];
}
}
}
template<typename T> static void
transposeI_( uchar* data, size_t step, int n )
{
for( int i = 0; i < n; i++ )
{
T* row = (T*)(data + step*i);
uchar* data1 = data + i*sizeof(T);
for( int j = i+1; j < n; j++ )
std::swap( row[j], *(T*)(data1 + step*j) );
}
}
typedef void (*TransposeFunc)( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz );
typedef void (*TransposeInplaceFunc)( uchar* data, size_t step, int n );
#define DEF_TRANSPOSE_FUNC(suffix, type) \
static void transpose_##suffix( const uchar* src, size_t sstep, uchar* dst, size_t dstep, Size sz ) \
{ transpose_<type>(src, sstep, dst, dstep, sz); } \
\
static void transposeI_##suffix( uchar* data, size_t step, int n ) \
{ transposeI_<type>(data, step, n); }
DEF_TRANSPOSE_FUNC(8u, uchar)
DEF_TRANSPOSE_FUNC(16u, ushort)
DEF_TRANSPOSE_FUNC(8uC3, Vec3b)
DEF_TRANSPOSE_FUNC(32s, int)
DEF_TRANSPOSE_FUNC(16uC3, Vec3s)
DEF_TRANSPOSE_FUNC(32sC2, Vec2i)
DEF_TRANSPOSE_FUNC(32sC3, Vec3i)
DEF_TRANSPOSE_FUNC(32sC4, Vec4i)
DEF_TRANSPOSE_FUNC(32sC6, Vec6i)
DEF_TRANSPOSE_FUNC(32sC8, Vec8i)
static TransposeFunc transposeTab[] =
{
0, transpose_8u, transpose_16u, transpose_8uC3, transpose_32s, 0, transpose_16uC3, 0,
transpose_32sC2, 0, 0, 0, transpose_32sC3, 0, 0, 0, transpose_32sC4,
0, 0, 0, 0, 0, 0, 0, transpose_32sC6, 0, 0, 0, 0, 0, 0, 0, transpose_32sC8
};
static TransposeInplaceFunc transposeInplaceTab[] =
{
0, transposeI_8u, transposeI_16u, transposeI_8uC3, transposeI_32s, 0, transposeI_16uC3, 0,
transposeI_32sC2, 0, 0, 0, transposeI_32sC3, 0, 0, 0, transposeI_32sC4,
0, 0, 0, 0, 0, 0, 0, transposeI_32sC6, 0, 0, 0, 0, 0, 0, 0, transposeI_32sC8
};
#ifdef HAVE_OPENCL
static bool ocl_transpose( InputArray _src, OutputArray _dst )
{
const ocl::Device & dev = ocl::Device::getDefault();
const int TILE_DIM = 32, BLOCK_ROWS = 8;
int type = _src.type(), cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type),
rowsPerWI = dev.isIntel() ? 4 : 1;
UMat src = _src.getUMat();
_dst.create(src.cols, src.rows, type);
UMat dst = _dst.getUMat();
String kernelName("transpose");
bool inplace = dst.u == src.u;
if (inplace)
{
CV_Assert(dst.cols == dst.rows);
kernelName += "_inplace";
}
else
{
// check required local memory size
size_t required_local_memory = (size_t) TILE_DIM*(TILE_DIM+1)*CV_ELEM_SIZE(type);
if (required_local_memory > ocl::Device::getDefault().localMemSize())
return false;
}
ocl::Kernel k(kernelName.c_str(), ocl::core::transpose_oclsrc,
format("-D T=%s -D T1=%s -D cn=%d -D TILE_DIM=%d -D BLOCK_ROWS=%d -D rowsPerWI=%d%s",
ocl::memopTypeToStr(type), ocl::memopTypeToStr(depth),
cn, TILE_DIM, BLOCK_ROWS, rowsPerWI, inplace ? " -D INPLACE" : ""));
if (k.empty())
return false;
if (inplace)
k.args(ocl::KernelArg::ReadWriteNoSize(dst), dst.rows);
else
k.args(ocl::KernelArg::ReadOnly(src),
ocl::KernelArg::WriteOnlyNoSize(dst));
size_t localsize[2] = { TILE_DIM, BLOCK_ROWS };
size_t globalsize[2] = { (size_t)src.cols, inplace ? ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI : (divUp((size_t)src.rows, TILE_DIM) * BLOCK_ROWS) };
if (inplace && dev.isIntel())
{
localsize[0] = 16;
localsize[1] = dev.maxWorkGroupSize() / localsize[0];
}
return k.run(2, globalsize, localsize, false);
}
#endif
#ifdef HAVE_IPP
static bool ipp_transpose( Mat &src, Mat &dst )
{
CV_INSTRUMENT_REGION_IPP();
int type = src.type();
typedef IppStatus (CV_STDCALL * IppiTranspose)(const void * pSrc, int srcStep, void * pDst, int dstStep, IppiSize roiSize);
typedef IppStatus (CV_STDCALL * IppiTransposeI)(const void * pSrcDst, int srcDstStep, IppiSize roiSize);
IppiTranspose ippiTranspose = 0;
IppiTransposeI ippiTranspose_I = 0;
if (dst.data == src.data && dst.cols == dst.rows)
{
CV_SUPPRESS_DEPRECATED_START
ippiTranspose_I =
type == CV_8UC1 ? (IppiTransposeI)ippiTranspose_8u_C1IR :
type == CV_8UC3 ? (IppiTransposeI)ippiTranspose_8u_C3IR :
type == CV_8UC4 ? (IppiTransposeI)ippiTranspose_8u_C4IR :
type == CV_16UC1 ? (IppiTransposeI)ippiTranspose_16u_C1IR :
type == CV_16UC3 ? (IppiTransposeI)ippiTranspose_16u_C3IR :
type == CV_16UC4 ? (IppiTransposeI)ippiTranspose_16u_C4IR :
type == CV_16SC1 ? (IppiTransposeI)ippiTranspose_16s_C1IR :
type == CV_16SC3 ? (IppiTransposeI)ippiTranspose_16s_C3IR :
type == CV_16SC4 ? (IppiTransposeI)ippiTranspose_16s_C4IR :
type == CV_32SC1 ? (IppiTransposeI)ippiTranspose_32s_C1IR :
type == CV_32SC3 ? (IppiTransposeI)ippiTranspose_32s_C3IR :
type == CV_32SC4 ? (IppiTransposeI)ippiTranspose_32s_C4IR :
type == CV_32FC1 ? (IppiTransposeI)ippiTranspose_32f_C1IR :
type == CV_32FC3 ? (IppiTransposeI)ippiTranspose_32f_C3IR :
type == CV_32FC4 ? (IppiTransposeI)ippiTranspose_32f_C4IR : 0;
CV_SUPPRESS_DEPRECATED_END
}
else
{
ippiTranspose =
type == CV_8UC1 ? (IppiTranspose)ippiTranspose_8u_C1R :
type == CV_8UC3 ? (IppiTranspose)ippiTranspose_8u_C3R :
type == CV_8UC4 ? (IppiTranspose)ippiTranspose_8u_C4R :
type == CV_16UC1 ? (IppiTranspose)ippiTranspose_16u_C1R :
type == CV_16UC3 ? (IppiTranspose)ippiTranspose_16u_C3R :
type == CV_16UC4 ? (IppiTranspose)ippiTranspose_16u_C4R :
type == CV_16SC1 ? (IppiTranspose)ippiTranspose_16s_C1R :
type == CV_16SC3 ? (IppiTranspose)ippiTranspose_16s_C3R :
type == CV_16SC4 ? (IppiTranspose)ippiTranspose_16s_C4R :
type == CV_32SC1 ? (IppiTranspose)ippiTranspose_32s_C1R :
type == CV_32SC3 ? (IppiTranspose)ippiTranspose_32s_C3R :
type == CV_32SC4 ? (IppiTranspose)ippiTranspose_32s_C4R :
type == CV_32FC1 ? (IppiTranspose)ippiTranspose_32f_C1R :
type == CV_32FC3 ? (IppiTranspose)ippiTranspose_32f_C3R :
type == CV_32FC4 ? (IppiTranspose)ippiTranspose_32f_C4R : 0;
}
IppiSize roiSize = { src.cols, src.rows };
if (ippiTranspose != 0)
{
if (CV_INSTRUMENT_FUN_IPP(ippiTranspose, src.ptr(), (int)src.step, dst.ptr(), (int)dst.step, roiSize) >= 0)
return true;
}
else if (ippiTranspose_I != 0)
{
if (CV_INSTRUMENT_FUN_IPP(ippiTranspose_I, dst.ptr(), (int)dst.step, roiSize) >= 0)
return true;
}
return false;
}
#endif
}
void cv::transpose( InputArray _src, OutputArray _dst )
{
CV_INSTRUMENT_REGION();
int type = _src.type(), esz = CV_ELEM_SIZE(type);
CV_Assert( _src.dims() <= 2 && esz <= 32 );
CV_OCL_RUN(_dst.isUMat(),
ocl_transpose(_src, _dst))
Mat src = _src.getMat();
if( src.empty() )
{
_dst.release();
return;
}
_dst.create(src.cols, src.rows, src.type());
Mat dst = _dst.getMat();
// handle the case of single-column/single-row matrices, stored in STL vectors.
if( src.rows != dst.cols || src.cols != dst.rows )
{
CV_Assert( src.size() == dst.size() && (src.cols == 1 || src.rows == 1) );
src.copyTo(dst);
return;
}
CV_IPP_RUN_FAST(ipp_transpose(src, dst))
if( dst.data == src.data )
{
TransposeInplaceFunc func = transposeInplaceTab[esz];
CV_Assert( func != 0 );
CV_Assert( dst.cols == dst.rows );
func( dst.ptr(), dst.step, dst.rows );
}
else
{
TransposeFunc func = transposeTab[esz];
CV_Assert( func != 0 );
func( src.ptr(), src.step, dst.ptr(), dst.step, src.size() );
}
}
////////////////////////////////////// completeSymm /////////////////////////////////////////
......
此差异已折叠。
......@@ -316,6 +316,7 @@ void _InputArray::getUMatVector(std::vector<UMat>& umv) const
cuda::GpuMat _InputArray::getGpuMat() const
{
#ifdef HAVE_CUDA
_InputArray::KindFlag k = kind();
if (k == CUDA_GPU_MAT)
......@@ -339,14 +340,22 @@ cuda::GpuMat _InputArray::getGpuMat() const
return cuda::GpuMat();
CV_Error(cv::Error::StsNotImplemented, "getGpuMat is available only for cuda::GpuMat and cuda::HostMem");
#else
CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
#endif
}
void _InputArray::getGpuMatVector(std::vector<cuda::GpuMat>& gpumv) const
{
#ifdef HAVE_CUDA
_InputArray::KindFlag k = kind();
if (k == STD_VECTOR_CUDA_GPU_MAT)
{
gpumv = *(std::vector<cuda::GpuMat>*)obj;
}
#else
CV_UNUSED(gpumv);
CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
#endif
}
ogl::Buffer _InputArray::getOGlBuffer() const
{
......@@ -453,11 +462,15 @@ Size _InputArray::size(int i) const
if (k == STD_VECTOR_CUDA_GPU_MAT)
{
#ifdef HAVE_CUDA
const std::vector<cuda::GpuMat>& vv = *(const std::vector<cuda::GpuMat>*)obj;
if (i < 0)
return vv.empty() ? Size() : Size((int)vv.size(), 1);
CV_Assert(i < (int)vv.size());
return vv[i].size();
#else
CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
#endif
}
if( k == STD_VECTOR_UMAT )
......@@ -792,6 +805,7 @@ int _InputArray::type(int i) const
if (k == STD_VECTOR_CUDA_GPU_MAT)
{
#ifdef HAVE_CUDA
const std::vector<cuda::GpuMat>& vv = *(const std::vector<cuda::GpuMat>*)obj;
if (vv.empty())
{
......@@ -800,6 +814,9 @@ int _InputArray::type(int i) const
}
CV_Assert(i < (int)vv.size());
return vv[i >= 0 ? i : 0].type();
#else
CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
#endif
}
if( k == OPENGL_BUFFER )
......@@ -1161,22 +1178,34 @@ void _OutputArray::create(Size _sz, int mtype, int i, bool allowTransposed, _Out
{
CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == _sz);
CV_Assert(!fixedType() || ((cuda::GpuMat*)obj)->type() == mtype);
#ifdef HAVE_CUDA
((cuda::GpuMat*)obj)->create(_sz, mtype);
return;
#else
CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
#endif
}
if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 )
{
CV_Assert(!fixedSize() || ((ogl::Buffer*)obj)->size() == _sz);
CV_Assert(!fixedType() || ((ogl::Buffer*)obj)->type() == mtype);
#ifdef HAVE_OPENGL
((ogl::Buffer*)obj)->create(_sz, mtype);
return;
#else
CV_Error(Error::StsNotImplemented, "OpenGL support is not enabled in this OpenCV build (missing HAVE_OPENGL)");
#endif
}
if( k == CUDA_HOST_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
{
CV_Assert(!fixedSize() || ((cuda::HostMem*)obj)->size() == _sz);
CV_Assert(!fixedType() || ((cuda::HostMem*)obj)->type() == mtype);
#ifdef HAVE_CUDA
((cuda::HostMem*)obj)->create(_sz, mtype);
return;
#else
CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
#endif
}
int sizes[] = {_sz.height, _sz.width};
create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
......@@ -1203,22 +1232,34 @@ void _OutputArray::create(int _rows, int _cols, int mtype, int i, bool allowTran
{
CV_Assert(!fixedSize() || ((cuda::GpuMat*)obj)->size() == Size(_cols, _rows));
CV_Assert(!fixedType() || ((cuda::GpuMat*)obj)->type() == mtype);
#ifdef HAVE_CUDA
((cuda::GpuMat*)obj)->create(_rows, _cols, mtype);
return;
#else
CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
#endif
}
if( k == OPENGL_BUFFER && i < 0 && !allowTransposed && fixedDepthMask == 0 )
{
CV_Assert(!fixedSize() || ((ogl::Buffer*)obj)->size() == Size(_cols, _rows));
CV_Assert(!fixedType() || ((ogl::Buffer*)obj)->type() == mtype);
#ifdef HAVE_OPENGL
((ogl::Buffer*)obj)->create(_rows, _cols, mtype);
return;
#else
CV_Error(Error::StsNotImplemented, "OpenGL support is not enabled in this OpenCV build (missing HAVE_OPENGL)");
#endif
}
if( k == CUDA_HOST_MEM && i < 0 && !allowTransposed && fixedDepthMask == 0 )
{
CV_Assert(!fixedSize() || ((cuda::HostMem*)obj)->size() == Size(_cols, _rows));
CV_Assert(!fixedType() || ((cuda::HostMem*)obj)->type() == mtype);
#ifdef HAVE_CUDA
((cuda::HostMem*)obj)->create(_rows, _cols, mtype);
return;
#else
CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
#endif
}
int sizes[] = {_rows, _cols};
create(2, sizes, mtype, i, allowTransposed, fixedDepthMask);
......@@ -1641,20 +1682,32 @@ void _OutputArray::release() const
if( k == CUDA_GPU_MAT )
{
#ifdef HAVE_CUDA
((cuda::GpuMat*)obj)->release();
return;
#else
CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
#endif
}
if( k == CUDA_HOST_MEM )
{
#ifdef HAVE_CUDA
((cuda::HostMem*)obj)->release();
return;
#else
CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
#endif
}
if( k == OPENGL_BUFFER )
{
#ifdef HAVE_OPENGL
((ogl::Buffer*)obj)->release();
return;
#else
CV_Error(Error::StsNotImplemented, "OpenGL support is not enabled in this OpenCV build (missing HAVE_OPENGL)");
#endif
}
if( k == NONE )
......@@ -1685,8 +1738,12 @@ void _OutputArray::release() const
}
if (k == STD_VECTOR_CUDA_GPU_MAT)
{
#ifdef HAVE_CUDA
((std::vector<cuda::GpuMat>*)obj)->clear();
return;
#else
CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
#endif
}
CV_Error(Error::StsNotImplemented, "Unknown/unsupported array type");
}
......@@ -1794,9 +1851,13 @@ void _OutputArray::setTo(const _InputArray& arr, const _InputArray & mask) const
((UMat*)obj)->setTo(arr, mask);
else if( k == CUDA_GPU_MAT )
{
#ifdef HAVE_CUDA
Mat value = arr.getMat();
CV_Assert( checkScalar(value, type(), arr.kind(), _InputArray::CUDA_GPU_MAT) );
((cuda::GpuMat*)obj)->setTo(Scalar(Vec<double, 4>(value.ptr<double>())), mask);
#else
CV_Error(Error::StsNotImplemented, "CUDA support is not enabled in this OpenCV build (missing HAVE_CUDA)");
#endif
}
else
CV_Error(Error::StsNotImplemented, "");
......
......@@ -205,13 +205,10 @@ int normL1_(const uchar* a, const uchar* b, int n)
return d;
}
}} //cv::hal
} //cv::hal
//==================================================================================================
namespace cv
{
template<typename T, typename ST> int
normInf_(const T* src, const uchar* mask, ST* _result, int len, int cn)
{
......@@ -594,12 +591,10 @@ static bool ipp_norm(Mat &src, int normType, Mat &mask, double &result)
CV_UNUSED(src); CV_UNUSED(normType); CV_UNUSED(mask); CV_UNUSED(result);
#endif
return false;
}
#endif
} // cv::
} // ipp_norm()
#endif // HAVE_IPP
double cv::norm( InputArray _src, int normType, InputArray _mask )
double norm( InputArray _src, int normType, InputArray _mask )
{
CV_INSTRUMENT_REGION();
......@@ -792,9 +787,6 @@ double cv::norm( InputArray _src, int normType, InputArray _mask )
//==================================================================================================
#ifdef HAVE_OPENCL
namespace cv {
static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask, double & result )
{
#ifdef __ANDROID__
......@@ -849,15 +841,10 @@ static bool ocl_norm( InputArray _src1, InputArray _src2, int normType, InputArr
result /= (s2 + DBL_EPSILON);
return true;
}
}
#endif
} // ocl_norm()
#endif // HAVE_OPENCL
#ifdef HAVE_IPP
namespace cv
{
static bool ipp_norm(InputArray _src1, InputArray _src2, int normType, InputArray _mask, double &result)
{
CV_INSTRUMENT_REGION_IPP();
......@@ -1083,12 +1070,11 @@ static bool ipp_norm(InputArray _src1, InputArray _src2, int normType, InputArra
CV_UNUSED(_src1); CV_UNUSED(_src2); CV_UNUSED(normType); CV_UNUSED(_mask); CV_UNUSED(result);
#endif
return false;
}
}
#endif
} // ipp_norm
#endif // HAVE_IPP
double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask )
double norm( InputArray _src1, InputArray _src2, int normType, InputArray _mask )
{
CV_INSTRUMENT_REGION();
......@@ -1280,12 +1266,12 @@ double cv::norm( InputArray _src1, InputArray _src2, int normType, InputArray _m
return result.d;
}
cv::Hamming::ResultType cv::Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
cv::Hamming::ResultType Hamming::operator()( const unsigned char* a, const unsigned char* b, int size ) const
{
return cv::hal::normHamming(a, b, size);
}
double cv::PSNR(InputArray _src1, InputArray _src2, double R)
double PSNR(InputArray _src1, InputArray _src2, double R)
{
CV_INSTRUMENT_REGION();
......@@ -1295,3 +1281,141 @@ double cv::PSNR(InputArray _src1, InputArray _src2, double R)
double diff = std::sqrt(norm(_src1, _src2, NORM_L2SQR)/(_src1.total()*_src1.channels()));
return 20*log10(R/(diff+DBL_EPSILON));
}
#ifdef HAVE_OPENCL
static bool ocl_normalize( InputArray _src, InputOutputArray _dst, InputArray _mask, int dtype,
double scale, double delta )
{
UMat src = _src.getUMat();
if( _mask.empty() )
src.convertTo( _dst, dtype, scale, delta );
else if (src.channels() <= 4)
{
const ocl::Device & dev = ocl::Device::getDefault();
int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32F, std::max(sdepth, ddepth)),
rowsPerWI = dev.isIntel() ? 4 : 1;
float fscale = static_cast<float>(scale), fdelta = static_cast<float>(delta);
bool haveScale = std::fabs(scale - 1) > DBL_EPSILON,
haveZeroScale = !(std::fabs(scale) > DBL_EPSILON),
haveDelta = std::fabs(delta) > DBL_EPSILON,
doubleSupport = dev.doubleFPConfig() > 0;
if (!haveScale && !haveDelta && stype == dtype)
{
_src.copyTo(_dst, _mask);
return true;
}
if (haveZeroScale)
{
_dst.setTo(Scalar(delta), _mask);
return true;
}
if ((sdepth == CV_64F || ddepth == CV_64F) && !doubleSupport)
return false;
char cvt[2][40];
String opts = format("-D srcT=%s -D dstT=%s -D convertToWT=%s -D cn=%d -D rowsPerWI=%d"
" -D convertToDT=%s -D workT=%s%s%s%s -D srcT1=%s -D dstT1=%s",
ocl::typeToStr(stype), ocl::typeToStr(dtype),
ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), cn,
rowsPerWI, ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
doubleSupport ? " -D DOUBLE_SUPPORT" : "",
haveScale ? " -D HAVE_SCALE" : "",
haveDelta ? " -D HAVE_DELTA" : "",
ocl::typeToStr(sdepth), ocl::typeToStr(ddepth));
ocl::Kernel k("normalizek", ocl::core::normalize_oclsrc, opts);
if (k.empty())
return false;
UMat mask = _mask.getUMat(), dst = _dst.getUMat();
ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
dstarg = ocl::KernelArg::ReadWrite(dst);
if (haveScale)
{
if (haveDelta)
k.args(srcarg, maskarg, dstarg, fscale, fdelta);
else
k.args(srcarg, maskarg, dstarg, fscale);
}
else
{
if (haveDelta)
k.args(srcarg, maskarg, dstarg, fdelta);
else
k.args(srcarg, maskarg, dstarg);
}
size_t globalsize[2] = { (size_t)src.cols, ((size_t)src.rows + rowsPerWI - 1) / rowsPerWI };
return k.run(2, globalsize, NULL, false);
}
else
{
UMat temp;
src.convertTo( temp, dtype, scale, delta );
temp.copyTo( _dst, _mask );
}
return true;
} // ocl_normalize
#endif // HAVE_OPENCL
void normalize(InputArray _src, InputOutputArray _dst, double a, double b,
int norm_type, int rtype, InputArray _mask)
{
CV_INSTRUMENT_REGION();
double scale = 1, shift = 0;
int type = _src.type(), depth = CV_MAT_DEPTH(type);
if( rtype < 0 )
rtype = _dst.fixedType() ? _dst.depth() : depth;
if( norm_type == CV_MINMAX )
{
double smin = 0, smax = 0;
double dmin = MIN( a, b ), dmax = MAX( a, b );
minMaxIdx( _src, &smin, &smax, 0, 0, _mask );
scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0);
if( rtype == CV_32F )
{
scale = (float)scale;
shift = (float)dmin - (float)(smin*scale);
}
else
shift = dmin - smin*scale;
}
else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C )
{
scale = norm( _src, norm_type, _mask );
scale = scale > DBL_EPSILON ? a/scale : 0.;
shift = 0;
}
else
CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" );
CV_OCL_RUN(_dst.isUMat(),
ocl_normalize(_src, _dst, _mask, rtype, scale, shift))
Mat src = _src.getMat();
if( _mask.empty() )
src.convertTo( _dst, rtype, scale, shift );
else
{
Mat temp;
src.convertTo( temp, rtype, scale, shift );
temp.copyTo( _dst, _mask );
}
}
} // namespace
......@@ -750,6 +750,9 @@ void cv::randShuffle( InputOutputArray _dst, double iterFactor, RNG* _rng )
func( dst, rng, iterFactor );
}
#ifndef OPENCV_EXCLUDE_C_API
CV_IMPL void
cvRandArr( CvRNG* _rng, CvArr* arr, int disttype, CvScalar param1, CvScalar param2 )
{
......@@ -767,6 +770,9 @@ CV_IMPL void cvRandShuffle( CvArr* arr, CvRNG* _rng, double iter_factor )
cv::randShuffle( dst, iter_factor, &rng );
}
#endif // OPENCV_EXCLUDE_C_API
// Mersenne Twister random number generator.
// Inspired by http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/MT2002/CODES/mt19937ar.c
......
......@@ -5,6 +5,8 @@
#include "precomp.hpp"
#ifndef OPENCV_EXCLUDE_C_API
CV_IMPL CvScalar cvSum( const CvArr* srcarr )
{
cv::Scalar sum = cv::sum(cv::cvarrToMat(srcarr, false, true, 1));
......@@ -117,3 +119,5 @@ cvNorm( const void* imgA, const void* imgB, int normType, const void* maskarr )
return !maskarr ? cv::norm(a, b, normType) : cv::norm(a, b, normType, mask);
}
#endif // OPENCV_EXCLUDE_C_API
......@@ -1318,88 +1318,6 @@ UMat UMat::t() const
return m;
}
UMat UMat::inv(int method) const
{
UMat m;
invert(*this, m, method);
return m;
}
UMat UMat::mul(InputArray m, double scale) const
{
UMat dst;
multiply(*this, m, dst, scale);
return dst;
}
#ifdef HAVE_OPENCL
static bool ocl_dot( InputArray _src1, InputArray _src2, double & res )
{
UMat src1 = _src1.getUMat().reshape(1), src2 = _src2.getUMat().reshape(1);
int type = src1.type(), depth = CV_MAT_DEPTH(type),
kercn = ocl::predictOptimalVectorWidth(src1, src2);
bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
if ( !doubleSupport && depth == CV_64F )
return false;
int dbsize = ocl::Device::getDefault().maxComputeUnits();
size_t wgs = ocl::Device::getDefault().maxWorkGroupSize();
int ddepth = std::max(CV_32F, depth);
int wgs2_aligned = 1;
while (wgs2_aligned < (int)wgs)
wgs2_aligned <<= 1;
wgs2_aligned >>= 1;
char cvt[40];
ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstTK=%s -D ddepth=%d -D convertToDT=%s -D OP_DOT "
"-D WGS=%d -D WGS2_ALIGNED=%d%s%s%s -D kercn=%d",
ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)), ocl::typeToStr(depth),
ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKE_TYPE(ddepth, kercn)),
ddepth, ocl::convertTypeStr(depth, ddepth, kercn, cvt),
(int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
_src1.isContinuous() ? " -D HAVE_SRC_CONT" : "",
_src2.isContinuous() ? " -D HAVE_SRC2_CONT" : "", kercn));
if (k.empty())
return false;
UMat db(1, dbsize, ddepth);
ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1),
src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
dbarg = ocl::KernelArg::PtrWriteOnly(db);
k.args(src1arg, src1.cols, (int)src1.total(), dbsize, dbarg, src2arg);
size_t globalsize = dbsize * wgs;
if (k.run(1, &globalsize, &wgs, true))
{
res = sum(db.getMat(ACCESS_READ))[0];
return true;
}
return false;
}
#endif
double UMat::dot(InputArray m) const
{
CV_INSTRUMENT_REGION();
CV_Assert(m.sameSize(*this) && m.type() == type());
#ifdef HAVE_OPENCL
double r = 0;
CV_OCL_RUN_(dims <= 2, ocl_dot(*this, m, r), r)
#endif
return getMat(ACCESS_READ).dot(m);
}
UMat UMat::zeros(int rows, int cols, int type)
{
return UMat(rows, cols, type, Scalar::all(0));
......@@ -1430,18 +1348,6 @@ UMat UMat::ones(int ndims, const int* sz, int type)
return UMat(ndims, sz, type, Scalar(1));
}
UMat UMat::eye(int rows, int cols, int type)
{
return UMat::eye(Size(cols, rows), type);
}
UMat UMat::eye(Size size, int type)
{
UMat m(size, type);
setIdentity(m);
return m;
}
}
/* End of file. */
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册