From 00f3ad721797d8013657f63142b2451b76556e9c Mon Sep 17 00:00:00 2001 From: Naba Kumar Date: Wed, 15 Mar 2017 08:41:54 +0200 Subject: [PATCH] Implement DFT as cv::Algorithm to support concurrent streams --- modules/core/include/opencv2/core/base.hpp | 4 + modules/core/src/dxt.cpp | 3 + .../cudaarithm/include/opencv2/cudaarithm.hpp | 30 +++ modules/cudaarithm/src/arithm.cpp | 197 +++++++++++------- modules/cudaarithm/test/test_arithm.cpp | 27 +++ 5 files changed, 180 insertions(+), 81 deletions(-) diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp index b319df6f38..07ca3a51a1 100644 --- a/modules/core/include/opencv2/core/base.hpp +++ b/modules/core/include/opencv2/core/base.hpp @@ -239,6 +239,10 @@ enum DftFlags { into a real array and inverse transformation is executed, the function treats the input as a packed complex-conjugate symmetrical array, and the output will also be a real array). */ DFT_REAL_OUTPUT = 32, + /** specifies that input is complex input. If this flag is set, the input must have 2 channels. + On the other hand, for backwards compatibility reason, if input has 2 channels, input is + already considered complex. */ + DFT_COMPLEX_INPUT = 64, /** performs an inverse 1D or 2D transform instead of the default forward transform. */ DCT_INVERSE = DFT_INVERSE, /** performs a forward or inverse transform of every individual row of the input diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp index f553c4f31e..e33b105ba6 100644 --- a/modules/core/src/dxt.cpp +++ b/modules/core/src/dxt.cpp @@ -3342,6 +3342,9 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows ) CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 ); + // Fail if DFT_COMPLEX_INPUT is specified, but src is not 2 channels. + CV_Assert( !((flags & DFT_COMPLEX_INPUT) && src.channels() != 2) ); + if( !inv && src.channels() == 1 && (flags & DFT_COMPLEX_OUTPUT) ) _dst.create( src.size(), CV_MAKETYPE(depth, 2) ); else if( inv && src.channels() == 2 && (flags & DFT_REAL_OUTPUT) ) diff --git a/modules/cudaarithm/include/opencv2/cudaarithm.hpp b/modules/cudaarithm/include/opencv2/cudaarithm.hpp index f2ee84543f..a482b49fcf 100644 --- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp +++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp @@ -788,6 +788,7 @@ CV_EXPORTS void mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArr (obtained from dft_size ). - **DFT_INVERSE** inverts DFT. Use for complex-complex cases (real-complex and complex-real cases are always forward and inverse, respectively). +- **DFT_COMPLEX_INPUT** Specifies that input is complex input with 2 channels. - **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of real-complex transform, so the destination matrix must be real. @param stream Stream for the asynchronous version. @@ -813,6 +814,35 @@ instead of the width. */ CV_EXPORTS void dft(InputArray src, OutputArray dst, Size dft_size, int flags=0, Stream& stream = Stream::Null()); +/** @brief Base class for DFT operator as a cv::Algorithm. : + */ +class CV_EXPORTS DFT : public Algorithm +{ +public: + /** @brief Computes an FFT of a given image. + + @param image Source image. Only CV_32FC1 images are supported for now. + @param result Result image. + @param stream Stream for the asynchronous version. + */ + virtual void compute(InputArray image, OutputArray result, Stream& stream = Stream::Null()) = 0; +}; + +/** @brief Creates implementation for cuda::DFT. + +@param dft_size The image size. +@param flags Optional flags: +- **DFT_ROWS** transforms each individual row of the source matrix. +- **DFT_SCALE** scales the result: divide it by the number of elements in the transform +(obtained from dft_size ). +- **DFT_INVERSE** inverts DFT. Use for complex-complex cases (real-complex and complex-real +cases are always forward and inverse, respectively). +- **DFT_COMPLEX_INPUT** Specifies that inputs will be complex with 2 channels. +- **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of +real-complex transform, so the destination matrix must be real. + */ +CV_EXPORTS Ptr createDFT(Size dft_size, int flags); + /** @brief Base class for convolution (or cross-correlation) operator. : */ class CV_EXPORTS Convolution : public Algorithm diff --git a/modules/cudaarithm/src/arithm.cpp b/modules/cudaarithm/src/arithm.cpp index 08de4e4288..01a0169136 100644 --- a/modules/cudaarithm/src/arithm.cpp +++ b/modules/cudaarithm/src/arithm.cpp @@ -286,111 +286,146 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray } ////////////////////////////////////////////////////////////////////////////// -// dft +// DFT function void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, Stream& stream) { -#ifndef HAVE_CUFFT - (void) _src; - (void) _dst; - (void) dft_size; - (void) flags; - (void) stream; - throw_no_cuda(); -#else - GpuMat src = getInputMat(_src, stream); + if (getInputMat(_src, stream).channels() == 2) + flags |= DFT_COMPLEX_INPUT; - CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 ); + Ptr dft = createDFT(dft_size, flags); + dft->compute(_src, _dst, stream); +} - // We don't support unpacked output (in the case of real input) - CV_Assert( !(flags & DFT_COMPLEX_OUTPUT) ); +////////////////////////////////////////////////////////////////////////////// +// DFT algorithm - const bool is_1d_input = (dft_size.height == 1) || (dft_size.width == 1); - const bool is_row_dft = (flags & DFT_ROWS) != 0; - const bool is_scaled_dft = (flags & DFT_SCALE) != 0; - const bool is_inverse = (flags & DFT_INVERSE) != 0; - const bool is_complex_input = src.channels() == 2; - const bool is_complex_output = !(flags & DFT_REAL_OUTPUT); +#ifdef HAVE_CUFFT - // We don't support real-to-real transform - CV_Assert( is_complex_input || is_complex_output ); +namespace +{ - // Make sure here we work with the continuous input, - // as CUFFT can't handle gaps - GpuMat src_cont; - if (src.isContinuous()) + class DFTImpl : public DFT { - src_cont = src; - } - else - { - BufferPool pool(stream); - src_cont.allocator = pool.getAllocator(); - createContinuous(src.rows, src.cols, src.type(), src_cont); - src.copyTo(src_cont, stream); - } + Size dft_size, dft_size_opt; + bool is_1d_input, is_row_dft, is_scaled_dft, is_inverse, is_complex_input, is_complex_output; - Size dft_size_opt = dft_size; - if (is_1d_input && !is_row_dft) - { - // If the source matrix is single column handle it as single row - dft_size_opt.width = std::max(dft_size.width, dft_size.height); - dft_size_opt.height = std::min(dft_size.width, dft_size.height); - } + cufftType dft_type; + cufftHandle plan; - CV_Assert( dft_size_opt.width > 1 ); + public: + DFTImpl(Size dft_size, int flags) + : dft_size(dft_size), + dft_size_opt(dft_size), + is_1d_input((dft_size.height == 1) || (dft_size.width == 1)), + is_row_dft((flags & DFT_ROWS) != 0), + is_scaled_dft((flags & DFT_SCALE) != 0), + is_inverse((flags & DFT_INVERSE) != 0), + is_complex_input((flags & DFT_COMPLEX_INPUT) != 0), + is_complex_output(!(flags & DFT_REAL_OUTPUT)), + dft_type(!is_complex_input ? CUFFT_R2C : (is_complex_output ? CUFFT_C2C : CUFFT_C2R)) + { + // We don't support unpacked output (in the case of real input) + CV_Assert( !(flags & DFT_COMPLEX_OUTPUT) ); - cufftType dft_type = CUFFT_R2C; - if (is_complex_input) - dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R; + // We don't support real-to-real transform + CV_Assert( is_complex_input || is_complex_output ); - cufftHandle plan; - if (is_1d_input || is_row_dft) - cufftSafeCall( cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height) ); - else - cufftSafeCall( cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type) ); + if (is_1d_input && !is_row_dft) + { + // If the source matrix is single column handle it as single row + dft_size_opt.width = std::max(dft_size.width, dft_size.height); + dft_size_opt.height = std::min(dft_size.width, dft_size.height); + } - cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) ); + CV_Assert( dft_size_opt.width > 1 ); - if (is_complex_input) - { - if (is_complex_output) - { - createContinuous(dft_size, CV_32FC2, _dst); - GpuMat dst = _dst.getGpuMat(); + if (is_1d_input || is_row_dft) + cufftSafeCall( cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height) ); + else + cufftSafeCall( cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type) ); + } - cufftSafeCall(cufftExecC2C( - plan, src_cont.ptr(), dst.ptr(), - is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD)); + ~DFTImpl() + { + cufftSafeCall( cufftDestroy(plan) ); } - else + + void compute(InputArray _src, OutputArray _dst, Stream& stream) { - createContinuous(dft_size, CV_32F, _dst); - GpuMat dst = _dst.getGpuMat(); + GpuMat src = getInputMat(_src, stream); - cufftSafeCall(cufftExecC2R( - plan, src_cont.ptr(), dst.ptr())); - } - } - else - { - // We could swap dft_size for efficiency. Here we must reflect it - if (dft_size == dft_size_opt) - createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, _dst); - else - createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, _dst); + CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 ); + CV_Assert( is_complex_input == (src.channels() == 2) ); - GpuMat dst = _dst.getGpuMat(); + // Make sure here we work with the continuous input, + // as CUFFT can't handle gaps + GpuMat src_cont; + if (src.isContinuous()) + { + src_cont = src; + } + else + { + BufferPool pool(stream); + src_cont.allocator = pool.getAllocator(); + createContinuous(src.rows, src.cols, src.type(), src_cont); + src.copyTo(src_cont, stream); + } - cufftSafeCall(cufftExecR2C( - plan, src_cont.ptr(), dst.ptr())); - } + cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) ); - cufftSafeCall( cufftDestroy(plan) ); + if (is_complex_input) + { + if (is_complex_output) + { + createContinuous(dft_size, CV_32FC2, _dst); + GpuMat dst = _dst.getGpuMat(); + + cufftSafeCall(cufftExecC2C( + plan, src_cont.ptr(), dst.ptr(), + is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD)); + } + else + { + createContinuous(dft_size, CV_32F, _dst); + GpuMat dst = _dst.getGpuMat(); + + cufftSafeCall(cufftExecC2R( + plan, src_cont.ptr(), dst.ptr())); + } + } + else + { + // We could swap dft_size for efficiency. Here we must reflect it + if (dft_size == dft_size_opt) + createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, _dst); + else + createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, _dst); - if (is_scaled_dft) - cuda::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream); + GpuMat dst = _dst.getGpuMat(); + cufftSafeCall(cufftExecR2C( + plan, src_cont.ptr(), dst.ptr())); + } + + if (is_scaled_dft) + cuda::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream); + } + }; +} + +#endif + +Ptr cv::cuda::createDFT(Size dft_size, int flags) +{ +#ifndef HAVE_CUFFT + (void) dft_size; + (void) flags; + CV_Error(Error::StsNotImplemented, "The library was build without CUFFT"); + return Ptr(); +#else + return makePtr(dft_size, flags); #endif } diff --git a/modules/cudaarithm/test/test_arithm.cpp b/modules/cudaarithm/test/test_arithm.cpp index 257f5233cc..3e99ed4f3f 100644 --- a/modules/cudaarithm/test/test_arithm.cpp +++ b/modules/cudaarithm/test/test_arithm.cpp @@ -250,6 +250,33 @@ CUDA_TEST_P(Dft, C2C) } } +CUDA_TEST_P(Dft, Algorithm) +{ + int cols = randomInt(2, 100); + int rows = randomInt(2, 100); + + int flags = 0; + cv::Ptr dft = cv::cuda::createDFT(cv::Size(cols, rows), flags); + + for (int i = 0; i < 5; ++i) + { + SCOPED_TRACE("dft algorithm"); + + cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC2, 0.0, 10.0); + + cv::cuda::GpuMat d_b; + cv::cuda::GpuMat d_b_data; + dft->compute(loadMat(a), d_b); + + cv::Mat b_gold; + cv::dft(a, b_gold, flags); + + ASSERT_EQ(CV_32F, d_b.depth()); + ASSERT_EQ(2, d_b.channels()); + EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), rows * cols * 1e-4); + } +} + namespace { void testR2CThenC2R(const std::string& hint, int cols, int rows, bool inplace) -- GitLab