From 00f3ad721797d8013657f63142b2451b76556e9c Mon Sep 17 00:00:00 2001
From: Naba Kumar <naba.kumar@gmail.com>
Date: Wed, 15 Mar 2017 08:41:54 +0200
Subject: [PATCH] Implement DFT as cv::Algorithm to support concurrent streams

---
 modules/core/include/opencv2/core/base.hpp    |   4 +
 modules/core/src/dxt.cpp                      |   3 +
 .../cudaarithm/include/opencv2/cudaarithm.hpp |  30 +++
 modules/cudaarithm/src/arithm.cpp             | 197 +++++++++++-------
 modules/cudaarithm/test/test_arithm.cpp       |  27 +++
 5 files changed, 180 insertions(+), 81 deletions(-)

diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index b319df6f38..07ca3a51a1 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -239,6 +239,10 @@ enum DftFlags {
         into a real array and inverse transformation is executed, the function treats the input as a
         packed complex-conjugate symmetrical array, and the output will also be a real array). */
     DFT_REAL_OUTPUT    = 32,
+    /** specifies that input is complex input. If this flag is set, the input must have 2 channels.
+        On the other hand, for backwards compatibility reason, if input has 2 channels, input is
+        already considered complex. */
+    DFT_COMPLEX_INPUT  = 64,
     /** performs an inverse 1D or 2D transform instead of the default forward transform. */
     DCT_INVERSE        = DFT_INVERSE,
     /** performs a forward or inverse transform of every individual row of the input
diff --git a/modules/core/src/dxt.cpp b/modules/core/src/dxt.cpp
index f553c4f31e..e33b105ba6 100644
--- a/modules/core/src/dxt.cpp
+++ b/modules/core/src/dxt.cpp
@@ -3342,6 +3342,9 @@ void cv::dft( InputArray _src0, OutputArray _dst, int flags, int nonzero_rows )
 
     CV_Assert( type == CV_32FC1 || type == CV_32FC2 || type == CV_64FC1 || type == CV_64FC2 );
 
+    // Fail if DFT_COMPLEX_INPUT is specified, but src is not 2 channels.
+    CV_Assert( !((flags & DFT_COMPLEX_INPUT) && src.channels() != 2) );
+
     if( !inv && src.channels() == 1 && (flags & DFT_COMPLEX_OUTPUT) )
         _dst.create( src.size(), CV_MAKETYPE(depth, 2) );
     else if( inv && src.channels() == 2 && (flags & DFT_REAL_OUTPUT) )
diff --git a/modules/cudaarithm/include/opencv2/cudaarithm.hpp b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
index f2ee84543f..a482b49fcf 100644
--- a/modules/cudaarithm/include/opencv2/cudaarithm.hpp
+++ b/modules/cudaarithm/include/opencv2/cudaarithm.hpp
@@ -788,6 +788,7 @@ CV_EXPORTS void mulAndScaleSpectrums(InputArray src1, InputArray src2, OutputArr
 (obtained from dft_size ).
 -   **DFT_INVERSE** inverts DFT. Use for complex-complex cases (real-complex and complex-real
 cases are always forward and inverse, respectively).
+-   **DFT_COMPLEX_INPUT** Specifies that input is complex input with 2 channels.
 -   **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of
 real-complex transform, so the destination matrix must be real.
 @param stream Stream for the asynchronous version.
@@ -813,6 +814,35 @@ instead of the width.
  */
 CV_EXPORTS void dft(InputArray src, OutputArray dst, Size dft_size, int flags=0, Stream& stream = Stream::Null());
 
+/** @brief Base class for DFT operator as a cv::Algorithm. :
+ */
+class CV_EXPORTS DFT : public Algorithm
+{
+public:
+    /** @brief Computes an FFT of a given image.
+
+    @param image Source image. Only CV_32FC1 images are supported for now.
+    @param result Result image.
+    @param stream Stream for the asynchronous version.
+     */
+    virtual void compute(InputArray image, OutputArray result, Stream& stream = Stream::Null()) = 0;
+};
+
+/** @brief Creates implementation for cuda::DFT.
+
+@param dft_size The image size.
+@param flags Optional flags:
+-   **DFT_ROWS** transforms each individual row of the source matrix.
+-   **DFT_SCALE** scales the result: divide it by the number of elements in the transform
+(obtained from dft_size ).
+-   **DFT_INVERSE** inverts DFT. Use for complex-complex cases (real-complex and complex-real
+cases are always forward and inverse, respectively).
+-   **DFT_COMPLEX_INPUT** Specifies that inputs will be complex with 2 channels.
+-   **DFT_REAL_OUTPUT** specifies the output as real. The source matrix is the result of
+real-complex transform, so the destination matrix must be real.
+ */
+CV_EXPORTS Ptr<DFT> createDFT(Size dft_size, int flags);
+
 /** @brief Base class for convolution (or cross-correlation) operator. :
  */
 class CV_EXPORTS Convolution : public Algorithm
diff --git a/modules/cudaarithm/src/arithm.cpp b/modules/cudaarithm/src/arithm.cpp
index 08de4e4288..01a0169136 100644
--- a/modules/cudaarithm/src/arithm.cpp
+++ b/modules/cudaarithm/src/arithm.cpp
@@ -286,111 +286,146 @@ void cv::cuda::gemm(InputArray _src1, InputArray _src2, double alpha, InputArray
 }
 
 //////////////////////////////////////////////////////////////////////////////
-// dft
+// DFT function
 
 void cv::cuda::dft(InputArray _src, OutputArray _dst, Size dft_size, int flags, Stream& stream)
 {
-#ifndef HAVE_CUFFT
-    (void) _src;
-    (void) _dst;
-    (void) dft_size;
-    (void) flags;
-    (void) stream;
-    throw_no_cuda();
-#else
-    GpuMat src = getInputMat(_src, stream);
+    if (getInputMat(_src, stream).channels() == 2)
+        flags |= DFT_COMPLEX_INPUT;
 
-    CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 );
+    Ptr<DFT> dft = createDFT(dft_size, flags);
+    dft->compute(_src, _dst, stream);
+}
 
-    // We don't support unpacked output (in the case of real input)
-    CV_Assert( !(flags & DFT_COMPLEX_OUTPUT) );
+//////////////////////////////////////////////////////////////////////////////
+// DFT algorithm
 
-    const bool is_1d_input       = (dft_size.height == 1) || (dft_size.width == 1);
-    const bool is_row_dft        = (flags & DFT_ROWS) != 0;
-    const bool is_scaled_dft     = (flags & DFT_SCALE) != 0;
-    const bool is_inverse        = (flags & DFT_INVERSE) != 0;
-    const bool is_complex_input  = src.channels() == 2;
-    const bool is_complex_output = !(flags & DFT_REAL_OUTPUT);
+#ifdef HAVE_CUFFT
 
-    // We don't support real-to-real transform
-    CV_Assert( is_complex_input || is_complex_output );
+namespace
+{
 
-    // Make sure here we work with the continuous input,
-    // as CUFFT can't handle gaps
-    GpuMat src_cont;
-    if (src.isContinuous())
+    class DFTImpl : public DFT
     {
-        src_cont = src;
-    }
-    else
-    {
-        BufferPool pool(stream);
-        src_cont.allocator = pool.getAllocator();
-        createContinuous(src.rows, src.cols, src.type(), src_cont);
-        src.copyTo(src_cont, stream);
-    }
+        Size dft_size, dft_size_opt;
+        bool is_1d_input, is_row_dft, is_scaled_dft, is_inverse, is_complex_input, is_complex_output;
 
-    Size dft_size_opt = dft_size;
-    if (is_1d_input && !is_row_dft)
-    {
-        // If the source matrix is single column handle it as single row
-        dft_size_opt.width = std::max(dft_size.width, dft_size.height);
-        dft_size_opt.height = std::min(dft_size.width, dft_size.height);
-    }
+        cufftType dft_type;
+        cufftHandle plan;
 
-    CV_Assert( dft_size_opt.width > 1 );
+    public:
+        DFTImpl(Size dft_size, int flags)
+            : dft_size(dft_size),
+              dft_size_opt(dft_size),
+              is_1d_input((dft_size.height == 1) || (dft_size.width == 1)),
+              is_row_dft((flags & DFT_ROWS) != 0),
+              is_scaled_dft((flags & DFT_SCALE) != 0),
+              is_inverse((flags & DFT_INVERSE) != 0),
+              is_complex_input((flags & DFT_COMPLEX_INPUT) != 0),
+              is_complex_output(!(flags & DFT_REAL_OUTPUT)),
+              dft_type(!is_complex_input ? CUFFT_R2C : (is_complex_output ? CUFFT_C2C : CUFFT_C2R))
+        {
+            // We don't support unpacked output (in the case of real input)
+            CV_Assert( !(flags & DFT_COMPLEX_OUTPUT) );
 
-    cufftType dft_type = CUFFT_R2C;
-    if (is_complex_input)
-        dft_type = is_complex_output ? CUFFT_C2C : CUFFT_C2R;
+            // We don't support real-to-real transform
+            CV_Assert( is_complex_input || is_complex_output );
 
-    cufftHandle plan;
-    if (is_1d_input || is_row_dft)
-        cufftSafeCall( cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height) );
-    else
-        cufftSafeCall( cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type) );
+            if (is_1d_input && !is_row_dft)
+            {
+                // If the source matrix is single column handle it as single row
+                dft_size_opt.width = std::max(dft_size.width, dft_size.height);
+                dft_size_opt.height = std::min(dft_size.width, dft_size.height);
+            }
 
-    cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );
+            CV_Assert( dft_size_opt.width > 1 );
 
-    if (is_complex_input)
-    {
-        if (is_complex_output)
-        {
-            createContinuous(dft_size, CV_32FC2, _dst);
-            GpuMat dst = _dst.getGpuMat();
+            if (is_1d_input || is_row_dft)
+                cufftSafeCall( cufftPlan1d(&plan, dft_size_opt.width, dft_type, dft_size_opt.height) );
+            else
+                cufftSafeCall( cufftPlan2d(&plan, dft_size_opt.height, dft_size_opt.width, dft_type) );
+        }
 
-            cufftSafeCall(cufftExecC2C(
-                    plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftComplex>(),
-                    is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
+        ~DFTImpl()
+        {
+            cufftSafeCall( cufftDestroy(plan) );
         }
-        else
+
+        void compute(InputArray _src, OutputArray _dst, Stream& stream)
         {
-            createContinuous(dft_size, CV_32F, _dst);
-            GpuMat dst = _dst.getGpuMat();
+            GpuMat src = getInputMat(_src, stream);
 
-            cufftSafeCall(cufftExecC2R(
-                    plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftReal>()));
-        }
-    }
-    else
-    {
-        // We could swap dft_size for efficiency. Here we must reflect it
-        if (dft_size == dft_size_opt)
-            createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, _dst);
-        else
-            createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, _dst);
+            CV_Assert( src.type() == CV_32FC1 || src.type() == CV_32FC2 );
+            CV_Assert( is_complex_input == (src.channels() == 2) );
 
-        GpuMat dst = _dst.getGpuMat();
+            // Make sure here we work with the continuous input,
+            // as CUFFT can't handle gaps
+            GpuMat src_cont;
+            if (src.isContinuous())
+            {
+                src_cont = src;
+            }
+            else
+            {
+                BufferPool pool(stream);
+                src_cont.allocator = pool.getAllocator();
+                createContinuous(src.rows, src.cols, src.type(), src_cont);
+                src.copyTo(src_cont, stream);
+            }
 
-        cufftSafeCall(cufftExecR2C(
-                plan, src_cont.ptr<cufftReal>(), dst.ptr<cufftComplex>()));
-    }
+            cufftSafeCall( cufftSetStream(plan, StreamAccessor::getStream(stream)) );
 
-    cufftSafeCall( cufftDestroy(plan) );
+            if (is_complex_input)
+            {
+                if (is_complex_output)
+                {
+                    createContinuous(dft_size, CV_32FC2, _dst);
+                    GpuMat dst = _dst.getGpuMat();
+
+                    cufftSafeCall(cufftExecC2C(
+                            plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftComplex>(),
+                            is_inverse ? CUFFT_INVERSE : CUFFT_FORWARD));
+                }
+                else
+                {
+                    createContinuous(dft_size, CV_32F, _dst);
+                    GpuMat dst = _dst.getGpuMat();
+
+                    cufftSafeCall(cufftExecC2R(
+                            plan, src_cont.ptr<cufftComplex>(), dst.ptr<cufftReal>()));
+                }
+            }
+            else
+            {
+                // We could swap dft_size for efficiency. Here we must reflect it
+                if (dft_size == dft_size_opt)
+                    createContinuous(Size(dft_size.width / 2 + 1, dft_size.height), CV_32FC2, _dst);
+                else
+                    createContinuous(Size(dft_size.width, dft_size.height / 2 + 1), CV_32FC2, _dst);
 
-    if (is_scaled_dft)
-        cuda::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream);
+                GpuMat dst = _dst.getGpuMat();
 
+                cufftSafeCall(cufftExecR2C(
+                                  plan, src_cont.ptr<cufftReal>(), dst.ptr<cufftComplex>()));
+            }
+
+            if (is_scaled_dft)
+                cuda::multiply(_dst, Scalar::all(1. / dft_size.area()), _dst, 1, -1, stream);
+        }
+    };
+}
+
+#endif
+
+Ptr<DFT> cv::cuda::createDFT(Size dft_size, int flags)
+{
+#ifndef HAVE_CUFFT
+    (void) dft_size;
+    (void) flags;
+    CV_Error(Error::StsNotImplemented, "The library was build without CUFFT");
+    return Ptr<DFT>();
+#else
+    return makePtr<DFTImpl>(dft_size, flags);
 #endif
 }
 
diff --git a/modules/cudaarithm/test/test_arithm.cpp b/modules/cudaarithm/test/test_arithm.cpp
index 257f5233cc..3e99ed4f3f 100644
--- a/modules/cudaarithm/test/test_arithm.cpp
+++ b/modules/cudaarithm/test/test_arithm.cpp
@@ -250,6 +250,33 @@ CUDA_TEST_P(Dft, C2C)
     }
 }
 
+CUDA_TEST_P(Dft, Algorithm)
+{
+    int cols = randomInt(2, 100);
+    int rows = randomInt(2, 100);
+
+    int flags = 0;
+    cv::Ptr<cv::cuda::DFT> dft = cv::cuda::createDFT(cv::Size(cols, rows), flags);
+
+    for (int i = 0; i < 5; ++i)
+    {
+        SCOPED_TRACE("dft algorithm");
+
+        cv::Mat a = randomMat(cv::Size(cols, rows), CV_32FC2, 0.0, 10.0);
+
+        cv::cuda::GpuMat d_b;
+        cv::cuda::GpuMat d_b_data;
+        dft->compute(loadMat(a), d_b);
+
+        cv::Mat b_gold;
+        cv::dft(a, b_gold, flags);
+
+        ASSERT_EQ(CV_32F, d_b.depth());
+        ASSERT_EQ(2, d_b.channels());
+        EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), rows * cols * 1e-4);
+    }
+}
+
 namespace
 {
     void testR2CThenC2R(const std::string& hint, int cols, int rows, bool inplace)
-- 
GitLab