refactor cudaoptflow public API:

* use opaque algorithm interfaces * add stream support

refactor cudaoptflow public API:
* use opaque algorithm interfaces * add stream support
381216aa · Vladislav Vinogradov · 19c6bbe7 · 381216aa · 381216aa · 381216aa
7 changed file
--- a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
+++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
@@ -61,49 +61,94 @@ namespace cv { namespace cuda {
 //! @addtogroup cudaoptflow
 //! @{

-/** @brief Class computing the optical flow for two images using Brox et al Optical Flow algorithm
-(@cite Brox2004). :
+//
+// Interface
+//
+
+/** @brief Base interface for dense optical flow algorithms.
 */
-class CV_EXPORTS BroxOpticalFlow
+class CV_EXPORTS DenseOpticalFlow : public Algorithm
 {
 public:
-    BroxOpticalFlow(float alpha_, float gamma_, float scale_factor_, int inner_iterations_, int outer_iterations_, int solver_iterations_) :
-        alpha(alpha_), gamma(gamma_), scale_factor(scale_factor_),
-        inner_iterations(inner_iterations_), outer_iterations(outer_iterations_), solver_iterations(solver_iterations_)
-    {
-    }
+    /** @brief Calculates a dense optical flow.

-    //! Compute optical flow
-    //! frame0 - source frame (supports only CV_32FC1 type)
-    //! frame1 - frame to track (with the same size and type as frame0)
-    //! u      - flow horizontal component (along x axis)
-    //! v      - flow vertical component (along y axis)
-    void operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& stream = Stream::Null());
+    @param I0 first input image.
+    @param I1 second input image of the same size and the same type as I0.
+    @param flow computed flow image that has the same size as I0 and type CV_32FC2.
+    @param stream Stream for the asynchronous version.
+     */
+    virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream = Stream::Null()) = 0;
+};

-    //! flow smoothness
-    float alpha;
+/** @brief Base interface for sparse optical flow algorithms.
+ */
+class CV_EXPORTS SparseOpticalFlow : public Algorithm
+{
+public:
+    /** @brief Calculates a sparse optical flow.
+
+    @param prevImg First input image.
+    @param nextImg Second input image of the same size and the same type as prevImg.
+    @param prevPts Vector of 2D points for which the flow needs to be found.
+    @param nextPts Output vector of 2D points containing the calculated new positions of input features in the second image.
+    @param status Output status vector. Each element of the vector is set to 1 if the
+                  flow for the corresponding features has been found. Otherwise, it is set to 0.
+    @param err Optional output vector that contains error response for each point (inverse confidence).
+    @param stream Stream for the asynchronous version.
+     */
+    virtual void calc(InputArray prevImg, InputArray nextImg,
+                      InputArray prevPts, InputOutputArray nextPts,
+                      OutputArray status,
+                      OutputArray err = cv::noArray(),
+                      Stream& stream = Stream::Null()) = 0;
+};

-    //! gradient constancy importance
-    float gamma;
+//
+// BroxOpticalFlow
+//

-    //! pyramid scale factor
-    float scale_factor;
+/** @brief Class computing the optical flow for two images using Brox et al Optical Flow algorithm (@cite Brox2004).
+ */
+class CV_EXPORTS BroxOpticalFlow : public DenseOpticalFlow
+{
+public:
+    virtual double getFlowSmoothness() const = 0;
+    virtual void setFlowSmoothness(double alpha) = 0;
+
+    virtual double getGradientConstancyImportance() const = 0;
+    virtual void setGradientConstancyImportance(double gamma) = 0;
+
+    virtual double getPyramidScaleFactor() const = 0;
+    virtual void setPyramidScaleFactor(double scale_factor) = 0;

    //! number of lagged non-linearity iterations (inner loop)
-    int inner_iterations;
+    virtual int getInnerIterations() const = 0;
+    virtual void setInnerIterations(int inner_iterations) = 0;

    //! number of warping iterations (number of pyramid levels)
-    int outer_iterations;
+    virtual int getOuterIterations() const = 0;
+    virtual void setOuterIterations(int outer_iterations) = 0;

    //! number of linear system solver iterations
-    int solver_iterations;
-
-    GpuMat buf;
+    virtual int getSolverIterations() const = 0;
+    virtual void setSolverIterations(int solver_iterations) = 0;
+
+    static Ptr<BroxOpticalFlow> create(
+            double alpha = 0.197,
+            double gamma = 50.0,
+            double scale_factor = 0.8,
+            int inner_iterations = 5,
+            int outer_iterations = 150,
+            int solver_iterations = 10);
 };

-/** @brief Class used for calculating an optical flow.
+//
+// PyrLKOpticalFlow
+//
+
+/** @brief Class used for calculating a sparse optical flow.

-The class can calculate an optical flow for a sparse feature set or dense optical flow using the
+The class can calculate an optical flow for a sparse feature set using the
 iterative Lucas-Kanade method with pyramids.

 @sa calcOpticalFlowPyrLK
@@ -112,158 +157,116 @@ iterative Lucas-Kanade method with pyramids.
   -   An example of the Lucas Kanade optical flow algorithm can be found at
        opencv_source_code/samples/gpu/pyrlk_optical_flow.cpp
 */
-class CV_EXPORTS PyrLKOpticalFlow
+class CV_EXPORTS SparsePyrLKOpticalFlow : public SparseOpticalFlow
 {
 public:
-    PyrLKOpticalFlow();
-
-    /** @brief Calculate an optical flow for a sparse feature set.
-
-    @param prevImg First 8-bit input image (supports both grayscale and color images).
-    @param nextImg Second input image of the same size and the same type as prevImg .
-    @param prevPts Vector of 2D points for which the flow needs to be found. It must be one row matrix
-    with CV_32FC2 type.
-    @param nextPts Output vector of 2D points (with single-precision floating-point coordinates)
-    containing the calculated new positions of input features in the second image. When useInitialFlow
-    is true, the vector must have the same size as in the input.
-    @param status Output status vector (CV_8UC1 type). Each element of the vector is set to 1 if the
-    flow for the corresponding features has been found. Otherwise, it is set to 0.
-    @param err Output vector (CV_32FC1 type) that contains the difference between patches around the
-    original and moved points or min eigen value if getMinEigenVals is checked. It can be NULL, if not
-    needed.
-
-    @sa calcOpticalFlowPyrLK
-     */
-    void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
-        GpuMat& status, GpuMat* err = 0);
-
-    /** @brief Calculate dense optical flow.
-
-    @param prevImg First 8-bit grayscale input image.
-    @param nextImg Second input image of the same size and the same type as prevImg .
-    @param u Horizontal component of the optical flow of the same size as input images, 32-bit
-    floating-point, single-channel
-    @param v Vertical component of the optical flow of the same size as input images, 32-bit
-    floating-point, single-channel
-    @param err Output vector (CV_32FC1 type) that contains the difference between patches around the
-    original and moved points or min eigen value if getMinEigenVals is checked. It can be NULL, if not
-    needed.
-     */
-    void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err = 0);
-
-    /** @brief Releases inner buffers memory.
-    */
-    void releaseMemory();
+    virtual Size getWinSize() const = 0;
+    virtual void setWinSize(Size winSize) = 0;

-    Size winSize;
-    int maxLevel;
-    int iters;
-    bool useInitialFlow;
+    virtual int getMaxLevel() const = 0;
+    virtual void setMaxLevel(int maxLevel) = 0;

-private:
-    std::vector<GpuMat> prevPyr_;
-    std::vector<GpuMat> nextPyr_;
+    virtual int getNumIters() const = 0;
+    virtual void setNumIters(int iters) = 0;

-    GpuMat buf_;
+    virtual bool getUseInitialFlow() const = 0;
+    virtual void setUseInitialFlow(bool useInitialFlow) = 0;

-    GpuMat uPyr_[2];
-    GpuMat vPyr_[2];
+    static Ptr<SparsePyrLKOpticalFlow> create(
+            Size winSize = Size(21, 21),
+            int maxLevel = 3,
+            int iters = 30,
+            bool useInitialFlow = false);
 };

-/** @brief Class computing a dense optical flow using the Gunnar Farneback’s algorithm. :
+/** @brief Class used for calculating a dense optical flow.
+
+The class can calculate an optical flow for a dense optical flow using the
+iterative Lucas-Kanade method with pyramids.
 */
-class CV_EXPORTS FarnebackOpticalFlow
+class CV_EXPORTS DensePyrLKOpticalFlow : public DenseOpticalFlow
 {
 public:
-    FarnebackOpticalFlow()
-    {
-        numLevels = 5;
-        pyrScale = 0.5;
-        fastPyramids = false;
-        winSize = 13;
-        numIters = 10;
-        polyN = 5;
-        polySigma = 1.1;
-        flags = 0;
-    }
-
-    int numLevels;
-    double pyrScale;
-    bool fastPyramids;
-    int winSize;
-    int numIters;
-    int polyN;
-    double polySigma;
-    int flags;
-
-    /** @brief Computes a dense optical flow using the Gunnar Farneback’s algorithm.
-
-    @param frame0 First 8-bit gray-scale input image
-    @param frame1 Second 8-bit gray-scale input image
-    @param flowx Flow horizontal component
-    @param flowy Flow vertical component
-    @param s Stream
-
-    @sa calcOpticalFlowFarneback
-     */
-    void operator ()(const GpuMat &frame0, const GpuMat &frame1, GpuMat &flowx, GpuMat &flowy, Stream &s = Stream::Null());
+    virtual Size getWinSize() const = 0;
+    virtual void setWinSize(Size winSize) = 0;

-    /** @brief Releases unused auxiliary memory buffers.
-     */
-    void releaseMemory()
-    {
-        frames_[0].release();
-        frames_[1].release();
-        pyrLevel_[0].release();
-        pyrLevel_[1].release();
-        M_.release();
-        bufM_.release();
-        R_[0].release();
-        R_[1].release();
-        blurredFrame_[0].release();
-        blurredFrame_[1].release();
-        pyramid0_.clear();
-        pyramid1_.clear();
-    }
-
-private:
-    void prepareGaussian(
-            int n, double sigma, float *g, float *xg, float *xxg,
-            double &ig11, double &ig03, double &ig33, double &ig55);
-
-    void setPolynomialExpansionConsts(int n, double sigma);
-
-    void updateFlow_boxFilter(
-            const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
-            GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
-
-    void updateFlow_gaussianBlur(
-            const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
-            GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[]);
-
-    GpuMat frames_[2];
-    GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
-    std::vector<GpuMat> pyramid0_, pyramid1_;
+    virtual int getMaxLevel() const = 0;
+    virtual void setMaxLevel(int maxLevel) = 0;
+
+    virtual int getNumIters() const = 0;
+    virtual void setNumIters(int iters) = 0;
+
+    virtual bool getUseInitialFlow() const = 0;
+    virtual void setUseInitialFlow(bool useInitialFlow) = 0;
+
+    static Ptr<DensePyrLKOpticalFlow> create(
+            Size winSize = Size(13, 13),
+            int maxLevel = 3,
+            int iters = 30,
+            bool useInitialFlow = false);
 };

-// Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method
 //
-// see reference:
-//   [1] C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
-//   [2] Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
-class CV_EXPORTS OpticalFlowDual_TVL1_CUDA
+// FarnebackOpticalFlow
+//
+
+/** @brief Class computing a dense optical flow using the Gunnar Farneback’s algorithm.
+ */
+class CV_EXPORTS FarnebackOpticalFlow : public DenseOpticalFlow
 {
 public:
-    OpticalFlowDual_TVL1_CUDA();
+    virtual int getNumLevels() const = 0;
+    virtual void setNumLevels(int numLevels) = 0;
+
+    virtual double getPyrScale() const = 0;
+    virtual void setPyrScale(double pyrScale) = 0;
+
+    virtual bool getFastPyramids() const = 0;
+    virtual void setFastPyramids(bool fastPyramids) = 0;

-    void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy);
+    virtual int getWinSize() const = 0;
+    virtual void setWinSize(int winSize) = 0;

-    void collectGarbage();
+    virtual int getNumIters() const = 0;
+    virtual void setNumIters(int numIters) = 0;
+
+    virtual int getPolyN() const = 0;
+    virtual void setPolyN(int polyN) = 0;
+
+    virtual double getPolySigma() const = 0;
+    virtual void setPolySigma(double polySigma) = 0;
+
+    virtual int getFlags() const = 0;
+    virtual void setFlags(int flags) = 0;
+
+    static Ptr<FarnebackOpticalFlow> create(
+            int numLevels = 5,
+            double pyrScale = 0.5,
+            bool fastPyramids = false,
+            int winSize = 13,
+            int numIters = 10,
+            int polyN = 5,
+            double polySigma = 1.1,
+            int flags = 0);
+};
+
+//
+// OpticalFlowDual_TVL1
+//

+/** @brief Implementation of the Zach, Pock and Bischof Dual TV-L1 Optical Flow method.
+ *
+ * @sa C. Zach, T. Pock and H. Bischof, "A Duality Based Approach for Realtime TV-L1 Optical Flow".
+ * @sa Javier Sanchez, Enric Meinhardt-Llopis and Gabriele Facciolo. "TV-L1 Optical Flow Estimation".
+ */
+class CV_EXPORTS OpticalFlowDual_TVL1 : public DenseOpticalFlow
+{
+public:
    /**
     * Time step of the numerical scheme.
     */
-    double tau;
+    virtual double getTau() const = 0;
+    virtual void setTau(double tau) = 0;

    /**
     * Weight parameter for the data term, attachment parameter.
@@ -271,7 +274,8 @@ public:
     * The smaller this parameter is, the smoother the solutions we obtain.
     * It depends on the range of motions of the images, so its value should be adapted to each image sequence.
     */
-    double lambda;
+    virtual double getLambda() const = 0;
+    virtual void setLambda(double lambda) = 0;

    /**
     * Weight parameter for (u - v)^2, tightness parameter.
@@ -279,20 +283,23 @@ public:
     * In theory, it should have a small value in order to maintain both parts in correspondence.
     * The method is stable for a large range of values of this parameter.
     */
+    virtual double getGamma() const = 0;
+    virtual void setGamma(double gamma) = 0;

-    double gamma;
    /**
-    * parameter used for motion estimation. It adds a variable allowing for illumination variations
-    * Set this parameter to 1. if you have varying illumination.
-    * See: Chambolle et al, A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging
-    * Journal of Mathematical imaging and vision, may 2011 Vol 40 issue 1, pp 120-145
-    */
-    double theta;
+     * parameter used for motion estimation. It adds a variable allowing for illumination variations
+     * Set this parameter to 1. if you have varying illumination.
+     * See: Chambolle et al, A First-Order Primal-Dual Algorithm for Convex Problems with Applications to Imaging
+     * Journal of Mathematical imaging and vision, may 2011 Vol 40 issue 1, pp 120-145
+     */
+    virtual double getTheta() const = 0;
+    virtual void setTheta(double theta) = 0;

    /**
     * Number of scales used to create the pyramid of images.
     */
-    int nscales;
+    virtual int getNumScales() const = 0;
+    virtual void setNumScales(int nscales) = 0;

    /**
     * Number of warpings per scale.
@@ -300,51 +307,39 @@ public:
     * This is a parameter that assures the stability of the method.
     * It also affects the running time, so it is a compromise between speed and accuracy.
     */
-    int warps;
+    virtual int getNumWarps() const = 0;
+    virtual void setNumWarps(int warps) = 0;

    /**
     * Stopping criterion threshold used in the numerical scheme, which is a trade-off between precision and running time.
     * A small value will yield more accurate solutions at the expense of a slower convergence.
     */
-    double epsilon;
+    virtual double getEpsilon() const = 0;
+    virtual void setEpsilon(double epsilon) = 0;

    /**
     * Stopping criterion iterations number used in the numerical scheme.
     */
-    int iterations;
-
-    double scaleStep;
-
-    bool useInitialFlow;
-
-private:
-    void procOneScale(const GpuMat& I0, const GpuMat& I1, GpuMat& u1, GpuMat& u2, GpuMat& u3);
-
-    std::vector<GpuMat> I0s;
-    std::vector<GpuMat> I1s;
-    std::vector<GpuMat> u1s;
-    std::vector<GpuMat> u2s;
-    std::vector<GpuMat> u3s;
-
-    GpuMat I1x_buf;
-    GpuMat I1y_buf;
-
-    GpuMat I1w_buf;
-    GpuMat I1wx_buf;
-    GpuMat I1wy_buf;
-
-    GpuMat grad_buf;
-    GpuMat rho_c_buf;
-
-    GpuMat p11_buf;
-    GpuMat p12_buf;
-    GpuMat p21_buf;
-    GpuMat p22_buf;
-    GpuMat p31_buf;
-    GpuMat p32_buf;
-
-    GpuMat diff_buf;
-    GpuMat norm_buf;
+    virtual int getNumIterations() const = 0;
+    virtual void setNumIterations(int iterations) = 0;
+
+    virtual double getScaleStep() const = 0;
+    virtual void setScaleStep(double scaleStep) = 0;
+
+    virtual bool getUseInitialFlow() const = 0;
+    virtual void setUseInitialFlow(bool useInitialFlow) = 0;
+
+    static Ptr<OpticalFlowDual_TVL1> create(
+            double tau = 0.25,
+            double lambda = 0.15,
+            double theta = 0.3,
+            int nscales = 5,
+            int warps = 5,
+            double epsilon = 0.01,
+            int iterations = 300,
+            double scaleStep = 0.8,
+            double gamma = 0.0,
+            bool useInitialFlow = false);
 };

 //! @}

--- a/modules/cudaoptflow/src/brox.cpp
+++ b/modules/cudaoptflow/src/brox.cpp
@@ -47,84 +47,148 @@ using namespace cv::cuda;

 #if !defined (HAVE_CUDA) || !defined (HAVE_OPENCV_CUDALEGACY) || defined (CUDA_DISABLER)

-void cv::cuda::BroxOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<BroxOpticalFlow> cv::cuda::BroxOpticalFlow::create(double, double, double, int, int, int) { throw_no_cuda(); return Ptr<BroxOpticalFlow>(); }

 #else

-namespace
-{
-    size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc, const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
-                      NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v, const cudaDeviceProp& devProp)
+namespace {
+
+    class BroxOpticalFlowImpl : public BroxOpticalFlow
    {
-        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
+    public:
+        BroxOpticalFlowImpl(double alpha, double gamma, double scale_factor,
+                            int inner_iterations, int outer_iterations, int solver_iterations) :
+            alpha_(alpha), gamma_(gamma), scale_factor_(scale_factor),
+            inner_iterations_(inner_iterations), outer_iterations_(outer_iterations),
+            solver_iterations_(solver_iterations)
+        {
+        }
+
+        virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream);
+
+        virtual double getFlowSmoothness() const { return alpha_; }
+        virtual void setFlowSmoothness(double alpha) { alpha_ = static_cast<float>(alpha); }
+
+        virtual double getGradientConstancyImportance() const { return gamma_; }
+        virtual void setGradientConstancyImportance(double gamma) { gamma_ = static_cast<float>(gamma); }
+
+        virtual double getPyramidScaleFactor() const { return scale_factor_; }
+        virtual void setPyramidScaleFactor(double scale_factor) { scale_factor_ = static_cast<float>(scale_factor); }
+
+        //! number of lagged non-linearity iterations (inner loop)
+        virtual int getInnerIterations() const { return inner_iterations_; }
+        virtual void setInnerIterations(int inner_iterations) { inner_iterations_ = inner_iterations; }
+
+        //! number of warping iterations (number of pyramid levels)
+        virtual int getOuterIterations() const { return outer_iterations_; }
+        virtual void setOuterIterations(int outer_iterations) { outer_iterations_ = outer_iterations; }
+
+        //! number of linear system solver iterations
+        virtual int getSolverIterations() const { return solver_iterations_; }
+        virtual void setSolverIterations(int solver_iterations) { solver_iterations_ = solver_iterations; }
+
+    private:
+        //! flow smoothness
+        float alpha_;
+
+        //! gradient constancy importance
+        float gamma_;
+
+        //! pyramid scale factor
+        float scale_factor_;
+
+        //! number of lagged non-linearity iterations (inner loop)
+        int inner_iterations_;
+
+        //! number of warping iterations (number of pyramid levels)
+        int outer_iterations_;
+
+        //! number of linear system solver iterations
+        int solver_iterations_;
+    };
+
+    static size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc,
+                             const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
+                             NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v,
+                             size_t textureAlignment)
+    {
+        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(textureAlignment));

        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0) );

        return gpuCounter.maxSize();
    }
-}

-namespace
-{
-    static void outputHandler(const String &msg) { CV_Error(cv::Error::GpuApiCallError, msg.c_str()); }
-}
+    static void outputHandler(const String &msg)
+    {
+        CV_Error(cv::Error::GpuApiCallError, msg.c_str());
+    }

-void cv::cuda::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& s)
-{
-    ncvSetDebugOutputHandler(outputHandler);
+    void BroxOpticalFlowImpl::calc(InputArray _I0, InputArray _I1, InputOutputArray _flow, Stream& stream)
+    {
+        const GpuMat frame0 = _I0.getGpuMat();
+        const GpuMat frame1 = _I1.getGpuMat();

-    CV_Assert(frame0.type() == CV_32FC1);
-    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
+        CV_Assert( frame0.type() == CV_32FC1 );
+        CV_Assert( frame1.size() == frame0.size() && frame1.type() == frame0.type() );

-    u.create(frame0.size(), CV_32FC1);
-    v.create(frame0.size(), CV_32FC1);
+        ncvSetDebugOutputHandler(outputHandler);

-    cudaDeviceProp devProp;
-    cudaSafeCall( cudaGetDeviceProperties(&devProp, getDevice()) );
+        BufferPool pool(stream);
+        GpuMat u = pool.getBuffer(frame0.size(), CV_32FC1);
+        GpuMat v = pool.getBuffer(frame0.size(), CV_32FC1);

-    NCVBroxOpticalFlowDescriptor desc;
+        NCVBroxOpticalFlowDescriptor desc;
+        desc.alpha = alpha_;
+        desc.gamma = gamma_;
+        desc.scale_factor = scale_factor_;
+        desc.number_of_inner_iterations = inner_iterations_;
+        desc.number_of_outer_iterations = outer_iterations_;
+        desc.number_of_solver_iterations = solver_iterations_;

-    desc.alpha = alpha;
-    desc.gamma = gamma;
-    desc.scale_factor = scale_factor;
-    desc.number_of_inner_iterations = inner_iterations;
-    desc.number_of_outer_iterations = outer_iterations;
-    desc.number_of_solver_iterations = solver_iterations;
+        NCVMemSegment frame0MemSeg;
+        frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
+        frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
+        frame0MemSeg.size = frame0.step * frame0.rows;

-    NCVMemSegment frame0MemSeg;
-    frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
-    frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
-    frame0MemSeg.size = frame0.step * frame0.rows;
+        NCVMemSegment frame1MemSeg;
+        frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
+        frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
+        frame1MemSeg.size = frame1.step * frame1.rows;

-    NCVMemSegment frame1MemSeg;
-    frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
-    frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
-    frame1MemSeg.size = frame1.step * frame1.rows;
+        NCVMemSegment uMemSeg;
+        uMemSeg.begin.memtype = NCVMemoryTypeDevice;
+        uMemSeg.begin.ptr = u.ptr();
+        uMemSeg.size = u.step * u.rows;

-    NCVMemSegment uMemSeg;
-    uMemSeg.begin.memtype = NCVMemoryTypeDevice;
-    uMemSeg.begin.ptr = u.ptr();
-    uMemSeg.size = u.step * u.rows;
+        NCVMemSegment vMemSeg;
+        vMemSeg.begin.memtype = NCVMemoryTypeDevice;
+        vMemSeg.begin.ptr = v.ptr();
+        vMemSeg.size = v.step * v.rows;

-    NCVMemSegment vMemSeg;
-    vMemSeg.begin.memtype = NCVMemoryTypeDevice;
-    vMemSeg.begin.ptr = v.ptr();
-    vMemSeg.size = v.step * v.rows;
+        DeviceInfo devInfo;
+        size_t textureAlignment = devInfo.textureAlignment();

-    NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame0.cols, frame0.rows, static_cast<Ncv32u>(frame0.step));
-    NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame1.cols, frame1.rows, static_cast<Ncv32u>(frame1.step));
-    NCVMatrixReuse<Ncv32f> uMat(uMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), u.cols, u.rows, static_cast<Ncv32u>(u.step));
-    NCVMatrixReuse<Ncv32f> vMat(vMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), v.cols, v.rows, static_cast<Ncv32u>(v.step));
+        NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, static_cast<Ncv32u>(textureAlignment), frame0.cols, frame0.rows, static_cast<Ncv32u>(frame0.step));
+        NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, static_cast<Ncv32u>(textureAlignment), frame1.cols, frame1.rows, static_cast<Ncv32u>(frame1.step));
+        NCVMatrixReuse<Ncv32f> uMat(uMemSeg, static_cast<Ncv32u>(textureAlignment), u.cols, u.rows, static_cast<Ncv32u>(u.step));
+        NCVMatrixReuse<Ncv32f> vMat(vMemSeg, static_cast<Ncv32u>(textureAlignment), v.cols, v.rows, static_cast<Ncv32u>(v.step));

-    cudaStream_t stream = StreamAccessor::getStream(s);
+        size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, textureAlignment);
+        GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), CV_8UC1);

-    size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, devProp);
+        NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(textureAlignment), buf.ptr());

-    ensureSizeIsEnough(1, static_cast<int>(bufSize), CV_8UC1, buf);
+        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, StreamAccessor::getStream(stream)) );

-    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), buf.ptr());
+        GpuMat flows[] = {u, v};
+        cuda::merge(flows, 2, _flow, stream);
+    }
+}

-    ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream) );
+Ptr<BroxOpticalFlow> cv::cuda::BroxOpticalFlow::create(double alpha, double gamma, double scale_factor, int inner_iterations, int outer_iterations, int solver_iterations)
+{
+    return makePtr<BroxOpticalFlowImpl>(alpha, gamma, scale_factor, inner_iterations, outer_iterations, solver_iterations);
 }

 #endif /* HAVE_CUDA */
--- a/modules/cudaoptflow/src/cuda/pyrlk.cu
+++ b/modules/cudaoptflow/src/cuda/pyrlk.cu
@@ -472,16 +472,16 @@ namespace pyrlk
        }
    }

-    void loadConstants(int2 winSize, int iters)
+    void loadConstants(int2 winSize, int iters, cudaStream_t stream)
    {
-        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_x, &winSize.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_y, &winSize.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );

        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
-        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_x, &halfWin.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_y, &halfWin.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );

-        cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
    }

    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,

--- a/modules/cudaoptflow/src/cuda/tvl1flow.cu
+++ b/modules/cudaoptflow/src/cuda/tvl1flow.cu
@@ -66,15 +66,16 @@ namespace tvl1flow
        dy(y, x) = 0.5f * (src(::min(y + 1, src.rows - 1), x) - src(::max(y - 1, 0), x));
    }

-    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy)
+    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy, cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));

-        centeredGradientKernel<<<grid, block>>>(src, dx, dy);
+        centeredGradientKernel<<<grid, block, 0, stream>>>(src, dx, dy);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }

@@ -164,7 +165,10 @@ namespace tvl1flow
        rho(y, x) = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
    }

-    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho)
+    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y,
+                      PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx,
+                      PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho,
+                      cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
@@ -173,10 +177,11 @@ namespace tvl1flow
        bindTexture(&tex_I1x, I1x);
        bindTexture(&tex_I1y, I1y);

-        warpBackwardKernel<<<grid, block>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
+        warpBackwardKernel<<<grid, block, 0, stream>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }

@@ -292,15 +297,17 @@ namespace tvl1flow
                   PtrStepSzf grad, PtrStepSzf rho_c,
                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf error,
-                   float l_t, float theta, float gamma, bool calcError)
+                   float l_t, float theta, float gamma, bool calcError,
+                   cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y));

-        estimateUKernel<<<grid, block>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, error, l_t, theta, gamma, calcError);
+        estimateUKernel<<<grid, block, 0, stream>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, error, l_t, theta, gamma, calcError);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }

@@ -346,15 +353,19 @@ namespace tvl1flow
        }
    }

-    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, float taut, float gamma)
+    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3,
+                               PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
+                               float taut, float gamma,
+                               cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(u1.cols, block.x), divUp(u1.rows, block.y));

-        estimateDualVariablesKernel<<<grid, block>>>(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
+        estimateDualVariablesKernel<<<grid, block, 0, stream>>>(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }


--- a/modules/cudaoptflow/src/farneback.cpp
+++ b/modules/cudaoptflow/src/farneback.cpp
--- a/modules/cudaoptflow/src/pyrlk.cpp
+++ b/modules/cudaoptflow/src/pyrlk.cpp
@@ -47,37 +47,54 @@ using namespace cv::cuda;

 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)

-cv::cuda::PyrLKOpticalFlow::PyrLKOpticalFlow() { throw_no_cuda(); }
-void cv::cuda::PyrLKOpticalFlow::sparse(const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_no_cuda(); }
-void cv::cuda::PyrLKOpticalFlow::dense(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat*) { throw_no_cuda(); }
-void cv::cuda::PyrLKOpticalFlow::releaseMemory() {}
+Ptr<SparsePyrLKOpticalFlow> cv::cuda::SparsePyrLKOpticalFlow::create(Size, int, int, bool) { throw_no_cuda(); return Ptr<SparsePyrLKOpticalFlow>(); }
+
+Ptr<DensePyrLKOpticalFlow> cv::cuda::DensePyrLKOpticalFlow::create(Size, int, int, bool) { throw_no_cuda(); return Ptr<SparsePyrLKOpticalFlow>(); }

 #else /* !defined (HAVE_CUDA) */

 namespace pyrlk
 {
-    void loadConstants(int2 winSize, int iters);
+    void loadConstants(int2 winSize, int iters, cudaStream_t stream);

    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
+                 int level, dim3 block, dim3 patch, cudaStream_t stream);
    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
+                 int level, dim3 block, dim3 patch, cudaStream_t stream);

    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
-               PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
-}
-
-cv::cuda::PyrLKOpticalFlow::PyrLKOpticalFlow()
-{
-    winSize = Size(21, 21);
-    maxLevel = 3;
-    iters = 30;
-    useInitialFlow = false;
+               PtrStepSzf err, int2 winSize, cudaStream_t stream);
 }

 namespace
 {
-    void calcPatchSize(cv::Size winSize, dim3& block, dim3& patch)
+    class PyrLKOpticalFlowBase
+    {
+    public:
+        PyrLKOpticalFlowBase(Size winSize, int maxLevel, int iters, bool useInitialFlow);
+
+        void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
+            GpuMat& status, GpuMat* err, Stream& stream);
+
+        void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream);
+
+    protected:
+        Size winSize_;
+        int maxLevel_;
+        int iters_;
+        bool useInitialFlow_;
+
+    private:
+        std::vector<GpuMat> prevPyr_;
+        std::vector<GpuMat> nextPyr_;
+    };
+
+    PyrLKOpticalFlowBase::PyrLKOpticalFlowBase(Size winSize, int maxLevel, int iters, bool useInitialFlow) :
+        winSize_(winSize), maxLevel_(maxLevel), iters_(iters), useInitialFlow_(useInitialFlow)
+    {
+    }
+
+    void calcPatchSize(Size winSize, dim3& block, dim3& patch)
    {
        if (winSize.width > 32 && winSize.width > 2 * winSize.height)
        {
@@ -95,156 +112,239 @@ namespace

        block.z = patch.z = 1;
    }
-}

-void cv::cuda::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err)
-{
-    if (prevPts.empty())
+    void PyrLKOpticalFlowBase::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err, Stream& stream)
    {
-        nextPts.release();
-        status.release();
-        if (err) err->release();
-        return;
-    }
+        if (prevPts.empty())
+        {
+            nextPts.release();
+            status.release();
+            if (err) err->release();
+            return;
+        }

-    dim3 block, patch;
-    calcPatchSize(winSize, block, patch);
+        dim3 block, patch;
+        calcPatchSize(winSize_, block, patch);

-    CV_Assert(prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4);
-    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
-    CV_Assert(maxLevel >= 0);
-    CV_Assert(winSize.width > 2 && winSize.height > 2);
-    CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);
-    CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2);
+        CV_Assert( prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4 );
+        CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() );
+        CV_Assert( maxLevel_ >= 0 );
+        CV_Assert( winSize_.width > 2 && winSize_.height > 2 );
+        CV_Assert( patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6 );
+        CV_Assert( prevPts.rows == 1 && prevPts.type() == CV_32FC2 );

-    if (useInitialFlow)
-        CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == CV_32FC2);
-    else
-        ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
+        if (useInitialFlow_)
+            CV_Assert( nextPts.size() == prevPts.size() && nextPts.type() == prevPts.type() );
+        else
+            ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);

-    GpuMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
-    GpuMat temp2 = nextPts.reshape(1);
-    cuda::multiply(temp1, Scalar::all(1.0 / (1 << maxLevel) / 2.0), temp2);
+        GpuMat temp1 = (useInitialFlow_ ? nextPts : prevPts).reshape(1);
+        GpuMat temp2 = nextPts.reshape(1);
+        cuda::multiply(temp1, Scalar::all(1.0 / (1 << maxLevel_) / 2.0), temp2, 1, -1, stream);

-    ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
-    status.setTo(Scalar::all(1));
+        ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
+        status.setTo(Scalar::all(1), stream);

-    if (err)
-        ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
+        if (err)
+            ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);

-    // build the image pyramids.
+        // build the image pyramids.

-    prevPyr_.resize(maxLevel + 1);
-    nextPyr_.resize(maxLevel + 1);
+        BufferPool pool(stream);

-    int cn = prevImg.channels();
+        prevPyr_.resize(maxLevel_ + 1);
+        nextPyr_.resize(maxLevel_ + 1);

-    if (cn == 1 || cn == 4)
-    {
-        prevImg.convertTo(prevPyr_[0], CV_32F);
-        nextImg.convertTo(nextPyr_[0], CV_32F);
-    }
-    else
-    {
-        cuda::cvtColor(prevImg, buf_, COLOR_BGR2BGRA);
-        buf_.convertTo(prevPyr_[0], CV_32F);
+        int cn = prevImg.channels();
+
+        if (cn == 1 || cn == 4)
+        {
+            prevImg.convertTo(prevPyr_[0], CV_32F, stream);
+            nextImg.convertTo(nextPyr_[0], CV_32F, stream);
+        }
+        else
+        {
+            GpuMat buf = pool.getBuffer(prevImg.size(), CV_MAKE_TYPE(prevImg.depth(), 4));

-        cuda::cvtColor(nextImg, buf_, COLOR_BGR2BGRA);
-        buf_.convertTo(nextPyr_[0], CV_32F);
+            cuda::cvtColor(prevImg, buf, COLOR_BGR2BGRA, 0, stream);
+            buf.convertTo(prevPyr_[0], CV_32F, stream);
+
+            cuda::cvtColor(nextImg, buf, COLOR_BGR2BGRA, 0, stream);
+            buf.convertTo(nextPyr_[0], CV_32F, stream);
+        }
+
+        for (int level = 1; level <= maxLevel_; ++level)
+        {
+            cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level], stream);
+            cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level], stream);
+        }
+
+        pyrlk::loadConstants(make_int2(winSize_.width, winSize_.height), iters_, StreamAccessor::getStream(stream));
+
+        for (int level = maxLevel_; level >= 0; level--)
+        {
+            if (cn == 1)
+            {
+                pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
+                               prevPts.ptr<float2>(), nextPts.ptr<float2>(),
+                               status.ptr(),
+                               level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
+                               level, block, patch,
+                               StreamAccessor::getStream(stream));
+            }
+            else
+            {
+                pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
+                               prevPts.ptr<float2>(), nextPts.ptr<float2>(),
+                               status.ptr(),
+                               level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
+                               level, block, patch,
+                               StreamAccessor::getStream(stream));
+            }
+        }
    }

-    for (int level = 1; level <= maxLevel; ++level)
+    void PyrLKOpticalFlowBase::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream)
    {
-        cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level]);
-        cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level]);
-    }
+        CV_Assert( prevImg.type() == CV_8UC1 );
+        CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() );
+        CV_Assert( maxLevel_ >= 0 );
+        CV_Assert( winSize_.width > 2 && winSize_.height > 2 );

-    pyrlk::loadConstants(make_int2(winSize.width, winSize.height), iters);
+        // build the image pyramids.

-    for (int level = maxLevel; level >= 0; level--)
-    {
-        if (cn == 1)
+        prevPyr_.resize(maxLevel_ + 1);
+        nextPyr_.resize(maxLevel_ + 1);
+
+        prevPyr_[0] = prevImg;
+        nextImg.convertTo(nextPyr_[0], CV_32F, stream);
+
+        for (int level = 1; level <= maxLevel_; ++level)
        {
-            pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
-                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
-                level, block, patch);
+            cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level], stream);
+            cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level], stream);
        }
-        else
+
+        BufferPool pool(stream);
+
+        GpuMat uPyr[] = {
+            pool.getBuffer(prevImg.size(), CV_32FC1),
+            pool.getBuffer(prevImg.size(), CV_32FC1),
+        };
+        GpuMat vPyr[] = {
+            pool.getBuffer(prevImg.size(), CV_32FC1),
+            pool.getBuffer(prevImg.size(), CV_32FC1),
+        };
+
+        uPyr[0].setTo(Scalar::all(0), stream);
+        vPyr[0].setTo(Scalar::all(0), stream);
+        uPyr[1].setTo(Scalar::all(0), stream);
+        vPyr[1].setTo(Scalar::all(0), stream);
+
+        int2 winSize2i = make_int2(winSize_.width, winSize_.height);
+        pyrlk::loadConstants(winSize2i, iters_, StreamAccessor::getStream(stream));
+
+        int idx = 0;
+
+        for (int level = maxLevel_; level >= 0; level--)
        {
-            pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
-                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
-                level, block, patch);
+            int idx2 = (idx + 1) & 1;
+
+            pyrlk::dense(prevPyr_[level], nextPyr_[level],
+                         uPyr[idx], vPyr[idx], uPyr[idx2], vPyr[idx2],
+                         PtrStepSzf(), winSize2i,
+                         StreamAccessor::getStream(stream));
+
+            if (level > 0)
+                idx = idx2;
        }
+
+        uPyr[idx].copyTo(u, stream);
+        vPyr[idx].copyTo(v, stream);
    }
-}

-void cv::cuda::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err)
-{
-    CV_Assert(prevImg.type() == CV_8UC1);
-    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
-    CV_Assert(maxLevel >= 0);
-    CV_Assert(winSize.width > 2 && winSize.height > 2);
+    class SparsePyrLKOpticalFlowImpl : public SparsePyrLKOpticalFlow, private PyrLKOpticalFlowBase
+    {
+    public:
+        SparsePyrLKOpticalFlowImpl(Size winSize, int maxLevel, int iters, bool useInitialFlow) :
+            PyrLKOpticalFlowBase(winSize, maxLevel, iters, useInitialFlow)
+        {
+        }

-    if (err)
-        err->create(prevImg.size(), CV_32FC1);
+        virtual Size getWinSize() const { return winSize_; }
+        virtual void setWinSize(Size winSize) { winSize_ = winSize; }

-    // build the image pyramids.
+        virtual int getMaxLevel() const { return maxLevel_; }
+        virtual void setMaxLevel(int maxLevel) { maxLevel_ = maxLevel; }

-    prevPyr_.resize(maxLevel + 1);
-    nextPyr_.resize(maxLevel + 1);
+        virtual int getNumIters() const { return iters_; }
+        virtual void setNumIters(int iters) { iters_ = iters; }

-    prevPyr_[0] = prevImg;
-    nextImg.convertTo(nextPyr_[0], CV_32F);
+        virtual bool getUseInitialFlow() const { return useInitialFlow_; }
+        virtual void setUseInitialFlow(bool useInitialFlow) { useInitialFlow_ = useInitialFlow; }

-    for (int level = 1; level <= maxLevel; ++level)
+        virtual void calc(InputArray _prevImg, InputArray _nextImg,
+                          InputArray _prevPts, InputOutputArray _nextPts,
+                          OutputArray _status,
+                          OutputArray _err,
+                          Stream& stream)
+        {
+            const GpuMat prevImg = _prevImg.getGpuMat();
+            const GpuMat nextImg = _nextImg.getGpuMat();
+            const GpuMat prevPts = _prevPts.getGpuMat();
+            GpuMat& nextPts = _nextPts.getGpuMatRef();
+            GpuMat& status = _status.getGpuMatRef();
+            GpuMat* err = _err.needed() ? &(_err.getGpuMatRef()) : NULL;
+
+            sparse(prevImg, nextImg, prevPts, nextPts, status, err, stream);
+        }
+    };
+
+    class DensePyrLKOpticalFlowImpl : public DensePyrLKOpticalFlow, private PyrLKOpticalFlowBase
    {
-        cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level]);
-        cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level]);
-    }
+    public:
+        DensePyrLKOpticalFlowImpl(Size winSize, int maxLevel, int iters, bool useInitialFlow) :
+            PyrLKOpticalFlowBase(winSize, maxLevel, iters, useInitialFlow)
+        {
+        }

-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);
-    ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]);
-    uPyr_[0].setTo(Scalar::all(0));
-    vPyr_[0].setTo(Scalar::all(0));
-    uPyr_[1].setTo(Scalar::all(0));
-    vPyr_[1].setTo(Scalar::all(0));
+        virtual Size getWinSize() const { return winSize_; }
+        virtual void setWinSize(Size winSize) { winSize_ = winSize; }

-    int2 winSize2i = make_int2(winSize.width, winSize.height);
-    pyrlk::loadConstants(winSize2i, iters);
+        virtual int getMaxLevel() const { return maxLevel_; }
+        virtual void setMaxLevel(int maxLevel) { maxLevel_ = maxLevel; }

-    PtrStepSzf derr = err ? *err : PtrStepSzf();
+        virtual int getNumIters() const { return iters_; }
+        virtual void setNumIters(int iters) { iters_ = iters; }

-    int idx = 0;
+        virtual bool getUseInitialFlow() const { return useInitialFlow_; }
+        virtual void setUseInitialFlow(bool useInitialFlow) { useInitialFlow_ = useInitialFlow; }

-    for (int level = maxLevel; level >= 0; level--)
-    {
-        int idx2 = (idx + 1) & 1;
+        virtual void calc(InputArray _prevImg, InputArray _nextImg, InputOutputArray _flow, Stream& stream)
+        {
+            const GpuMat prevImg = _prevImg.getGpuMat();
+            const GpuMat nextImg = _nextImg.getGpuMat();

-        pyrlk::dense(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
-            level == 0 ? derr : PtrStepSzf(), winSize2i);
+            BufferPool pool(stream);
+            GpuMat u = pool.getBuffer(prevImg.size(), CV_32FC1);
+            GpuMat v = pool.getBuffer(prevImg.size(), CV_32FC1);

-        if (level > 0)
-            idx = idx2;
-    }
+            dense(prevImg, nextImg, u, v, stream);

-    uPyr_[idx].copyTo(u);
-    vPyr_[idx].copyTo(v);
+            GpuMat flows[] = {u, v};
+            cuda::merge(flows, 2, _flow, stream);
+        }
+    };
 }

-void cv::cuda::PyrLKOpticalFlow::releaseMemory()
+Ptr<SparsePyrLKOpticalFlow> cv::cuda::SparsePyrLKOpticalFlow::create(Size winSize, int maxLevel, int iters, bool useInitialFlow)
 {
-    prevPyr_.clear();
-    nextPyr_.clear();
-
-    buf_.release();
-
-    uPyr_[0].release();
-    vPyr_[0].release();
+    return makePtr<SparsePyrLKOpticalFlowImpl>(winSize, maxLevel, iters, useInitialFlow);
+}

-    uPyr_[1].release();
-    vPyr_[1].release();
+Ptr<DensePyrLKOpticalFlow> cv::cuda::DensePyrLKOpticalFlow::create(Size winSize, int maxLevel, int iters, bool useInitialFlow)
+{
+    return makePtr<DensePyrLKOpticalFlowImpl>(winSize, maxLevel, iters, useInitialFlow);
 }

 #endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaoptflow/src/tvl1flow.cpp
+++ b/modules/cudaoptflow/src/tvl1flow.cpp