Merge pull request #2807 from vchiluka5:NVIDIA_OPTICAL_FLOW_SDK_2.0_INTEGRATION

1c957bea · Alexander Alekhin · 0b6b8ff9 · 582fe44b · 1c957bea · 1c957bea
9 changed file
--- a/modules/cudaoptflow/CMakeLists.txt
+++ b/modules/cudaoptflow/CMakeLists.txt
@@ -8,21 +8,21 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-d

 ocv_define_module(cudaoptflow opencv_video opencv_optflow opencv_cudaarithm opencv_cudawarping opencv_cudaimgproc OPTIONAL opencv_cudalegacy WRAP python)

-set(NVIDIA_OPTICAL_FLOW_1_0_HEADERS_COMMIT "79c6cee80a2df9a196f20afd6b598a9810964c32")
-set(NVIDIA_OPTICAL_FLOW_1_0_HEADERS_MD5 "ca5acedee6cb45d0ec610a6732de5c15")
-set(NVIDIA_OPTICAL_FLOW_1_0_HEADERS_PATH "${OpenCV_BINARY_DIR}/3rdparty/NVIDIAOpticalFlowSDK_1_0_Headers")
-ocv_download(FILENAME "${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_COMMIT}.zip"
-               HASH ${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_MD5}
+set(NVIDIA_OPTICAL_FLOW_2_0_HEADERS_COMMIT "edb50da3cf849840d680249aa6dbef248ebce2ca")
+set(NVIDIA_OPTICAL_FLOW_2_0_HEADERS_MD5 "a73cd48b18dcc0cc8933b30796074191")
+set(NVIDIA_OPTICAL_FLOW_2_0_HEADERS_PATH "${OpenCV_BINARY_DIR}/3rdparty/NVIDIAOpticalFlowSDK_2_0_Headers")
+ocv_download(FILENAME "${NVIDIA_OPTICAL_FLOW_2_0_HEADERS_COMMIT}.zip"
+               HASH ${NVIDIA_OPTICAL_FLOW_2_0_HEADERS_MD5}
               URL
                 "https://github.com/NVIDIA/NVIDIAOpticalFlowSDK/archive/"
-               DESTINATION_DIR "${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_PATH}"
-               STATUS NVIDIA_OPTICAL_FLOW_1_0_HEADERS_DOWNLOAD_SUCCESS
+               DESTINATION_DIR "${NVIDIA_OPTICAL_FLOW_2_0_HEADERS_PATH}"
+               STATUS NVIDIA_OPTICAL_FLOW_2_0_HEADERS_DOWNLOAD_SUCCESS
               ID "NVIDIA_OPTICAL_FLOW"
               UNPACK RELATIVE_URL)

-if(NOT NVIDIA_OPTICAL_FLOW_1_0_HEADERS_DOWNLOAD_SUCCESS)
-  message(STATUS "Failed to download NVIDIA_Optical_Flow_1_0 Headers")
+if(NOT NVIDIA_OPTICAL_FLOW_2_0_HEADERS_DOWNLOAD_SUCCESS)
+  message(STATUS "Failed to download NVIDIA_Optical_Flow_2_0 Headers")
 else()
  add_definitions(-DHAVE_NVIDIA_OPTFLOW=1)
-  ocv_include_directories(SYSTEM "${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_PATH}/NVIDIAOpticalFlowSDK-${NVIDIA_OPTICAL_FLOW_1_0_HEADERS_COMMIT}")
+  ocv_include_directories(SYSTEM "${NVIDIA_OPTICAL_FLOW_2_0_HEADERS_PATH}/NVIDIAOpticalFlowSDK-${NVIDIA_OPTICAL_FLOW_2_0_HEADERS_COMMIT}")
 endif()
\ No newline at end of file
--- a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
+++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
@@ -392,9 +392,9 @@ public:
 /** @brief Class for computing the optical flow vectors between two images using NVIDIA Optical Flow hardware and Optical Flow SDK 1.0.
 @note
 - A sample application demonstrating the use of NVIDIA Optical Flow can be found at
-opencv_source_code/samples/gpu/nvidia_optical_flow.cpp
+opencv_contrib_source_code/modules/cudaoptflow/samples/nvidia_optical_flow.cpp
 - An example application comparing accuracy and performance of NVIDIA Optical Flow with other optical flow algorithms in OpenCV can be found at
-opencv_source_code/samples/gpu/optical_flow.cpp
+opencv_contrib_source_code/modules/cudaoptflow/samples/optical_flow.cpp
 */

 class CV_EXPORTS_W NvidiaOpticalFlow_1_0 : public NvidiaHWOpticalFlow
@@ -417,18 +417,16 @@ public:
    * using nearest neighbour upsampling method.

    @param flow Buffer of type CV_16FC2 containing flow vectors generated by calc().
-    @param width Width of the input image in pixels for which these flow vectors were generated.
-    @param height Height of the input image in pixels for which these flow vectors were generated.
+    @param imageSize Size of the input image in pixels for which these flow vectors were generated.
    @param gridSize Granularity of the optical flow vectors returned by calc() function. Can be queried using getGridSize().
    @param upsampledFlow Buffer of type CV_32FC2, containing upsampled flow vectors, each flow vector for 1 pixel, in the pitch-linear layout.
    */
-    CV_WRAP virtual void upSampler(InputArray flow, int width, int height,
+    CV_WRAP virtual void upSampler(InputArray flow, cv::Size imageSize,
        int gridSize, InputOutputArray upsampledFlow) = 0;

    /** @brief Instantiate NVIDIA Optical Flow

-    @param width Width of input image in pixels.
-    @param height Height of input image in pixels.
+    @param imageSize Size of input image in pixels.
    @param perfPreset Optional parameter. Refer [NV OF SDK documentation](https://developer.nvidia.com/opticalflow-sdk) for details about presets.
                      Defaults to NV_OF_PERF_LEVEL_SLOW.
    @param enableTemporalHints Optional parameter. Flag to enable temporal hints. When set to true, the hardware uses the flow vectors
@@ -445,10 +443,142 @@ public:
                        If output stream is not set, the execute function will use default stream which is NULL stream;
    */
    CV_WRAP static Ptr<NvidiaOpticalFlow_1_0> create(
-        int width,
-        int height,
+        cv::Size imageSize,
        cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL perfPreset
-        = cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW,
+        = cv::cuda::NvidiaOpticalFlow_1_0::NV_OF_PERF_LEVEL_SLOW,
+        bool enableTemporalHints = false,
+        bool enableExternalHints = false,
+        bool enableCostBuffer = false,
+        int gpuId = 0,
+        Stream& inputStream = Stream::Null(),
+        Stream& outputStream = Stream::Null());
+};
+
+/** @brief Class for computing the optical flow vectors between two images using NVIDIA Optical Flow hardware and Optical Flow SDK 2.0.
+@note
+- A sample application demonstrating the use of NVIDIA Optical Flow can be found at
+opencv_contrib_source_code/modules/cudaoptflow/samples/nvidia_optical_flow.cpp
+- An example application comparing accuracy and performance of NVIDIA Optical Flow with other optical flow algorithms in OpenCV can be found at
+opencv_contrib_source_code/modules/cudaoptflow/samples/optical_flow.cpp
+*/
+
+class CV_EXPORTS_W NvidiaOpticalFlow_2_0 : public NvidiaHWOpticalFlow
+{
+public:
+    /**
+    * Supported optical flow performance levels.
+    */
+    enum NVIDIA_OF_PERF_LEVEL
+    {
+        NV_OF_PERF_LEVEL_UNDEFINED,
+        NV_OF_PERF_LEVEL_SLOW = 5,                   /**< Slow perf level results in lowest performance and best quality */
+        NV_OF_PERF_LEVEL_MEDIUM = 10,                /**< Medium perf level results in low performance and medium quality */
+        NV_OF_PERF_LEVEL_FAST = 20,                  /**< Fast perf level results in high performance and low quality */
+        NV_OF_PERF_LEVEL_MAX
+    };
+
+    /**
+    * Supported grid size for output buffer.
+    */
+    enum NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE
+    {
+        NV_OF_OUTPUT_VECTOR_GRID_SIZE_UNDEFINED,
+        NV_OF_OUTPUT_VECTOR_GRID_SIZE_1 = 1,          /**< Output buffer grid size is 1x1  */
+        NV_OF_OUTPUT_VECTOR_GRID_SIZE_2 = 2,          /**< Output buffer grid size is 2x2  */
+        NV_OF_OUTPUT_VECTOR_GRID_SIZE_4 = 4,          /**< Output buffer grid size is 4x4  */
+        NV_OF_OUTPUT_VECTOR_GRID_SIZE_MAX
+    };
+
+    /**
+    * Supported grid size for hint buffer.
+    */
+    enum NVIDIA_OF_HINT_VECTOR_GRID_SIZE
+    {
+        NV_OF_HINT_VECTOR_GRID_SIZE_UNDEFINED,
+        NV_OF_HINT_VECTOR_GRID_SIZE_1 = 1,            /**< Hint buffer grid size is 1x1.*/
+        NV_OF_HINT_VECTOR_GRID_SIZE_2 = 2,            /**< Hint buffer grid size is 2x2.*/
+        NV_OF_HINT_VECTOR_GRID_SIZE_4 = 4,            /**< Hint buffer grid size is 4x4.*/
+        NV_OF_HINT_VECTOR_GRID_SIZE_8 = 8,            /**< Hint buffer grid size is 8x8.*/
+        NV_OF_HINT_VECTOR_GRID_SIZE_MAX
+    };
+
+    /** @brief convertToFloat() helper function converts the hardware-generated flow vectors to floating point representation (1 flow vector for gridSize).
+    * gridSize can be queried via function getGridSize().
+
+    @param flow Buffer of type CV_16FC2 containing flow vectors generated by calc().
+    @param floatFlow Buffer of type CV_32FC2, containing flow vectors in floating point representation, each flow vector for 1 pixel per gridSize, in the pitch-linear layout.
+    */
+    CV_WRAP virtual void convertToFloat(InputArray flow, InputOutputArray floatFlow) = 0;
+
+    /** @brief Instantiate NVIDIA Optical Flow
+
+    @param imageSize Size of input image in pixels.
+    @param perfPreset Optional parameter. Refer [NV OF SDK documentation](https://developer.nvidia.com/opticalflow-sdk) for details about presets.
+                      Defaults to NV_OF_PERF_LEVEL_SLOW.
+    @param outputGridSize Optional parameter. Refer [NV OF SDK documentation](https://developer.nvidia.com/opticalflow-sdk) for details about output grid sizes.
+                          Defaults to NV_OF_OUTPUT_VECTOR_GRID_SIZE_1.
+    @param hintGridSize Optional parameter. Refer [NV OF SDK documentation](https://developer.nvidia.com/opticalflow-sdk) for details about hint grid sizes.
+                        Defaults to NV_OF_HINT_VECTOR_GRID_SIZE_1.
+    @param enableTemporalHints Optional parameter. Flag to enable temporal hints. When set to true, the hardware uses the flow vectors
+                               generated in previous call to calc() as internal hints for the current call to calc().
+                               Useful when computing flow vectors between successive video frames. Defaults to false.
+    @param enableExternalHints Optional Parameter. Flag to enable passing external hints buffer to calc(). Defaults to false.
+    @param enableCostBuffer Optional Parameter. Flag to enable cost buffer output from calc(). Defaults to false.
+    @param gpuId Optional parameter to select the GPU ID on which the optical flow should be computed. Useful in multi-GPU systems. Defaults to 0.
+    @param inputStream Optical flow algorithm may optionally involve cuda preprocessing on the input buffers.
+                       The input cuda stream can be used to pipeline and synchronize the cuda preprocessing tasks with OF HW engine.
+                       If input stream is not set, the execute function will use default stream which is NULL stream;
+    @param outputStream Optical flow algorithm may optionally involve cuda post processing on the output flow vectors.
+                        The output cuda stream can be used to pipeline and synchronize the cuda post processing tasks with OF HW engine.
+                        If output stream is not set, the execute function will use default stream which is NULL stream;
+    */
+    CV_WRAP static Ptr<NvidiaOpticalFlow_2_0> create(
+        cv::Size imageSize,
+        cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL perfPreset
+        = cv::cuda::NvidiaOpticalFlow_2_0::NV_OF_PERF_LEVEL_SLOW,
+        cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE outputGridSize
+        = cv::cuda::NvidiaOpticalFlow_2_0::NV_OF_OUTPUT_VECTOR_GRID_SIZE_1,
+        cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE hintGridSize
+        = cv::cuda::NvidiaOpticalFlow_2_0::NV_OF_HINT_VECTOR_GRID_SIZE_1,
+        bool enableTemporalHints = false,
+        bool enableExternalHints = false,
+        bool enableCostBuffer = false,
+        int gpuId = 0,
+        Stream& inputStream = Stream::Null(),
+        Stream& outputStream = Stream::Null());
+
+    /**  @brief Instantiate NVIDIA Optical Flow with ROI Feature
+
+    @param imageSize Size of input image in pixels.
+    @param roiData Pointer to ROI data.
+    @param perfPreset Optional parameter. Refer [NV OF SDK documentation](https://developer.nvidia.com/opticalflow-sdk) for details about presets.
+                      Defaults to NV_OF_PERF_LEVEL_SLOW.
+    @param outputGridSize Optional parameter. Refer [NV OF SDK documentation](https://developer.nvidia.com/opticalflow-sdk) for details about output grid sizes.
+                          Defaults to NV_OF_OUTPUT_VECTOR_GRID_SIZE_1.
+    @param hintGridSize Optional parameter. Refer [NV OF SDK documentation](https://developer.nvidia.com/opticalflow-sdk) for details about hint grid sizes.
+                        Defaults to NV_OF_HINT_VECTOR_GRID_SIZE_1.
+    @param enableTemporalHints Optional parameter. Flag to enable temporal hints. When set to true, the hardware uses the flow vectors
+                               generated in previous call to calc() as internal hints for the current call to calc().
+                               Useful when computing flow vectors between successive video frames. Defaults to false.
+    @param enableExternalHints Optional Parameter. Flag to enable passing external hints buffer to calc(). Defaults to false.
+    @param enableCostBuffer Optional Parameter. Flag to enable cost buffer output from calc(). Defaults to false.
+    @param gpuId Optional parameter to select the GPU ID on which the optical flow should be computed. Useful in multi-GPU systems. Defaults to 0.
+    @param inputStream Optical flow algorithm may optionally involve cuda preprocessing on the input buffers.
+                       The input cuda stream can be used to pipeline and synchronize the cuda preprocessing tasks with OF HW engine.
+                       If input stream is not set, the execute function will use default stream which is NULL stream;
+    @param outputStream Optical flow algorithm may optionally involve cuda post processing on the output flow vectors.
+                        The output cuda stream can be used to pipeline and synchronize the cuda post processing tasks with OF HW engine.
+                        If output stream is not set, the execute function will use default stream which is NULL stream;
+    */
+    CV_WRAP static Ptr<NvidiaOpticalFlow_2_0> create(
+        cv::Size imageSize,
+        std::vector<Rect> roiData,
+        cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL perfPreset
+        = cv::cuda::NvidiaOpticalFlow_2_0::NV_OF_PERF_LEVEL_SLOW,
+        cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE outputGridSize
+        = cv::cuda::NvidiaOpticalFlow_2_0::NV_OF_OUTPUT_VECTOR_GRID_SIZE_1,
+        cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE hintGridSize
+        = cv::cuda::NvidiaOpticalFlow_2_0::NV_OF_HINT_VECTOR_GRID_SIZE_1,
        bool enableTemporalHints = false,
        bool enableExternalHints = false,
        bool enableCostBuffer = false,

--- a/modules/cudaoptflow/misc/python/test/test_nvidiaopticalflow.py
+++ b/modules/cudaoptflow/misc/python/test/test_nvidiaopticalflow.py
+import os
+import cv2 as cv
+import numpy as np
+
+from tests_common import NewOpenCVTests, unittest
+
+class nvidiaopticalflow_test(NewOpenCVTests):
+    def setUp(self):
+        super(nvidiaopticalflow_test, self).setUp()
+        if not cv.cuda.getCudaEnabledDeviceCount():
+            self.skipTest("No CUDA-capable device is detected")
+
+    @unittest.skipIf('OPENCV_TEST_DATA_PATH' not in os.environ,
+                        "OPENCV_TEST_DATA_PATH is not defined")
+    def test_calc(self):
+        frame1 = os.environ['OPENCV_TEST_DATA_PATH'] + '/gpu/opticalflow/frame0.png'
+        frame2 = os.environ['OPENCV_TEST_DATA_PATH'] + '/gpu/opticalflow/frame1.png'
+
+        npMat1 = cv.cvtColor(cv.imread(frame1),cv.COLOR_BGR2GRAY)
+        npMat2 = cv.cvtColor(cv.imread(frame2),cv.COLOR_BGR2GRAY)
+
+        cuMat1 = cv.cuda_GpuMat(npMat1)
+        cuMat2 = cv.cuda_GpuMat(npMat2)
+        try:
+            nvof = cv.cuda_NvidiaOpticalFlow_1_0.create(cuMat1.shape[1], cuMat1.shape[0], 5, False, False, False, 0)
+            flow = nvof.calc(cuMat1, cuMat2, None)
+            self.assertTrue(flow.shape[1] > 0 and flow.shape[0] > 0)
+            flowUpSampled = nvof.upSampler(flow[0], cuMat1.shape[1], cuMat1.shape[0], nvof.getGridSize(), None)
+            nvof.collectGarbage()
+        except cv.error as e:
+            if e.code == cv.Error.StsBadFunc or e.code == cv.Error.StsBadArg or e.code == cv.Error.StsNullPtr:
+                self.skipTest("Algorithm is not supported in the current environment")
+        self.assertTrue(flowUpSampled.shape[1] > 0 and flowUpSampled.shape[0] > 0)
+
+if __name__ == '__main__':
+    NewOpenCVTests.bootstrap()
\ No newline at end of file
--- a/modules/cudaoptflow/perf/perf_optflow.cpp
+++ b/modules/cudaoptflow/perf/perf_optflow.cpp
@@ -339,13 +339,8 @@ PERF_TEST_P(ImagePair, NvidiaOpticalFlow_1_0,

    const cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(frame1.empty());
-
-    const int width = frame0.size().width;
-    const int height = frame0.size().height;
-    const bool enableTemporalHints = false;
-    const bool enableExternalHints = false;
-    const bool enableCostBuffer = false;
-    const int gpuid = 0;
+    Stream inputStream;
+    Stream outputStream;

    if (PERF_RUN_CUDA())
    {
@@ -355,9 +350,9 @@ PERF_TEST_P(ImagePair, NvidiaOpticalFlow_1_0,
        cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> d_nvof;
        try
        {
-            d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(width, height,
+            d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(frame0.size(),
                cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST,
-                enableTemporalHints, enableExternalHints, enableCostBuffer, gpuid);
+                false, false, false, 0, inputStream, outputStream);
        }
        catch (const cv::Exception& e)
        {
@@ -376,6 +371,63 @@ PERF_TEST_P(ImagePair, NvidiaOpticalFlow_1_0,

        CUDA_SANITY_CHECK(u, 1e-10);
        CUDA_SANITY_CHECK(v, 1e-10);
+
+        d_nvof->collectGarbage();
+    }
+}
+
+//////////////////////////////////////////////////////
+// NvidiaOpticalFlow_2_0
+
+PERF_TEST_P(ImagePair, NvidiaOpticalFlow_2_0,
+    Values<pair_string>(make_pair("gpu/opticalflow/frame0.png", "gpu/opticalflow/frame1.png")))
+{
+    declare.time(10);
+
+    const cv::Mat frame0 = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    const cv::Mat frame1 = readImage(GetParam().second, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    const cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE outGridSize
+        = cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE::NV_OF_OUTPUT_VECTOR_GRID_SIZE_1;
+    const cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE hintGridSize
+        = cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE::NV_OF_HINT_VECTOR_GRID_SIZE_1;
+    Stream inputStream;
+    Stream outputStream;
+
+    if (PERF_RUN_CUDA())
+    {
+        const cv::cuda::GpuMat d_frame0(frame0);
+        const cv::cuda::GpuMat d_frame1(frame1);
+        cv::cuda::GpuMat d_flow;
+        cv::Ptr<cv::cuda::NvidiaOpticalFlow_2_0> d_nvof;
+        try
+        {
+            d_nvof = cv::cuda::NvidiaOpticalFlow_2_0::create(frame0.size(),
+                cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST, outGridSize, hintGridSize,
+              false, false, false, 0, inputStream, outputStream);
+        }
+        catch (const cv::Exception& e)
+        {
+            if (e.code == Error::StsBadFunc || e.code == Error::StsBadArg || e.code == Error::StsNullPtr)
+                throw SkipTestException("Current configuration is not supported");
+            throw;
+        }
+
+        TEST_CYCLE() d_nvof->calc(d_frame0, d_frame1, d_flow);
+
+        cv::cuda::GpuMat flow[2];
+        cv::cuda::split(d_flow, flow);
+
+        cv::cuda::GpuMat u = flow[0];
+        cv::cuda::GpuMat v = flow[1];
+
+        CUDA_SANITY_CHECK(u, 1e-10);
+        CUDA_SANITY_CHECK(v, 1e-10);
+
+        d_nvof->collectGarbage();
    }
 }


--- a/modules/cudaoptflow/samples/nvidia_optical_flow.cpp
+++ b/modules/cudaoptflow/samples/nvidia_optical_flow.cpp
@@ -2,6 +2,7 @@
 #include <iostream>
 #include <fstream>
 #include <iomanip>
+#include <iterator>

 #include "opencv2/core.hpp"
 #include "opencv2/core/utility.hpp"
@@ -11,7 +12,6 @@
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/video/tracking.hpp"

-using namespace std;
 using namespace cv;
 using namespace cv::cuda;

@@ -131,12 +131,88 @@ static void drawOpticalFlow(const Mat_<float>& flowx, const Mat_<float>& flowy
    }
 }

+/*
+ROI config file format.
+numrois 3
+roi0 640 96 1152 192
+roi1 640 64 896 864
+roi2 640 960 256 32
+*/
+bool parseROI(std::string ROIFileName, std::vector<Rect>& roiData)
+{
+    std::string str;
+    uint32_t nRois = 0;
+    std::ifstream hRoiFile;
+    hRoiFile.open(ROIFileName, std::ios::in);
+
+    if (hRoiFile.is_open())
+    {
+        while (std::getline(hRoiFile, str))
+        {
+            std::istringstream iss(str);
+            std::vector<std::string> tokens{ std::istream_iterator<std::string>{iss},
+                std::istream_iterator<std::string>{} };
+
+            if (tokens.size() == 0) continue; // if empty line, coninue
+
+            transform(tokens[0].begin(), tokens[0].end(), tokens[0].begin(), ::tolower);
+            if (tokens[0] == "numrois")
+            {
+                nRois = atoi(tokens[1].data());
+            }
+            else if (tokens[0].rfind("roi", 0) == 0)
+            {
+                cv::Rect roi;
+                roi.x = atoi(tokens[1].data());
+                roi.y = atoi(tokens[2].data());
+                roi.width = atoi(tokens[3].data());
+                roi.height = atoi(tokens[4].data());
+                roiData.push_back(roi);
+            }
+            else if (tokens[0].rfind("#", 0) == 0)
+            {
+                continue;
+            }
+            else
+            {
+                std::cout << "Unidentified keyword in roi config file " << tokens[0] << std::endl;
+                hRoiFile.close();
+                return false;
+            }
+        }
+    }
+    else
+    {
+        std::cout << "Unable to open ROI file " << std::endl;
+        return false;
+    }
+    if (nRois != roiData.size())
+    {
+        std::cout << "NumRois(" << nRois << ")and specified roi rects (" << roiData.size() << ")are not matching " << std::endl;
+        hRoiFile.close();
+        return false;
+    }
+    hRoiFile.close();
+    return true;
+}
+
 int main(int argc, char **argv)
 {
-    std::unordered_map<std::string, NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL> presetMap = {
-        { "slow", NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW },
-        { "medium", NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_MEDIUM },
-        { "fast", NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST } };
+    std::unordered_map<std::string, NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL> presetMap = {
+        { "slow", NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW },
+        { "medium", NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_MEDIUM },
+        { "fast", NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST } };
+
+    std::unordered_map<int, NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE> outputGridSize = {
+        { 1, NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE::NV_OF_OUTPUT_VECTOR_GRID_SIZE_1 },
+        { 2, NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE::NV_OF_OUTPUT_VECTOR_GRID_SIZE_2 },
+        { 4, NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE::NV_OF_OUTPUT_VECTOR_GRID_SIZE_4 } };
+
+    std::unordered_map<int, NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE> hintGridSize = {
+        { 1, NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE::NV_OF_HINT_VECTOR_GRID_SIZE_1 },
+        { 2, NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE::NV_OF_HINT_VECTOR_GRID_SIZE_2 },
+        { 4, NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE::NV_OF_HINT_VECTOR_GRID_SIZE_4 },
+        { 8, NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE::NV_OF_HINT_VECTOR_GRID_SIZE_8 } };

    try
    {
@@ -145,7 +221,10 @@ int main(int argc, char **argv)
            "{ r right  | ../data/basketball2.png | specify right image }"
            "{ g gpuid  | 0 | cuda device index}"
            "{ p preset | slow | perf preset for OF algo [ options : slow, medium, fast ]}"
+            "{ og outputGridSize | 1 | Output grid size of OF vector [ options : 1, 2, 4 ]}"
+            "{ hg hintGridSize | 1 | Hint grid size of OF vector [ options : 1, 2, 4, 8 ]}"
            "{ o output | OpenCVNvOF.flo | output flow vector file in middlebury format}"
+            "{ rc roiConfigFile | | Region of Interest config file }"
            "{ th enableTemporalHints | false | Enable temporal hints}"
            "{ eh enableExternalHints | false | Enable external hints}"
            "{ cb enableCostBuffer | false | Enable output cost buffer}"
@@ -159,60 +238,93 @@ int main(int argc, char **argv)
            return 0;
        }

-        string pathL = cmd.get<string>("left");
-        string pathR = cmd.get<string>("right");
-        string preset = cmd.get<string>("preset");
-        string output = cmd.get<string>("output");
+        std::string pathL = cmd.get<std::string>("left");
+        std::string pathR = cmd.get<std::string>("right");
+        std::string preset = cmd.get<std::string>("preset");
+        std::string output = cmd.get<std::string>("output");
+        std::string roiConfiFile = cmd.get<std::string>("roiConfigFile");
        bool enableExternalHints = cmd.get<bool>("enableExternalHints");
        bool enableTemporalHints = cmd.get<bool>("enableTemporalHints");
        bool enableCostBuffer = cmd.get<bool>("enableCostBuffer");
        int gpuId = cmd.get<int>("gpuid");
+        int outputBufferGridSize = cmd.get<int>("outputGridSize");
+        int hintBufferGridSize = cmd.get<int>("hintGridSize");

-        if (pathL.empty()) cout << "Specify left image path\n";
-        if (pathR.empty()) cout << "Specify right image path\n";
-        if (preset.empty()) cout << "Specify perf preset for OpticalFlow algo\n";
+        if (pathL.empty()) std::cout << "Specify left image path" << std::endl;
+        if (pathR.empty()) std::cout << "Specify right image path" << std::endl;
+        if (preset.empty()) std::cout << "Specify perf preset for OpticalFlow algo" << std::endl;
        if (pathL.empty() || pathR.empty()) return 0;

-        auto search = presetMap.find(preset);
-        if (search == presetMap.end())
+        auto p = presetMap.find(preset);
+        if (p == presetMap.end())
        {
            std::cout << "Invalid preset level : " << preset << std::endl;
            return 0;
        }
-        NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL perfPreset = search->second;
+        NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL perfPreset = p->second;
+
+        auto o = outputGridSize.find(outputBufferGridSize);
+        if (o == outputGridSize.end())
+        {
+            std::cout << "Invalid output grid size: " << outputBufferGridSize << std::endl;
+            return 0;
+        }
+        NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE outBufGridSize = o->second;
+
+        NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE hintBufGridSize =
+            NvidiaOpticalFlow_2_0::NV_OF_HINT_VECTOR_GRID_SIZE_UNDEFINED;
+        if (enableExternalHints)
+        {
+            auto h = hintGridSize.find(hintBufferGridSize);
+            if (h == hintGridSize.end())
+            {
+                std::cout << "Invalid hint grid size: " << hintBufferGridSize << std::endl;
+                return 0;
+            }
+            hintBufGridSize = h->second;
+        }
+
+        std::vector<Rect> roiData;
+
+        if (!roiConfiFile.empty())
+        {
+            if (!parseROI(roiConfiFile, roiData))
+            {
+                std::cout << "Wrong Region of Interest config file, proceeding without ROI" << std::endl;
+            }
+        }

        Mat frameL = imread(pathL, IMREAD_GRAYSCALE);
        Mat frameR = imread(pathR, IMREAD_GRAYSCALE);
-        if (frameL.empty()) cout << "Can't open '" << pathL << "'\n";
-        if (frameR.empty()) cout << "Can't open '" << pathR << "'\n";
+        if (frameL.empty()) std::cout << "Can't open '" << pathL << "'" << std::endl;
+        if (frameR.empty()) std::cout << "Can't open '" << pathR << "'" << std::endl;
        if (frameL.empty() || frameR.empty()) return -1;

-        Ptr<NvidiaOpticalFlow_1_0> nvof = NvidiaOpticalFlow_1_0::create(
-            frameL.size().width, frameL.size().height, perfPreset,
+        Ptr<NvidiaOpticalFlow_2_0> nvof = NvidiaOpticalFlow_2_0::create(
+            frameL.size(), roiData, perfPreset, outBufGridSize, hintBufGridSize,
            enableTemporalHints, enableExternalHints, enableCostBuffer, gpuId);

-        Mat flowx, flowy, flowxy, upsampledFlowXY, image;
+        Mat flowx, flowy, flowxy, floatFlow, image;

        nvof->calc(frameL, frameR, flowxy);

-        nvof->upSampler(flowxy, frameL.size().width, frameL.size().height,
-            nvof->getGridSize(), upsampledFlowXY);
+        nvof->convertToFloat(flowxy, floatFlow);

-        if (output.size() != 0)
+        if (!output.empty())
        {
-            if (!writeOpticalFlow(output, upsampledFlowXY))
-                cout << "Failed to save Flow Vector" << endl;
+            if (!writeOpticalFlow(output, floatFlow))
+                std::cout << "Failed to save Flow Vector" << std::endl;
            else
-                cout << "Flow vector saved as '" << output << "'\n";
+                std::cout << "Flow vector saved as '" << output << "'" << std::endl;
        }

        Mat planes[] = { flowx, flowy };
-        split(upsampledFlowXY, planes);
+        split(floatFlow, planes);
        flowx = planes[0]; flowy = planes[1];

        drawOpticalFlow(flowx, flowy, image, 10);

-        imshow("Colorize image",image);
+        imshow("Colorize image", image);
        waitKey(0);
        nvof->collectGarbage();
    }

--- a/modules/cudaoptflow/samples/optical_flow.cpp
+++ b/modules/cudaoptflow/samples/optical_flow.cpp
@@ -183,8 +183,11 @@ int main(int argc, const char* argv[])
    Ptr<cuda::DensePyrLKOpticalFlow> lk = cuda::DensePyrLKOpticalFlow::create(Size(7, 7));
    Ptr<cuda::FarnebackOpticalFlow> farn = cuda::FarnebackOpticalFlow::create();
    Ptr<cuda::OpticalFlowDual_TVL1> tvl1 = cuda::OpticalFlowDual_TVL1::create();
-    Ptr<cuda::NvidiaOpticalFlow_1_0> nvof = cuda::NvidiaOpticalFlow_1_0::create(frame0.size().width, frame0.size().height,
+    Ptr<cuda::NvidiaOpticalFlow_1_0> nvof_1_0 = cuda::NvidiaOpticalFlow_1_0::create(frame0.size(),
        NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST, false, false, false, 0, inputStream, outputStream);
+    Ptr<cuda::NvidiaOpticalFlow_2_0> nvof_2_0 = cuda::NvidiaOpticalFlow_2_0::create(frame0.size(),
+        NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_FAST, NvidiaOpticalFlow_2_0::NVIDIA_OF_OUTPUT_VECTOR_GRID_SIZE::NV_OF_OUTPUT_VECTOR_GRID_SIZE_1,
+        NvidiaOpticalFlow_2_0::NVIDIA_OF_HINT_VECTOR_GRID_SIZE::NV_OF_HINT_VECTOR_GRID_SIZE_UNDEFINED, false, false, false, 0, inputStream, outputStream);

    {
        GpuMat d_frame0f;
@@ -242,16 +245,32 @@ int main(int argc, const char* argv[])
        //Hence it is expected to be more than what is displayed in the NVIDIA Optical Flow SDK documentation.
        const int64 start = getTickCount();

-        nvof->calc(d_frame0, d_frame1, d_flowxy);
+        nvof_1_0->calc(d_frame0, d_frame1, d_flowxy);

        const double timeSec = (getTickCount() - start) / getTickFrequency();
-        cout << "NVIDIAOpticalFlow : " << timeSec << " sec" << endl;
+        cout << "NVIDIAOpticalFlow_1_0 : " << timeSec << " sec" << endl;

-        nvof->upSampler(d_flowxy, frame0.size().width, frame0.size().height,
-            nvof->getGridSize(), d_flow);
+        nvof_1_0->upSampler(d_flowxy, frame0.size(), nvof_1_0->getGridSize(), d_flow);

-        showFlow("NVIDIAOpticalFlow", d_flow);
-        nvof->collectGarbage();
+        showFlow("NVIDIAOpticalFlow_1_0", d_flow);
+        nvof_1_0->collectGarbage();
+    }
+
+    {
+        //The timing displayed below includes the time taken to copy the input buffers to the OF CUDA input buffers
+        //and to copy the output buffers from the OF CUDA output buffer to the output buffer.
+        //Hence it is expected to be more than what is displayed in the NVIDIA Optical Flow SDK documentation.
+        const int64 start = getTickCount();
+
+        nvof_2_0->calc(d_frame0, d_frame1, d_flowxy);
+
+        const double timeSec = (getTickCount() - start) / getTickFrequency();
+        cout << "NVIDIAOpticalFlow_2_0 : " << timeSec << " sec" << endl;
+
+        nvof_2_0->convertToFloat(d_flowxy, d_flow);
+
+        showFlow("NVIDIAOpticalFlow_2_0", d_flow);
+        nvof_2_0->collectGarbage();
    }

    imshow("Frame 0", frame0);

--- a/modules/cudaoptflow/src/cuda/nvidiaOpticalFlow.cu
+++ b/modules/cudaoptflow/src/cuda/nvidiaOpticalFlow.cu
+//
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+//M*/
+#if !defined CUDA_DISABLER
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+typedef unsigned char   uint8_t;
+typedef unsigned short  uint16_t;
+typedef unsigned int    uint32_t;
+typedef   signed short  int16_t;
+typedef   signed int    int32_t;
+
+#define BLOCKDIM_X 32
+#define BLOCKDIM_Y 16
+
+// data required to do 2x upsampling.  Same can be used for 4x upsampling also
+#define SMEM_COLS  ((BLOCKDIM_X)/2)
+#define SMEM_ROWS  ((BLOCKDIM_Y)/2)
+
+namespace cv { namespace cuda { namespace device { namespace optflow_nvidia
+{
+static const char *_cudaGetErrorEnum(cudaError_t error) { return cudaGetErrorName(error); }
+
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+    int const line) {
+    if (result) {
+        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<uint32_t>(result), _cudaGetErrorEnum(result), func);
+        // Make sure we call CUDA Device Reset before exiting
+        exit(EXIT_FAILURE);
+    }
+}
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+template <class T>
+static __device__ void ReadDevPtrData(void* devptr, uint32_t x0, uint32_t y0, uint32_t src_w, uint32_t src_h, uint32_t src_pitch,
+                                      T src[][SMEM_COLS], uint32_t i, uint32_t j)
+{
+    uint32_t shift = (sizeof(T) == sizeof(int32_t)) ? 2 : 1;
+    src[j][i] = *(T*)((uint8_t*)devptr + y0 * src_pitch + (x0 << shift));
+}
+
+
+extern "C"
+__global__ void NearestNeighborFlowKernel(cudaSurfaceObject_t srcSurf, void* srcDevPtr, uint32_t src_w, uint32_t src_pitch, uint32_t src_h,
+                                          cudaSurfaceObject_t dstSurf, void* dstDevPtr, uint32_t dst_w, uint32_t dst_pitch, uint32_t dst_h,
+                                          uint32_t nScaleFactor)
+{
+    int x = blockDim.x * blockIdx.x + threadIdx.x;
+    int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+    int x0 = x / nScaleFactor;
+    int y0 = y / nScaleFactor;
+
+    __shared__ short2 src[SMEM_ROWS][SMEM_COLS];
+
+    int i = threadIdx.x / nScaleFactor;
+    int j = threadIdx.y / nScaleFactor;
+
+    if ((x % nScaleFactor == 0) && (y % nScaleFactor == 0))
+    {
+        ReadDevPtrData<short2>(srcDevPtr, x0, y0, src_w, src_h, src_pitch, src, i, j);
+    }
+    __syncthreads();
+
+    if (x < dst_w && y < dst_h)
+    {
+        if (dstDevPtr == NULL)
+        {
+            surf2Dwrite<short2>(src[j][i], dstSurf, x * sizeof(short2), y, cudaBoundaryModeClamp);
+        }
+        else
+        {
+            *(short2*)((uint8_t*)dstDevPtr + y * dst_pitch + (x << 2)) = src[j][i];
+        }
+    }
+}
+
+void FlowUpsample(void* srcDevPtr, uint32_t nSrcWidth, uint32_t nSrcPitch, uint32_t nSrcHeight,
+                  void* dstDevPtr, uint32_t nDstWidth, uint32_t nDstPitch, uint32_t nDstHeight,
+                  uint32_t nScaleFactor)
+{
+
+        dim3 blockDim(BLOCKDIM_X, BLOCKDIM_Y);
+        dim3 gridDim((nDstWidth + blockDim.x - 1) / blockDim.x, (nDstHeight + blockDim.y - 1) / blockDim.y);
+        NearestNeighborFlowKernel << <gridDim, blockDim >> > (0, srcDevPtr, nSrcWidth, nSrcPitch, nSrcHeight,
+            0, dstDevPtr, nDstWidth, nDstPitch, nDstHeight,
+            nScaleFactor);
+
+        checkCudaErrors(cudaGetLastError());
+}}}}}
+
+#endif
\ No newline at end of file
--- a/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp
+++ b/modules/cudaoptflow/src/nvidiaOpticalFlow.cpp
--- a/modules/cudaoptflow/test/test_optflow.cpp
+++ b/modules/cudaoptflow/test/test_optflow.cpp
@@ -495,19 +495,11 @@ CUDA_TEST_P(NvidiaOpticalFlow_1_0, Regression)
    cv::Mat frame1 = readImage("opticalflow/frame1.png", cv::IMREAD_GRAYSCALE);
    ASSERT_FALSE(frame1.empty());

-    const int width = frame0.size().width;
-    const int height = frame0.size().height;
-    const bool enableTemporalHints = false;
-    const bool enableExternalHints = false;
-    const bool enableCostBuffer = false;
-    const int gpuid = 0;
-
    cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> d_nvof;
    try
    {
-        d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(width, height,
-            cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW,
-            enableTemporalHints, enableExternalHints, enableCostBuffer, gpuid);
+        d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(frame0.size(),
+            cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW);
    }
    catch (const cv::Exception& e)
    {
@@ -519,7 +511,7 @@ CUDA_TEST_P(NvidiaOpticalFlow_1_0, Regression)

    Mat flow, upsampledFlow;
    d_nvof->calc(loadMat(frame0), loadMat(frame1), flow);
-    d_nvof->upSampler(flow, width, height, gridSize, upsampledFlow);
+    d_nvof->upSampler(flow, frame0.size(), gridSize, upsampledFlow);

    std::string fname(cvtest::TS::ptr()->get_data_path());
    fname += "opticalflow/nvofGolden.flo";
@@ -527,6 +519,7 @@ CUDA_TEST_P(NvidiaOpticalFlow_1_0, Regression)
    ASSERT_FALSE(golden.empty());

    EXPECT_MAT_SIMILAR(golden, upsampledFlow, 1e-10);
+    d_nvof->collectGarbage();
 }

 CUDA_TEST_P(NvidiaOpticalFlow_1_0, OpticalFlowNan)
@@ -539,19 +532,11 @@ CUDA_TEST_P(NvidiaOpticalFlow_1_0, OpticalFlowNan)

    cv::Mat r_frame0, r_frame1;

-    const int width = frame0.size().width;
-    const int height = frame0.size().height;
-    const bool enableTemporalHints = false;
-    const bool enableExternalHints = false;
-    const bool enableCostBuffer = false;
-    const int gpuid = 0;
-
    cv::Ptr<cv::cuda::NvidiaOpticalFlow_1_0> d_nvof;
    try
    {
-        d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(width, height,
-            cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW,
-            enableTemporalHints, enableExternalHints, enableCostBuffer, gpuid);
+        d_nvof = cv::cuda::NvidiaOpticalFlow_1_0::create(frame0.size(),
+            cv::cuda::NvidiaOpticalFlow_1_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW);
    }
    catch (const cv::Exception& e)
    {
@@ -569,9 +554,96 @@ CUDA_TEST_P(NvidiaOpticalFlow_1_0, OpticalFlowNan)

    EXPECT_TRUE(cv::checkRange(flowx));
    EXPECT_TRUE(cv::checkRange(flowy));
+    d_nvof->collectGarbage();
 };

 INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, NvidiaOpticalFlow_1_0, ALL_DEVICES);

+//////////////////////////////////////////////////////
+// NvidiaOpticalFlow_2_0
+
+struct NvidiaOpticalFlow_2_0 : testing::TestWithParam<cv::cuda::DeviceInfo>
+{
+    cv::cuda::DeviceInfo devInfo;
+
+    virtual void SetUp()
+    {
+        devInfo = GetParam();
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(NvidiaOpticalFlow_2_0, Regression)
+{
+    cv::Mat frame0 = readImage("opticalflow/frame0.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/frame1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Ptr<cv::cuda::NvidiaOpticalFlow_2_0> d_nvof;
+    try
+    {
+        d_nvof = cv::cuda::NvidiaOpticalFlow_2_0::create(frame0.size(),
+            cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW);
+    }
+    catch (const cv::Exception& e)
+    {
+        if (e.code == Error::StsBadFunc || e.code == Error::StsBadArg || e.code == Error::StsNullPtr)
+            throw SkipTestException("Current configuration is not supported");
+        throw;
+    }
+
+    Mat flow, upsampledFlow;
+    d_nvof->calc(loadMat(frame0), loadMat(frame1), flow);
+    d_nvof->convertToFloat(flow, upsampledFlow);
+
+    std::string fname(cvtest::TS::ptr()->get_data_path());
+    fname += "opticalflow/nvofGolden_2.flo";
+    cv::Mat golden = cv::readOpticalFlow(fname.c_str());
+    ASSERT_FALSE(golden.empty());
+
+    EXPECT_MAT_SIMILAR(golden, upsampledFlow, 1e-10);
+    d_nvof->collectGarbage();
+}
+
+CUDA_TEST_P(NvidiaOpticalFlow_2_0, OpticalFlowNan)
+{
+    cv::Mat frame0 = readImage("opticalflow/rubberwhale1.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame0.empty());
+
+    cv::Mat frame1 = readImage("opticalflow/rubberwhale2.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(frame1.empty());
+
+    cv::Mat r_frame0, r_frame1;
+
+    cv::Ptr<cv::cuda::NvidiaOpticalFlow_2_0> d_nvof;
+    try
+    {
+        d_nvof = cv::cuda::NvidiaOpticalFlow_2_0::create(frame0.size(),
+            cv::cuda::NvidiaOpticalFlow_2_0::NVIDIA_OF_PERF_LEVEL::NV_OF_PERF_LEVEL_SLOW);
+    }
+    catch (const cv::Exception& e)
+    {
+        if (e.code == Error::StsBadFunc || e.code == Error::StsBadArg || e.code == Error::StsNullPtr)
+            throw SkipTestException("Current configuration is not supported");
+        throw;
+    }
+
+    Mat flow, flowx, flowy;
+    d_nvof->calc(loadMat(frame0), loadMat(frame1), flow);
+
+    Mat planes[] = { flowx, flowy };
+    split(flow, planes);
+    flowx = planes[0]; flowy = planes[1];
+
+    EXPECT_TRUE(cv::checkRange(flowx));
+    EXPECT_TRUE(cv::checkRange(flowy));
+    d_nvof->collectGarbage();
+};
+
+INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, NvidiaOpticalFlow_2_0, ALL_DEVICES);
+
 }} // namespace
 #endif // HAVE_CUDA