some optimizations to ocl::pyrDown, PyrLK and Canny

09359982 · bitwangyaoyao · 494ae156 · 09359982 · 09359982 · 09359982
9 changed file
--- a/modules/ocl/src/canny.cpp
+++ b/modules/ocl/src/canny.cpp
@@ -45,6 +45,7 @@

 #include <iomanip>
 #include "precomp.hpp"
+#include "mcwutil.hpp"

 using namespace cv;
 using namespace cv::ocl;
@@ -237,7 +238,7 @@ void canny::calcSobelRowPass_gpu(const oclMat& src, oclMat& dx_buf, oclMat& dy_b

    size_t globalThreads[3] = {cols, rows, 1};
    size_t localThreads[3]  = {16, 16, 1};
-    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 void canny::calcMagnitude_gpu(const oclMat& dx_buf, const oclMat& dy_buf, oclMat& dx, oclMat& dy, oclMat& mag, int rows, int cols, bool L2Grad)
@@ -272,7 +273,7 @@ void canny::calcMagnitude_gpu(const oclMat& dx_buf, const oclMat& dy_buf, oclMat
    {
        strcat(build_options, "-D L2GRAD");
    }
-    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
+    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
 }
 void canny::calcMagnitude_gpu(const oclMat& dx, const oclMat& dy, oclMat& mag, int rows, int cols, bool L2Grad)
 {
@@ -300,7 +301,7 @@ void canny::calcMagnitude_gpu(const oclMat& dx, const oclMat& dy, oclMat& mag, i
    {
        strcat(build_options, "-D L2GRAD");
    }
-    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
+    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, build_options);
 }

 void canny::calcMap_gpu(oclMat& dx, oclMat& dy, oclMat& mag, oclMat& map, int rows, int cols, float low_thresh, float high_thresh)
@@ -331,7 +332,7 @@ void canny::calcMap_gpu(oclMat& dx, oclMat& dy, oclMat& mag, oclMat& map, int ro
    string kernelName = "calcMap";
    size_t localThreads[3]  = {16, 16, 1};

-    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 void canny::edgesHysteresisLocal_gpu(oclMat& map, oclMat& st1, void * counter, int rows, int cols)
@@ -351,7 +352,7 @@ void canny::edgesHysteresisLocal_gpu(oclMat& map, oclMat& st1, void * counter, i
    size_t globalThreads[3] = {cols, rows, 1};
    size_t localThreads[3]  = {16, 16, 1};

-    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 void canny::edgesHysteresisGlobal_gpu(oclMat& map, oclMat& st1, oclMat& st2, void * counter, int rows, int cols)
@@ -381,7 +382,7 @@ void canny::edgesHysteresisGlobal_gpu(oclMat& map, oclMat& st1, oclMat& st2, voi
        args.push_back( make_pair( sizeof(cl_int), (void *)&map.step));
        args.push_back( make_pair( sizeof(cl_int), (void *)&map.offset));

-        openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+        openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1, DISABLE);
        openCLSafeCall(clEnqueueReadBuffer(Context::getContext()->impl->clCmdQueue, (cl_mem)counter, 1, 0, sizeof(int), &count, 0, NULL, NULL));
        std::swap(st1, st2);
    }
@@ -406,7 +407,7 @@ void canny::getEdges_gpu(oclMat& map, oclMat& dst, int rows, int cols)
    size_t globalThreads[3] = {cols, rows, 1};
    size_t localThreads[3]  = {16, 16, 1};

-    openCLExecuteKernel(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel2(clCxt, &imgproc_canny, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 #endif // HAVE_OPENCL
--- a/modules/ocl/src/hog.cpp
+++ b/modules/ocl/src/hog.cpp
@@ -44,7 +44,7 @@
 //M*/

 #include "precomp.hpp"
-
+#include "mcwutil.hpp"
 using namespace cv;
 using namespace cv::ocl;
 using namespace std;
@@ -1613,7 +1613,7 @@ void cv::ocl::device::hog::compute_hists(int nbins, int block_stride_x, int bloc
    args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
    args.push_back( make_pair( smem, (void *)NULL));

-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 void cv::ocl::device::hog::normalize_hists(int nbins, int block_stride_x, int block_stride_y,
@@ -1641,7 +1641,7 @@ void cv::ocl::device::hog::normalize_hists(int nbins, int block_stride_x, int bl
    args.push_back( make_pair( sizeof(cl_float), (void *)&threshold));
    args.push_back( make_pair( nthreads * sizeof(float), (void *)NULL));

-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int block_stride_y,
@@ -1675,7 +1675,7 @@ void cv::ocl::device::hog::classify_hists(int win_height, int win_width, int blo
    args.push_back( make_pair( sizeof(cl_float), (void *)&threshold));
    args.push_back( make_pair( sizeof(cl_mem), (void *)&labels.data));

-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x,
@@ -1706,7 +1706,7 @@ void cv::ocl::device::hog::extract_descrs_by_rows(int win_height, int win_width,
    args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
    args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));

-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
@@ -1738,7 +1738,7 @@ void cv::ocl::device::hog::extract_descrs_by_cols(int win_height, int win_width,
    args.push_back( make_pair( sizeof(cl_mem), (void *)&block_hists.data));
    args.push_back( make_pair( sizeof(cl_mem), (void *)&descriptors.data));

-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 static inline int divUp(int total, int grain)
@@ -1772,7 +1772,7 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width, const c
    args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma));
    args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins));

-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const cv::ocl::oclMat& img,
@@ -1802,7 +1802,7 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width, const c
    args.push_back( make_pair( sizeof(cl_char), (void *)&correctGamma));
    args.push_back( make_pair( sizeof(cl_int), (void *)&cnbins));

-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz)
@@ -1834,7 +1834,7 @@ void cv::ocl::device::hog::resize( const oclMat &src, oclMat &dst, const Size sz
    args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
    args.push_back( make_pair(sizeof(cl_float), (void *)&ify));

-    openCLExecuteKernel(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
+    openCLExecuteKernel2(clCxt, &objdetect_hog, kernelName, globalThreads, localThreads, args, -1, -1);
 }

 #endif
--- a/modules/ocl/src/kernels/pyr_down.cl
+++ b/modules/ocl/src/kernels/pyr_down.cl
--- a/modules/ocl/src/kernels/pyrlk.cl
+++ b/modules/ocl/src/kernels/pyrlk.cl
@@ -45,6 +45,25 @@

 //#pragma OPENCL EXTENSION cl_amd_printf : enable

+__kernel void arithm_muls_D5 (__global float *src1, int src1_step, int src1_offset,
+                             __global float *dst,  int dst_step,  int dst_offset,
+                             int rows, int cols, int dst_step1, float scalar)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    if (x < cols && y < rows)
+    {
+        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset);
+        int dst_index  = mad24(y, dst_step,  (x << 2) + dst_offset);
+
+        float data1 = *((__global float *)((__global char *)src1 + src1_index));
+        float tmp = data1 * scalar;
+
+        *((__global float *)((__global char *)dst + dst_index)) = tmp;
+    }
+}
+

 __kernel void calcSharrDeriv_vertical_C1_D0(__global const uchar* src, int srcStep, int rows, int cols, int cn, __global short* dx_buf, int dx_bufStep, __global short* dy_buf, int dy_bufStep)
 {

--- a/modules/ocl/src/mcwutil.cpp
+++ b/modules/ocl/src/mcwutil.cpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "mcwutil.hpp"
+
+#if defined (HAVE_OPENCL)
+
+using namespace std;
+
+
+
+namespace cv
+{
+    namespace ocl
+    {
+
+        inline int divUp(int total, int grain)
+        {
+            return (total + grain - 1) / grain;
+        }
+
+        // provide additional methods for the user to interact with the command queue after a task is fired
+        void openCLExecuteKernel_2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
+            size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels,
+            int depth, char *build_options, FLUSH_MODE finish_mode)
+        {
+            //construct kernel name
+            //The rule is functionName_Cn_Dn, C represent Channels, D Represent DataType Depth, n represent an integer number
+            //for exmaple split_C2_D2, represent the split kernel with channels =2 and dataType Depth = 2(Data type is char)
+            stringstream idxStr;
+            if(channels != -1)
+                idxStr << "_C" << channels;
+            if(depth != -1)
+                idxStr << "_D" << depth;
+            kernelName += idxStr.str();
+
+            cl_kernel kernel;
+            kernel = openCLGetKernelFromSource(clCxt, source, kernelName, build_options);
+
+            if ( localThreads != NULL)
+            {    
+                globalThreads[0] = divUp(globalThreads[0], localThreads[0]) * localThreads[0];
+                globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
+                globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
+
+                size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
+                cv::ocl::openCLVerifyKernel(clCxt, kernel, &blockSize, globalThreads, localThreads);
+            }
+            for(int i = 0; i < args.size(); i ++)
+                openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
+
+            openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,
+                localThreads, 0, NULL, NULL));
+
+            switch(finish_mode)
+            {
+            case CLFINISH:
+                clFinish(clCxt->impl->clCmdQueue);
+            case CLFLUSH:
+                clFlush(clCxt->impl->clCmdQueue);
+                break;
+            case DISABLE:
+            default:
+                break;
+            }
+            openCLSafeCall(clReleaseKernel(kernel));
+        }
+
+        void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName,
+            size_t globalThreads[3], size_t localThreads[3],
+            vector< pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode)
+        {
+            openCLExecuteKernel2(clCxt, source, kernelName, globalThreads, localThreads, args,
+                channels, depth, NULL, finish_mode);
+        }
+        void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName,
+            size_t globalThreads[3], size_t localThreads[3],
+            vector< pair<size_t, const void *> > &args, int channels, int depth, char *build_options, FLUSH_MODE finish_mode)
+
+        {
+            openCLExecuteKernel_2(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth,
+                build_options, finish_mode);
+        }
+    }//namespace ocl
+
+}//namespace cv
+#endif
\ No newline at end of file
--- a/modules/ocl/src/mcwutil.hpp
+++ b/modules/ocl/src/mcwutil.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
+// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// @Authors
+//    Peng Xiao, pengxiao@multicorewareinc.com
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other oclMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors as is and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef _OPENCV_MCWUTIL_
+#define _OPENCV_MCWUTIL_
+
+#include "precomp.hpp"
+
+#if defined (HAVE_OPENCL)
+
+using namespace std;
+
+namespace cv
+{
+    namespace ocl
+    {
+        enum FLUSH_MODE
+        {
+            CLFINISH = 0,
+            CLFLUSH,
+            DISABLE
+        };
+        void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
+            size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels, int depth, FLUSH_MODE finish_mode = DISABLE);
+        void openCLExecuteKernel2(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
+            size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels,
+            int depth, char *build_options, FLUSH_MODE finish_mode = DISABLE);
+    }//namespace ocl
+
+}//namespace cv
+#endif // HAVE_OPENCL
+#endif //_OPENCV_MCWUTIL_
--- a/modules/ocl/src/pyrdown.cpp
+++ b/modules/ocl/src/pyrdown.cpp
@@ -66,7 +66,6 @@ namespace cv
 //////////////////////////////////////////////////////////////////////////////
 /////////////////////// add subtract multiply divide /////////////////////////
 //////////////////////////////////////////////////////////////////////////////
-template<typename T>
 void pyrdown_run(const oclMat &src, const oclMat &dst)
 {

@@ -95,52 +94,14 @@ void pyrdown_run(const oclMat &src, const oclMat &dst)
    vector<pair<size_t , const void *> > args;
    args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data ));
    args.push_back( make_pair( sizeof(cl_int), (void *)&src.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&src.offset ));
    args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows));
    args.push_back( make_pair( sizeof(cl_int), (void *)&src.cols));
    args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
-    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset ));
    args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols));

    openCLExecuteKernel(clCxt, &pyr_down, kernelName, globalThreads, localThreads, args, src.channels(), src.depth());
 }
-void pyrdown_run(const oclMat &src, const oclMat &dst)
-{
-	switch(src.depth())
-	{
-	case 0:
-	    pyrdown_run<unsigned char>(src, dst);
-		break;
-
-	case 1:
-	    pyrdown_run<char>(src, dst);
-		break;
-
-	case 2:
-	    pyrdown_run<unsigned short>(src, dst);
-		break;
-
-	case 3:
-	    pyrdown_run<short>(src, dst);
-		break;
-
-	case 4:
-	    pyrdown_run<int>(src, dst);
-		break;
-
-	case 5:
-	    pyrdown_run<float>(src, dst);
-		break;
-
-	case 6:
-	    pyrdown_run<double>(src, dst);
-		break;
-
-	default:
-		break;
-	}
-}
 //////////////////////////////////////////////////////////////////////////////
 // pyrDown

@@ -148,11 +109,9 @@ void cv::ocl::pyrDown(const oclMat& src, oclMat& dst)
 {
    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);

-	//src.step = src.rows;
-
    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());

-	dst.download_channels = src.download_channels;
+	dst.download_channels=src.download_channels;

    pyrdown_run(src, dst);
 }

--- a/modules/ocl/src/pyrlk.cpp
+++ b/modules/ocl/src/pyrlk.cpp
--- a/modules/ocl/test/test_pyrlk.cpp
+++ b/modules/ocl/test/test_pyrlk.cpp
@@ -118,9 +118,9 @@ TEST_P(Sparse, Mat)
    cv::Mat status_mat(1, d_status.cols, CV_8UC1, (void*)&status[0]);
    d_status.download(status_mat);

-    std::vector<float> err(d_err.cols);
-    cv::Mat err_mat(1, d_err.cols, CV_32FC1, (void*)&err[0]);
-    d_err.download(err_mat);
+    //std::vector<float> err(d_err.cols);
+    //cv::Mat err_mat(1, d_err.cols, CV_32FC1, (void*)&err[0]);
+    //d_err.download(err_mat);

    std::vector<cv::Point2f> nextPts_gold;
    std::vector<unsigned char> status_gold;