Merge pull request #707 from pengx17:2.4_surf

56d62118 · Andrey Kamaev · OpenCV Buildbot · b5dd26e4 · f2ecf4f9 · 56d62118
4 changed file
--- a/modules/nonfree/src/opencl/surf.cl
+++ b/modules/nonfree/src/opencl/surf.cl
@@ -747,21 +747,42 @@ void reduce_32_sum(volatile __local  float * data, volatile float* partial_reduc
 #define op(A, B) (*A)+(B)
    data[tid] = *partial_reduction;
    barrier(CLK_LOCAL_MEM_FENCE);
-
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
    if (tid < 16)
+    {
        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]);
+#if WAVE_SIZE < 16
+    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 8)
+    {
+#endif
        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]);
+#if WAVE_SIZE < 8
+    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 4)
+    {
+#endif
        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]);
+#if WAVE_SIZE < 4
+    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 2)
+    {
+#endif
        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]);
+#if WAVE_SIZE < 2
+    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 1)
+    {
+#endif
        data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]);
+    }
+#undef WAVE_SIZE
 #undef op
 }

@@ -1087,44 +1108,67 @@ void reduce_sum25(
    int tid
    )
 {
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
    // first step is to reduce from 25 to 16
-    if (tid < 9) // use 9 threads
+    if (tid < 9)
    {
        sdata1[tid] += sdata1[tid + 16];
        sdata2[tid] += sdata2[tid + 16];
        sdata3[tid] += sdata3[tid + 16];
        sdata4[tid] += sdata4[tid + 16];
+#if WAVE_SIZE < 16
    }
-
-    // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
+    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 8)
    {
+#endif
        sdata1[tid] += sdata1[tid + 8];
-        sdata1[tid] += sdata1[tid + 4];
-        sdata1[tid] += sdata1[tid + 2];
-        sdata1[tid] += sdata1[tid + 1];

        sdata2[tid] += sdata2[tid + 8];
-        sdata2[tid] += sdata2[tid + 4];
-        sdata2[tid] += sdata2[tid + 2];
-        sdata2[tid] += sdata2[tid + 1];

        sdata3[tid] += sdata3[tid + 8];
-        sdata3[tid] += sdata3[tid + 4];
-        sdata3[tid] += sdata3[tid + 2];
-        sdata3[tid] += sdata3[tid + 1];

        sdata4[tid] += sdata4[tid + 8];
+#if WAVE_SIZE < 8
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 4)
+    {
+#endif
+        sdata1[tid] += sdata1[tid + 4];
+        sdata2[tid] += sdata2[tid + 4];
+        sdata3[tid] += sdata3[tid + 4];
        sdata4[tid] += sdata4[tid + 4];
+#if WAVE_SIZE < 4
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 2)
+    {
+#endif
+        sdata1[tid] += sdata1[tid + 2];
+        sdata2[tid] += sdata2[tid + 2];
+        sdata3[tid] += sdata3[tid + 2];
        sdata4[tid] += sdata4[tid + 2];
+#if WAVE_SIZE < 2
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 1)
+    {
+#endif
+        sdata1[tid] += sdata1[tid + 1];
+        sdata2[tid] += sdata2[tid + 1];
+        sdata3[tid] += sdata3[tid + 1];
        sdata4[tid] += sdata4[tid + 1];
    }
+#undef WAVE_SIZE
 }

 __kernel
    void compute_descriptors64(
    IMAGE_INT8 imgTex,
-    volatile __global float * descriptors,
+    __global float * descriptors,
    __global const float * keypoints,
    int descriptors_step,
    int keypoints_step,
@@ -1158,14 +1202,13 @@ __kernel
        sdyabs[tid] = fabs(sdy[tid]); // |dy| array
    }
    barrier(CLK_LOCAL_MEM_FENCE);
-    if (tid < 25)
-    {
+
        reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
-    }
+ 
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 25)
    {
-        volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);
+        __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 2);

        // write dx, dy, |dx|, |dy|
        if (tid == 0)
@@ -1180,7 +1223,7 @@ __kernel
 __kernel
    void compute_descriptors128(
    IMAGE_INT8 imgTex,
-    __global volatile float * descriptors,
+    __global float * descriptors,
    __global float * keypoints,
    int descriptors_step,
    int keypoints_step,
@@ -1229,13 +1272,15 @@ __kernel
            sd2[tid] = sdx[tid];
            sdabs2[tid] = fabs(sdx[tid]);
        }
-        //barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);

        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-        //barrier(CLK_LOCAL_MEM_FENCE);
-
-        volatile __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
+    barrier(CLK_LOCAL_MEM_FENCE);

+    __global float* descriptors_block = descriptors + descriptors_step * get_group_id(0) + (get_group_id(1) << 3);
+    if (tid < 25)
+    {
        // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
        if (tid == 0)
        {
@@ -1259,11 +1304,14 @@ __kernel
            sd2[tid] = sdy[tid];
            sdabs2[tid] = fabs(sdy[tid]);
        }
-        //barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);

        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
-        //barrier(CLK_LOCAL_MEM_FENCE);
+    barrier(CLK_LOCAL_MEM_FENCE);

+    if (tid < 25)
+    {
        // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
        if (tid == 0)
        {
@@ -1274,6 +1322,103 @@ __kernel
        }
    }
 }
+void reduce_sum128(volatile __local  float* smem, int tid)
+{
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
+    if (tid < 64)
+    {
+        smem[tid] += smem[tid + 64];
+#if WAVE_SIZE < 64
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 32) 
+    {
+#endif
+        smem[tid] += smem[tid + 32];
+#if WAVE_SIZE < 32
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16) 
+    {
+#endif
+        smem[tid] += smem[tid + 16];
+#if WAVE_SIZE < 16
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
+    {
+#endif
+        smem[tid] += smem[tid + 8];
+#if WAVE_SIZE < 8
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 4)
+    {
+#endif
+        smem[tid] += smem[tid + 4];
+#if WAVE_SIZE < 4
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 2)
+    {
+#endif
+        smem[tid] += smem[tid + 2];
+#if WAVE_SIZE < 2
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 1)
+    {
+#endif
+        smem[tid] += smem[tid + 1];
+    }
+}
+void reduce_sum64(volatile __local  float* smem, int tid)
+{
+#ifndef WAVE_SIZE
+#define WAVE_SIZE 1
+#endif
+    if (tid < 32)
+    {
+        smem[tid] += smem[tid + 32];
+#if WAVE_SIZE < 32
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 16) 
+    {
+#endif
+        smem[tid] += smem[tid + 16];
+#if WAVE_SIZE < 16
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 8)
+    {
+#endif
+        smem[tid] += smem[tid + 8];
+#if WAVE_SIZE < 8
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 4)
+    {
+#endif
+        smem[tid] += smem[tid + 4];
+#if WAVE_SIZE < 4
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 2)
+    {
+#endif
+        smem[tid] += smem[tid + 2];
+#if WAVE_SIZE < 2
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (tid < 1)
+    {
+#endif
+        smem[tid] += smem[tid + 1];
+    }
+}

 __kernel
    void normalize_descriptors128(__global float * descriptors, int descriptors_step)
@@ -1288,22 +1433,10 @@ __kernel
    sqDesc[get_local_id(0)] = lookup * lookup;
    barrier(CLK_LOCAL_MEM_FENCE);

-    if (get_local_id(0) < 64)
-        sqDesc[get_local_id(0)] += sqDesc[get_local_id(0) + 64];
+    reduce_sum128(sqDesc, get_local_id(0));
    barrier(CLK_LOCAL_MEM_FENCE);

-    // reduction to get total
-    if (get_local_id(0) < 32)
-    {
-        volatile __local  float* smem = sqDesc;
-
-        smem[get_local_id(0)] += smem[get_local_id(0) + 32];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 16];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 8];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 4];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 2];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 1];
-    }
+

    // compute length (square root)
    volatile __local  float len;
@@ -1329,18 +1462,9 @@ __kernel
    sqDesc[get_local_id(0)] = lookup * lookup;
    barrier(CLK_LOCAL_MEM_FENCE);

-    // reduction to get total
-    if (get_local_id(0) < 32)
-    {
-        volatile __local  float* smem = sqDesc;
-
-        smem[get_local_id(0)] += smem[get_local_id(0) + 32];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 16];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 8];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 4];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 2];
-        smem[get_local_id(0)] += smem[get_local_id(0) + 1];
-    }
+
+    reduce_sum64(sqDesc, get_local_id(0));
+    barrier(CLK_LOCAL_MEM_FENCE);

    // compute length (square root)
    volatile __local  float len;

--- a/modules/nonfree/src/surf.ocl.cpp
+++ b/modules/nonfree/src/surf.ocl.cpp
@@ -43,6 +43,7 @@
 //
 //M*/
 #include "precomp.hpp"
+#include <cstdio>

 #ifdef HAVE_OPENCV_OCL

@@ -57,25 +58,35 @@ namespace cv
        ///////////////////////////OpenCL kernel strings///////////////////////////
        extern const char *surf;

-        const char* noImage2dOption = "-D DISABLE_IMAGE2D";
+        const char noImage2dOption [] = "-D DISABLE_IMAGE2D";

+        static char SURF_OPTIONS [1024] = ""; 
+        static bool USE_IMAGE2d = false;
        static void openCLExecuteKernelSURF(Context *clCxt , const char **source, string kernelName, size_t globalThreads[3],
            size_t localThreads[3],  vector< pair<size_t, const void *> > &args, int channels, int depth)
        {
-            if(support_image2d())
+            char * pSURF_OPTIONS = SURF_OPTIONS;
+            static bool OPTION_INIT = false;
+            if(!OPTION_INIT)
            {
-                openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth);
-            }
-            else
-            {
-                openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, noImage2dOption);
+                if( !USE_IMAGE2d )
+                {
+                    strcat(pSURF_OPTIONS, noImage2dOption);
+                    pSURF_OPTIONS += strlen(noImage2dOption);
+                }
+
+                size_t wave_size = 0;
+                queryDeviceInfo(WAVEFRONT_SIZE, &wave_size);
+                std::sprintf(pSURF_OPTIONS, " -D WAVE_SIZE=%d", static_cast<int>(wave_size));
+                OPTION_INIT = true;
            }
+            openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, SURF_OPTIONS);
        }
    }
 }


-static inline int divUp(size_t total, size_t grain)
+static inline size_t divUp(size_t total, size_t grain)
 {
    return (total + grain - 1) / grain;
 }
@@ -152,8 +163,20 @@ public:
        integral(img, surf_.sum);
        if(support_image2d())
        {
-            bindImgTex(img, imgTex);
-            bindImgTex(surf_.sum, sumTex);
+            try
+            {
+                bindImgTex(img, imgTex);
+                bindImgTex(surf_.sum, sumTex);
+                USE_IMAGE2d = true;
+            }
+            catch (const cv::Exception& e)
+            {
+                USE_IMAGE2d = false;
+                if(e.code != CL_IMAGE_FORMAT_NOT_SUPPORTED && e.code != -217)
+                {
+                    throw e;
+                }
+            }
        }

        maskSumTex = 0;

--- a/modules/ocl/include/opencv2/ocl/private/util.hpp
+++ b/modules/ocl/include/opencv2/ocl/private/util.hpp
@@ -123,6 +123,16 @@ namespace cv
        // returns whether the current context supports image2d_t format or not
        bool CV_EXPORTS support_image2d(Context *clCxt = Context::getContext());

+        // the enums are used to query device information
+        // currently only support wavefront size queries
+        enum DEVICE_INFO
+        {
+             WAVEFRONT_SIZE,            //in AMD speak
+             WARP_SIZE = WAVEFRONT_SIZE //in nvidia speak
+        };
+        //info should have been pre-allocated
+        void CV_EXPORTS queryDeviceInfo(DEVICE_INFO info_type, void* info);
+
    }//namespace ocl

 }//namespace cv

--- a/modules/ocl/src/initialization.cpp
+++ b/modules/ocl/src/initialization.cpp
@@ -353,6 +353,51 @@ namespace cv
        {
            return &(Context::getContext()->impl->clCmdQueue);
        }
+
+        void queryDeviceInfo(DEVICE_INFO info_type, void* info)
+        {
+            static Info::Impl* impl = Context::getContext()->impl;
+            switch(info_type)
+            {
+            case WAVEFRONT_SIZE:
+                {
+#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+                    try
+                    {
+                        openCLSafeCall(clGetDeviceInfo(Context::getContext()->impl->devices[0], 
+                            CL_DEVICE_WAVEFRONT_WIDTH_AMD, sizeof(size_t), info, 0));
+                    }
+                    catch(const cv::Exception&)
+#elif defined (CL_DEVICE_WARP_SIZE_NV)
+                    const int EXT_LEN = 4096 + 1 ;
+                    char extends_set[EXT_LEN];
+                    size_t extends_size;
+                    openCLSafeCall(clGetDeviceInfo(impl->devices[impl->devnum], CL_DEVICE_EXTENSIONS, EXT_LEN, (void *)extends_set, &extends_size));
+                    extends_set[EXT_LEN - 1] = 0;
+                    if(std::string(extends_set).find("cl_nv_device_attribute_query") != std::string::npos)
+                    {
+                        openCLSafeCall(clGetDeviceInfo(Context::getContext()->impl->devices[0], 
+                            CL_DEVICE_WARP_SIZE_NV, sizeof(size_t), info, 0));
+                    }
+                    else
+#endif
+                    {
+                        // if no way left for us to query the warp size, we can get it from kernel group info
+                        static const char * _kernel_string = "__kernel void test_func() {}";
+                        cl_kernel kernel;
+                        kernel = openCLGetKernelFromSource(Context::getContext(), &_kernel_string, "test_func");
+                        openCLSafeCall(clGetKernelWorkGroupInfo(kernel, impl->devices[impl->devnum],
+                            CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), info, NULL));
+                    }
+
+                }
+                break;
+            default:
+                CV_Error(-1, "Invalid device info type");
+                break;
+            }
+        }
+
        void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size)
        {
            cl_int status;