[*] Fixed two bugs in reduction functor: out of shared memory bounds access...

[*] Fixed two bugs in reduction functor: out of shared memory bounds access and missing volatile on GF100 and further

[*] Fixed two bugs in reduction functor: out of shared memory bounds access...
[*] Fixed two bugs in reduction functor: out of shared memory bounds access and missing volatile on GF100 and further
ebc3043c · Anton Obukhov · 4378f398 · ebc3043c · ebc3043c
Showing with 38 addition and 20 deletion

modules/gpu/src/nvidia/NCVHaarObjectDetection.cu modules/gpu/src/nvidia/NCVHaarObjectDetection.cu +1 -1

modules/gpu/src/nvidia/core/NCVAlg.hpp modules/gpu/src/nvidia/core/NCVAlg.hpp +37 -19

未找到文件。
--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
@@ -451,7 +451,7 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
    }
 }
 template <NcvBool tbCacheTextureIImg,
          NcvBool tbCacheTextureCascade,
          NcvBool tbDoAtomicCompaction>

--- a/modules/gpu/src/nvidia/core/NCVAlg.hpp
+++ b/modules/gpu/src/nvidia/core/NCVAlg.hpp
@@ -64,40 +64,56 @@ static T divUp(T a, T b)
 template<typename T>
 struct functorAddValues
 {
-    static __device__ __inline__ void reduce(T &in1out, T &in2)
+    static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
+    {
+        //Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
+        *dst = *src;
+    }
+    static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
    {
        in1out += in2;
    }
 };
 template<typename T>
 struct functorMinValues
 {
-    static __device__ __inline__ void reduce(T &in1out, T &in2)
+    static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
    {
+        //Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
+        *dst = *src;
+    }
+    static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
+    {
        in1out = in1out > in2 ? in2 : in1out;
    }
 };
 template<typename T>
 struct functorMaxValues
 {
-    static __device__ __inline__ void reduce(T &in1out, T &in2)
+    static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
    {
+        //Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
+        *dst = *src;
+    }
+    static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
+    {
        in1out = in1out > in2 ? in1out : in2;
    }
 };
 template<typename Tdata, class Tfunc, Ncv32u nThreads>
 static __device__ Tdata subReduce(Tdata threadElem)
 {
    Tfunc functor;
-    __shared__ Tdata reduceArr[nThreads];
+    __shared__ Tdata _reduceArr[nThreads];
-    reduceArr[threadIdx.x] = threadElem;
+    volatile Tdata *reduceArr = _reduceArr;
+    functor.assign(reduceArr + threadIdx.x, &threadElem);
    __syncthreads();
    if (nThreads >= 256 && threadIdx.x < 128)
@@ -118,18 +134,20 @@ static __device__ Tdata subReduce(Tdata threadElem)
        {
            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 32]);
        }
-        if (nThreads >= 32)
+        if (nThreads >= 32 && threadIdx.x < 16)
        {
            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 16]);
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
+            functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
        }
-        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
-        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
-        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
-        functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
    }
    __syncthreads();
-    return reduceArr[0];
+    Tdata reduceRes;
+    functor.assign(&reduceRes, reduceArr);
+    return reduceRes;
 }