提交 ebc3043c 编写于 作者: A Anton Obukhov

[*] Fixed two bugs in reduction functor: out of shared memory bounds access...

[*] Fixed two bugs in reduction functor: out of shared memory bounds access and missing volatile on GF100 and further
上级 4378f398
...@@ -451,7 +451,7 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr ...@@ -451,7 +451,7 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
} }
} }
template <NcvBool tbCacheTextureIImg, template <NcvBool tbCacheTextureIImg,
NcvBool tbCacheTextureCascade, NcvBool tbCacheTextureCascade,
NcvBool tbDoAtomicCompaction> NcvBool tbDoAtomicCompaction>
......
...@@ -64,40 +64,56 @@ static T divUp(T a, T b) ...@@ -64,40 +64,56 @@ static T divUp(T a, T b)
template<typename T> template<typename T>
struct functorAddValues struct functorAddValues
{ {
static __device__ __inline__ void reduce(T &in1out, T &in2) static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
{
//Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
*dst = *src;
}
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
{ {
in1out += in2; in1out += in2;
} }
}; };
template<typename T> template<typename T>
struct functorMinValues struct functorMinValues
{ {
static __device__ __inline__ void reduce(T &in1out, T &in2) static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
{ {
//Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
*dst = *src;
}
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
{
in1out = in1out > in2 ? in2 : in1out; in1out = in1out > in2 ? in2 : in1out;
} }
}; };
template<typename T> template<typename T>
struct functorMaxValues struct functorMaxValues
{ {
static __device__ __inline__ void reduce(T &in1out, T &in2) static __device__ __inline__ void assign(volatile T *dst, volatile T *src)
{ {
//Works only for integral types. If you see compiler error here, then you have to specify how to copy your object as a set of integral fields.
*dst = *src;
}
static __device__ __inline__ void reduce(volatile T &in1out, const volatile T &in2)
{
in1out = in1out > in2 ? in1out : in2; in1out = in1out > in2 ? in1out : in2;
} }
}; };
template<typename Tdata, class Tfunc, Ncv32u nThreads> template<typename Tdata, class Tfunc, Ncv32u nThreads>
static __device__ Tdata subReduce(Tdata threadElem) static __device__ Tdata subReduce(Tdata threadElem)
{ {
Tfunc functor; Tfunc functor;
__shared__ Tdata reduceArr[nThreads]; __shared__ Tdata _reduceArr[nThreads];
reduceArr[threadIdx.x] = threadElem; volatile Tdata *reduceArr = _reduceArr;
functor.assign(reduceArr + threadIdx.x, &threadElem);
__syncthreads(); __syncthreads();
if (nThreads >= 256 && threadIdx.x < 128) if (nThreads >= 256 && threadIdx.x < 128)
...@@ -118,18 +134,20 @@ static __device__ Tdata subReduce(Tdata threadElem) ...@@ -118,18 +134,20 @@ static __device__ Tdata subReduce(Tdata threadElem)
{ {
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 32]); functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 32]);
} }
if (nThreads >= 32) if (nThreads >= 32 && threadIdx.x < 16)
{ {
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 16]); functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 16]);
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
} }
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
} }
__syncthreads(); __syncthreads();
return reduceArr[0]; Tdata reduceRes;
functor.assign(&reduceRes, reduceArr);
return reduceRes;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册