提交 3396a2ec 编写于 作者: A Anton Obukhov

[~] Refactored device reduction for better reuse (now with functor semantics)

上级 56531c61
......@@ -59,6 +59,7 @@
#include <cstdio>
#include "NCV.hpp"
#include "NCVAlg.hpp"
#include "NPP_staging/NPP_staging.hpp"
#include "NCVRuntimeTemplates.hpp"
#include "NCVHaarObjectDetection.hpp"
......@@ -84,11 +85,6 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
pos += K_WARP_SIZE;
s_Data[pos] = idata;
//for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
//{
// s_Data[pos] += s_Data[pos - offset];
//}
s_Data[pos] += s_Data[pos - 1];
s_Data[pos] += s_Data[pos - 2];
s_Data[pos] += s_Data[pos - 4];
......@@ -234,60 +230,6 @@ __device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
}
__device__ Ncv32f reduceSpecialization(Ncv32f partialSum)
{
__shared__ volatile Ncv32f reductor[NUM_THREADS_CLASSIFIERPARALLEL];
reductor[threadIdx.x] = partialSum;
__syncthreads();
#if defined CPU_FP_COMPLIANCE
if (!threadIdx.x)
{
Ncv32f sum = 0.0f;
for (int i=0; i<NUM_THREADS_CLASSIFIERPARALLEL; i++)
{
sum += reductor[i];
}
reductor[0] = sum;
}
#else
#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 8
if (threadIdx.x < 128)
{
reductor[threadIdx.x] += reductor[threadIdx.x + 128];
}
__syncthreads();
#endif
#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 7
if (threadIdx.x < 64)
{
reductor[threadIdx.x] += reductor[threadIdx.x + 64];
}
__syncthreads();
#endif
if (threadIdx.x < 32)
{
#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 6
reductor[threadIdx.x] += reductor[threadIdx.x + 32];
#endif
#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 5
reductor[threadIdx.x] += reductor[threadIdx.x + 16];
#endif
reductor[threadIdx.x] += reductor[threadIdx.x + 8];
reductor[threadIdx.x] += reductor[threadIdx.x + 4];
reductor[threadIdx.x] += reductor[threadIdx.x + 2];
reductor[threadIdx.x] += reductor[threadIdx.x + 1];
}
#endif
__syncthreads();
return reductor[0];
}
__device__ Ncv32u d_outMaskPosition;
......@@ -623,7 +565,14 @@ __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IIm
curRootNodeOffset += NUM_THREADS_CLASSIFIERPARALLEL;
}
Ncv32f finalStageSum = reduceSpecialization(curStageSum);
struct functorAddValues
{
__device__ void reduce(Ncv32f &in1out, Ncv32f &in2)
{
in1out += in2;
}
};
Ncv32f finalStageSum = subReduce<Ncv32f, functorAddValues, NUM_THREADS_CLASSIFIERPARALLEL>(curStageSum);
if (finalStageSum < stageThreshold)
{
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef _ncv_alg_hpp_
#define _ncv_alg_hpp_
#include "NCV.hpp"
template<typename T>
static T divUp(T a, T b)
{
return (a + b - 1) / b;
}
template<typename Tdata, class Tfunc, Ncv32u nThreads>
static __device__ Tdata subReduce(Tdata threadElem)
{
Tfunc functor;
__shared__ Tdata reduceArr[nThreads];
reduceArr[threadIdx.x] = threadElem;
__syncthreads();
if (nThreads >= 256 && threadIdx.x < 128)
{
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 128]);
}
__syncthreads();
if (nThreads >= 128 && threadIdx.x < 64)
{
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 64]);
}
__syncthreads();
if (threadIdx.x < 32)
{
if (nThreads >= 64)
{
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 32]);
}
if (nThreads >= 32)
{
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 16]);
}
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 8]);
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 4]);
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 2]);
functor.reduce(reduceArr[threadIdx.x], reduceArr[threadIdx.x + 1]);
}
__syncthreads();
return reduceArr[0];
}
#endif //_ncv_alg_hpp_
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册