提交 39700c5d 编写于 作者: A Alexey Spizhevoy

added some gpu::matchTemplate kernels (other parts after NPP Staging integration)

上级 a81b41fb
......@@ -175,7 +175,7 @@ void multiplyAndNormalizeSpects(int n, float scale, const cufftComplex* a,
__global__ void matchTemplatePreparedKernel_8U_SQDIFF(
int w, int h, const PtrStepf image_sumsq, float templ_sumsq,
int w, int h, const PtrStep_<unsigned long long> image_sqsum, float templ_sqsum,
DevMem2Df result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -183,24 +183,80 @@ __global__ void matchTemplatePreparedKernel_8U_SQDIFF(
if (x < result.cols && y < result.rows)
{
float image_sq = image_sumsq.ptr(y + h)[x + w]
- image_sumsq.ptr(y)[x + w]
- image_sumsq.ptr(y + h)[x]
+ image_sumsq.ptr(y)[x];
float image_sq = (float)(
(image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
(image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
float ccorr = result.ptr(y)[x];
result.ptr(y)[x] = image_sq - 2.f * ccorr + templ_sumsq;
result.ptr(y)[x] = image_sq - 2.f * ccorr + templ_sqsum;
}
}
void matchTemplatePrepared_8U_SQDIFF(
int w, int h, const DevMem2Df image_sumsq, float templ_sumsq,
int w, int h, const DevMem2D_<unsigned long long> image_sqsum, float templ_sqsum,
DevMem2Df result)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_8U_SQDIFF<<<grid, threads>>>(
w, h, image_sumsq, templ_sumsq, result);
w, h, image_sqsum, templ_sqsum, result);
cudaSafeCall(cudaThreadSynchronize());
}
__global__ void matchTemplatePreparedKernel_8U_SQDIFF_NORMED(
int w, int h, const PtrStep_<unsigned long long> image_sqsum, float templ_sqsum,
DevMem2Df result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < result.cols && y < result.rows)
{
float image_sq = (float)(
(image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
(image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
float ccorr = result.ptr(y)[x];
result.ptr(y)[x] = (image_sq - 2.f * ccorr + templ_sqsum) *
rsqrtf(image_sq * templ_sqsum);
}
}
void matchTemplatePrepared_8U_SQDIFF_NORMED(
int w, int h, const DevMem2D_<unsigned long long> image_sqsum, float templ_sqsum,
DevMem2Df result)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_8U_SQDIFF_NORMED<<<grid, threads>>>(
w, h, image_sqsum, templ_sqsum, result);
cudaSafeCall(cudaThreadSynchronize());
}
__global__ void normalizeKernel_8U(int w, int h, const PtrStep_<unsigned long long> image_sqsum,
float templ_sqsum, DevMem2Df result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < result.cols && y < result.rows)
{
float image_sq = (float)(
(image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
(image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
result.ptr(y)[x] *= rsqrtf(image_sq * templ_sqsum);
}
}
void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
float templ_sqsum, DevMem2Df result)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
normalizeKernel_8U<<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
cudaSafeCall(cudaThreadSynchronize());
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册