surf.cu 48.6 KB
Newer Older
V
Vladislav Vinogradov 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
// Copyright (c) 2010, Paul Furgale, Chi Hay Tong
//
// The original code was written by Paul Furgale and Chi Hay Tong 
// and later optimized and prepared for integration into OpenCV by Itseez.
//
//M*/

#include "internal_shared.hpp"
49
#include "opencv2/gpu/device/limits.hpp"
50
#include "opencv2/gpu/device/saturate_cast.hpp"
51 52
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/functional.hpp"
53
#include "opencv2/gpu/device/filters.hpp"
V
Vladislav Vinogradov 已提交
54

55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
BEGIN_OPENCV_DEVICE_NAMESPACE

namespace surf {

////////////////////////////////////////////////////////////////////////
// Global parameters

// The maximum number of features (before subpixel interpolation) that memory is reserved for.
__constant__ int c_max_candidates;
// The maximum number of features that memory is reserved for.
__constant__ int c_max_features;
// The image size.
__constant__ int c_img_rows;
__constant__ int c_img_cols;
// The number of layers.
__constant__ int c_nOctaveLayers;
// The hessian threshold.
__constant__ float c_hessianThreshold;

// The current octave.
__constant__ int c_octave;
// The current layer size.
__constant__ int c_layer_rows;
__constant__ int c_layer_cols;

void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold)
{
    cudaSafeCall( cudaMemcpyToSymbol(c_max_candidates, &maxCandidates, sizeof(maxCandidates)) );
    cudaSafeCall( cudaMemcpyToSymbol(c_max_features, &maxFeatures, sizeof(maxFeatures)) );
    cudaSafeCall( cudaMemcpyToSymbol(c_img_rows, &img_rows, sizeof(img_rows)) );
    cudaSafeCall( cudaMemcpyToSymbol(c_img_cols, &img_cols, sizeof(img_cols)) );
    cudaSafeCall( cudaMemcpyToSymbol(c_nOctaveLayers, &nOctaveLayers, sizeof(nOctaveLayers)) );
    cudaSafeCall( cudaMemcpyToSymbol(c_hessianThreshold, &hessianThreshold, sizeof(hessianThreshold)) );
}

void loadOctaveConstants(int octave, int layer_rows, int layer_cols)
{
    cudaSafeCall( cudaMemcpyToSymbol(c_octave, &octave, sizeof(octave)) );
    cudaSafeCall( cudaMemcpyToSymbol(c_layer_rows, &layer_rows, sizeof(layer_rows)) );
    cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );
}
V
Vladislav Vinogradov 已提交
96

97 98
////////////////////////////////////////////////////////////////////////
// Integral image texture
V
Vladislav Vinogradov 已提交
99

100 101 102 103 104
texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);

void bindImgTex(DevMem2Db img)
V
Vladislav Vinogradov 已提交
105
{
106 107 108 109 110 111 112 113 114 115
    bindTexture(&imgTex, img);
}
void bindSumTex(DevMem2D_<uint> sum)
{
    bindTexture(&sumTex, sum);
}
void bindMaskSumTex(DevMem2D_<uint> maskSum)
{
    bindTexture(&maskSumTex, maskSum);
}
V
Vladislav Vinogradov 已提交
116

117 118 119 120 121 122 123
template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)
{
#if __CUDA_ARCH__ >= 200
    typedef double real_t;        
#else
    typedef float  real_t;
#endif
V
Vladislav Vinogradov 已提交
124

125 126 127
    float ratio = (float)newSize / oldSize;
    
    real_t d = 0;
128

129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
    #pragma unroll
    for (int k = 0; k < N; ++k)
    {
        int dx1 = __float2int_rn(ratio * src[k][0]);
        int dy1 = __float2int_rn(ratio * src[k][1]);
        int dx2 = __float2int_rn(ratio * src[k][2]);
        int dy2 = __float2int_rn(ratio * src[k][3]);

        real_t t = 0;
        t += tex2D(sumTex, x + dx1, y + dy1);
        t -= tex2D(sumTex, x + dx1, y + dy2);
        t -= tex2D(sumTex, x + dx2, y + dy1);
        t += tex2D(sumTex, x + dx2, y + dy2);

        d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
V
Vladislav Vinogradov 已提交
144
    }
145

146 147
    return (float)d;
}
148

149 150
////////////////////////////////////////////////////////////////////////
// Hessian
V
Vladislav Vinogradov 已提交
151

152 153 154 155 156 157 158 159
__constant__ float c_DX [3][5] = { {0, 2, 3, 7, 1}, {3, 2, 6, 7, -2}, {6, 2, 9, 7, 1} };
__constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
__constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };

__host__ __device__ __forceinline__ int calcSize(int octave, int layer)
{
    /* Wavelet size at first layer of first octave. */
    const int HAAR_SIZE0 = 9;
V
Vladislav Vinogradov 已提交
160

161 162 163 164 165
    /* Wavelet size increment between layers. This should be an even number,
     such that the wavelet sizes in an octave are either all even or all odd.
     This ensures that when looking for the neighbours of a sample, the layers
     above and below are aligned correctly. */
    const int HAAR_SIZE_INC = 6;
V
Vladislav Vinogradov 已提交
166

167 168
    return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
}
169

170 171 172 173 174 175
__global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
{
    // Determine the indices
    const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
    const int blockIdx_y = blockIdx.y % gridDim_y;
    const int blockIdx_z = blockIdx.y / gridDim_y;
176

177 178 179
    const int j = threadIdx.x + blockIdx.x * blockDim.x;
    const int i = threadIdx.y + blockIdx_y * blockDim.y;
    const int layer = blockIdx_z;
180

181
    const int size = calcSize(c_octave, layer);
182

183 184
    const int samples_i = 1 + ((c_img_rows - size) >> c_octave);
    const int samples_j = 1 + ((c_img_cols - size) >> c_octave);
185

186 187
    // Ignore pixels where some of the kernel is outside the image
    const int margin = (size >> 1) >> c_octave;
188

189 190 191 192 193
    if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
    {
        const float dx  = icvCalcHaarPatternSum<3>(c_DX , 9, size, i << c_octave, j << c_octave);
        const float dy  = icvCalcHaarPatternSum<3>(c_DY , 9, size, i << c_octave, j << c_octave);
        const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, i << c_octave, j << c_octave);
194

195 196
        det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
        trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
197
    }
198
}
199

200 201 202 203 204
void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers)
{
    const int min_size = calcSize(octave, 0);
    const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
    const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
205

206
    dim3 threads(16, 16);
207

208 209 210
    dim3 grid;
    grid.x = divUp(max_samples_j, threads.x);
    grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);
211

212 213
    icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
    cudaSafeCall( cudaGetLastError() );
V
Vladislav Vinogradov 已提交
214

215 216
    cudaSafeCall( cudaDeviceSynchronize() );
}
V
Vladislav Vinogradov 已提交
217

218 219
////////////////////////////////////////////////////////////////////////
// NONMAX
220

221
__constant__ float c_DM[5] = {0, 0, 9, 9, 1};
222

223 224 225
struct WithMask
{
    static __device__ bool check(int sum_i, int sum_j, int size)
226
    {
227 228 229
        float ratio = (float)size / 9.0f;
        
        float d = 0;
230

231 232 233 234
        int dx1 = __float2int_rn(ratio * c_DM[0]);
        int dy1 = __float2int_rn(ratio * c_DM[1]);
        int dx2 = __float2int_rn(ratio * c_DM[2]);
        int dy2 = __float2int_rn(ratio * c_DM[3]);
235

236 237 238 239 240
        float t = 0;
        t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
        t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
        t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
        t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
241

242
        d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));
243

244 245 246
        return (d >= 0.5f);
    }
};
247

248 249 250 251
template <typename Mask>
__global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer, unsigned int* maxCounter)
{
    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
V
Vladislav Vinogradov 已提交
252

253
    extern __shared__ float N9[];
V
Vladislav Vinogradov 已提交
254

255 256 257 258
    // The hidx variables are the indices to the hessian buffer.
    const int gridDim_y = gridDim.y / c_nOctaveLayers;
    const int blockIdx_y = blockIdx.y % gridDim_y;
    const int blockIdx_z = blockIdx.y / gridDim_y;
V
Vladislav Vinogradov 已提交
259

260
    const int layer = blockIdx_z + 1;
261

262
    const int size = calcSize(c_octave, layer);
263

264 265
    // Ignore pixels without a 3x3x3 neighbourhood in the layer above
    const int margin = ((calcSize(c_octave, layer + 1) >> 1) >> c_octave) + 1;
V
Vladislav Vinogradov 已提交
266

267 268
    const int j = threadIdx.x + blockIdx.x * (blockDim.x - 2) + margin - 1;
    const int i = threadIdx.y + blockIdx_y * (blockDim.y - 2) + margin - 1;
V
Vladislav Vinogradov 已提交
269

270 271 272 273 274 275 276 277 278 279 280
    // Is this thread within the hessian buffer?
    const int zoff = blockDim.x * blockDim.y;
    const int localLin = threadIdx.x + threadIdx.y * blockDim.x + zoff;
    N9[localLin - zoff] = det.ptr(c_layer_rows * (layer - 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
    N9[localLin       ] = det.ptr(c_layer_rows * (layer    ) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
    N9[localLin + zoff] = det.ptr(c_layer_rows * (layer + 1) + ::min(::max(i, 0), c_img_rows - 1))[::min(::max(j, 0), c_img_cols - 1)];
    __syncthreads();

    if (i < c_layer_rows - margin && j < c_layer_cols - margin && threadIdx.x > 0 && threadIdx.x < blockDim.x - 1 && threadIdx.y > 0 && threadIdx.y < blockDim.y - 1)
    {
        float val0 = N9[localLin];
281

282
        if (val0 > c_hessianThreshold)
V
Vladislav Vinogradov 已提交
283
        {
284 285 286 287 288
            // Coordinates for the start of the wavelet in the sum image. There
            // is some integer division involved, so don't try to simplify this
            // (cancel out sampleStep) without checking the result is the same
            const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
            const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;
289

290
            if (Mask::check(sum_i, sum_j, size))
V
Vladislav Vinogradov 已提交
291
            {
292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
                // Check to see if we have a max (in its 26 neighbours)
                const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
                &&                   val0 > N9[localLin     - blockDim.x - zoff]
                &&                   val0 > N9[localLin + 1 - blockDim.x - zoff]
                &&                   val0 > N9[localLin - 1              - zoff]
                &&                   val0 > N9[localLin                  - zoff]
                &&                   val0 > N9[localLin + 1              - zoff]
                &&                   val0 > N9[localLin - 1 + blockDim.x - zoff]
                &&                   val0 > N9[localLin     + blockDim.x - zoff]
                &&                   val0 > N9[localLin + 1 + blockDim.x - zoff]

                &&                   val0 > N9[localLin - 1 - blockDim.x]
                &&                   val0 > N9[localLin     - blockDim.x]
                &&                   val0 > N9[localLin + 1 - blockDim.x]
                &&                   val0 > N9[localLin - 1             ]
                &&                   val0 > N9[localLin + 1             ]
                &&                   val0 > N9[localLin - 1 + blockDim.x]
                &&                   val0 > N9[localLin     + blockDim.x]
                &&                   val0 > N9[localLin + 1 + blockDim.x]

                &&                   val0 > N9[localLin - 1 - blockDim.x + zoff]
                &&                   val0 > N9[localLin     - blockDim.x + zoff]
                &&                   val0 > N9[localLin + 1 - blockDim.x + zoff]
                &&                   val0 > N9[localLin - 1              + zoff]
                &&                   val0 > N9[localLin                  + zoff]
                &&                   val0 > N9[localLin + 1              + zoff]
                &&                   val0 > N9[localLin - 1 + blockDim.x + zoff]
                &&                   val0 > N9[localLin     + blockDim.x + zoff]
                &&                   val0 > N9[localLin + 1 + blockDim.x + zoff]
                ;

                if(condmax)
V
Vladislav Vinogradov 已提交
324
                {
325
                    unsigned int ind = atomicInc(maxCounter,(unsigned int) -1);
326

327 328 329
                    if (ind < c_max_candidates)
                    {
                        const int laplacian = (int) copysignf(1.0f, trace.ptr(layer * c_layer_rows + i)[j]);
330

331
                        maxPosBuffer[ind] = make_int4(j, i, layer, laplacian);
332
                    }
V
Vladislav Vinogradov 已提交
333 334
                }
            }
335
        }
V
Vladislav Vinogradov 已提交
336 337
    }

338 339
    #endif
}
340

341 342 343 344 345
void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
    int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
{
    const int layer_rows = img_rows >> octave;
    const int layer_cols = img_cols >> octave;
346

347
    const int min_margin = ((calcSize(octave, 2) >> 1) >> octave) + 1;
V
Vladislav Vinogradov 已提交
348

349
    dim3 threads(16, 16);
V
Vladislav Vinogradov 已提交
350

351 352 353
    dim3 grid;
    grid.x = divUp(layer_cols - 2 * min_margin, threads.x - 2);
    grid.y = divUp(layer_rows - 2 * min_margin, threads.y - 2) * nOctaveLayers;
V
Vladislav Vinogradov 已提交
354

355
    const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);
356

357 358 359 360
    if (use_mask)
        icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
    else
        icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
V
Vladislav Vinogradov 已提交
361

362
    cudaSafeCall( cudaGetLastError() );
V
Vladislav Vinogradov 已提交
363

364 365 366 367 368
    cudaSafeCall( cudaDeviceSynchronize() );
}

////////////////////////////////////////////////////////////////////////
// INTERPOLATION
V
Vladislav Vinogradov 已提交
369

370 371 372 373 374
__global__ void icvInterpolateKeypoint(const PtrStepf det, const int4* maxPosBuffer,
    float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,
    unsigned int* featureCounter)
{
    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
375

376
    const int4 maxPos = maxPosBuffer[blockIdx.x];
V
Vladislav Vinogradov 已提交
377

378 379 380
    const int j = maxPos.x - 1 + threadIdx.x;
    const int i = maxPos.y - 1 + threadIdx.y;
    const int layer = maxPos.z - 1 + threadIdx.z;
V
Vladislav Vinogradov 已提交
381

382
    __shared__ float N9[3][3][3];
V
Vladislav Vinogradov 已提交
383

384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421
    N9[threadIdx.z][threadIdx.y][threadIdx.x] = det.ptr(c_layer_rows * layer + i)[j];
    __syncthreads();

    if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)
    {
        __shared__ float dD[3];

        //dx
        dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
        //dy
        dD[1] = -0.5f * (N9[1][2][1] - N9[1][0][1]);
        //ds
        dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);

        __shared__ float H[3][3];

        //dxx
        H[0][0] = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
        //dxy
        H[0][1]= 0.25f * (N9[1][2][2] - N9[1][2][0] - N9[1][0][2] + N9[1][0][0]);
        //dxs
        H[0][2]= 0.25f * (N9[2][1][2] - N9[2][1][0] - N9[0][1][2] + N9[0][1][0]);
        //dyx = dxy
        H[1][0] = H[0][1];
        //dyy
        H[1][1] = N9[1][0][1] - 2.0f * N9[1][1][1] + N9[1][2][1];
        //dys
        H[1][2]= 0.25f * (N9[2][2][1] - N9[2][0][1] - N9[0][2][1] + N9[0][0][1]);
        //dsx = dxs
        H[2][0] = H[0][2];
        //dsy = dys
        H[2][1] = H[1][2];
        //dss
        H[2][2] = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];

        __shared__ float x[3];

        if (solve3x3(H, dD, x))
V
Vladislav Vinogradov 已提交
422
        {
423
            if (::fabs(x[0]) <= 1.f && ::fabs(x[1]) <= 1.f && ::fabs(x[2]) <= 1.f)
424
            {
425 426 427
                // if the step is within the interpolation region, perform it
                
                const int size = calcSize(c_octave, maxPos.z);
V
Vladislav Vinogradov 已提交
428

429 430 431 432 433
                const int sum_i = (maxPos.y - ((size >> 1) >> c_octave)) << c_octave;
                const int sum_j = (maxPos.x - ((size >> 1) >> c_octave)) << c_octave;
                
                const float center_i = sum_i + (float)(size - 1) / 2;
                const float center_j = sum_j + (float)(size - 1) / 2;
V
Vladislav Vinogradov 已提交
434

435 436
                const float px = center_j + x[0] * (1 << c_octave);
                const float py = center_i + x[1] * (1 << c_octave);
437

438 439
                const int ds = size - calcSize(c_octave, maxPos.z - 1);
                const float psize = roundf(size + x[2] * ds);
V
Vladislav Vinogradov 已提交
440

441 442 443
                /* The sampling intervals and wavelet sized for selecting an orientation
                 and building the keypoint descriptor are defined relative to 's' */
                const float s = psize * 1.2f / 9.0f;
V
Vladislav Vinogradov 已提交
444

445 446 447 448 449
                /* To find the dominant orientation, the gradients in x and y are
                 sampled in a circle of radius 6s using wavelets of size 4s.
                 We ensure the gradient wavelet size is even to ensure the
                 wavelet pattern is balanced and symmetric around its center */
                const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
V
Vladislav Vinogradov 已提交
450

451 452 453 454 455
                // check when grad_wav_size is too big
                if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
                {
                    // Get a new feature index.
                    unsigned int ind = atomicInc(featureCounter, (unsigned int)-1);
V
Vladislav Vinogradov 已提交
456

457 458 459 460 461 462 463 464 465 466 467 468
                    if (ind < c_max_features)
                    {
                        featureX[ind] = px;
                        featureY[ind] = py;
                        featureLaplacian[ind] = maxPos.w;
                        featureSize[ind] = psize;
                        featureHessian[ind] = N9[1][1][1];
                    }
                } // grad_wav_size check
            } // If the subpixel interpolation worked
        }
    } // If this is thread 0.
V
Vladislav Vinogradov 已提交
469

470 471
    #endif
}
V
Vladislav Vinogradov 已提交
472

473 474 475 476 477 478 479 480
void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, 
    float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, 
    unsigned int* featureCounter)
{
    dim3 threads;
    threads.x = 3;
    threads.y = 3;
    threads.z = 3;
V
Vladislav Vinogradov 已提交
481

482 483
    dim3 grid;
    grid.x = maxCounter;
484

485 486
    icvInterpolateKeypoint<<<grid, threads>>>(det, maxPosBuffer, featureX, featureY, featureLaplacian, featureSize, featureHessian, featureCounter);
    cudaSafeCall( cudaGetLastError() );
V
Vladislav Vinogradov 已提交
487

488 489
    cudaSafeCall( cudaDeviceSynchronize() );
}
V
Vladislav Vinogradov 已提交
490

491 492
////////////////////////////////////////////////////////////////////////
// Orientation
493

494 495 496
#define ORI_SEARCH_INC 5
#define ORI_WIN        60
#define ORI_SAMPLES    113
V
Vladislav Vinogradov 已提交
497

498 499 500
__constant__ float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
__constant__ float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
__constant__ float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f};
V
Vladislav Vinogradov 已提交
501

502 503
__constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
__constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};
V
Vladislav Vinogradov 已提交
504

505 506 507
__global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
{        
    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
508

509 510 511
    __shared__ float s_X[128];
    __shared__ float s_Y[128];
    __shared__ float s_angle[128];
512

513
    __shared__ float s_sum[32 * 4];
514

515 516 517
    /* The sampling intervals and wavelet sized for selecting an orientation
     and building the keypoint descriptor are defined relative to 's' */
    const float s = featureSize[blockIdx.x] * 1.2f / 9.0f;
518

519 520 521 522 523
    /* To find the dominant orientation, the gradients in x and y are
     sampled in a circle of radius 6s using wavelets of size 4s.
     We ensure the gradient wavelet size is even to ensure the
     wavelet pattern is balanced and symmetric around its center */
    const int grad_wav_size = 2 * __float2int_rn(2.0f * s);
524

525 526 527 528 529
    // check when grad_wav_size is too big
    if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
    {
        // Calc X, Y, angle and store it to shared memory
        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
530

531 532 533 534 535 536 537
        float X = 0.0f, Y = 0.0f, angle = 0.0f;

        if (tid < ORI_SAMPLES)
        {
            const float margin = (float)(grad_wav_size - 1) / 2.0f;
            const int x = __float2int_rn(featureX[blockIdx.x] + c_aptX[tid] * s - margin);
            const int y = __float2int_rn(featureY[blockIdx.x] + c_aptY[tid] * s - margin);
538

539
            if ((unsigned)y < (unsigned)((c_img_rows + 1) - grad_wav_size) && (unsigned)x < (unsigned)((c_img_cols + 1) - grad_wav_size))
540
            {
541 542 543 544 545 546 547
                X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
                Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);
            
                angle = atan2f(Y, X);
                if (angle < 0)
                    angle += 2.0f * CV_PI;
                angle *= 180.0f / CV_PI;
548
            }
549 550 551 552 553
        }
        s_X[tid] = X;
        s_Y[tid] = Y;
        s_angle[tid] = angle;
        __syncthreads();
554

555
        float bestx = 0, besty = 0, best_mod = 0;
556

557 558 559 560
        #pragma unroll
        for (int i = 0; i < 18; ++i)
        {
            const int dir = (i * 4 + threadIdx.y) * ORI_SEARCH_INC;
561

562 563 564 565 566 567
            float sumx = 0.0f, sumy = 0.0f;
            int d = ::abs(__float2int_rn(s_angle[threadIdx.x]) - dir);
            if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
            {
                sumx = s_X[threadIdx.x];
                sumy = s_Y[threadIdx.x];
568
            }
569 570 571 572 573 574 575 576 577 578 579 580 581 582
            d = ::abs(__float2int_rn(s_angle[threadIdx.x + 32]) - dir);
            if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
            {
                sumx += s_X[threadIdx.x + 32];
                sumy += s_Y[threadIdx.x + 32];
            }
            d = ::abs(__float2int_rn(s_angle[threadIdx.x + 64]) - dir);
            if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
            {
                sumx += s_X[threadIdx.x + 64];
                sumy += s_Y[threadIdx.x + 64];
            }
            d = ::abs(__float2int_rn(s_angle[threadIdx.x + 96]) - dir);
            if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
583
            {
584 585 586 587 588
                sumx += s_X[threadIdx.x + 96];
                sumy += s_Y[threadIdx.x + 96];
            }

            float* s_sum_row = s_sum + threadIdx.y * 32;
589

590 591 592 593 594 595 596 597 598
            device::reduce<32>(s_sum_row, sumx, threadIdx.x, plus<volatile float>());
            device::reduce<32>(s_sum_row, sumy, threadIdx.x, plus<volatile float>());

            const float temp_mod = sumx * sumx + sumy * sumy;
            if (temp_mod > best_mod)
            {
                best_mod = temp_mod;
                bestx = sumx;
                besty = sumy;
599
            }
V
Vladislav Vinogradov 已提交
600
        }
601

602 603 604 605 606 607 608
        if (threadIdx.x == 0)
        {
            s_X[threadIdx.y] = bestx;
            s_Y[threadIdx.y] = besty;
            s_angle[threadIdx.y] = best_mod;
        }
        __syncthreads();
V
Vladislav Vinogradov 已提交
609

610 611 612 613 614
        if (threadIdx.x < 2 && threadIdx.y == 0)
        {
            volatile float* v_x = s_X;
            volatile float* v_y = s_Y;
            volatile float* v_mod = s_angle;
615

616 617 618
            bestx = v_x[threadIdx.x];
            besty = v_y[threadIdx.x];
            best_mod = v_mod[threadIdx.x];
V
Vladislav Vinogradov 已提交
619

620 621 622 623 624 625 626 627 628 629 630 631 632 633
            float temp_mod = v_mod[threadIdx.x + 2];
            if (temp_mod > best_mod)
            {
                v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 2];
                v_y[threadIdx.x] = besty = v_y[threadIdx.x + 2];
                v_mod[threadIdx.x] = best_mod = temp_mod;
            }
            temp_mod = v_mod[threadIdx.x + 1];
            if (temp_mod > best_mod)
            {
                v_x[threadIdx.x] = bestx = v_x[threadIdx.x + 1];
                v_y[threadIdx.x] = besty = v_y[threadIdx.x + 1];
            }
        }
V
Vladislav Vinogradov 已提交
634

635 636 637 638 639 640
        if (threadIdx.x == 0 && threadIdx.y == 0 && best_mod != 0)
        {
            float kp_dir = atan2f(besty, bestx);
            if (kp_dir < 0)
                kp_dir += 2.0f * CV_PI;
            kp_dir *= 180.0f / CV_PI;
641

642 643
            featureDir[blockIdx.x] = kp_dir;
        }
V
Vladislav Vinogradov 已提交
644 645
    }

646 647
    #endif
}
V
Vladislav Vinogradov 已提交
648

649 650 651
#undef ORI_SEARCH_INC
#undef ORI_WIN
#undef ORI_SAMPLES
652

653 654 655 656 657
void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures) 
{
    dim3 threads;
    threads.x = 32;
    threads.y = 4;
658

659 660
    dim3 grid;
    grid.x = nFeatures;
V
Vladislav Vinogradov 已提交
661

662 663
    icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
    cudaSafeCall( cudaGetLastError() );
V
Vladislav Vinogradov 已提交
664

665 666
    cudaSafeCall( cudaDeviceSynchronize() );
}
V
Vladislav Vinogradov 已提交
667

668 669 670 671
////////////////////////////////////////////////////////////////////////
// Descriptors

#define PATCH_SZ 20
672

673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699
__constant__ float c_DW[PATCH_SZ * PATCH_SZ] = 
{
    3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f, 
    8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, 
    1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, 
    3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, 
    5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, 
    9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, 
    0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, 
    0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, 
    0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, 
    0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, 
    0.0002302826324012131f, 0.0005262381164357066f, 0.001097041997127235f, 0.002086334861814976f, 0.003619635012000799f, 0.005728822201490402f, 0.008271530270576477f, 0.01089497376233339f, 0.01309141051024199f, 0.01435048412531614f, 0.01435048412531614f, 0.01309141051024199f, 0.01089497376233339f, 0.008271530270576477f, 0.005728822201490402f, 0.003619635012000799f, 0.002086334861814976f, 0.001097041997127235f, 0.0005262381164357066f, 0.0002302826324012131f, 
    0.0002100782439811155f, 0.0004800673632416874f, 0.001000790391117334f, 0.001903285388834775f, 0.00330205773934722f, 0.00522619066759944f, 0.007545807864516974f, 0.009939077310264111f, 0.01194280479103327f, 0.01309141051024199f, 0.01309141051024199f, 0.01194280479103327f, 0.009939077310264111f, 0.007545807864516974f, 0.00522619066759944f, 0.00330205773934722f, 0.001903285388834775f, 0.001000790391117334f, 0.0004800673632416874f, 0.0002100782439811155f, 
    0.0001748319627949968f, 0.0003995231236331165f, 0.0008328808471560478f, 0.001583957928232849f, 0.002748048631474376f, 0.004349356517195702f, 0.006279794964939356f, 0.008271529339253902f, 0.009939077310264111f, 0.01089497376233339f, 0.01089497376233339f, 0.009939077310264111f, 0.008271529339253902f, 0.006279794964939356f, 0.004349356517195702f, 0.002748048631474376f, 0.001583957928232849f, 0.0008328808471560478f, 0.0003995231236331165f, 0.0001748319627949968f, 
    0.0001327334757661447f, 0.0003033203829545528f, 0.0006323281559161842f, 0.001202550483867526f, 0.002086335094645619f, 0.00330205773934722f, 0.004767658654600382f, 0.006279794964939356f, 0.007545807864516974f, 0.008271530270576477f, 0.008271530270576477f, 0.007545807864516974f, 0.006279794964939356f, 0.004767658654600382f, 0.00330205773934722f, 0.002086335094645619f, 0.001202550483867526f, 0.0006323281559161842f, 0.0003033203829545528f, 0.0001327334757661447f, 
    9.193058212986216e-005f, 0.0002100782585330308f, 0.0004379475140012801f, 0.0008328807889483869f, 0.001444985857233405f, 0.002286989474669099f, 0.00330205773934722f, 0.004349356517195702f, 0.00522619066759944f, 0.005728822201490402f, 0.005728822201490402f, 0.00522619066759944f, 0.004349356517195702f, 0.00330205773934722f, 0.002286989474669099f, 0.001444985857233405f, 0.0008328807889483869f, 0.0004379475140012801f, 0.0002100782585330308f, 9.193058212986216e-005f, 
    5.808438800158911e-005f, 0.0001327334903180599f, 0.0002767078403849155f, 0.0005262380582280457f, 0.0009129836107604206f, 0.001444985857233405f, 0.002086335094645619f, 0.002748048631474376f, 0.00330205773934722f, 0.003619635012000799f, 0.003619635012000799f, 0.00330205773934722f, 0.002748048631474376f, 0.002086335094645619f, 0.001444985857233405f, 0.0009129836107604206f, 0.0005262380582280457f, 0.0002767078403849155f, 0.0001327334903180599f, 5.808438800158911e-005f, 
    3.34794785885606e-005f, 7.650675252079964e-005f, 0.0001594926579855382f, 0.0003033203247468919f, 0.0005262380582280457f, 0.0008328807889483869f, 0.001202550483867526f, 0.001583957928232849f, 0.001903285388834775f, 0.002086334861814976f, 0.002086334861814976f, 0.001903285388834775f, 0.001583957928232849f, 0.001202550483867526f, 0.0008328807889483869f, 0.0005262380582280457f, 0.0003033203247468919f, 0.0001594926579855382f, 7.650675252079964e-005f, 3.34794785885606e-005f, 
    1.760426494001877e-005f, 4.022897701361217e-005f, 8.386484114453197e-005f, 0.0001594926579855382f, 0.0002767078403849155f, 0.0004379475140012801f, 0.0006323281559161842f, 0.0008328808471560478f, 0.001000790391117334f, 0.001097041997127235f, 0.001097041997127235f, 0.001000790391117334f, 0.0008328808471560478f, 0.0006323281559161842f, 0.0004379475140012801f, 0.0002767078403849155f, 0.0001594926579855382f, 8.386484114453197e-005f, 4.022897701361217e-005f, 1.760426494001877e-005f, 
    8.444558261544444e-006f, 1.929736572492402e-005f, 4.022897701361217e-005f, 7.650675252079964e-005f, 0.0001327334903180599f, 0.0002100782585330308f, 0.0003033203829545528f, 0.0003995231236331165f, 0.0004800673632416874f, 0.0005262381164357066f, 0.0005262381164357066f, 0.0004800673632416874f, 0.0003995231236331165f, 0.0003033203829545528f, 0.0002100782585330308f, 0.0001327334903180599f, 7.650675252079964e-005f, 4.022897701361217e-005f, 1.929736572492402e-005f, 8.444558261544444e-006f, 
    3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
};

struct WinReader
{
    typedef uchar elem_type;
700

701 702
    __device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) : 
        centerX(centerX_), centerY(centerY_), win_offset(win_offset_), cos_dir(cos_dir_), sin_dir(sin_dir_)
703
    {
704
    }
705

706 707 708 709
    __device__ __forceinline__ uchar operator ()(int i, int j) const
    {
        float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
        float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
710

711 712
        return tex2D(imgTex, pixel_x, pixel_y);
    }
713

714 715 716 717 718 719
    float centerX; 
    float centerY;
    float win_offset; 
    float cos_dir; 
    float sin_dir;
};
720

721 722 723 724
__device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], 
    const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
{
    __shared__ float s_PATCH[6][6];
V
Vladislav Vinogradov 已提交
725

726 727 728 729
    const float centerX = featureX[blockIdx.x];
    const float centerY = featureY[blockIdx.x];
    const float size = featureSize[blockIdx.x];
    const float descriptor_dir = featureDir[blockIdx.x] * (float)(CV_PI / 180);
V
Vladislav Vinogradov 已提交
730

731 732 733
    /* The sampling intervals and wavelet sized for selecting an orientation
     and building the keypoint descriptor are defined relative to 's' */
    const float s = size * 1.2f / 9.0f;
734

735 736
    /* Extract a window of pixels around the keypoint of size 20s */
    const int win_size = (int)((PATCH_SZ + 1) * s);
737

738 739 740
    float sin_dir;
    float cos_dir;
    sincosf(descriptor_dir, &sin_dir, &cos_dir);
741

742 743
    /* Nearest neighbour version (faster) */
    const float win_offset = -(float)(win_size - 1) / 2; 
744

745 746 747 748 749 750
    // Compute sampling points
    // since grids are 2D, need to compute xBlock and yBlock indices
    const int xBlock = (blockIdx.y & 3);  // blockIdx.y % 4
    const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
    const int xIndex = xBlock * 5 + threadIdx.x;
    const int yIndex = yBlock * 5 + threadIdx.y;
751

752 753
    const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
    const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
754

755
    LinearFilter<WinReader> filter(WinReader(centerX, centerY, win_offset, cos_dir, sin_dir));
756

757
    s_PATCH[threadIdx.y][threadIdx.x] = filter(icoo, jcoo);
V
Vladislav Vinogradov 已提交
758

759
    __syncthreads();
V
Vladislav Vinogradov 已提交
760

761
    if (threadIdx.x < 5 && threadIdx.y < 5)
762
    {
763
        const int tid = threadIdx.y * 5 + threadIdx.x;
V
Vladislav Vinogradov 已提交
764

765
        const float dw = c_DW[yIndex * PATCH_SZ + xIndex];
V
Vladislav Vinogradov 已提交
766

767 768
        const float vx = (s_PATCH[threadIdx.y    ][threadIdx.x + 1] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y + 1][threadIdx.x    ]) * dw;
        const float vy = (s_PATCH[threadIdx.y + 1][threadIdx.x    ] - s_PATCH[threadIdx.y][threadIdx.x] + s_PATCH[threadIdx.y + 1][threadIdx.x + 1] - s_PATCH[threadIdx.y    ][threadIdx.x + 1]) * dw;
769

770 771 772 773
        s_dx_bin[tid] = vx;
        s_dy_bin[tid] = vy;
    }
}
774

775 776 777 778 779 780 781 782 783 784
__device__ void reduce_sum25(volatile float* sdata1, volatile float* sdata2, volatile float* sdata3, volatile float* sdata4, int tid)
{
    // first step is to reduce from 25 to 16
    if (tid < 9) // use 9 threads
    {
        sdata1[tid] += sdata1[tid + 16];
        sdata2[tid] += sdata2[tid + 16];
        sdata3[tid] += sdata3[tid + 16];
        sdata4[tid] += sdata4[tid + 16];
    }
785

786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809
    // sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
    if (tid < 8)
    {
        sdata1[tid] += sdata1[tid + 8];
        sdata1[tid] += sdata1[tid + 4];
        sdata1[tid] += sdata1[tid + 2];
        sdata1[tid] += sdata1[tid + 1];

        sdata2[tid] += sdata2[tid + 8];
        sdata2[tid] += sdata2[tid + 4];
        sdata2[tid] += sdata2[tid + 2];
        sdata2[tid] += sdata2[tid + 1];

        sdata3[tid] += sdata3[tid + 8];
        sdata3[tid] += sdata3[tid + 4];
        sdata3[tid] += sdata3[tid + 2];
        sdata3[tid] += sdata3[tid + 1];

        sdata4[tid] += sdata4[tid + 8];
        sdata4[tid] += sdata4[tid + 4];
        sdata4[tid] += sdata4[tid + 2];
        sdata4[tid] += sdata4[tid + 1];
    }
}
810

811 812 813 814 815 816 817
__global__ void compute_descriptors64(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
{
    // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
    __shared__ float sdx[25];
    __shared__ float sdy[25];
    __shared__ float sdxabs[25];
    __shared__ float sdyabs[25];
818

819 820
    calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
    __syncthreads();
821

822
    const int tid = threadIdx.y * blockDim.x + threadIdx.x;
823

824
    if (tid < 25)
825
    {
826 827 828
        sdxabs[tid] = ::fabs(sdx[tid]); // |dx| array
        sdyabs[tid] = ::fabs(sdy[tid]); // |dy| array
        __syncthreads();
829

830
        reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid);
831 832
        __syncthreads();

833
        float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);
834

835 836
        // write dx, dy, |dx|, |dy|
        if (tid == 0)
837
        {
838 839 840 841 842 843 844
            descriptors_block[0] = sdx[0];
            descriptors_block[1] = sdy[0];
            descriptors_block[2] = sdxabs[0];
            descriptors_block[3] = sdyabs[0];
        }
    }
}
845

846 847 848 849 850
__global__ void compute_descriptors128(PtrStepf descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
{
    // 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
    __shared__ float sdx[25];
    __shared__ float sdy[25];
851

852 853 854 855 856
    // sum (reduce) 5x5 area response
    __shared__ float sd1[25];
    __shared__ float sd2[25];
    __shared__ float sdabs1[25];
    __shared__ float sdabs2[25];
857

858 859
    calc_dx_dy(sdx, sdy, featureX, featureY, featureSize, featureDir);
    __syncthreads();
860

861
    const int tid = threadIdx.y * blockDim.x + threadIdx.x;
862

863
    if (tid < 25)
864
    {
865 866 867 868 869 870 871 872 873 874 875 876 877 878 879
        if (sdy[tid] >= 0)
        {
            sd1[tid] = sdx[tid];
            sdabs1[tid] = ::fabs(sdx[tid]);
            sd2[tid] = 0;
            sdabs2[tid] = 0;
        }
        else
        {
            sd1[tid] = 0;
            sdabs1[tid] = 0;
            sd2[tid] = sdx[tid];
            sdabs2[tid] = ::fabs(sdx[tid]);
        }
        __syncthreads();
880

881
        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
882 883
        __syncthreads();

884 885 886 887
        float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);

        // write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
        if (tid == 0)
888
        {
889 890 891 892
            descriptors_block[0] = sd1[0];
            descriptors_block[1] = sdabs1[0];
            descriptors_block[2] = sd2[0];
            descriptors_block[3] = sdabs2[0];
893
        }
894
        __syncthreads();
895

896
        if (sdx[tid] >= 0)
897
        {
898 899 900 901
            sd1[tid] = sdy[tid];
            sdabs1[tid] = ::fabs(sdy[tid]);
            sd2[tid] = 0;
            sdabs2[tid] = 0;
902
        }
903 904 905 906 907 908 909 910
        else
        {
            sd1[tid] = 0;
            sdabs1[tid] = 0;
            sd2[tid] = sdy[tid];
            sdabs2[tid] = ::fabs(sdy[tid]);
        }
        __syncthreads();
911

912 913 914 915 916
        reduce_sum25(sd1, sd2, sdabs1, sdabs2, tid);
        __syncthreads();

        // write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
        if (tid == 0)
917
        {
918 919 920 921
            descriptors_block[4] = sd1[0];
            descriptors_block[5] = sdabs1[0];
            descriptors_block[6] = sd2[0];
            descriptors_block[7] = sdabs2[0];
922
        }
923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940
    }
}

template <int BLOCK_DIM_X> __global__ void normalize_descriptors(PtrStepf descriptors)
{
    // no need for thread ID
    float* descriptor_base = descriptors.ptr(blockIdx.x);

    // read in the unnormalized descriptor values (squared)
    __shared__ float sqDesc[BLOCK_DIM_X];
    const float lookup = descriptor_base[threadIdx.x];
    sqDesc[threadIdx.x] = lookup * lookup;
    __syncthreads();

    if (BLOCK_DIM_X >= 128)
    {
        if (threadIdx.x < 64)
            sqDesc[threadIdx.x] += sqDesc[threadIdx.x + 64];
941
        __syncthreads();
942
    }
943

944 945 946 947 948 949 950 951 952 953 954
    // reduction to get total
    if (threadIdx.x < 32)
    {
        volatile float* smem = sqDesc;

        smem[threadIdx.x] += smem[threadIdx.x + 32];
        smem[threadIdx.x] += smem[threadIdx.x + 16];
        smem[threadIdx.x] += smem[threadIdx.x + 8];
        smem[threadIdx.x] += smem[threadIdx.x + 4];
        smem[threadIdx.x] += smem[threadIdx.x + 2];
        smem[threadIdx.x] += smem[threadIdx.x + 1];
955 956
    }

957 958 959
    // compute length (square root)
    __shared__ float len;
    if (threadIdx.x == 0)
960
    {
961 962 963
        len = sqrtf(sqDesc[0]);
    }
    __syncthreads();
964

965 966 967
    // normalize and store in output
    descriptor_base[threadIdx.x] = lookup / len;
}
968

969 970 971 972 973 974 975 976 977
void compute_descriptors_gpu(const DevMem2Df& descriptors, 
    const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
{
    // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
    
    if (descriptors.cols == 64)
    {
        compute_descriptors64<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);
        cudaSafeCall( cudaGetLastError() );
978

979
        cudaSafeCall( cudaDeviceSynchronize() );
980

981 982
        normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
        cudaSafeCall( cudaGetLastError() );
983

984 985 986 987 988 989
        cudaSafeCall( cudaDeviceSynchronize() );
    }
    else
    {
        compute_descriptors128<<<dim3(nFeatures, 16, 1), dim3(6, 6, 1)>>>(descriptors, featureX, featureY, featureSize, featureDir);            
        cudaSafeCall( cudaGetLastError() );
990

991 992 993 994 995 996
        cudaSafeCall( cudaDeviceSynchronize() );

        normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);            
        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall( cudaDeviceSynchronize() );
V
Vladislav Vinogradov 已提交
997
    }
998 999 1000 1001 1002
}

} // namespace surf

END_OPENCV_DEVICE_NAMESPACE