stereocsbp.cu 35.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

V
minor  
Vladislav Vinogradov 已提交
43
#include "internal_shared.hpp"
44
#include "opencv2/gpu/device/saturate_cast.hpp"
45
#include "opencv2/gpu/device/limits.hpp"
46

47
BEGIN_OPENCV_DEVICE_NAMESPACE
48

49
namespace stereocsbp {
50 51 52 53 54

///////////////////////////////////////////////////////////////
/////////////////////// load constants ////////////////////////
///////////////////////////////////////////////////////////////

55
__constant__ int cndisp;
56

57 58 59 60
__constant__ float cmax_data_term;
__constant__ float cdata_weight;
__constant__ float cmax_disc_term;
__constant__ float cdisc_single_jump;
61

62
__constant__ int cth;
63

64 65 66 67 68
__constant__ size_t cimg_step;
__constant__ size_t cmsg_step1;
__constant__ size_t cmsg_step2;
__constant__ size_t cdisp_step1;
__constant__ size_t cdisp_step2;
69

70 71 72
__constant__ uchar* cleft;
__constant__ uchar* cright;
__constant__ uchar* ctemp;
73

74

75 76 77 78
void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
                    const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp)
{
    cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
79

80 81 82 83
    cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
    cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
    cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
    cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
84

85
    cudaSafeCall( cudaMemcpyToSymbol(cth, &min_disp_th, sizeof(int)) );
86

87
    cudaSafeCall( cudaMemcpyToSymbol(cimg_step, &left.step, sizeof(size_t)) );
88

89 90 91 92
    cudaSafeCall( cudaMemcpyToSymbol(cleft,  &left.data,  sizeof(left.data)) );
    cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) );
    cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) );
}
93 94 95 96 97

///////////////////////////////////////////////////////////////
/////////////////////// init data cost ////////////////////////
///////////////////////////////////////////////////////////////

98 99 100 101
template <int channels> struct DataCostPerPixel;
template <> struct DataCostPerPixel<1>
{
    static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
102
    {
103 104 105 106 107 108
        return fmin(cdata_weight * ::abs((int)*left - *right), cdata_weight * cmax_data_term);
    }
};
template <> struct DataCostPerPixel<3>
{
    static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
109
    {
110 111 112
        float tb = 0.114f * ::abs((int)left[0] - right[0]);
        float tg = 0.587f * ::abs((int)left[1] - right[1]);
        float tr = 0.299f * ::abs((int)left[2] - right[2]);
113

114 115 116 117 118 119
        return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
    }
};
template <> struct DataCostPerPixel<4>
{
    static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
120
    {
121 122
        uchar4 l = *((const uchar4*)left);
        uchar4 r = *((const uchar4*)right);
123

124 125 126
        float tb = 0.114f * ::abs((int)l.x - r.x);
        float tg = 0.587f * ::abs((int)l.y - r.y);
        float tr = 0.299f * ::abs((int)l.z - r.z);
127

128 129 130
        return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
    }
};
131

132 133 134 135 136 137 138
template <typename T>
__global__ void get_first_k_initial_global(T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (y < h && x < w)
139
    {
140 141 142
        T* selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;
        T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
        T* data_cost = (T*)ctemp + y * cmsg_step1 + x;
143

144
        for(int i = 0; i < nr_plane; i++)
145
        {
146 147 148
            T minimum = device::numeric_limits<T>::max();
            int id = 0;
            for(int d = 0; d < cndisp; d++)
149
            {
150 151
                T cur = data_cost[d * cdisp_step1];
                if(cur < minimum)
152
                {
153 154
                    minimum = cur;
                    id = d;
155 156
                }
            }
157 158 159 160

            data_cost_selected[i  * cdisp_step1] = minimum;
            selected_disparity[i  * cdisp_step1] = id;
            data_cost         [id * cdisp_step1] = numeric_limits<T>::max();
161 162
        }
    }
163
}
164 165


166 167 168 169 170
template <typename T>
__global__ void get_first_k_initial_local(T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
171

172 173 174 175 176
    if (y < h && x < w)
    {
        T* selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;
        T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
        T* data_cost = (T*)ctemp + y * cmsg_step1 + x;
177

178
        int nr_local_minimum = 0;
179

180 181 182
        T prev = data_cost[0 * cdisp_step1];
        T cur  = data_cost[1 * cdisp_step1];
        T next = data_cost[2 * cdisp_step1];
183

184 185 186
        for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)
        {
            if (cur < prev && cur < next)
187
            {
188 189
                data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
                selected_disparity[nr_local_minimum * cdisp_step1] = d;
190

191
                data_cost[d * cdisp_step1] = numeric_limits<T>::max();
192

193
                nr_local_minimum++;
194
            }
195 196 197 198
            prev = cur;
            cur = next;
            next = data_cost[(d + 1) * cdisp_step1];
        }
199

200 201 202 203
        for (int i = nr_local_minimum; i < nr_plane; i++)
        {
            T minimum = numeric_limits<T>::max();
            int id = 0;
204

205 206 207 208
            for (int d = 0; d < cndisp; d++)
            {
                cur = data_cost[d * cdisp_step1];
                if (cur < minimum)
209
                {
210 211
                    minimum = cur;
                    id = d;
212 213
                }
            }
214 215 216 217
            data_cost_selected[i * cdisp_step1] = minimum;
            selected_disparity[i * cdisp_step1] = id;

            data_cost[id * cdisp_step1] = numeric_limits<T>::max();
218 219
        }
    }
220
}
221

222 223 224 225 226
template <typename T, int channels>
__global__ void init_data_cost(int h, int w, int level)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
227

228 229 230 231
    if (y < h && x < w)
    {
        int y0 = y << level;
        int yt = (y + 1) << level;
232

233 234
        int x0 = x << level;
        int xt = (x + 1) << level;
235

236
        T* data_cost = (T*)ctemp + y * cmsg_step1 + x;
237

238 239 240 241
        for(int d = 0; d < cndisp; ++d)
        {
            float val = 0.0f;
            for(int yi = y0; yi < yt; yi++)
242
            {
243
                for(int xi = x0; xi < xt; xi++)
244
                {
245 246 247 248
                    int xr = xi - d;
                    if(d < cth || xr < 0)
                        val += cdata_weight * cmax_data_term;
                    else
249
                    {
250 251 252 253
                        const uchar* lle = cleft + yi * cimg_step + xi * channels;
                        const uchar* lri = cright + yi * cimg_step + xr * channels;

                        val += DataCostPerPixel<channels>::compute(lle, lri);
254
                    }
255 256
                }
            }
257
            data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
258 259
        }
    }
260
}
261

262 263 264 265 266 267
template <typename T, int winsz, int channels>
__global__ void init_data_cost_reduce(int level, int rows, int cols, int h)
{
    int x_out = blockIdx.x;
    int y_out = blockIdx.y % h;
    int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
268

269
    int tid = threadIdx.x;
270

271 272 273 274
    if (d < cndisp)
    {
        int x0 = x_out << level;
        int y0 = y_out << level;
275

276
        int len = ::min(y0 + winsz, rows) - y0;
277

278 279 280 281 282 283
        float val = 0.0f;
        if (x0 + tid < cols)
        {
            if (x0 + tid - d < 0 || d < cth)
                val = cdata_weight * cmax_data_term * len;
            else
284
            {
285 286
                const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
                const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);
287

288 289 290
                for(int y = 0; y < len; ++y)
                {
                    val += DataCostPerPixel<channels>::compute(lle, lri);
291

292 293
                    lle += cimg_step;
                    lri += cimg_step;
294 295
                }
            }
296
        }
297

298 299
        extern __shared__ float smem[];
        float* dline = smem + winsz * threadIdx.z;
300

301
        dline[tid] = val;
302

303
        __syncthreads();
304

305 306
        if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }
        if (winsz >= 128) { if (tid <  64) { dline[tid] += dline[tid + 64]; } __syncthreads(); }
307

308
		volatile float* vdline = smem + winsz * threadIdx.z;
309

310 311 312 313 314 315
        if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
        if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
        if (winsz >= 16) if (tid <  8) vdline[tid] += vdline[tid + 8];
        if (winsz >=  8) if (tid <  4) vdline[tid] += vdline[tid + 4];
        if (winsz >=  4) if (tid <  2) vdline[tid] += vdline[tid + 2];
        if (winsz >=  2) if (tid <  1) vdline[tid] += vdline[tid + 1];
316

317
        T* data_cost = (T*)ctemp + y_out * cmsg_step1 + x_out;
318

319 320
        if (tid == 0)
            data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
321
    }
322
}
323

324

325 326 327 328 329
template <typename T>
void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream)
{
    dim3 threads(32, 8, 1);
    dim3 grid(1, 1, 1);
330

331 332
    grid.x = divUp(w, threads.x);
    grid.y = divUp(h, threads.y);
333

334 335 336 337 338 339
    switch (channels)
    {
    case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break;
    case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break;
    case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(h, w, level); break;
    default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
340
    }
341
}
342

343 344 345 346 347
template <typename T, int winsz>
void init_data_cost_reduce_caller_(int rows, int cols, int h, int w, int level, int ndisp, int channels, cudaStream_t stream)
{
    const int threadsNum = 256;
    const size_t smem_size = threadsNum * sizeof(float);
348

349 350 351
    dim3 threads(winsz, 1, threadsNum / winsz);
    dim3 grid(w, h, 1);
    grid.y *= divUp(ndisp, threads.z);
352

353 354 355 356 357 358
    switch (channels)
    {
    case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
    case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
    case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
    default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
359
    }
360
}
361

362 363 364 365
template<class T>
void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
            int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)
{
366

367
    typedef void (*InitDataCostCaller)(int cols, int rows, int w, int h, int level, int ndisp, int channels, cudaStream_t stream);
368

369 370 371 372 373 374
    static const InitDataCostCaller init_data_cost_callers[] =
    {
        init_data_cost_caller_<T>, init_data_cost_caller_<T>, init_data_cost_reduce_caller_<T, 4>,
        init_data_cost_reduce_caller_<T, 8>, init_data_cost_reduce_caller_<T, 16>, init_data_cost_reduce_caller_<T, 32>,
        init_data_cost_reduce_caller_<T, 64>, init_data_cost_reduce_caller_<T, 128>, init_data_cost_reduce_caller_<T, 256>
    };
375

376 377 378
    size_t disp_step = msg_step * h;
    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step,  sizeof(size_t)) );
379

380 381
    init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);
    cudaSafeCall( cudaGetLastError() );
382

383 384
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
385

386 387
    dim3 threads(32, 8, 1);
    dim3 grid(1, 1, 1);
388

389 390
    grid.x = divUp(w, threads.x);
    grid.y = divUp(h, threads.y);
A
Anatoly Baksheev 已提交
391

392 393 394 395 396 397
    if (use_local_init_data_cost == true)
        get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
    else
        get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
    
    cudaSafeCall( cudaGetLastError() );
398

399 400 401
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
}
A
Anatoly Baksheev 已提交
402

403 404
template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
            int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
A
Anatoly Baksheev 已提交
405

406 407
template void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,
            int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
A
Anatoly Baksheev 已提交
408

409 410 411 412
///////////////////////////////////////////////////////////////
////////////////////// compute data cost //////////////////////
///////////////////////////////////////////////////////////////

413 414 415 416 417
template <typename T, int channels>
__global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
418

419 420 421 422
    if (y < h && x < w)
    {
        int y0 = y << level;
        int yt = (y + 1) << level;
423

424 425
        int x0 = x << level;
        int xt = (x + 1) << level;
426

427 428
        const T* selected_disparity = selected_disp_pyr + y/2 * cmsg_step2 + x/2;
        T* data_cost = data_cost_ + y * cmsg_step1 + x;
429

430 431 432 433
        for(int d = 0; d < nr_plane; d++)
        {
            float val = 0.0f;
            for(int yi = y0; yi < yt; yi++)
434
            {
435
                for(int xi = x0; xi < xt; xi++)
436
                {
437 438 439 440 441 442
                    int sel_disp = selected_disparity[d * cdisp_step2];
                    int xr = xi - sel_disp;

                    if (xr < 0 || sel_disp < cth)
                        val += cdata_weight * cmax_data_term;
                    else
443
                    {
444 445 446 447
                        const uchar* left_x = cleft + yi * cimg_step + xi * channels;
                        const uchar* right_x = cright + yi * cimg_step + xr * channels;

                        val += DataCostPerPixel<channels>::compute(left_x, right_x);
448 449 450
                    }
                }
            }
451
            data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
452 453
        }
    }
454
}
455

456 457 458 459 460 461
template <typename T, int winsz, int channels>
__global__ void compute_data_cost_reduce(const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane)
{
    int x_out = blockIdx.x;
    int y_out = blockIdx.y % h;
    int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
462

463
    int tid = threadIdx.x;
464

465 466
    const T* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step2 + x_out/2;
    T* data_cost = data_cost_ + y_out * cmsg_step1 + x_out;
467

468 469 470
    if (d < nr_plane)
    {
        int sel_disp = selected_disparity[d * cdisp_step2];
471

472 473
        int x0 = x_out << level;
        int y0 = y_out << level;
474

475
        int len = ::min(y0 + winsz, rows) - y0;
476

477 478 479 480 481 482
        float val = 0.0f;
        if (x0 + tid < cols)
        {
            if (x0 + tid - sel_disp < 0 || sel_disp < cth)
                val = cdata_weight * cmax_data_term * len;
            else
483
            {
484 485
                const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
                const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);
486

487 488 489
                for(int y = 0; y < len; ++y)
                {
                    val += DataCostPerPixel<channels>::compute(lle, lri);
490

491 492
                    lle += cimg_step;
                    lri += cimg_step;
493 494
                }
            }
495
        }
496

497 498
        extern __shared__ float smem[];
        float* dline = smem + winsz * threadIdx.z;
499

500
        dline[tid] = val;
501

502
        __syncthreads();
503

504 505
        if (winsz >= 256) { if (tid < 128) { dline[tid] += dline[tid + 128]; } __syncthreads(); }
        if (winsz >= 128) { if (tid <  64) { dline[tid] += dline[tid +  64]; } __syncthreads(); }
506

507
		volatile float* vdline = smem + winsz * threadIdx.z;
508

509 510 511 512 513 514
        if (winsz >= 64) if (tid < 32) vdline[tid] += vdline[tid + 32];
        if (winsz >= 32) if (tid < 16) vdline[tid] += vdline[tid + 16];
        if (winsz >= 16) if (tid <  8) vdline[tid] += vdline[tid + 8];
        if (winsz >=  8) if (tid <  4) vdline[tid] += vdline[tid + 4];
        if (winsz >=  4) if (tid <  2) vdline[tid] += vdline[tid + 2];
        if (winsz >=  2) if (tid <  1) vdline[tid] += vdline[tid + 1];
515

516 517
        if (tid == 0)
            data_cost[cdisp_step1 * d] = saturate_cast<T>(dline[0]);
518
    }
519
}
520

521 522 523 524 525 526
template <typename T>
void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,
                              int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
{
    dim3 threads(32, 8, 1);
    dim3 grid(1, 1, 1);
527

528 529
    grid.x = divUp(w, threads.x);
    grid.y = divUp(h, threads.y);
530

531 532 533 534 535 536
    switch(channels)
    {
    case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
    case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
    case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
    default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
537
    }
538
}
539

540 541 542 543 544 545
template <typename T, int winsz>
void compute_data_cost_reduce_caller_(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
                              int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
{
    const int threadsNum = 256;
    const size_t smem_size = threadsNum * sizeof(float);
546

547 548 549
    dim3 threads(winsz, 1, threadsNum / winsz);
    dim3 grid(w, h, 1);
    grid.y *= divUp(nr_plane, threads.z);
550

551 552 553 554 555 556
    switch (channels)
    {
    case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
    case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
    case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
    default: cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
557
    }
558
}
559

560 561 562 563 564 565 566 567
template<class T>
void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step1, size_t msg_step2,
                       int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)
{
    typedef void (*ComputeDataCostCaller)(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
        int h, int w, int level, int nr_plane, int channels, cudaStream_t stream);

    static const ComputeDataCostCaller callers[] =
568
    {
569 570 571 572
        compute_data_cost_caller_<T>, compute_data_cost_caller_<T>, compute_data_cost_reduce_caller_<T, 4>,
        compute_data_cost_reduce_caller_<T, 8>, compute_data_cost_reduce_caller_<T, 16>, compute_data_cost_reduce_caller_<T, 32>,
        compute_data_cost_reduce_caller_<T, 64>, compute_data_cost_reduce_caller_<T, 128>, compute_data_cost_reduce_caller_<T, 256>
    };
A
Anatoly Baksheev 已提交
573

574 575 576 577 578 579
    size_t disp_step1 = msg_step1 * h;
    size_t disp_step2 = msg_step2 * h2;
    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step1,  sizeof(size_t)) );
    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2,  &msg_step2,  sizeof(size_t)) );
580

581 582 583 584 585 586
    callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
    cudaSafeCall( cudaGetLastError() );

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
}
A
Anatoly Baksheev 已提交
587

588 589
template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2,
                       int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
590

591 592
template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step1, size_t msg_step2,
                       int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
593
     
A
Anatoly Baksheev 已提交
594

595 596 597 598
///////////////////////////////////////////////////////////////
//////////////////////// init message /////////////////////////
///////////////////////////////////////////////////////////////

599
 
600 601 602 603 604 605 606 607
 template <typename T>
__device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,
                                             const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
                                             T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,
                                             const T* data_cost_cur, const T* disparity_selected_cur,
                                             int nr_plane, int nr_plane2)
{
    for(int i = 0; i < nr_plane; i++)
608
    {
609 610 611
        T minimum = numeric_limits<T>::max();
        int id = 0;
        for(int j = 0; j < nr_plane2; j++)
612
        {
613 614
            T cur = data_cost_new[j * cdisp_step1];
            if(cur < minimum)
615
            {
616 617
                minimum = cur;
                id = j;
618
            }
619
        }
620

621 622
        data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1];
        disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];
623

624 625 626 627
        u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];
        d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];
        l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
        r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
628

629
        data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();
630
    }
631 632 633 634 635 636 637 638 639 640 641 642 643
}

template <typename T>
__global__ void init_message(T* u_new_, T* d_new_, T* l_new_, T* r_new_,
                             const T* u_cur_, const T* d_cur_, const T* l_cur_, const T* r_cur_,
                             T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
                             T* data_cost_selected_, const T* data_cost_,
                             int h, int w, int nr_plane, int h2, int w2, int nr_plane2)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (y < h && x < w)
644
    {
645 646 647 648
        const T* u_cur = u_cur_ + ::min(h2-1, y/2 + 1) * cmsg_step2 + x/2;
        const T* d_cur = d_cur_ + ::max(0, y/2 - 1)    * cmsg_step2 + x/2;
        const T* l_cur = l_cur_ + y/2                  * cmsg_step2 + ::min(w2-1, x/2 + 1);
        const T* r_cur = r_cur_ + y/2                  * cmsg_step2 + ::max(0, x/2 - 1);
649

650
        T* data_cost_new = (T*)ctemp + y * cmsg_step1 + x;
651

652 653
        const T* disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step2 + x/2;
        const T* data_cost = data_cost_ + y * cmsg_step1 + x;
654

655 656 657
        for(int d = 0; d < nr_plane2; d++)
        {
            int idx2 = d * cdisp_step2;
658

659 660
            T val  = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];
            data_cost_new[d * cdisp_step1] = val;
661
        }
662

663 664
        T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
        T* disparity_selected_new = selected_disp_pyr_new + y * cmsg_step1 + x;
665

666 667 668 669
        T* u_new = u_new_ + y * cmsg_step1 + x;
        T* d_new = d_new_ + y * cmsg_step1 + x;
        T* l_new = l_new_ + y * cmsg_step1 + x;
        T* r_new = r_new_ + y * cmsg_step1 + x;
670

671 672 673 674
        u_cur = u_cur_ + y/2 * cmsg_step2 + x/2;
        d_cur = d_cur_ + y/2 * cmsg_step2 + x/2;
        l_cur = l_cur_ + y/2 * cmsg_step2 + x/2;
        r_cur = r_cur_ + y/2 * cmsg_step2 + x/2;
675

676 677 678
        get_first_k_element_increase(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,
                                     data_cost_selected, disparity_selected_new, data_cost_new,
                                     data_cost, disparity_selected_cur, nr_plane, nr_plane2);
679
    }
680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726
}


template<class T>
void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
                  const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
                  T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
                  T* data_cost_selected, const T* data_cost, size_t msg_step1, size_t msg_step2,
                  int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream)
{

    size_t disp_step1 = msg_step1 * h;
    size_t disp_step2 = msg_step2 * h2;
    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,   &msg_step1, sizeof(size_t)) );
    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2,   &msg_step2, sizeof(size_t)) );

    dim3 threads(32, 8, 1);
    dim3 grid(1, 1, 1);

    grid.x = divUp(w, threads.x);
    grid.y = divUp(h, threads.y);

    init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,
                                               u_cur, d_cur, l_cur, r_cur,
                                               selected_disp_pyr_new, selected_disp_pyr_cur,
                                               data_cost_selected, data_cost,
                                               h, w, nr_plane, h2, w2, nr_plane2);
    cudaSafeCall( cudaGetLastError() );

    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
}


template void init_message(short* u_new, short* d_new, short* l_new, short* r_new,
                  const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,
                  short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,
                  short* data_cost_selected, const short* data_cost, size_t msg_step1, size_t msg_step2,
                  int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);

template void init_message(float* u_new, float* d_new, float* l_new, float* r_new,
                  const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,
                  float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,
                  float* data_cost_selected, const float* data_cost, size_t msg_step1, size_t msg_step2,
                  int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);        
727 728 729 730 731

///////////////////////////////////////////////////////////////
////////////////////  calc all iterations /////////////////////
///////////////////////////////////////////////////////////////

732 733 734 735 736 737 738
template <typename T>
__device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
                                  const T* dst_disp, const T* src_disp, int nr_plane, T* temp)
{
    T minimum = numeric_limits<T>::max();

    for(int d = 0; d < nr_plane; d++)
739
    {
740 741
        int idx = d * cdisp_step1;
        T val  = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];
742

743 744
        if(val < minimum)
            minimum = val;
745

746 747
        msg_dst[idx] = val;
    }
748

749 750 751 752 753
    float sum = 0;
    for(int d = 0; d < nr_plane; d++)
    {
        float cost_min = minimum + cmax_disc_term;
        T src_disp_reg = src_disp[d * cdisp_step1];
754

755 756
        for(int d2 = 0; d2 < nr_plane; d2++)
            cost_min = fmin(cost_min, msg_dst[d2 * cdisp_step1] + cdisc_single_jump * ::abs(dst_disp[d2 * cdisp_step1] - src_disp_reg));
757

758 759 760 761
        temp[d * cdisp_step1] = saturate_cast<T>(cost_min);
        sum += cost_min;
    }
    sum /= nr_plane;
762

763 764 765
    for(int d = 0; d < nr_plane; d++)
        msg_dst[d * cdisp_step1] = saturate_cast<T>(temp[d * cdisp_step1] - sum);
}
766

767 768 769 770 771
template <typename T>
__global__ void compute_message(T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i)
{
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + i) & 1);
772

773
    if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
774
    {
775
        const T* data = data_cost_selected + y * cmsg_step1 + x;
776

777 778 779 780
        T* u = u_ + y * cmsg_step1 + x;
        T* d = d_ + y * cmsg_step1 + x;
        T* l = l_ + y * cmsg_step1 + x;
        T* r = r_ + y * cmsg_step1 + x;
781

782
        const T* disp = selected_disp_pyr_cur + y * cmsg_step1 + x;
783

784
        T* temp = (T*)ctemp + y * cmsg_step1 + x;
785

786 787 788 789
        message_per_pixel(data, u, r - 1, u + cmsg_step1, l + 1, disp, disp - cmsg_step1, nr_plane, temp);
        message_per_pixel(data, d, d - cmsg_step1, r - 1, l + 1, disp, disp + cmsg_step1, nr_plane, temp);
        message_per_pixel(data, l, u + cmsg_step1, d - cmsg_step1, l + 1, disp, disp - 1, nr_plane, temp);
        message_per_pixel(data, r, u + cmsg_step1, d - cmsg_step1, r - 1, disp, disp + 1, nr_plane, temp);
790
    }
791
}
792

793

794 795 796 797 798 799 800
template<class T>
void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
    const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)
{
    size_t disp_step = msg_step * h;
    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step,  sizeof(size_t)) );
A
Anatoly Baksheev 已提交
801

802 803
    dim3 threads(32, 8, 1);
    dim3 grid(1, 1, 1);
804

805 806
    grid.x = divUp(w, threads.x << 1);
    grid.y = divUp(h, threads.y);
807

808 809 810 811
    for(int t = 0; t < iters; ++t)
    {
        compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
        cudaSafeCall( cudaGetLastError() );
812

813 814 815 816
        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
};
A
Anatoly Baksheev 已提交
817

818 819 820 821 822
template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,
    int h, int w, int nr_plane, int iters, cudaStream_t stream);

template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step, 
    int h, int w, int nr_plane, int iters, cudaStream_t stream);
A
Anatoly Baksheev 已提交
823

824 825 826 827 828

///////////////////////////////////////////////////////////////
/////////////////////////// output ////////////////////////////
///////////////////////////////////////////////////////////////

829

830 831 832 833 834 835 836 837 838
template <typename T>
__global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,
                             const T* data_cost_selected, const T* disp_selected_pyr,
                             short* disp, size_t res_step, int cols, int rows, int nr_plane)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (y > 0 && y < rows - 1 && x > 0 && x < cols - 1)
839
    {
840 841
        const T* data = data_cost_selected + y * cmsg_step1 + x;
        const T* disp_selected = disp_selected_pyr + y * cmsg_step1 + x;
842

843 844 845 846
        const T* u = u_ + (y+1) * cmsg_step1 + (x+0);
        const T* d = d_ + (y-1) * cmsg_step1 + (x+0);
        const T* l = l_ + (y+0) * cmsg_step1 + (x+1);
        const T* r = r_ + (y+0) * cmsg_step1 + (x-1);
847

848 849 850 851 852 853
        int best = 0;
        T best_val = numeric_limits<T>::max();
        for (int i = 0; i < nr_plane; ++i)
        {
            int idx = i * cdisp_step1;
            T val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];
854

855
            if (val < best_val)
856
            {
857 858
                best_val = val;
                best = saturate_cast<short>(disp_selected[idx]);
859 860
            }
        }
861
        disp[res_step * y + x] = best;
862
    }
863
}
864

865 866 867 868 869 870 871
template<class T>
void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
    const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream)
{
    size_t disp_step = disp.rows * msg_step;
    cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
    cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1,  &msg_step,  sizeof(size_t)) );
A
Anatoly Baksheev 已提交
872

873 874
    dim3 threads(32, 8, 1);
    dim3 grid(1, 1, 1);
875

876 877
    grid.x = divUp(disp.cols, threads.x);
    grid.y = divUp(disp.rows, threads.y);
878

879 880 881
    compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected,
                                               disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);
    cudaSafeCall( cudaGetLastError() );
882

883 884 885 886 887 888 889 890 891
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
}

template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step, 
    const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);

template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
    const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
892

893
} // namespace stereocsbp
A
Anatoly Baksheev 已提交
894

895
END_OPENCV_DEVICE_NAMESPACE