added get_first_k_initial_global_init_global_cost in gpu::SCBP

28030952 · Andrey Morozov · 9a669b1c · 28030952 · 28030952 · 28030952
3 changed file
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -486,6 +486,8 @@ namespace cv
            int min_disp_th;

            int msg_type;
+
+            bool use_local_init_data_cost;
        private:
            GpuMat u[2], d[2], l[2], r[2];
            GpuMat disp_selected_pyr[2];

--- a/modules/gpu/src/constantspacebp_gpu.cpp
+++ b/modules/gpu/src/constantspacebp_gpu.cpp
@@ -62,10 +62,10 @@ namespace cv { namespace gpu { namespace csbp
        const DevMem2D& left, const DevMem2D& right, const DevMem2D& temp);

    void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected,
-        size_t msg_step, int h, int w, int level, int nr_plane, int ndisp, int channels, cudaStream_t stream);
+        size_t msg_step, int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);

    void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected,
-        size_t msg_step, int h, int w, int level, int nr_plane, int ndisp, int channels, cudaStream_t stream);
+        size_t msg_step, int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);

    void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step1, size_t msg_step2,
        int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
@@ -111,7 +111,7 @@ cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, in
    : ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_),
      max_data_term(DEFAULT_MAX_DATA_TERM), data_weight(DEFAULT_DATA_WEIGHT),
      max_disc_term(DEFAULT_MAX_DISC_TERM), disc_single_jump(DEFAULT_DISC_SINGLE_JUMP), min_disp_th(0),
-      msg_type(msg_type_)
+      msg_type(msg_type_), use_local_init_data_cost(true)
 {
    CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S);
 }
@@ -122,7 +122,7 @@ cv::gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, in
    : ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_),
      max_data_term(max_data_term_), data_weight(data_weight_),
      max_disc_term(max_disc_term_), disc_single_jump(disc_single_jump_), min_disp_th(min_disp_th_),
-      msg_type(msg_type_)
+      msg_type(msg_type_), use_local_init_data_cost(true)
 {
    CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S);
 }
@@ -131,7 +131,7 @@ template<class T>
 static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2], GpuMat l[2], GpuMat r[2],
                          GpuMat disp_selected_pyr[2], GpuMat& data_cost, GpuMat& data_cost_selected,
                          GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp,
-                          cudaStream_t stream)
+                          bool use_local_init_data_cost, cudaStream_t stream)
 {
    CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane
        && left.rows == right.rows && left.cols == right.cols && left.type() == right.type());
@@ -225,7 +225,7 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
        if (i == levels - 1)
        {
            csbp::init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(),
-                step_pyr[i], rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), stream);
+                step_pyr[i], rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), use_local_init_data_cost, stream);
        }
        else
        {
@@ -265,20 +265,20 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
 typedef void (*csbp_operator_t)(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2], GpuMat l[2], GpuMat r[2],
                                     GpuMat disp_selected_pyr[2], GpuMat& data_cost, GpuMat& data_cost_selected,
                                     GpuMat& temp, GpuMat& out, const GpuMat& left, const GpuMat& right, GpuMat& disp,
-                                     cudaStream_t stream);
+                                     bool use_local_init_data_cost, cudaStream_t stream);

 const static csbp_operator_t operators[] = {0, 0, 0, csbp_operator<short>, 0, csbp_operator<float>, 0, 0};

 void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp)
 {
    CV_Assert(msg_type == CV_32F || msg_type == CV_16S);
-    operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out, left, right, disp, 0);
+    operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out, left, right, disp, use_local_init_data_cost, 0);
 }

 void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
 {
    CV_Assert(msg_type == CV_32F || msg_type == CV_16S);
-    operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out, left, right, disp, StreamAccessor::getStream(stream));
+    operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out, left, right, disp, use_local_init_data_cost,  StreamAccessor::getStream(stream));
 }

 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/cuda/constantspacebp.cu
+++ b/modules/gpu/src/cuda/constantspacebp.cu
@@ -146,6 +146,40 @@ namespace csbp_krnls
        }
    };

+    template <typename T>
+    __global__ void get_first_k_initial_global(T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane)
+    {
+        int x = blockIdx.x * blockDim.x + threadIdx.x;
+        int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (y < h && x < w)
+        {
+            T* selected_disparity = selected_disp_pyr + y * cmsg_step1 + x;
+            T* data_cost_selected = data_cost_selected_ + y * cmsg_step1 + x;
+            T* data_cost = (T*)ctemp + y * cmsg_step1 + x;
+
+            for(int i = 0; i < nr_plane; i++)
+            {
+                T fmin_ = data_cost[i * cdisp_step1];
+                int id = i;
+                for(int j = 0; j < nr_plane; j++)
+                {
+                    T cur = data_cost[j * cdisp_step1];
+                    if(cur < fmin_)
+                    {
+                        fmin_ = cur;
+                        id = j;
+                    }
+                }
+
+                data_cost_selected[i  * cdisp_step1] = fmin_;
+                selected_disparity[i  * cdisp_step1] = id;
+                data_cost         [id * cdisp_step1] = TypeLimits<T>::max();;
+            }
+        }
+    }
+
+
    template <typename T>
    __global__ void get_first_k_initial_local(T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane)
    {
@@ -341,8 +375,8 @@ namespace cv { namespace gpu { namespace csbp
    }

    template<class T>
-    void init_data_cost_tmpl(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected,
-                        size_t msg_step, int h, int w, int level, int nr_plane, int ndisp, int channels, cudaStream_t stream)
+    void init_data_cost_tmpl(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
+                int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)
    {

        typedef void (*InitDataCostCaller)(int cols, int rows, int w, int h, int level, int ndisp, int channels, cudaStream_t stream);
@@ -368,21 +402,24 @@ namespace cv { namespace gpu { namespace csbp
        grid.x = divUp(w, threads.x);
        grid.y = divUp(h, threads.y);

-        csbp_krnls::get_first_k_initial_local<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);        
+        if (use_local_init_data_cost == true)
+            csbp_krnls::get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
+        else
+            csbp_krnls::get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
    }

      void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected,
-                        size_t msg_step, int h, int w, int level, int nr_plane, int ndisp, int channels, cudaStream_t stream)
+                        size_t msg_step, int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)
      {
-          init_data_cost_tmpl(rows, cols, disp_selected_pyr, data_cost_selected, msg_step, h, w, level, nr_plane, ndisp, channels, stream);
+          init_data_cost_tmpl(rows, cols, disp_selected_pyr, data_cost_selected, msg_step, h, w, level, nr_plane, ndisp, channels, use_local_init_data_cost, stream);
      }

      void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected,
-                        size_t msg_step, int h, int w, int level, int nr_plane, int ndisp, int channels, cudaStream_t stream)
+                        size_t msg_step, int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)
      {
-          init_data_cost_tmpl(rows, cols, disp_selected_pyr, data_cost_selected, msg_step, h, w, level, nr_plane, ndisp, channels, stream);
+          init_data_cost_tmpl(rows, cols, disp_selected_pyr, data_cost_selected, msg_step, h, w, level, nr_plane, ndisp, channels, use_local_init_data_cost, stream);
      }

 }}}