Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
12dc52c2
O
Opencv
项目概览
Greenplum
/
Opencv
11 个月 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
12dc52c2
编写于
7月 27, 2010
作者:
A
Andrey Morozov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
implemented asynchronous call for StereoBM()
上级
dc0f3139
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
115 addition
and
98 deletion
+115
-98
modules/gpu/include/opencv2/gpu/gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+1
-1
modules/gpu/src/cuda/stereobm.cu
modules/gpu/src/cuda/stereobm.cu
+82
-74
modules/gpu/src/precomp.hpp
modules/gpu/src/precomp.hpp
+6
-5
modules/gpu/src/stereobm_gpu.cpp
modules/gpu/src/stereobm_gpu.cpp
+26
-18
未找到文件。
modules/gpu/include/opencv2/gpu/gpu.hpp
浏览文件 @
12dc52c2
...
...
@@ -349,7 +349,7 @@ namespace cv
void
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
);
//! Acync version
void
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
,
const
CudaStream
&
stream
);
void
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
,
const
CudaStream
&
stream
);
//! Some heuristics that tries to estmate
// if current GPU will be faster then CPU in this algorithm.
...
...
modules/gpu/src/cuda/stereobm.cu
浏览文件 @
12dc52c2
...
...
@@ -55,13 +55,13 @@ using namespace cv::gpu;
#define ROWSperTHREAD 21 // the number of rows a thread will process
namespace
stereobm_gpu
namespace
stereobm_gpu
{
#define BLOCK_W 128 // the thread block width (464)
#define N_DISPARITIES 8
#define STEREO_MIND 0 // The minimum d range to check
#define STEREO_MIND 0 // The minimum d range to check
#define STEREO_DISP_STEP N_DISPARITIES // the d step, must be <= 1 to avoid aliasing
__constant__
unsigned
int
*
cminSSDImage
;
...
...
@@ -71,7 +71,7 @@ __constant__ int cheight;
__device__
int
SQ
(
int
a
)
{
return
a
*
a
;
return
a
*
a
;
}
template
<
int
RADIUS
>
...
...
@@ -82,7 +82,7 @@ __device__ unsigned int CalcSSD(unsigned int *col_ssd_cache, unsigned int *col_s
for
(
int
i
=
1
;
i
<=
RADIUS
;
i
++
)
cache
+=
col_ssd
[
i
];
col_ssd_cache
[
0
]
=
cache
;
__syncthreads
();
...
...
@@ -101,7 +101,7 @@ __device__ uint2 MinSSD(unsigned int *col_ssd_cache, unsigned int *col_ssd)
{
unsigned
int
ssd
[
N_DISPARITIES
];
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
ssd
[
0
]
=
CalcSSD
<
RADIUS
>
(
col_ssd_cache
,
col_ssd
+
0
*
(
BLOCK_W
+
2
*
RADIUS
));
ssd
[
1
]
=
CalcSSD
<
RADIUS
>
(
col_ssd_cache
,
col_ssd
+
1
*
(
BLOCK_W
+
2
*
RADIUS
));
ssd
[
2
]
=
CalcSSD
<
RADIUS
>
(
col_ssd_cache
,
col_ssd
+
2
*
(
BLOCK_W
+
2
*
RADIUS
));
...
...
@@ -146,7 +146,7 @@ __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned cha
rightPixel1
[
4
]
=
imageR
[
idx1
-
4
];
rightPixel1
[
5
]
=
imageR
[
idx1
-
5
];
rightPixel1
[
6
]
=
imageR
[
idx1
-
6
];
rightPixel2
[
7
]
=
imageR
[
idx2
-
7
];
rightPixel2
[
0
]
=
imageR
[
idx2
-
0
];
rightPixel2
[
1
]
=
imageR
[
idx2
-
1
];
...
...
@@ -155,16 +155,16 @@ __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned cha
rightPixel2
[
4
]
=
imageR
[
idx2
-
4
];
rightPixel2
[
5
]
=
imageR
[
idx2
-
5
];
rightPixel2
[
6
]
=
imageR
[
idx2
-
6
];
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
diff1
=
leftPixel1
-
rightPixel1
[
0
];
diff2
=
leftPixel2
-
rightPixel2
[
0
];
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
diff1
=
leftPixel1
-
rightPixel1
[
0
];
diff2
=
leftPixel2
-
rightPixel2
[
0
];
col_ssd
[
0
*
(
BLOCK_W
+
2
*
RADIUS
)]
+=
SQ
(
diff2
)
-
SQ
(
diff1
);
diff1
=
leftPixel1
-
rightPixel1
[
1
];
diff2
=
leftPixel2
-
rightPixel2
[
1
];
col_ssd
[
1
*
(
BLOCK_W
+
2
*
RADIUS
)]
+=
SQ
(
diff2
)
-
SQ
(
diff1
);
diff1
=
leftPixel1
-
rightPixel1
[
2
];
diff2
=
leftPixel2
-
rightPixel2
[
2
];
col_ssd
[
2
*
(
BLOCK_W
+
2
*
RADIUS
)]
+=
SQ
(
diff2
)
-
SQ
(
diff1
);
...
...
@@ -172,19 +172,19 @@ __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned cha
diff1
=
leftPixel1
-
rightPixel1
[
3
];
diff2
=
leftPixel2
-
rightPixel2
[
3
];
col_ssd
[
3
*
(
BLOCK_W
+
2
*
RADIUS
)]
+=
SQ
(
diff2
)
-
SQ
(
diff1
);
diff1
=
leftPixel1
-
rightPixel1
[
4
];
diff2
=
leftPixel2
-
rightPixel2
[
4
];
diff1
=
leftPixel1
-
rightPixel1
[
4
];
diff2
=
leftPixel2
-
rightPixel2
[
4
];
col_ssd
[
4
*
(
BLOCK_W
+
2
*
RADIUS
)]
+=
SQ
(
diff2
)
-
SQ
(
diff1
);
diff1
=
leftPixel1
-
rightPixel1
[
5
];
diff2
=
leftPixel2
-
rightPixel2
[
5
];
col_ssd
[
5
*
(
BLOCK_W
+
2
*
RADIUS
)]
+=
SQ
(
diff2
)
-
SQ
(
diff1
);
diff1
=
leftPixel1
-
rightPixel1
[
6
];
diff2
=
leftPixel2
-
rightPixel2
[
6
];
col_ssd
[
6
*
(
BLOCK_W
+
2
*
RADIUS
)]
+=
SQ
(
diff2
)
-
SQ
(
diff1
);
diff1
=
leftPixel1
-
rightPixel1
[
7
];
diff2
=
leftPixel2
-
rightPixel2
[
7
];
col_ssd
[
7
*
(
BLOCK_W
+
2
*
RADIUS
)]
+=
SQ
(
diff2
)
-
SQ
(
diff1
);
...
...
@@ -203,7 +203,7 @@ __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* im
leftPixel1
=
imageL
[
idx
];
idx
=
idx
-
d
;
diffa
[
0
]
+=
SQ
(
leftPixel1
-
imageR
[
idx
-
0
]);
diffa
[
0
]
+=
SQ
(
leftPixel1
-
imageR
[
idx
-
0
]);
diffa
[
1
]
+=
SQ
(
leftPixel1
-
imageR
[
idx
-
1
]);
diffa
[
2
]
+=
SQ
(
leftPixel1
-
imageR
[
idx
-
2
]);
diffa
[
3
]
+=
SQ
(
leftPixel1
-
imageR
[
idx
-
3
]);
...
...
@@ -213,7 +213,7 @@ __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* im
diffa
[
7
]
+=
SQ
(
leftPixel1
-
imageR
[
idx
-
7
]);
y_tex
+=
1
;
}
}
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
col_ssd
[
0
*
(
BLOCK_W
+
2
*
RADIUS
)]
=
diffa
[
0
];
col_ssd
[
1
*
(
BLOCK_W
+
2
*
RADIUS
)]
=
diffa
[
1
];
...
...
@@ -225,11 +225,11 @@ __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* im
col_ssd
[
7
*
(
BLOCK_W
+
2
*
RADIUS
)]
=
diffa
[
7
];
}
template
<
int
RADIUS
>
template
<
int
RADIUS
>
__global__
void
stereoKernel
(
unsigned
char
*
left
,
unsigned
char
*
right
,
size_t
img_step
,
unsigned
char
*
disp
,
size_t
disp_pitch
,
int
maxdisp
)
{
extern
__shared__
unsigned
int
col_ssd_cache
[];
unsigned
int
*
col_ssd
=
col_ssd_cache
+
BLOCK_W
+
threadIdx
.
x
;
unsigned
int
*
col_ssd
=
col_ssd_cache
+
BLOCK_W
+
threadIdx
.
x
;
unsigned
int
*
col_ssd_extra
=
threadIdx
.
x
<
(
2
*
RADIUS
)
?
col_ssd
+
BLOCK_W
:
0
;
//#define N_DIRTY_PIXELS (2 * RADIUS)
//#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
...
...
@@ -241,13 +241,13 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
unsigned
int
*
minSSDImage
=
cminSSDImage
+
X
+
Y
*
cminSSD_step
;
unsigned
char
*
disparImage
=
disp
+
X
+
Y
*
disp_pitch
;
/* if (X < cwidth)
{
{
unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
*ptr = 0xFFFFFFFF;
*ptr = 0xFFFFFFFF;
}*/
int
end_row
=
min
(
ROWSperTHREAD
,
cheight
-
Y
);
int
y_tex
;
int
y_tex
;
int
x_tex
=
X
-
RADIUS
;
if
(
x_tex
>=
cwidth
)
...
...
@@ -257,7 +257,7 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
{
y_tex
=
Y
-
RADIUS
;
InitColSSD
<
RADIUS
>
(
x_tex
,
y_tex
,
img_step
,
left
,
right
,
d
,
col_ssd
);
InitColSSD
<
RADIUS
>
(
x_tex
,
y_tex
,
img_step
,
left
,
right
,
d
,
col_ssd
);
if
(
col_ssd_extra
>
0
)
if
(
x_tex
+
BLOCK_W
<
cwidth
)
...
...
@@ -289,13 +289,13 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
StepDown
<
RADIUS
>
(
idx1
,
idx2
,
left
+
BLOCK_W
,
right
+
BLOCK_W
,
d
,
col_ssd_extra
);
y_tex
+=
1
;
__syncthreads
();
//before MinSSD function
if
(
X
<
cwidth
-
RADIUS
&&
row
<
cheight
-
RADIUS
-
Y
)
{
int
idx
=
row
*
cminSSD_step
;
uint2
minSSD
=
MinSSD
<
RADIUS
>
(
col_ssd_cache
+
threadIdx
.
x
,
col_ssd
);
{
int
idx
=
row
*
cminSSD_step
;
uint2
minSSD
=
MinSSD
<
RADIUS
>
(
col_ssd_cache
+
threadIdx
.
x
,
col_ssd
);
if
(
minSSD
.
x
<
minSSDImage
[
idx
])
{
disparImage
[
disp_pitch
*
row
]
=
(
unsigned
char
)(
d
+
minSSD
.
y
);
...
...
@@ -310,49 +310,57 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
namespace
cv
{
namespace
gpu
{
namespace
impl
{
template
<
int
RADIUS
>
void
kernel_caller
(
const
DevMem2D
&
left
,
const
DevMem2D
&
right
,
const
DevMem2D
&
disp
,
int
maxdisp
)
{
template
<
int
RADIUS
>
void
kernel_caller
(
const
DevMem2D
&
left
,
const
DevMem2D
&
right
,
const
DevMem2D
&
disp
,
int
maxdisp
,
const
cudaStream_t
&
stream
)
{
dim3
grid
(
1
,
1
,
1
);
dim3
threads
(
BLOCK_W
,
1
,
1
);
dim3
threads
(
BLOCK_W
,
1
,
1
);
grid
.
x
=
divUp
(
left
.
cols
-
maxdisp
-
2
*
RADIUS
,
BLOCK_W
);
grid
.
y
=
divUp
(
left
.
rows
-
2
*
RADIUS
,
ROWSperTHREAD
);
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
size_t
smem_size
=
(
BLOCK_W
+
N_DISPARITIES
*
(
BLOCK_W
+
2
*
RADIUS
))
*
sizeof
(
unsigned
int
);
//See above: #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
size_t
smem_size
=
(
BLOCK_W
+
N_DISPARITIES
*
(
BLOCK_W
+
2
*
RADIUS
))
*
sizeof
(
unsigned
int
);
if
(
stream
==
0
)
{
stereobm_gpu
::
stereoKernel
<
RADIUS
><<<
grid
,
threads
,
smem_size
>>>
(
left
.
ptr
,
right
.
ptr
,
left
.
step
,
disp
.
ptr
,
disp
.
step
,
maxdisp
);
cudaSafeCall
(
cudaThreadSynchronize
()
);
}
else
{
stereobm_gpu
::
stereoKernel
<
RADIUS
><<<
grid
,
threads
,
smem_size
,
stream
>>>
(
left
.
ptr
,
right
.
ptr
,
left
.
step
,
disp
.
ptr
,
disp
.
step
,
maxdisp
);
}
stereobm_gpu
::
stereoKernel
<
RADIUS
><<<
grid
,
threads
,
smem_size
>>>
(
left
.
ptr
,
right
.
ptr
,
left
.
step
,
disp
.
ptr
,
disp
.
step
,
maxdisp
);
cudaSafeCall
(
cudaThreadSynchronize
()
);
};
typedef
void
(
*
kernel_caller_t
)(
const
DevMem2D
&
left
,
const
DevMem2D
&
right
,
const
DevMem2D
&
disp
,
int
maxdisp
);
typedef
void
(
*
kernel_caller_t
)(
const
DevMem2D
&
left
,
const
DevMem2D
&
right
,
const
DevMem2D
&
disp
,
int
maxdisp
,
const
cudaStream_t
&
stream
);
const
static
kernel_caller_t
callers
[]
=
{
0
,
kernel_caller
<
1
>
,
kernel_caller
<
2
>
,
kernel_caller
<
3
>
,
kernel_caller
<
4
>
,
kernel_caller
<
5
>
,
kernel_caller
<
6
>
,
kernel_caller
<
7
>
,
kernel_caller
<
8
>
,
kernel_caller
<
9
>
,
kernel_caller
<
10
>
,
kernel_caller
<
11
>
,
kernel_caller
<
12
>
,
kernel_caller
<
13
>
,
kernel_caller
<
15
>
,
kernel_caller
<
15
>
,
kernel_caller
<
16
>
,
kernel_caller
<
17
>
,
kernel_caller
<
18
>
,
kernel_caller
<
19
>
,
kernel_caller
<
20
>
,
0
,
kernel_caller
<
1
>
,
kernel_caller
<
2
>
,
kernel_caller
<
3
>
,
kernel_caller
<
4
>
,
kernel_caller
<
5
>
,
kernel_caller
<
6
>
,
kernel_caller
<
7
>
,
kernel_caller
<
8
>
,
kernel_caller
<
9
>
,
kernel_caller
<
10
>
,
kernel_caller
<
11
>
,
kernel_caller
<
12
>
,
kernel_caller
<
13
>
,
kernel_caller
<
15
>
,
kernel_caller
<
15
>
,
kernel_caller
<
16
>
,
kernel_caller
<
17
>
,
kernel_caller
<
18
>
,
kernel_caller
<
19
>
,
kernel_caller
<
20
>
,
kernel_caller
<
21
>
,
kernel_caller
<
22
>
,
kernel_caller
<
23
>
,
kernel_caller
<
24
>
,
kernel_caller
<
25
>
//0,0,0, 0,0,0, 0,0,kernel_caller<9>
};
const
int
calles_num
=
sizeof
(
callers
)
/
sizeof
(
callers
[
0
]);
extern
"C"
void
stereoBM_GPU
(
const
DevMem2D
&
left
,
const
DevMem2D
&
right
,
const
DevMem2D
&
disp
,
int
maxdisp
,
int
winsz
,
const
DevMem2D_
<
unsigned
int
>&
minSSD_buf
)
{
extern
"C"
void
stereoBM_GPU
(
const
DevMem2D
&
left
,
const
DevMem2D
&
right
,
const
DevMem2D
&
disp
,
int
maxdisp
,
int
winsz
,
const
DevMem2D_
<
unsigned
int
>&
minSSD_buf
,
const
cudaStream_t
&
stream
)
{
int
winsz2
=
winsz
>>
1
;
if
(
winsz2
==
0
||
winsz2
>=
calles_num
)
cv
::
gpu
::
error
(
"Unsupported window size"
,
__FILE__
,
__LINE__
);
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
//cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
cudaSafeCall
(
cudaMemset2D
(
disp
.
ptr
,
disp
.
step
,
0
,
disp
.
cols
,
disp
.
rows
)
);
cudaSafeCall
(
cudaMemset2D
(
minSSD_buf
.
ptr
,
minSSD_buf
.
step
,
0xFF
,
minSSD_buf
.
cols
*
minSSD_buf
.
elemSize
(),
disp
.
rows
)
);
cudaSafeCall
(
cudaMemset2D
(
minSSD_buf
.
ptr
,
minSSD_buf
.
step
,
0xFF
,
minSSD_buf
.
cols
*
minSSD_buf
.
elemSize
(),
disp
.
rows
)
);
cudaSafeCall
(
cudaMemcpyToSymbol
(
stereobm_gpu
::
cwidth
,
&
left
.
cols
,
sizeof
(
left
.
cols
)
)
);
cudaSafeCall
(
cudaMemcpyToSymbol
(
stereobm_gpu
::
cheight
,
&
left
.
rows
,
sizeof
(
left
.
rows
)
)
);
...
...
@@ -361,7 +369,7 @@ namespace cv { namespace gpu { namespace impl
size_t
minssd_step
=
minSSD_buf
.
step
/
minSSD_buf
.
elemSize
();
cudaSafeCall
(
cudaMemcpyToSymbol
(
stereobm_gpu
::
cminSSD_step
,
&
minssd_step
,
sizeof
(
minssd_step
)
)
);
callers
[
winsz2
](
left
,
right
,
disp
,
maxdisp
);
callers
[
winsz2
](
left
,
right
,
disp
,
maxdisp
,
stream
);
}
}}}
...
...
@@ -381,7 +389,7 @@ extern "C" __global__ void prefilter_kernel(unsigned char *output, size_t step,
if
(
x
<
width
&&
y
<
height
)
{
int
conv
=
(
int
)
tex2D
(
texForSobel
,
x
-
1
,
y
-
1
)
*
(
-
1
)
+
(
int
)
tex2D
(
texForSobel
,
x
+
1
,
y
-
1
)
*
(
1
)
+
int
conv
=
(
int
)
tex2D
(
texForSobel
,
x
-
1
,
y
-
1
)
*
(
-
1
)
+
(
int
)
tex2D
(
texForSobel
,
x
+
1
,
y
-
1
)
*
(
1
)
+
(
int
)
tex2D
(
texForSobel
,
x
-
1
,
y
)
*
(
-
2
)
+
(
int
)
tex2D
(
texForSobel
,
x
+
1
,
y
)
*
(
2
)
+
(
int
)
tex2D
(
texForSobel
,
x
-
1
,
y
+
1
)
*
(
-
1
)
+
(
int
)
tex2D
(
texForSobel
,
x
+
1
,
y
+
1
)
*
(
1
);
...
...
@@ -398,18 +406,18 @@ namespace cv { namespace gpu { namespace impl
extern
"C"
void
prefilter_xsobel
(
const
DevMem2D
&
input
,
const
DevMem2D
&
output
,
int
prefilterCap
)
{
cudaChannelFormatDesc
desc
=
cudaCreateChannelDesc
<
unsigned
char
>
();
cudaSafeCall
(
cudaBindTexture2D
(
0
,
stereobm_gpu
::
texForSobel
,
input
.
ptr
,
desc
,
input
.
cols
,
input
.
rows
,
input
.
step
)
);
cudaSafeCall
(
cudaBindTexture2D
(
0
,
stereobm_gpu
::
texForSobel
,
input
.
ptr
,
desc
,
input
.
cols
,
input
.
rows
,
input
.
step
)
);
dim3
threads
(
16
,
16
,
1
);
dim3
grid
(
1
,
1
,
1
);
grid
.
x
=
divUp
(
input
.
cols
,
threads
.
x
);
grid
.
y
=
divUp
(
input
.
rows
,
threads
.
y
);
grid
.
y
=
divUp
(
input
.
rows
,
threads
.
y
);
stereobm_gpu
::
prefilter_kernel
<<<
grid
,
threads
>>>
(
output
.
ptr
,
output
.
step
,
output
.
cols
,
output
.
rows
,
prefilterCap
);
cudaSafeCall
(
cudaThreadSynchronize
()
);
cudaSafeCall
(
cudaUnbindTexture
(
stereobm_gpu
::
texForSobel
)
);
cudaSafeCall
(
cudaUnbindTexture
(
stereobm_gpu
::
texForSobel
)
);
}
}}}
...
...
@@ -424,8 +432,8 @@ namespace stereobm_gpu
texture
<
unsigned
char
,
2
,
cudaReadModeNormalizedFloat
>
texForTF
;
__device__
float
sobel
(
int
x
,
int
y
)
{
float
conv
=
tex2D
(
texForTF
,
x
-
1
,
y
-
1
)
*
(
-
1
)
+
tex2D
(
texForTF
,
x
+
1
,
y
-
1
)
*
(
1
)
+
{
float
conv
=
tex2D
(
texForTF
,
x
-
1
,
y
-
1
)
*
(
-
1
)
+
tex2D
(
texForTF
,
x
+
1
,
y
-
1
)
*
(
1
)
+
tex2D
(
texForTF
,
x
-
1
,
y
)
*
(
-
2
)
+
tex2D
(
texForTF
,
x
+
1
,
y
)
*
(
2
)
+
tex2D
(
texForTF
,
x
-
1
,
y
+
1
)
*
(
-
1
)
+
tex2D
(
texForTF
,
x
+
1
,
y
+
1
)
*
(
1
);
return
fabs
(
conv
);
...
...
@@ -453,28 +461,28 @@ __device__ float CalcSums(float *cols, float *cols_cache, int winsz)
return
cols
[
0
]
+
cache
+
cache2
;
}
#define RpT (2 * ROWSperTHREAD) // got experimentally
#define RpT (2 * ROWSperTHREAD) // got experimentally
extern
"C"
__global__
void
textureness_kernel
(
unsigned
char
*
disp
,
size_t
disp_step
,
int
winsz
,
float
threshold
,
int
width
,
int
height
)
{
{
int
winsz2
=
winsz
/
2
;
int
n_dirty_pixels
=
(
winsz2
)
*
2
;
extern
__shared__
float
cols_cache
[];
float
*
cols
=
cols_cache
+
blockDim
.
x
+
threadIdx
.
x
;
float
*
cols_extra
=
threadIdx
.
x
<
n_dirty_pixels
?
cols
+
blockDim
.
x
:
0
;
float
*
cols
=
cols_cache
+
blockDim
.
x
+
threadIdx
.
x
;
float
*
cols_extra
=
threadIdx
.
x
<
n_dirty_pixels
?
cols
+
blockDim
.
x
:
0
;
int
x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
x
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
beg_row
=
blockIdx
.
y
*
RpT
;
int
end_row
=
min
(
beg_row
+
RpT
,
height
);
if
(
x
<
width
)
{
if
(
x
<
width
)
{
int
y
=
beg_row
;
float
sum
=
0
;
float
sum_extra
=
0
;
float
sum
=
0
;
float
sum_extra
=
0
;
for
(
int
i
=
y
-
winsz2
;
i
<=
y
+
winsz2
;
++
i
)
{
sum
+=
sobel
(
x
-
winsz2
,
i
);
...
...
@@ -486,11 +494,11 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s
*
cols_extra
=
sum_extra
;
__syncthreads
();
float
sum_win
=
CalcSums
(
cols
,
cols_cache
+
threadIdx
.
x
,
winsz
)
*
255
;
if
(
sum_win
<
threshold
)
disp
[
y
*
disp_step
+
x
]
=
0
;
__syncthreads
();
for
(
int
y
=
beg_row
+
1
;
y
<
end_row
;
++
y
)
...
...
@@ -505,12 +513,12 @@ extern "C" __global__ void textureness_kernel(unsigned char *disp, size_t disp_s
}
__syncthreads
();
float
sum_win
=
CalcSums
(
cols
,
cols_cache
+
threadIdx
.
x
,
winsz
)
*
255
;
float
sum_win
=
CalcSums
(
cols
,
cols_cache
+
threadIdx
.
x
,
winsz
)
*
255
;
if
(
sum_win
<
threshold
)
disp
[
y
*
disp_step
+
x
]
=
0
;
__syncthreads
();
}
}
}
}
}
...
...
@@ -521,21 +529,21 @@ namespace cv { namespace gpu { namespace impl
{
avgTexturenessThreshold
*=
winsz
*
winsz
;
stereobm_gpu
::
texForTF
.
filterMode
=
cudaFilterModeLinear
;
stereobm_gpu
::
texForTF
.
filterMode
=
cudaFilterModeLinear
;
stereobm_gpu
::
texForTF
.
addressMode
[
0
]
=
cudaAddressModeWrap
;
stereobm_gpu
::
texForTF
.
addressMode
[
1
]
=
cudaAddressModeWrap
;
cudaChannelFormatDesc
desc
=
cudaCreateChannelDesc
<
unsigned
char
>
();
cudaSafeCall
(
cudaBindTexture2D
(
0
,
stereobm_gpu
::
texForTF
,
input
.
ptr
,
desc
,
input
.
cols
,
input
.
rows
,
input
.
step
)
);
cudaSafeCall
(
cudaBindTexture2D
(
0
,
stereobm_gpu
::
texForTF
,
input
.
ptr
,
desc
,
input
.
cols
,
input
.
rows
,
input
.
step
)
);
dim3
threads
(
128
,
1
,
1
);
dim3
grid
(
1
,
1
,
1
);
grid
.
x
=
divUp
(
input
.
cols
,
threads
.
x
);
grid
.
y
=
divUp
(
input
.
rows
,
RpT
);
size_t
smem_size
=
(
threads
.
x
+
threads
.
x
+
(
winsz
/
2
)
*
2
)
*
sizeof
(
float
);
grid
.
y
=
divUp
(
input
.
rows
,
RpT
);
size_t
smem_size
=
(
threads
.
x
+
threads
.
x
+
(
winsz
/
2
)
*
2
)
*
sizeof
(
float
);
stereobm_gpu
::
textureness_kernel
<<<
grid
,
threads
,
smem_size
>>>
(
disp
.
ptr
,
disp
.
step
,
winsz
,
avgTexturenessThreshold
,
disp
.
cols
,
disp
.
rows
);
cudaSafeCall
(
cudaThreadSynchronize
()
);
...
...
modules/gpu/src/precomp.hpp
浏览文件 @
12dc52c2
...
...
@@ -46,25 +46,26 @@
#pragma warning( disable: 4251 4710 4711 4514 4996 )
#endif
#ifdef HAVE_CONFIG_H
#include <cvconfig.h>
#ifdef HAVE_CONFIG_H
#include <cvconfig.h>
#endif
#include <iostream>
#include <limits>
#include "opencv2/gpu/gpu.hpp"
#include "opencv2/gpu/stream_accessor.hpp"
#if defined(HAVE_CUDA)
#include "cuda_shared.hpp"
#include "cuda_runtime_api.h"
#include "cuda_runtime_api.h"
#else
/* defined(HAVE_CUDA) */
static
inline
void
throw_nogpu
()
{
CV_Error
(
CV_GpuNotFound
,
"The library is compilled with no GPU support"
);
}
#endif
/* defined(HAVE_CUDA) */
#endif
/* __OPENCV_PRECOMP_H__ */
#endif
/* __OPENCV_PRECOMP_H__ */
modules/gpu/src/stereobm_gpu.cpp
浏览文件 @
12dc52c2
...
...
@@ -56,25 +56,26 @@ void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&,
#else
/* !defined (HAVE_CUDA) */
namespace
cv
{
namespace
gpu
{
namespace
impl
namespace
cv
{
namespace
gpu
{
namespace
impl
{
extern
"C"
void
stereoBM_GPU
(
const
DevMem2D
&
left
,
const
DevMem2D
&
right
,
const
DevMem2D
&
disp
,
int
ndisp
,
int
winsz
,
const
DevMem2D_
<
uint
>&
minSSD_buf
);
//extern "C" void stereoBM_GPU(const DevMem2D& left, const DevMem2D& right, const DevMem2D& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf);
extern
"C"
void
stereoBM_GPU
(
const
DevMem2D
&
left
,
const
DevMem2D
&
right
,
const
DevMem2D
&
disp
,
int
ndisp
,
int
winsz
,
const
DevMem2D_
<
uint
>&
minSSD_buf
,
const
cudaStream_t
&
stream
);
extern
"C"
void
prefilter_xsobel
(
const
DevMem2D
&
input
,
const
DevMem2D
&
output
,
int
prefilterCap
=
31
);
extern
"C"
void
postfilter_textureness
(
const
DevMem2D
&
input
,
int
winsz
,
float
avergeTexThreshold
,
const
DevMem2D
&
disp
);
}
}}
const
float
defaultAvgTexThreshold
=
3
;
cv
::
gpu
::
StereoBM_GPU
::
StereoBM_GPU
()
cv
::
gpu
::
StereoBM_GPU
::
StereoBM_GPU
()
:
preset
(
BASIC_PRESET
),
ndisp
(
DEFAULT_NDISP
),
winSize
(
DEFAULT_WINSZ
),
avergeTexThreshold
(
defaultAvgTexThreshold
)
{}
cv
::
gpu
::
StereoBM_GPU
::
StereoBM_GPU
(
int
preset_
,
int
ndisparities_
,
int
winSize_
)
cv
::
gpu
::
StereoBM_GPU
::
StereoBM_GPU
(
int
preset_
,
int
ndisparities_
,
int
winSize_
)
:
preset
(
preset_
),
ndisp
(
ndisparities_
),
winSize
(
winSize_
),
avergeTexThreshold
(
defaultAvgTexThreshold
)
{
const
int
max_supported_ndisp
=
1
<<
(
sizeof
(
unsigned
char
)
*
8
);
const
int
max_supported_ndisp
=
1
<<
(
sizeof
(
unsigned
char
)
*
8
);
CV_Assert
(
0
<
ndisp
&&
ndisp
<=
max_supported_ndisp
);
CV_Assert
(
ndisp
%
8
==
0
);
CV_Assert
(
winSize
%
2
==
1
);
...
...
@@ -92,12 +93,12 @@ bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable()
int
numSM
=
getNumberOfSMs
(
device
);
if
(
major
>
1
||
numSM
>
16
)
return
true
;
return
true
;
return
false
;
}
void
cv
::
gpu
::
StereoBM_GPU
::
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
)
void
stereo_gpu_operator
(
GpuMat
&
minSSD
,
GpuMat
&
leBuf
,
GpuMat
&
riBuf
,
int
preset
,
int
ndisp
,
int
winSize
,
float
avergeTexThreshold
,
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
,
const
cudaStream_t
&
stream
)
{
CV_DbgAssert
(
left
.
rows
==
right
.
rows
&&
left
.
cols
==
right
.
cols
);
CV_DbgAssert
(
left
.
type
()
==
CV_8UC1
);
...
...
@@ -109,26 +110,33 @@ void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right
GpuMat
le_for_bm
=
left
;
GpuMat
ri_for_bm
=
right
;
if
(
preset
==
PREFILTER_XSOBEL
)
if
(
preset
==
StereoBM_GPU
::
PREFILTER_XSOBEL
)
{
leBuf
.
create
(
left
.
size
(),
left
.
type
());
riBuf
.
create
(
right
.
size
(),
right
.
type
());
impl
::
prefilter_xsobel
(
left
,
leBuf
);
impl
::
prefilter_xsobel
(
right
,
riBuf
);
impl
::
prefilter_xsobel
(
right
,
riBuf
);
le_for_bm
=
leBuf
;
ri_for_bm
=
riBuf
;
}
impl
::
stereoBM_GPU
(
le_for_bm
,
ri_for_bm
,
disparity
,
ndisp
,
winSize
,
minSSD
);
}
impl
::
stereoBM_GPU
(
le_for_bm
,
ri_for_bm
,
disparity
,
ndisp
,
winSize
,
minSSD
,
stream
);
if
(
avergeTexThreshold
)
impl
::
postfilter_textureness
(
le_for_bm
,
winSize
,
avergeTexThreshold
,
disparity
);
}
void
cv
::
gpu
::
StereoBM_GPU
::
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
)
{
::
stereo_gpu_operator
(
minSSD
,
leBuf
,
riBuf
,
preset
,
ndisp
,
winSize
,
avergeTexThreshold
,
left
,
right
,
disparity
,
0
);
}
void
cv
::
gpu
::
StereoBM_GPU
::
operator
()
(
const
GpuMat
&
left
,
const
GpuMat
&
right
,
GpuMat
&
disparity
,
const
CudaStream
&
stream
)
{
CV_Assert
(
!
"Not implemented"
);
::
stereo_gpu_operator
(
minSSD
,
leBuf
,
riBuf
,
preset
,
ndisp
,
winSize
,
avergeTexThreshold
,
left
,
right
,
disparity
,
StreamAccessor
::
getStream
(
stream
)
);
}
#endif
/* !defined (HAVE_CUDA) */
\ No newline at end of file
#endif
/* !defined (HAVE_CUDA) */
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录