Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
0e339dd1
O
Opencv
项目概览
Greenplum
/
Opencv
大约 1 年 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
0e339dd1
编写于
11月 12, 2012
作者:
V
Vladislav Vinogradov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
hog
上级
0ddd16cf
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
69 addition
and
128 deletion
+69
-128
modules/gpu/src/cuda/hog.cu
modules/gpu/src/cuda/hog.cu
+69
-128
未找到文件。
modules/gpu/src/cuda/hog.cu
浏览文件 @
0e339dd1
...
...
@@ -42,7 +42,10 @@
#if !defined CUDA_DISABLER
#include "internal_shared.hpp"
#include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/functional.hpp"
#include "opencv2/gpu/device/warp_shuffle.hpp"
namespace
cv
{
namespace
gpu
{
namespace
device
{
...
...
@@ -226,29 +229,30 @@ namespace cv { namespace gpu { namespace device
template
<
int
size
>
__device__
float
reduce_smem
(
volatile
float
*
smem
)
__device__
float
reduce_smem
(
float
*
smem
,
float
val
)
{
unsigned
int
tid
=
threadIdx
.
x
;
float
sum
=
smem
[
tid
]
;
float
sum
=
val
;
if
(
size
>=
512
)
{
if
(
tid
<
256
)
smem
[
tid
]
=
sum
=
sum
+
smem
[
tid
+
256
];
__syncthreads
();
}
if
(
size
>=
256
)
{
if
(
tid
<
128
)
smem
[
tid
]
=
sum
=
sum
+
smem
[
tid
+
128
];
__syncthreads
();
}
if
(
size
>=
128
)
{
if
(
tid
<
64
)
smem
[
tid
]
=
sum
=
sum
+
smem
[
tid
+
64
];
__syncthreads
();
}
reduce
<
size
>
(
smem
,
sum
,
tid
,
plus
<
float
>
());
if
(
tid
<
32
)
if
(
size
==
32
)
{
if
(
size
>=
64
)
smem
[
tid
]
=
sum
=
sum
+
smem
[
tid
+
32
];
if
(
size
>=
32
)
smem
[
tid
]
=
sum
=
sum
+
smem
[
tid
+
16
];
if
(
size
>=
16
)
smem
[
tid
]
=
sum
=
sum
+
smem
[
tid
+
8
];
if
(
size
>=
8
)
smem
[
tid
]
=
sum
=
sum
+
smem
[
tid
+
4
];
if
(
size
>=
4
)
smem
[
tid
]
=
sum
=
sum
+
smem
[
tid
+
2
];
if
(
size
>=
2
)
smem
[
tid
]
=
sum
=
sum
+
smem
[
tid
+
1
];
#if __CUDA_ARCH__ >= 300
return
shfl
(
sum
,
0
);
#else
return
smem
[
0
];
#endif
}
#if __CUDA_ARCH__ >= 300
if
(
threadIdx
.
x
==
0
)
smem
[
0
]
=
sum
;
#endif
__syncthreads
();
sum
=
smem
[
0
];
return
s
um
;
return
s
mem
[
0
]
;
}
...
...
@@ -272,19 +276,13 @@ namespace cv { namespace gpu { namespace device
if
(
threadIdx
.
x
<
block_hist_size
)
elem
=
hist
[
0
];
squares
[
threadIdx
.
x
]
=
elem
*
elem
;
__syncthreads
();
float
sum
=
reduce_smem
<
nthreads
>
(
squares
);
float
sum
=
reduce_smem
<
nthreads
>
(
squares
,
elem
*
elem
);
float
scale
=
1.0
f
/
(
::
sqrtf
(
sum
)
+
0.1
f
*
block_hist_size
);
elem
=
::
min
(
elem
*
scale
,
threshold
);
__syncthreads
();
squares
[
threadIdx
.
x
]
=
elem
*
elem
;
sum
=
reduce_smem
<
nthreads
>
(
squares
,
elem
*
elem
);
__syncthreads
();
sum
=
reduce_smem
<
nthreads
>
(
squares
);
scale
=
1.0
f
/
(
::
sqrtf
(
sum
)
+
1e-3
f
);
if
(
threadIdx
.
x
<
block_hist_size
)
...
...
@@ -330,65 +328,36 @@ namespace cv { namespace gpu { namespace device
// return confidence values not just positive location
template
<
int
nthreads
,
// Number of threads per one histogram block
int
nblocks
>
// Number of histogram block processed by single GPU thread block
int
nblocks
>
// Number of histogram block processed by single GPU thread block
__global__
void
compute_confidence_hists_kernel_many_blocks
(
const
int
img_win_width
,
const
int
img_block_width
,
const
int
win_block_stride_x
,
const
int
win_block_stride_y
,
const
float
*
block_hists
,
const
float
*
coefs
,
float
free_coef
,
float
threshold
,
float
*
confidences
)
{
const
int
win_x
=
threadIdx
.
z
;
if
(
blockIdx
.
x
*
blockDim
.
z
+
win_x
>=
img_win_width
)
return
;
const
float
*
hist
=
block_hists
+
(
blockIdx
.
y
*
win_block_stride_y
*
img_block_width
+
blockIdx
.
x
*
win_block_stride_x
*
blockDim
.
z
+
win_x
)
*
cblock_hist_size
;
float
product
=
0.
f
;
for
(
int
i
=
threadIdx
.
x
;
i
<
cdescr_size
;
i
+=
nthreads
)
{
int
offset_y
=
i
/
cdescr_width
;
int
offset_x
=
i
-
offset_y
*
cdescr_width
;
product
+=
coefs
[
i
]
*
hist
[
offset_y
*
img_block_width
*
cblock_hist_size
+
offset_x
];
}
__shared__
float
products
[
nthreads
*
nblocks
];
const
int
tid
=
threadIdx
.
z
*
nthreads
+
threadIdx
.
x
;
products
[
tid
]
=
product
;
__syncthreads
();
if
(
nthreads
>=
512
)
{
if
(
threadIdx
.
x
<
256
)
products
[
tid
]
=
product
=
product
+
products
[
tid
+
256
];
__syncthreads
();
}
if
(
nthreads
>=
256
)
{
if
(
threadIdx
.
x
<
128
)
products
[
tid
]
=
product
=
product
+
products
[
tid
+
128
];
__syncthreads
();
}
if
(
nthreads
>=
128
)
{
if
(
threadIdx
.
x
<
64
)
products
[
tid
]
=
product
=
product
+
products
[
tid
+
64
];
__syncthreads
();
}
if
(
threadIdx
.
x
<
32
)
{
volatile
float
*
smem
=
products
;
if
(
nthreads
>=
64
)
smem
[
tid
]
=
product
=
product
+
smem
[
tid
+
32
];
if
(
nthreads
>=
32
)
smem
[
tid
]
=
product
=
product
+
smem
[
tid
+
16
];
if
(
nthreads
>=
16
)
smem
[
tid
]
=
product
=
product
+
smem
[
tid
+
8
];
if
(
nthreads
>=
8
)
smem
[
tid
]
=
product
=
product
+
smem
[
tid
+
4
];
if
(
nthreads
>=
4
)
smem
[
tid
]
=
product
=
product
+
smem
[
tid
+
2
];
if
(
nthreads
>=
2
)
smem
[
tid
]
=
product
=
product
+
smem
[
tid
+
1
];
}
if
(
threadIdx
.
x
==
0
)
confidences
[
blockIdx
.
y
*
img_win_width
+
blockIdx
.
x
*
blockDim
.
z
+
win_x
]
=
(
float
)(
product
+
free_coef
);
const
int
win_x
=
threadIdx
.
z
;
if
(
blockIdx
.
x
*
blockDim
.
z
+
win_x
>=
img_win_width
)
return
;
const
float
*
hist
=
block_hists
+
(
blockIdx
.
y
*
win_block_stride_y
*
img_block_width
+
blockIdx
.
x
*
win_block_stride_x
*
blockDim
.
z
+
win_x
)
*
cblock_hist_size
;
float
product
=
0.
f
;
for
(
int
i
=
threadIdx
.
x
;
i
<
cdescr_size
;
i
+=
nthreads
)
{
int
offset_y
=
i
/
cdescr_width
;
int
offset_x
=
i
-
offset_y
*
cdescr_width
;
product
+=
coefs
[
i
]
*
hist
[
offset_y
*
img_block_width
*
cblock_hist_size
+
offset_x
];
}
__shared__
float
products
[
nthreads
*
nblocks
];
const
int
tid
=
threadIdx
.
z
*
nthreads
+
threadIdx
.
x
;
reduce
<
nthreads
>
(
products
,
product
,
tid
,
plus
<
float
>
());
if
(
threadIdx
.
x
==
0
)
confidences
[
blockIdx
.
y
*
img_win_width
+
blockIdx
.
x
*
blockDim
.
z
+
win_x
]
=
product
+
free_coef
;
}
...
...
@@ -396,32 +365,32 @@ namespace cv { namespace gpu { namespace device
int
win_stride_y
,
int
win_stride_x
,
int
height
,
int
width
,
float
*
block_hists
,
float
*
coefs
,
float
free_coef
,
float
threshold
,
float
*
confidences
)
{
const
int
nthreads
=
256
;
const
int
nblocks
=
1
;
int
win_block_stride_x
=
win_stride_x
/
block_stride_x
;
int
win_block_stride_y
=
win_stride_y
/
block_stride_y
;
int
img_win_width
=
(
width
-
win_width
+
win_stride_x
)
/
win_stride_x
;
int
img_win_height
=
(
height
-
win_height
+
win_stride_y
)
/
win_stride_y
;
dim3
threads
(
nthreads
,
1
,
nblocks
);
dim3
grid
(
divUp
(
img_win_width
,
nblocks
),
img_win_height
);
cudaSafeCall
(
cudaFuncSetCacheConfig
(
compute_confidence_hists_kernel_many_blocks
<
nthreads
,
nblocks
>
,
cudaFuncCachePreferL1
));
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
compute_confidence_hists_kernel_many_blocks
<
nthreads
,
nblocks
><<<
grid
,
threads
>>>
(
img_win_width
,
img_block_width
,
win_block_stride_x
,
win_block_stride_y
,
block_hists
,
coefs
,
free_coef
,
threshold
,
confidences
);
cudaSafeCall
(
cudaThreadSynchronize
());
const
int
nthreads
=
256
;
const
int
nblocks
=
1
;
int
win_block_stride_x
=
win_stride_x
/
block_stride_x
;
int
win_block_stride_y
=
win_stride_y
/
block_stride_y
;
int
img_win_width
=
(
width
-
win_width
+
win_stride_x
)
/
win_stride_x
;
int
img_win_height
=
(
height
-
win_height
+
win_stride_y
)
/
win_stride_y
;
dim3
threads
(
nthreads
,
1
,
nblocks
);
dim3
grid
(
divUp
(
img_win_width
,
nblocks
),
img_win_height
);
cudaSafeCall
(
cudaFuncSetCacheConfig
(
compute_confidence_hists_kernel_many_blocks
<
nthreads
,
nblocks
>
,
cudaFuncCachePreferL1
));
int
img_block_width
=
(
width
-
CELLS_PER_BLOCK_X
*
CELL_WIDTH
+
block_stride_x
)
/
block_stride_x
;
compute_confidence_hists_kernel_many_blocks
<
nthreads
,
nblocks
><<<
grid
,
threads
>>>
(
img_win_width
,
img_block_width
,
win_block_stride_x
,
win_block_stride_y
,
block_hists
,
coefs
,
free_coef
,
threshold
,
confidences
);
cudaSafeCall
(
cudaThreadSynchronize
());
}
template
<
int
nthreads
,
// Number of threads per one histogram block
int
nblocks
>
// Number of histogram block processed by single GPU thread block
int
nblocks
>
// Number of histogram block processed by single GPU thread block
__global__
void
classify_hists_kernel_many_blocks
(
const
int
img_win_width
,
const
int
img_block_width
,
const
int
win_block_stride_x
,
const
int
win_block_stride_y
,
const
float
*
block_hists
,
const
float
*
coefs
,
...
...
@@ -446,36 +415,8 @@ namespace cv { namespace gpu { namespace device
__shared__
float
products
[
nthreads
*
nblocks
];
const
int
tid
=
threadIdx
.
z
*
nthreads
+
threadIdx
.
x
;
products
[
tid
]
=
product
;
__syncthreads
();
if
(
nthreads
>=
512
)
{
if
(
threadIdx
.
x
<
256
)
products
[
tid
]
=
product
=
product
+
products
[
tid
+
256
];
__syncthreads
();
}
if
(
nthreads
>=
256
)
{
if
(
threadIdx
.
x
<
128
)
products
[
tid
]
=
product
=
product
+
products
[
tid
+
128
];
__syncthreads
();
}
if
(
nthreads
>=
128
)
{
if
(
threadIdx
.
x
<
64
)
products
[
tid
]
=
product
=
product
+
products
[
tid
+
64
];
__syncthreads
();
}
if
(
threadIdx
.
x
<
32
)
{
volatile
float
*
smem
=
products
;
if
(
nthreads
>=
64
)
smem
[
tid
]
=
product
=
product
+
smem
[
tid
+
32
];
if
(
nthreads
>=
32
)
smem
[
tid
]
=
product
=
product
+
smem
[
tid
+
16
];
if
(
nthreads
>=
16
)
smem
[
tid
]
=
product
=
product
+
smem
[
tid
+
8
];
if
(
nthreads
>=
8
)
smem
[
tid
]
=
product
=
product
+
smem
[
tid
+
4
];
if
(
nthreads
>=
4
)
smem
[
tid
]
=
product
=
product
+
smem
[
tid
+
2
];
if
(
nthreads
>=
2
)
smem
[
tid
]
=
product
=
product
+
smem
[
tid
+
1
];
}
reduce
<
nthreads
>
(
products
,
product
,
tid
,
plus
<
float
>
());
if
(
threadIdx
.
x
==
0
)
labels
[
blockIdx
.
y
*
img_win_width
+
blockIdx
.
x
*
blockDim
.
z
+
win_x
]
=
(
product
+
free_coef
>=
threshold
);
...
...
@@ -868,4 +809,4 @@ namespace cv { namespace gpu { namespace device
}}}
// namespace cv { namespace gpu { namespace device
#endif
/* CUDA_DISABLER */
\ No newline at end of file
#endif
/* CUDA_DISABLER */
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录