Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
3396a2ec
O
Opencv
项目概览
Greenplum
/
Opencv
大约 1 年 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
3396a2ec
编写于
11月 04, 2011
作者:
A
Anton Obukhov
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[~] Refactored device reduction for better reuse (now with functor semantics)
上级
56531c61
变更
2
显示空白变更内容
内联
并排
Showing
2 changed file
with
106 addition
and
60 deletion
+106
-60
modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+9
-60
modules/gpu/src/nvidia/core/NCVAlg.hpp
modules/gpu/src/nvidia/core/NCVAlg.hpp
+97
-0
未找到文件。
modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
浏览文件 @
3396a2ec
...
...
@@ -59,6 +59,7 @@
#include <cstdio>
#include "NCV.hpp"
#include "NCVAlg.hpp"
#include "NPP_staging/NPP_staging.hpp"
#include "NCVRuntimeTemplates.hpp"
#include "NCVHaarObjectDetection.hpp"
...
...
@@ -84,11 +85,6 @@ inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
pos
+=
K_WARP_SIZE
;
s_Data
[
pos
]
=
idata
;
//for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
//{
// s_Data[pos] += s_Data[pos - offset];
//}
s_Data
[
pos
]
+=
s_Data
[
pos
-
1
];
s_Data
[
pos
]
+=
s_Data
[
pos
-
2
];
s_Data
[
pos
]
+=
s_Data
[
pos
-
4
];
...
...
@@ -234,60 +230,6 @@ __device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
}
__device__
Ncv32f
reduceSpecialization
(
Ncv32f
partialSum
)
{
__shared__
volatile
Ncv32f
reductor
[
NUM_THREADS_CLASSIFIERPARALLEL
];
reductor
[
threadIdx
.
x
]
=
partialSum
;
__syncthreads
();
#if defined CPU_FP_COMPLIANCE
if
(
!
threadIdx
.
x
)
{
Ncv32f
sum
=
0.0
f
;
for
(
int
i
=
0
;
i
<
NUM_THREADS_CLASSIFIERPARALLEL
;
i
++
)
{
sum
+=
reductor
[
i
];
}
reductor
[
0
]
=
sum
;
}
#else
#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 8
if
(
threadIdx
.
x
<
128
)
{
reductor
[
threadIdx
.
x
]
+=
reductor
[
threadIdx
.
x
+
128
];
}
__syncthreads
();
#endif
#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 7
if
(
threadIdx
.
x
<
64
)
{
reductor
[
threadIdx
.
x
]
+=
reductor
[
threadIdx
.
x
+
64
];
}
__syncthreads
();
#endif
if
(
threadIdx
.
x
<
32
)
{
#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 6
reductor
[
threadIdx
.
x
]
+=
reductor
[
threadIdx
.
x
+
32
];
#endif
#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 5
reductor
[
threadIdx
.
x
]
+=
reductor
[
threadIdx
.
x
+
16
];
#endif
reductor
[
threadIdx
.
x
]
+=
reductor
[
threadIdx
.
x
+
8
];
reductor
[
threadIdx
.
x
]
+=
reductor
[
threadIdx
.
x
+
4
];
reductor
[
threadIdx
.
x
]
+=
reductor
[
threadIdx
.
x
+
2
];
reductor
[
threadIdx
.
x
]
+=
reductor
[
threadIdx
.
x
+
1
];
}
#endif
__syncthreads
();
return
reductor
[
0
];
}
__device__
Ncv32u
d_outMaskPosition
;
...
...
@@ -623,7 +565,14 @@ __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IIm
curRootNodeOffset
+=
NUM_THREADS_CLASSIFIERPARALLEL
;
}
Ncv32f
finalStageSum
=
reduceSpecialization
(
curStageSum
);
struct
functorAddValues
{
__device__
void
reduce
(
Ncv32f
&
in1out
,
Ncv32f
&
in2
)
{
in1out
+=
in2
;
}
};
Ncv32f
finalStageSum
=
subReduce
<
Ncv32f
,
functorAddValues
,
NUM_THREADS_CLASSIFIERPARALLEL
>
(
curStageSum
);
if
(
finalStageSum
<
stageThreshold
)
{
...
...
modules/gpu/src/nvidia/core/NCVAlg.hpp
0 → 100644
浏览文件 @
3396a2ec
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef _ncv_alg_hpp_
#define _ncv_alg_hpp_
#include "NCV.hpp"
template
<
typename
T
>
static
T
divUp
(
T
a
,
T
b
)
{
return
(
a
+
b
-
1
)
/
b
;
}
template
<
typename
Tdata
,
class
Tfunc
,
Ncv32u
nThreads
>
static
__device__
Tdata
subReduce
(
Tdata
threadElem
)
{
Tfunc
functor
;
__shared__
Tdata
reduceArr
[
nThreads
];
reduceArr
[
threadIdx
.
x
]
=
threadElem
;
__syncthreads
();
if
(
nThreads
>=
256
&&
threadIdx
.
x
<
128
)
{
functor
.
reduce
(
reduceArr
[
threadIdx
.
x
],
reduceArr
[
threadIdx
.
x
+
128
]);
}
__syncthreads
();
if
(
nThreads
>=
128
&&
threadIdx
.
x
<
64
)
{
functor
.
reduce
(
reduceArr
[
threadIdx
.
x
],
reduceArr
[
threadIdx
.
x
+
64
]);
}
__syncthreads
();
if
(
threadIdx
.
x
<
32
)
{
if
(
nThreads
>=
64
)
{
functor
.
reduce
(
reduceArr
[
threadIdx
.
x
],
reduceArr
[
threadIdx
.
x
+
32
]);
}
if
(
nThreads
>=
32
)
{
functor
.
reduce
(
reduceArr
[
threadIdx
.
x
],
reduceArr
[
threadIdx
.
x
+
16
]);
}
functor
.
reduce
(
reduceArr
[
threadIdx
.
x
],
reduceArr
[
threadIdx
.
x
+
8
]);
functor
.
reduce
(
reduceArr
[
threadIdx
.
x
],
reduceArr
[
threadIdx
.
x
+
4
]);
functor
.
reduce
(
reduceArr
[
threadIdx
.
x
],
reduceArr
[
threadIdx
.
x
+
2
]);
functor
.
reduce
(
reduceArr
[
threadIdx
.
x
],
reduceArr
[
threadIdx
.
x
+
1
]);
}
__syncthreads
();
return
reduceArr
[
0
];
}
#endif //_ncv_alg_hpp_
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录