Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
4988e131
O
Opencv
项目概览
Greenplum
/
Opencv
11 个月 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
4988e131
编写于
7月 04, 2020
作者:
Y
YashasSamaga
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
transfer output blobs in background
上级
657c8d1c
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
95 addition
and
10 deletion
+95
-10
modules/dnn/src/cuda4dnn/csl/event.hpp
modules/dnn/src/cuda4dnn/csl/event.hpp
+5
-3
modules/dnn/src/dnn.cpp
modules/dnn/src/dnn.cpp
+30
-5
modules/dnn/src/op_cuda.hpp
modules/dnn/src/op_cuda.hpp
+60
-2
未找到文件。
modules/dnn/src/cuda4dnn/csl/event.hpp
浏览文件 @
4988e131
...
...
@@ -33,7 +33,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
/** if \p create is `true`, a new event will be created; otherwise, an empty event object is created */
Event
(
bool
create
,
bool
timing_event
=
false
)
:
event
{
nullptr
}
{
if
(
create
)
{
unsigned
int
flags
=
cudaEventBlockingSync
|
(
timing_event
?
0
:
cudaEventDisableTiming
);
unsigned
int
flags
=
(
timing_event
?
0
:
cudaEventDisableTiming
);
CUDA4DNN_CHECK_CUDA
(
cudaEventCreateWithFlags
(
&
event
,
flags
));
}
}
...
...
@@ -60,6 +60,7 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
/** mark a point in \p stream */
void
record
(
const
Stream
&
stream
)
{
CV_Assert
(
stream
);
CUDA4DNN_CHECK_CUDA
(
cudaEventRecord
(
event
,
stream
.
get
()));
}
...
...
@@ -85,12 +86,13 @@ namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
};
/** makes a stream wait on an event */
void
StreamWaitOnEvent
(
const
Stream
&
stream
,
const
Event
&
event
)
{
inline
void
StreamWaitOnEvent
(
const
Stream
&
stream
,
const
Event
&
event
)
{
CV_Assert
(
stream
);
CUDA4DNN_CHECK_CUDA
(
cudaStreamWaitEvent
(
stream
.
get
(),
event
.
get
(),
0
));
}
/** returns the time elapsed between two events in milliseconds */
float
TimeElapsedBetweenEvents
(
const
Event
&
start
,
const
Event
&
end
)
{
inline
float
TimeElapsedBetweenEvents
(
const
Event
&
start
,
const
Event
&
end
)
{
float
temp
;
CUDA4DNN_CHECK_CUDA
(
cudaEventElapsedTime
(
&
temp
,
start
.
get
(),
end
.
get
()));
return
temp
;
...
...
modules/dnn/src/dnn.cpp
浏览文件 @
4988e131
...
...
@@ -585,6 +585,13 @@ struct LayerData
std
::
vector
<
Ptr
<
BackendWrapper
>
>
inputBlobsWrappers
;
std
::
vector
<
Ptr
<
BackendWrapper
>
>
internalBlobsWrappers
;
#ifdef HAVE_CUDA
/* output ids which must be transferred to the host in the background
* after the completion of the forward pass of the layer
*/
std
::
vector
<
int
>
cudaD2HBackgroundTransfers
;
#endif
Ptr
<
Layer
>
layerInstance
;
std
::
vector
<
Mat
>
outputBlobs
;
std
::
vector
<
Mat
*>
inputBlobs
;
...
...
@@ -1187,7 +1194,8 @@ struct Net::Impl : public detail::NetImplBase
context
.
cublas_handle
=
cuda4dnn
::
csl
::
cublas
::
Handle
(
context
.
stream
);
context
.
cudnn_handle
=
cuda4dnn
::
csl
::
cudnn
::
Handle
(
context
.
stream
);
cudaInfo
=
std
::
unique_ptr
<
CudaInfo_t
>
(
new
CudaInfo_t
(
std
::
move
(
context
)));
auto
d2h_stream
=
cuda4dnn
::
csl
::
Stream
(
true
);
// stream for background D2H data transfers
cudaInfo
=
std
::
unique_ptr
<
CudaInfo_t
>
(
new
CudaInfo_t
(
std
::
move
(
context
),
std
::
move
(
d2h_stream
)));
}
#endif
}
...
...
@@ -1215,8 +1223,10 @@ struct Net::Impl : public detail::NetImplBase
#ifdef HAVE_CUDA
struct
CudaInfo_t
{
CudaInfo_t
(
cuda4dnn
::
csl
::
CSLContext
ctxt
)
:
context
(
std
::
move
(
ctxt
))
{
}
CudaInfo_t
(
cuda4dnn
::
csl
::
CSLContext
ctxt
,
cuda4dnn
::
csl
::
Stream
d2h_stream_
)
:
context
(
std
::
move
(
ctxt
)),
d2h_stream
(
std
::
move
(
d2h_stream_
))
{
}
cuda4dnn
::
csl
::
CSLContext
context
;
cuda4dnn
::
csl
::
Stream
d2h_stream
;
cuda4dnn
::
csl
::
Workspace
workspace
;
};
...
...
@@ -1290,7 +1300,7 @@ struct Net::Impl : public detail::NetImplBase
if
(
preferableBackend
==
DNN_BACKEND_CUDA
)
{
auto
cudaWrapper
=
wrapper
.
dynamicCast
<
CUDABackendWrapper
>
();
cudaWrapper
->
setStream
(
cudaInfo
->
context
.
stream
);
cudaWrapper
->
setStream
(
cudaInfo
->
context
.
stream
,
cudaInfo
->
d2h_stream
);
}
#endif
backendWrappers
[
data
]
=
wrapper
;
...
...
@@ -1630,7 +1640,7 @@ struct Net::Impl : public detail::NetImplBase
else
if
(
preferableBackend
==
DNN_BACKEND_VKCOM
)
initVkComBackend
();
else
if
(
preferableBackend
==
DNN_BACKEND_CUDA
)
initCUDABackend
();
initCUDABackend
(
blobsToKeep_
);
else
CV_Error
(
Error
::
StsNotImplemented
,
"Unknown backend identifier"
);
}
...
...
@@ -2360,7 +2370,7 @@ struct Net::Impl : public detail::NetImplBase
#endif
}
void
initCUDABackend
()
{
void
initCUDABackend
(
const
std
::
vector
<
LayerPin
>&
blobsToKeep_
)
{
CV_Assert
(
haveCUDA
());
#ifdef HAVE_CUDA
...
...
@@ -2386,6 +2396,15 @@ struct Net::Impl : public detail::NetImplBase
auto
cudaNode
=
node
.
dynamicCast
<
CUDABackendNode
>
();
cudaInfo
->
workspace
.
require
(
cudaNode
->
get_workspace_memory_in_bytes
());
}
if
(
blobsToKeep_
.
size
()
>
1
)
{
for
(
const
auto
&
pin
:
blobsToKeep_
)
{
LayerData
&
ld
=
layers
[
pin
.
lid
];
ld
.
cudaD2HBackgroundTransfers
.
push_back
(
pin
.
oid
);
}
}
#endif
}
...
...
@@ -3120,6 +3139,12 @@ struct Net::Impl : public detail::NetImplBase
CV_Assert
(
!
cudaNode
.
empty
());
cudaNode
->
forward
(
ld
.
inputBlobsWrappers
,
ld
.
outputBlobsWrappers
,
cudaInfo
->
workspace
);
for
(
auto
id
:
ld
.
cudaD2HBackgroundTransfers
)
{
auto
wrapper
=
ld
.
outputBlobsWrappers
[
id
].
dynamicCast
<
CUDABackendWrapper
>
();
wrapper
->
copyToHostInBackground
();
}
#endif
}
else
if
(
preferableBackend
==
DNN_BACKEND_HALIDE
)
...
...
modules/dnn/src/op_cuda.hpp
浏览文件 @
4988e131
...
...
@@ -7,6 +7,7 @@
#ifdef HAVE_CUDA
#include "cuda4dnn/csl/stream.hpp"
#include "cuda4dnn/csl/event.hpp"
#include "cuda4dnn/csl/cublas.hpp"
#include "cuda4dnn/csl/cudnn.hpp"
#include "cuda4dnn/csl/tensor.hpp"
...
...
@@ -206,6 +207,7 @@ namespace cv { namespace dnn {
virtual
~
CUDABackendWrapper
()
{
}
void
copyToHost
()
override
=
0
;
virtual
void
copyToHostInBackground
()
=
0
;
void
setHostDirty
()
override
=
0
;
virtual
void
copyToDevice
()
=
0
;
...
...
@@ -215,7 +217,7 @@ namespace cv { namespace dnn {
virtual
std
::
size_t
getRank
()
const
noexcept
=
0
;
/** @note setting the stream updates the stream for all wrappers which use the same tensor */
virtual
void
setStream
(
cuda4dnn
::
csl
::
Stream
stream
)
noexcept
=
0
;
virtual
void
setStream
(
cuda4dnn
::
csl
::
Stream
stream
,
cuda4dnn
::
csl
::
Stream
h2d_stream
)
noexcept
=
0
;
virtual
void
update
(
const
MatShape
&
shape
,
std
::
size_t
offset
)
=
0
;
};
...
...
@@ -240,6 +242,36 @@ namespace cv { namespace dnn {
cuda4dnn
::
csl
::
memcpy
<
float
>
(
reinterpret_cast
<
float
*>
(
mat
.
data
),
view
.
data
(),
view
.
size
(),
stream
);
}
template
<
class
U
>
void
convert_D2H_background
(
const
cv
::
Mat
&
mat
,
cuda4dnn
::
csl
::
View
<
U
>
view
,
cuda4dnn
::
csl
::
ManagedPtr
<
float
>&
device_temp
,
const
cuda4dnn
::
csl
::
Stream
&
stream
,
const
cuda4dnn
::
csl
::
Stream
&
d2h_stream
,
cuda4dnn
::
csl
::
Event
&
d2h_event
);
template
<
>
inline
void
convert_D2H_background
<
half
>
(
const
cv
::
Mat
&
mat
,
cuda4dnn
::
csl
::
View
<
half
>
view
,
cuda4dnn
::
csl
::
ManagedPtr
<
float
>&
device_temp
,
const
cuda4dnn
::
csl
::
Stream
&
stream
,
const
cuda4dnn
::
csl
::
Stream
&
d2h_stream
,
cuda4dnn
::
csl
::
Event
&
d2h_event
)
{
if
(
device_temp
.
size
()
<
view
.
size
())
device_temp
.
reset
(
view
.
size
());
auto
temp_span
=
cuda4dnn
::
csl
::
Span
<
float
>
(
device_temp
.
get
(),
view
.
size
());
/* The conversion kernel should can be executed in the background stream for better
* performance. We do it in the inference stream to prevent an unexplained performance
* regression on RTX 2080 Ti. Executing conversion kernel in the background stream causes
* everything to slow down (even operations that appear before the background transfer).
*
* TODO: identify the cause and move conversion kernel to the background stream
*/
cuda4dnn
::
kernels
::
fp16_to_fp32
(
stream
,
temp_span
,
view
);
d2h_event
.
record
(
stream
);
// mark position in inference stream
cuda4dnn
::
csl
::
StreamWaitOnEvent
(
d2h_stream
,
d2h_event
);
// don't start transfer until data is available
cuda4dnn
::
csl
::
memcpy
<
float
>
(
reinterpret_cast
<
float
*>
(
mat
.
data
),
temp_span
.
data
(),
view
.
size
(),
d2h_stream
);
}
template
<
>
inline
void
convert_D2H_background
<
float
>
(
const
cv
::
Mat
&
mat
,
cuda4dnn
::
csl
::
View
<
float
>
view
,
cuda4dnn
::
csl
::
ManagedPtr
<
float
>&
device_temp
,
const
cuda4dnn
::
csl
::
Stream
&
stream
,
const
cuda4dnn
::
csl
::
Stream
&
d2h_stream
,
cuda4dnn
::
csl
::
Event
&
d2h_event
)
{
d2h_event
.
record
(
stream
);
cuda4dnn
::
csl
::
StreamWaitOnEvent
(
d2h_stream
,
d2h_event
);
cuda4dnn
::
csl
::
memcpy
<
float
>
(
reinterpret_cast
<
float
*>
(
mat
.
data
),
view
.
data
(),
view
.
size
(),
d2h_stream
);
}
template
<
class
U
>
void
convert_H2D
(
cuda4dnn
::
csl
::
Span
<
U
>
span
,
const
cv
::
Mat
&
mat
,
cuda4dnn
::
csl
::
ManagedPtr
<
float
>&
device_temp
,
const
cuda4dnn
::
csl
::
Stream
&
stream
);
...
...
@@ -349,6 +381,28 @@ namespace cv { namespace dnn {
cuda4dnn
::
detail
::
convert_D2H
<
T
>
(
mat
,
view
,
shared_block
->
device_temp
,
shared_block
->
stream
);
shared_block
->
stream
.
synchronize
();
}
else
if
(
shared_block
->
d2h_event
&&
shared_block
->
d2h_event
.
busy
())
{
/* wait for the background copy to finish */
shared_block
->
d2h_event
.
synchronize
();
}
}
void
copyToHostInBackground
()
override
{
CV_Assert
(
shared_block
->
d2h_stream
);
if
(
shared_block
->
device_dirty
)
{
shared_block
->
host_dirty
=
false
;
shared_block
->
device_dirty
=
false
;
auto
view
=
tensor_view_type
(
shared_block
->
device
.
get
(),
std
::
begin
(
shape
),
std
::
end
(
shape
));
auto
&
mat
=
shared_block
->
host
;
CV_Assert
(
mat
.
isContinuous
());
CV_Assert
(
mat
.
type
()
==
CV_32F
);
if
(
!
shared_block
->
d2h_event
)
shared_block
->
d2h_event
=
cuda4dnn
::
csl
::
Event
(
true
);
cuda4dnn
::
detail
::
convert_D2H_background
<
T
>
(
mat
,
view
,
shared_block
->
device_temp
,
shared_block
->
stream
,
shared_block
->
d2h_stream
,
shared_block
->
d2h_event
);
shared_block
->
d2h_event
.
record
(
shared_block
->
d2h_stream
);
// record position so that we can check status later
}
}
...
...
@@ -383,8 +437,9 @@ namespace cv { namespace dnn {
std
::
size_t
getRank
()
const
noexcept
override
{
return
shape
.
size
();
}
void
setStream
(
cuda4dnn
::
csl
::
Stream
stream
)
noexcept
override
{
void
setStream
(
cuda4dnn
::
csl
::
Stream
stream
,
cuda4dnn
::
csl
::
Stream
d2h_stream
)
noexcept
override
{
shared_block
->
stream
=
std
::
move
(
stream
);
shared_block
->
d2h_stream
=
std
::
move
(
d2h_stream
);
}
void
update
(
const
MatShape
&
shape_
,
std
::
size_t
offset_
)
override
{
...
...
@@ -452,6 +507,9 @@ namespace cv { namespace dnn {
cuda4dnn
::
csl
::
ManagedPtr
<
T
>
device
;
cuda4dnn
::
csl
::
ManagedPtr
<
float
>
device_temp
;
/* use for conversions */
cuda4dnn
::
csl
::
Stream
stream
;
cuda4dnn
::
csl
::
Event
d2h_event
;
cuda4dnn
::
csl
::
Stream
d2h_stream
;
};
std
::
shared_ptr
<
shared_block_type
>
shared_block
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录