Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
83dc6898
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
83dc6898
编写于
10月 22, 2018
作者:
T
tensor-tang
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'ups/develop' into refine/jit/gru
test=develop
上级
640e789d
5d6783f8
变更
13
显示空白变更内容
内联
并排
Showing
13 changed file
with
280 addition
and
208 deletion
+280
-208
cmake/generic.cmake
cmake/generic.cmake
+7
-0
paddle/fluid/framework/details/var_handle.h
paddle/fluid/framework/details/var_handle.h
+2
-0
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+6
-0
paddle/fluid/operators/detection/generate_proposals_op.cc
paddle/fluid/operators/detection/generate_proposals_op.cc
+131
-119
paddle/fluid/operators/detection/generate_proposals_op.cu
paddle/fluid/operators/detection/generate_proposals_op.cu
+90
-76
paddle/fluid/operators/distributed/grpc_client.cc
paddle/fluid/operators/distributed/grpc_client.cc
+7
-7
paddle/fluid/operators/distributed/grpc_serde.cc
paddle/fluid/operators/distributed/grpc_serde.cc
+2
-2
paddle/fluid/operators/gather.h
paddle/fluid/operators/gather.h
+2
-4
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+10
-0
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+3
-0
paddle/fluid/platform/profiler.cc
paddle/fluid/platform/profiler.cc
+9
-0
paddle/fluid/platform/profiler.h
paddle/fluid/platform/profiler.h
+10
-0
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-0
未找到文件。
cmake/generic.cmake
浏览文件 @
83dc6898
...
...
@@ -261,6 +261,13 @@ function(cc_library TARGET_NAME)
add_dependencies
(
${
TARGET_NAME
}
mklml
)
target_link_libraries
(
${
TARGET_NAME
}
"-L
${
MKLML_LIB_DIR
}
-liomp5 -Wl,--as-needed"
)
endif
()
# remove link to python, see notes at:
# https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually
if
(
"
${
cc_library_DEPS
}
;"
MATCHES
"python;"
)
list
(
REMOVE_ITEM cc_library_DEPS python
)
add_dependencies
(
${
TARGET_NAME
}
python
)
target_link_libraries
(
${
TARGET_NAME
}
"-Wl,-undefined,dynamic_lookup"
)
endif
()
target_link_libraries
(
${
TARGET_NAME
}
${
cc_library_DEPS
}
)
add_dependencies
(
${
TARGET_NAME
}
${
cc_library_DEPS
}
)
endif
()
...
...
paddle/fluid/framework/details/var_handle.h
浏览文件 @
83dc6898
...
...
@@ -49,6 +49,8 @@ struct VarHandleBase {
void
AddOutput
(
OpHandleBase
*
out
,
ir
::
Node
*
node
)
{
if
(
pending_ops_
.
find
(
out
)
==
pending_ops_
.
end
())
{
PADDLE_ENFORCE
(
out
!=
nullptr
,
"The output of %s should not be nullptr"
,
this
->
Node
()
->
Name
());
pending_ops_
.
insert
(
out
);
node_
->
outputs
.
push_back
(
node
);
}
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
83dc6898
...
...
@@ -299,6 +299,12 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
}
ParallelExecutor
::~
ParallelExecutor
()
{
const
auto
dev_ctxs
=
platform
::
DeviceContextPool
::
Instance
().
GetAllDeviceContexts
();
for
(
auto
&
dev_ctx
:
dev_ctxs
)
{
dev_ctx
->
Wait
();
}
if
(
member_
->
own_local_scope_
)
{
for
(
size_t
i
=
1
;
i
<
member_
->
local_scopes_
.
size
();
++
i
)
{
Scope
*
local_scope
=
member_
->
local_scopes_
[
i
];
...
...
paddle/fluid/operators/detection/generate_proposals_op.cc
浏览文件 @
83dc6898
...
...
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cmath>
#include <cstring>
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/
framework/var_type
.h"
#include "paddle/fluid/
operators/detail/safe_ref
.h"
#include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/math_function.h"
...
...
@@ -25,21 +27,17 @@ namespace operators {
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
struct
AppendProposalsFunctor
{
LoDTensor
*
out_
;
int64_t
offset_
;
Tensor
*
to_add_
;
static
const
double
kBBoxClipDefault
=
std
::
log
(
1000.0
/
16.0
);
AppendProposalsFunctor
(
LoDTensor
*
out
,
int64_t
offset
,
Tensor
*
to_add
)
:
out_
(
out
),
offset_
(
offset
),
to_add_
(
to_add
)
{}
template
<
typename
T
>
void
apply
()
const
{
auto
*
out_data
=
out_
->
data
<
T
>
();
auto
*
to_add_data
=
to_add_
->
data
<
T
>
();
memcpy
(
out_data
+
offset_
,
to_add_data
,
to_add_
->
numel
()
*
sizeof
(
T
));
}
};
static
void
AppendProposals
(
Tensor
*
dst
,
int64_t
offset
,
const
Tensor
&
src
)
{
auto
*
out_data
=
dst
->
data
<
void
>
();
auto
*
to_add_data
=
src
.
data
<
void
>
();
size_t
size_of_t
=
framework
::
SizeOfType
(
src
.
type
());
offset
*=
size_of_t
;
std
::
memcpy
(
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
uintptr_t
>
(
out_data
)
+
offset
),
to_add_data
,
src
.
numel
()
*
size_of_t
);
}
class
GenerateProposalsOp
:
public
framework
::
OperatorWithKernel
{
public:
...
...
@@ -75,8 +73,9 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
};
template
<
class
T
>
void
BoxCoder
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
all_anchors
,
Tensor
*
bbox_deltas
,
Tensor
*
variances
,
Tensor
*
proposals
)
{
static
inline
void
BoxCoder
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
all_anchors
,
Tensor
*
bbox_deltas
,
Tensor
*
variances
,
Tensor
*
proposals
)
{
T
*
proposals_data
=
proposals
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int64_t
row
=
all_anchors
->
dims
()[
0
];
...
...
@@ -108,11 +107,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
anchor_center_y
;
bbox_width
=
std
::
exp
(
std
::
min
<
T
>
(
variances_data
[
i
*
len
+
2
]
*
bbox_deltas_data
[
i
*
len
+
2
],
std
::
log
(
1000.0
/
16.0
)
))
*
kBBoxClipDefault
))
*
anchor_width
;
bbox_height
=
std
::
exp
(
std
::
min
<
T
>
(
variances_data
[
i
*
len
+
3
]
*
bbox_deltas_data
[
i
*
len
+
3
],
std
::
log
(
1000.0
/
16.0
)
))
*
kBBoxClipDefault
))
*
anchor_height
;
}
else
{
bbox_center_x
=
...
...
@@ -120,10 +119,10 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
bbox_center_y
=
bbox_deltas_data
[
i
*
len
+
1
]
*
anchor_height
+
anchor_center_y
;
bbox_width
=
std
::
exp
(
std
::
min
<
T
>
(
bbox_deltas_data
[
i
*
len
+
2
],
std
::
log
(
1000.0
/
16.0
)
))
*
kBBoxClipDefault
))
*
anchor_width
;
bbox_height
=
std
::
exp
(
std
::
min
<
T
>
(
bbox_deltas_data
[
i
*
len
+
3
],
std
::
log
(
1000.0
/
16.0
)
))
*
kBBoxClipDefault
))
*
anchor_height
;
}
...
...
@@ -136,30 +135,32 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
}
template
<
class
T
>
void
ClipTiledBoxes
(
const
platform
::
DeviceContext
&
ctx
,
const
Tensor
&
im_info
,
Tensor
*
boxes
)
{
static
inline
void
ClipTiledBoxes
(
const
platform
::
DeviceContext
&
ctx
,
const
Tensor
&
im_info
,
Tensor
*
boxes
)
{
T
*
boxes_data
=
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
T
*
im_info_data
=
im_info
.
data
<
T
>
();
T
zero
(
0
);
for
(
int64_t
i
=
0
;
i
<
boxes
->
numel
();
++
i
)
{
if
(
i
%
4
==
0
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
0.0
f
);
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
zero
);
}
else
if
(
i
%
4
==
1
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
0.0
f
);
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
zero
);
}
else
if
(
i
%
4
==
2
)
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
0.0
f
);
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
1
]
-
1
),
zero
);
}
else
{
boxes_data
[
i
]
=
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
0.0
f
);
std
::
max
(
std
::
min
(
boxes_data
[
i
],
im_info_data
[
0
]
-
1
),
zero
);
}
}
}
template
<
class
T
>
void
FilterBoxes
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
boxes
,
float
min_size
,
const
Tensor
&
im_info
,
Tensor
*
keep
)
{
static
inline
void
FilterBoxes
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
boxes
,
float
min_size
,
const
Tensor
&
im_info
,
Tensor
*
keep
)
{
const
T
*
im_info_data
=
im_info
.
data
<
T
>
();
T
*
boxes_data
=
boxes
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
im_scale
=
im_info_data
[
2
];
...
...
@@ -185,24 +186,24 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
keep
->
Resize
({
keep_len
});
}
bool
SortScorePairDescend
(
const
std
::
pair
<
float
,
int
>
&
pair1
,
const
std
::
pair
<
float
,
int
>
&
pair2
)
{
return
pair1
.
first
>
pair2
.
first
;
}
template
<
class
T
>
void
GetMaxScoreIndex
(
const
std
::
vector
<
T
>
&
scores
,
std
::
vector
<
std
::
pair
<
T
,
int
>>
*
sorted_indices
)
{
static
inline
std
::
vector
<
std
::
pair
<
T
,
int
>>
GetSortedScoreIndex
(
const
std
::
vector
<
T
>
&
scores
)
{
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
;
sorted_indices
.
reserve
(
scores
.
size
());
for
(
size_t
i
=
0
;
i
<
scores
.
size
();
++
i
)
{
sorted_indices
->
push_back
(
std
::
make_pair
(
scores
[
i
],
i
)
);
sorted_indices
.
emplace_back
(
scores
[
i
],
i
);
}
// Sort the score pair according to the scores in descending order
std
::
stable_sort
(
sorted_indices
->
begin
(),
sorted_indices
->
end
(),
SortScorePairDescend
);
std
::
stable_sort
(
sorted_indices
.
begin
(),
sorted_indices
.
end
(),
[](
const
std
::
pair
<
T
,
int
>
&
a
,
const
std
::
pair
<
T
,
int
>
&
b
)
{
return
a
.
first
<
b
.
first
;
});
return
sorted_indices
;
}
template
<
class
T
>
T
BBoxArea
(
const
T
*
box
,
const
bool
normalized
)
{
static
inline
T
BBoxArea
(
const
T
*
box
,
bool
normalized
)
{
if
(
box
[
2
]
<
box
[
0
]
||
box
[
3
]
<
box
[
1
])
{
// If coordinate values are is invalid
// (e.g. xmax < xmin or ymax < ymin), return 0.
...
...
@@ -220,7 +221,7 @@ T BBoxArea(const T *box, const bool normalized) {
}
template
<
class
T
>
T
JaccardOverlap
(
const
T
*
box1
,
const
T
*
box2
,
const
bool
normalized
)
{
static
inline
T
JaccardOverlap
(
const
T
*
box1
,
const
T
*
box2
,
bool
normalized
)
{
if
(
box2
[
0
]
>
box1
[
2
]
||
box2
[
2
]
<
box1
[
0
]
||
box2
[
1
]
>
box1
[
3
]
||
box2
[
3
]
<
box1
[
1
])
{
return
static_cast
<
T
>
(
0.
);
...
...
@@ -229,8 +230,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
const
T
inter_ymin
=
std
::
max
(
box1
[
1
],
box2
[
1
]);
const
T
inter_xmax
=
std
::
min
(
box1
[
2
],
box2
[
2
]);
const
T
inter_ymax
=
std
::
min
(
box1
[
3
],
box2
[
3
]);
const
T
inter_w
=
std
::
max
(
0.0
f
,
inter_xmax
-
inter_xmin
+
1
);
const
T
inter_h
=
std
::
max
(
0.0
f
,
inter_ymax
-
inter_ymin
+
1
);
const
T
inter_w
=
std
::
max
(
T
(
0
)
,
inter_xmax
-
inter_xmin
+
1
);
const
T
inter_h
=
std
::
max
(
T
(
0
)
,
inter_ymax
-
inter_ymin
+
1
);
const
T
inter_area
=
inter_w
*
inter_h
;
const
T
bbox1_area
=
BBoxArea
<
T
>
(
box1
,
normalized
);
const
T
bbox2_area
=
BBoxArea
<
T
>
(
box2
,
normalized
);
...
...
@@ -238,9 +239,21 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
}
}
template
<
typename
T
>
static
inline
Tensor
VectorToTensor
(
const
std
::
vector
<
T
>
&
selected_indices
,
int
selected_num
)
{
Tensor
keep_nms
;
keep_nms
.
Resize
({
selected_num
});
auto
*
keep_data
=
keep_nms
.
mutable_data
<
T
>
(
platform
::
CPUPlace
());
for
(
int
i
=
0
;
i
<
selected_num
;
++
i
)
{
keep_data
[
i
]
=
selected_indices
[
i
];
}
return
keep_nms
;
}
template
<
class
T
>
Tensor
NMS
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
bbox
,
Tensor
*
scores
,
const
T
nms_threshold
,
const
float
eta
)
{
static
inline
Tensor
NMS
(
const
platform
::
DeviceContext
&
ctx
,
Tensor
*
bbox
,
Tensor
*
scores
,
T
nms_threshold
,
float
eta
)
{
PADDLE_ENFORCE_NOT_NULL
(
bbox
);
int64_t
num_boxes
=
bbox
->
dims
()[
0
];
// 4: [xmin ymin xmax ymax]
...
...
@@ -248,20 +261,18 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
std
::
vector
<
T
>
scores_data
(
num_boxes
);
std
::
copy_n
(
scores
->
data
<
T
>
(),
num_boxes
,
scores_data
.
begin
());
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
;
GetMaxScoreIndex
<
T
>
(
scores_data
,
&
sorted_indices
);
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
=
GetSortedScoreIndex
<
T
>
(
scores_data
);
std
::
vector
<
int
>
selected_indices
;
int
selected_num
=
0
;
T
adaptive_threshold
=
nms_threshold
;
const
T
*
bbox_data
=
bbox
->
data
<
T
>
();
bool
flag
;
while
(
sorted_indices
.
size
()
!=
0
)
{
int
idx
=
sorted_indices
.
front
().
second
;
flag
=
true
;
for
(
size_t
k
=
0
;
k
<
selected_indices
.
size
();
++
k
)
{
int
idx
=
sorted_indices
.
back
().
second
;
bool
flag
=
true
;
for
(
int
kept_idx
:
selected_indices
)
{
if
(
flag
)
{
const
int
kept_idx
=
selected_indices
[
k
];
T
overlap
=
JaccardOverlap
<
T
>
(
bbox_data
+
idx
*
box_size
,
bbox_data
+
kept_idx
*
box_size
,
false
);
flag
=
(
overlap
<=
adaptive_threshold
);
...
...
@@ -271,32 +282,29 @@ Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, Tensor *scores,
}
if
(
flag
)
{
selected_indices
.
push_back
(
idx
);
selected_num
++
;
++
selected_num
;
}
sorted_indices
.
erase
(
sorted_indices
.
begin
());
sorted_indices
.
erase
(
sorted_indices
.
end
());
if
(
flag
&&
eta
<
1
&&
adaptive_threshold
>
0.5
)
{
adaptive_threshold
*=
eta
;
}
}
Tensor
keep_nms
;
keep_nms
.
Resize
({
selected_num
});
int
*
keep_data
=
keep_nms
.
mutable_data
<
int
>
(
ctx
.
GetPlace
());
for
(
int
i
=
0
;
i
<
selected_num
;
++
i
)
{
keep_data
[
i
]
=
selected_indices
[
i
];
}
return
keep_nms
;
return
VectorToTensor
(
selected_indices
,
selected_num
);
}
template
<
typename
DeviceContext
,
typename
T
>
template
<
typename
T
>
class
GenerateProposalsKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
scores
=
context
.
Input
<
Tensor
>
(
"Scores"
);
auto
*
bbox_deltas
=
context
.
Input
<
Tensor
>
(
"BboxDeltas"
);
auto
*
im_info
=
context
.
Input
<
Tensor
>
(
"ImInfo"
);
auto
*
anchors
=
context
.
Input
<
Tensor
>
(
"Anchors"
);
auto
*
variances
=
context
.
Input
<
Tensor
>
(
"Variances"
);
auto
anchors
=
detail
::
Ref
(
context
.
Input
<
Tensor
>
(
"Anchors"
),
"Cannot find input Anchors(%s) in scope"
,
context
.
Inputs
(
"Anchors"
)[
0
]);
auto
variances
=
detail
::
Ref
(
context
.
Input
<
Tensor
>
(
"Variances"
),
"Cannot find input Variances(%s) in scope"
,
context
.
Inputs
(
"Variances"
)[
0
]);
auto
*
rpn_rois
=
context
.
Output
<
LoDTensor
>
(
"RpnRois"
);
auto
*
rpn_roi_probs
=
context
.
Output
<
LoDTensor
>
(
"RpnRoiProbs"
);
...
...
@@ -307,15 +315,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
float
min_size
=
context
.
Attr
<
float
>
(
"min_size"
);
float
eta
=
context
.
Attr
<
float
>
(
"eta"
);
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
auto
&
dev_ctx
=
context
.
template
device_context
<
platform
::
CPUDeviceContext
>();
auto
scores_dim
=
scores
->
dims
();
auto
&
scores_dim
=
scores
->
dims
();
int64_t
num
=
scores_dim
[
0
];
int64_t
c_score
=
scores_dim
[
1
];
int64_t
h_score
=
scores_dim
[
2
];
int64_t
w_score
=
scores_dim
[
3
];
auto
bbox_dim
=
bbox_deltas
->
dims
();
auto
&
bbox_dim
=
bbox_deltas
->
dims
();
int64_t
c_bbox
=
bbox_dim
[
1
];
int64_t
h_bbox
=
bbox_dim
[
2
];
int64_t
w_bbox
=
bbox_dim
[
3
];
...
...
@@ -330,17 +339,17 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
scores_swap
.
mutable_data
<
T
>
({
num
,
h_score
,
w_score
,
c_score
},
dev_ctx
.
GetPlace
());
math
::
Transpose
<
DeviceContext
,
T
,
4
>
trans
;
math
::
Transpose
<
platform
::
CPU
DeviceContext
,
T
,
4
>
trans
;
std
::
vector
<
int
>
axis
=
{
0
,
2
,
3
,
1
};
trans
(
dev_ctx
,
*
bbox_deltas
,
&
bbox_deltas_swap
,
axis
);
trans
(
dev_ctx
,
*
scores
,
&
scores_swap
,
axis
);
framework
::
LoD
lod
;
std
::
vector
<
size_t
>
lod0
(
1
,
0
);
Tensor
*
anchor
=
const_cast
<
framework
::
Tensor
*>
(
anchors
)
;
anchor
->
Resize
({
anchors
->
numel
()
/
4
,
4
}
);
Tensor
*
var
=
const_cast
<
framework
::
Tensor
*>
(
variances
);
var
->
Resize
({
var
->
numel
()
/
4
,
4
});
lod
.
resize
(
1
);
auto
&
lod0
=
lod
[
0
]
;
lod0
.
push_back
(
0
);
anchors
.
Resize
({
anchors
.
numel
()
/
4
,
4
}
);
var
iances
.
Resize
({
variances
.
numel
()
/
4
,
4
});
int64_t
num_proposals
=
0
;
for
(
int64_t
i
=
0
;
i
<
num
;
++
i
)
{
...
...
@@ -352,24 +361,17 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
scores_slice
.
Resize
({
h_score
*
w_score
*
c_score
,
1
});
std
::
pair
<
Tensor
,
Tensor
>
tensor_pair
=
ProposalForOneImage
(
dev_ctx
,
im_info_slice
,
*
anchor
,
*
var
,
ProposalForOneImage
(
dev_ctx
,
im_info_slice
,
anchors
,
variances
,
bbox_deltas_slice
,
scores_slice
,
pre_nms_top_n
,
post_nms_top_n
,
nms_thresh
,
min_size
,
eta
);
Tensor
proposals
=
tensor_pair
.
first
;
Tensor
scores
=
tensor_pair
.
second
;
framework
::
VisitDataType
(
framework
::
ToDataType
(
rpn_rois
->
type
()),
AppendProposalsFunctor
(
rpn_rois
,
4
*
num_proposals
,
&
proposals
));
framework
::
VisitDataType
(
framework
::
ToDataType
(
rpn_roi_probs
->
type
()),
AppendProposalsFunctor
(
rpn_roi_probs
,
num_proposals
,
&
scores
));
Tensor
&
proposals
=
tensor_pair
.
first
;
Tensor
&
scores
=
tensor_pair
.
second
;
AppendProposals
(
rpn_rois
,
4
*
num_proposals
,
proposals
);
AppendProposals
(
rpn_roi_probs
,
num_proposals
,
scores
);
num_proposals
+=
proposals
.
dims
()[
0
];
lod0
.
emplace
_back
(
num_proposals
);
lod0
.
push
_back
(
num_proposals
);
}
lod
.
emplace_back
(
lod0
);
rpn_rois
->
set_lod
(
lod
);
rpn_roi_probs
->
set_lod
(
lod
);
rpn_rois
->
Resize
({
num_proposals
,
4
});
...
...
@@ -377,7 +379,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
}
std
::
pair
<
Tensor
,
Tensor
>
ProposalForOneImage
(
const
DeviceContext
&
ctx
,
const
Tensor
&
im_info_slice
,
const
platform
::
CPU
DeviceContext
&
ctx
,
const
Tensor
&
im_info_slice
,
const
Tensor
&
anchors
,
const
Tensor
&
variances
,
const
Tensor
&
bbox_deltas_slice
,
// [M, 4]
const
Tensor
&
scores_slice
,
// [N, 1]
...
...
@@ -392,8 +394,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
for
(
int
i
=
0
;
i
<
scores_slice
.
numel
();
++
i
)
{
index
[
i
]
=
i
;
}
std
::
function
<
bool
(
const
int64_t
&
,
const
int64_t
&
)
>
compare
=
[
scores_data
](
const
int64_t
&
i
,
const
int64_t
&
j
)
{
auto
compare
=
[
scores_data
](
const
int64_t
&
i
,
const
int64_t
&
j
)
{
return
scores_data
[
i
]
>
scores_data
[
j
];
};
...
...
@@ -452,33 +453,45 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
class
GenerateProposalsOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"Scores"
,
"The scores of anchors should be foreground."
);
AddInput
(
"BboxDeltas"
,
"bbox_deltas."
);
AddInput
(
"ImInfo"
,
"Information for image reshape."
);
AddInput
(
"Anchors"
,
"All anchors."
);
AddInput
(
"Variances"
,
" variances"
);
AddOutput
(
"RpnRois"
,
"Anchors."
);
AddOutput
(
"RpnRoiProbs"
,
"Anchors."
);
AddAttr
<
int
>
(
"pre_nms_topN"
,
"pre_nms_topN"
);
AddAttr
<
int
>
(
"post_nms_topN"
,
"post_nms_topN"
);
AddAttr
<
float
>
(
"nms_thresh"
,
"nms_thres"
);
AddAttr
<
float
>
(
"min_size"
,
"min size"
);
AddInput
(
"Scores"
,
"(Tensor) The scores from conv is in shape (N, A, H, W), "
"N is batch size, A is number of anchors, "
"H and W are height and width of the feature map"
);
AddInput
(
"BboxDeltas"
,
"(Tensor) Bounding box deltas from conv is in "
"shape (N, 4*A, H, W)."
);
AddInput
(
"ImInfo"
,
"(Tensor) Information for image reshape is in shape (N, 3), "
"in format (height, width, scale)"
);
AddInput
(
"Anchors"
,
"(Tensor) Bounding box anchors from anchor_generator_op "
"is in shape (A, H, W, 4)."
);
AddInput
(
"Variances"
,
"(Tensor) Bounding box variances with same shape as `Anchors`."
);
AddOutput
(
"RpnRois"
,
"(LoDTensor), Output proposals with shape (rois_num, 4)."
);
AddOutput
(
"RpnRoiProbs"
,
"(LoDTensor) Scores of proposals with shape (rois_num, 1)."
);
AddAttr
<
int
>
(
"pre_nms_topN"
,
"Number of top scoring RPN proposals to keep before "
"applying NMS."
);
AddAttr
<
int
>
(
"post_nms_topN"
,
"Number of top scoring RPN proposals to keep after "
"applying NMS"
);
AddAttr
<
float
>
(
"nms_thresh"
,
"NMS threshold used on RPN proposals."
);
AddAttr
<
float
>
(
"min_size"
,
"Proposal height and width both need to be greater "
"than this min_size."
);
AddAttr
<
float
>
(
"eta"
,
"The parameter for adaptive NMS."
);
AddComment
(
R"DOC(
Generate Proposals OP
This operator proposes rois according to each box with their probability to be a foreground object and
the box can be calculated by anchors. Bbox_deltais and scores are the output of RPN. Final proposals
could be used to train detection net.
Scores is the probability for each box to be an object. In format of (N, A, H, W) where N is batch size, A is number
of anchors, H and W are height and width of the feature map.
BboxDeltas is the differece between predicted box locatoin and anchor location. In format of (N, 4*A, H, W)
This operator Generate bounding box proposals for Faster RCNN.
The propoasls are generated for a list of images based on image
score 'Scores', bounding box regression result 'BboxDeltas' as
well as predefined bounding box shapes 'anchors'. Greedy
non-maximum suppression is applied to generate the final bounding
boxes.
For generating proposals, this operator transposes and resizes scores and bbox_deltas in size of (H*W*A, 1) and (H*W*A, 4) and
calculate box locations as proposals candidates. Then clip boxes to image and remove predicted boxes with small area.
Finally, apply nms to get final proposals as output.
)DOC"
);
}
};
...
...
@@ -490,6 +503,5 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR
(
generate_proposals
,
ops
::
GenerateProposalsOp
,
ops
::
GenerateProposalsOpMaker
,
paddle
::
framework
::
EmptyGradOpMaker
);
REGISTER_OP_CPU_KERNEL
(
generate_proposals
,
ops
::
GenerateProposalsKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
);
REGISTER_OP_CPU_KERNEL
(
generate_proposals
,
ops
::
GenerateProposalsKernel
<
float
>
,
ops
::
GenerateProposalsKernel
<
double
>
);
paddle/fluid/operators/detection/generate_proposals_op.cu
浏览文件 @
83dc6898
...
...
@@ -16,10 +16,13 @@ limitations under the License. */
#include <string>
#include <vector>
#include "cub/cub.cuh"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/gather.cu.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/for_range.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -36,36 +39,38 @@ namespace {
int
const
kThreadsPerBlock
=
sizeof
(
uint64_t
)
*
8
;
template
<
typename
T
>
__global__
void
RangeInitKernel
(
const
T
start
,
const
T
delta
,
const
int
size
,
T
*
out
)
{
CUDA_1D_KERNEL_LOOP
(
i
,
size
)
{
out
[
i
]
=
start
+
i
*
delta
;
}
}
static
const
double
kBBoxClipDefault
=
std
::
log
(
1000.0
/
16.0
);
struct
RangeInitFunctor
{
int
start_
;
int
delta_
;
int
*
out_
;
__device__
void
operator
()(
size_t
i
)
{
out_
[
i
]
=
start_
+
i
*
delta_
;
}
};
template
<
typename
T
>
void
SortDescending
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
value
,
Tensor
*
value_out
,
Tensor
*
index_out
)
{
int
num
=
value
.
numel
();
static
void
SortDescending
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
value
,
Tensor
*
value_out
,
Tensor
*
index_out
)
{
int
num
=
static_cast
<
int
>
(
value
.
numel
());
Tensor
index_in_t
;
int
*
idx_in
=
index_in_t
.
mutable_data
<
int
>
({
num
},
ctx
.
GetPlace
());
int
block
=
512
;
auto
stream
=
ctx
.
stream
(
);
RangeInitKernel
<<<
DIVUP
(
num
,
block
),
block
,
0
,
stream
>>>
(
0
,
1
,
num
,
idx_in
);
platform
::
ForRange
<
platform
::
CUDADeviceContext
>
for_range
(
ctx
,
num
)
;
for_range
(
RangeInitFunctor
{
0
,
1
,
idx_in
}
);
int
*
idx_out
=
index_out
->
mutable_data
<
int
>
({
num
},
ctx
.
GetPlace
());
const
T
*
keys_in
=
value
.
data
<
T
>
();
T
*
keys_out
=
value_out
->
mutable_data
<
T
>
({
num
},
ctx
.
GetPlace
());
// Determine temporary device storage requirements
void
*
d_temp_storage
=
NULL
;
size_t
temp_storage_bytes
=
0
;
cub
::
DeviceRadixSort
::
SortPairsDescending
<
T
,
int
>
(
d_temp_storage
,
temp_storage_bytes
,
keys_in
,
keys_out
,
idx_in
,
idx_out
,
num
);
nullptr
,
temp_storage_bytes
,
keys_in
,
keys_out
,
idx_in
,
idx_out
,
num
);
// Allocate temporary storage
auto
place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx
.
GetPlace
());
d_temp_storage
=
memory
::
Alloc
(
place
,
temp_storage_bytes
);
void
*
d_temp_storage
=
memory
::
Alloc
(
place
,
temp_storage_bytes
);
// Run sorting operation
cub
::
DeviceRadixSort
::
SortPairsDescending
<
T
,
int
>
(
...
...
@@ -76,22 +81,27 @@ void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value,
}
template
<
typename
T
>
__device__
__forceinline__
T
Min
(
T
x
,
T
y
)
{
return
x
<
y
?
x
:
y
;
}
template
<
typename
T
>
__device__
__forceinline__
T
Max
(
T
x
,
T
y
)
{
return
x
>
y
?
x
:
y
;
}
template
<
typename
T
>
__global__
void
BoxDecodeAndClipKernel
(
const
T
*
anchor
,
const
T
*
deltas
,
const
T
*
var
,
const
int
*
index
,
const
T
*
im_info
,
const
int
num
,
T
*
proposals
)
{
T
kBBoxClipDefault
=
log
(
1000.0
/
16.0
);
CUDA_1D_KERNEL_LOOP
(
i
,
num
)
{
struct
BoxDecodeAndClipFunctor
{
const
T
*
anchor
;
const
T
*
deltas
;
const
T
*
var
;
const
int
*
index
;
const
T
*
im_info
;
T
*
proposals
;
BoxDecodeAndClipFunctor
(
const
T
*
anchor
,
const
T
*
deltas
,
const
T
*
var
,
const
int
*
index
,
const
T
*
im_info
,
T
*
proposals
)
:
anchor
(
anchor
),
deltas
(
deltas
),
var
(
var
),
index
(
index
),
im_info
(
im_info
),
proposals
(
proposals
)
{}
T
bbox_clip_default
{
static_cast
<
T
>
(
kBBoxClipDefault
)};
__device__
void
operator
()(
size_t
i
)
{
int
k
=
index
[
i
]
*
4
;
T
axmin
=
anchor
[
k
];
T
aymin
=
anchor
[
k
+
1
];
...
...
@@ -108,17 +118,17 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
T
dxmax
=
deltas
[
k
+
2
];
T
dymax
=
deltas
[
k
+
3
];
T
d_cx
=
0.
,
d_cy
=
0.
,
d_w
=
0.
,
d_h
=
0.
;
T
d_cx
,
d_cy
,
d_w
,
d_h
;
if
(
var
)
{
d_cx
=
cx
+
dxmin
*
w
*
var
[
k
];
d_cy
=
cy
+
dymin
*
h
*
var
[
k
+
1
];
d_w
=
exp
(
Min
<
T
>
(
dxmax
*
var
[
k
+
2
],
kBBoxClipD
efault
))
*
w
;
d_h
=
exp
(
Min
<
T
>
(
dymax
*
var
[
k
+
3
],
kBBoxClipD
efault
))
*
h
;
d_w
=
exp
(
Min
(
dxmax
*
var
[
k
+
2
],
bbox_clip_d
efault
))
*
w
;
d_h
=
exp
(
Min
(
dymax
*
var
[
k
+
3
],
bbox_clip_d
efault
))
*
h
;
}
else
{
d_cx
=
cx
+
dxmin
*
w
;
d_cy
=
cy
+
dymin
*
h
;
d_w
=
exp
(
Min
<
T
>
(
dxmax
,
kBBoxClipD
efault
))
*
w
;
d_h
=
exp
(
Min
<
T
>
(
dymax
,
kBBoxClipD
efault
))
*
h
;
d_w
=
exp
(
Min
(
dxmax
,
bbox_clip_d
efault
))
*
w
;
d_h
=
exp
(
Min
(
dymax
,
bbox_clip_d
efault
))
*
h
;
}
T
oxmin
=
d_cx
-
d_w
*
0.5
;
...
...
@@ -126,17 +136,21 @@ __global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas,
T
oxmax
=
d_cx
+
d_w
*
0.5
-
1.
;
T
oymax
=
d_cy
+
d_h
*
0.5
-
1.
;
proposals
[
i
*
4
]
=
Max
<
T
>
(
Min
<
T
>
(
oxmin
,
im_info
[
1
]
-
1.
),
0.
);
proposals
[
i
*
4
+
1
]
=
Max
<
T
>
(
Min
<
T
>
(
oymin
,
im_info
[
0
]
-
1.
),
0.
);
proposals
[
i
*
4
+
2
]
=
Max
<
T
>
(
Min
<
T
>
(
oxmax
,
im_info
[
1
]
-
1.
),
0.
);
proposals
[
i
*
4
+
3
]
=
Max
<
T
>
(
Min
<
T
>
(
oymax
,
im_info
[
0
]
-
1.
),
0.
);
proposals
[
i
*
4
]
=
Max
(
Min
(
oxmin
,
im_info
[
1
]
-
1.
),
0.
);
proposals
[
i
*
4
+
1
]
=
Max
(
Min
(
oymin
,
im_info
[
0
]
-
1.
),
0.
);
proposals
[
i
*
4
+
2
]
=
Max
(
Min
(
oxmax
,
im_info
[
1
]
-
1.
),
0.
);
proposals
[
i
*
4
+
3
]
=
Max
(
Min
(
oymax
,
im_info
[
0
]
-
1.
),
0.
);
}
}
__device__
__forceinline__
T
Min
(
T
a
,
T
b
)
const
{
return
a
>
b
?
b
:
a
;
}
__device__
__forceinline__
T
Max
(
T
a
,
T
b
)
const
{
return
a
>
b
?
a
:
b
;
}
};
template
<
typename
T
,
int
BlockSize
>
__global__
void
FilterBBoxes
(
const
T
*
bboxes
,
const
T
*
im_info
,
const
T
min_size
,
const
int
num
,
int
*
keep_
num
,
int
*
keep
)
{
static
__global__
void
FilterBBoxes
(
const
T
*
bboxes
,
const
T
*
im_info
,
const
T
min_size
,
const
int
num
,
int
*
keep_num
,
int
*
keep
)
{
T
im_h
=
im_info
[
0
];
T
im_w
=
im_info
[
1
];
T
im_scale
=
im_info
[
2
];
...
...
@@ -181,7 +195,7 @@ __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
}
}
__device__
inline
float
IoU
(
const
float
*
a
,
const
float
*
b
)
{
static
__device__
inline
float
IoU
(
const
float
*
a
,
const
float
*
b
)
{
float
left
=
max
(
a
[
0
],
b
[
0
]),
right
=
min
(
a
[
2
],
b
[
2
]);
float
top
=
max
(
a
[
1
],
b
[
1
]),
bottom
=
min
(
a
[
3
],
b
[
3
]);
float
width
=
max
(
right
-
left
+
1
,
0.
f
),
height
=
max
(
bottom
-
top
+
1
,
0.
f
);
...
...
@@ -191,7 +205,8 @@ __device__ inline float IoU(const float *a, const float *b) {
return
inter_s
/
(
s_a
+
s_b
-
inter_s
);
}
__global__
void
NMSKernel
(
const
int
n_boxes
,
const
float
nms_overlap_thresh
,
static
__global__
void
NMSKernel
(
const
int
n_boxes
,
const
float
nms_overlap_thresh
,
const
float
*
dev_boxes
,
uint64_t
*
dev_mask
)
{
const
int
row_start
=
blockIdx
.
y
;
const
int
col_start
=
blockIdx
.
x
;
...
...
@@ -234,7 +249,7 @@ __global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh,
}
template
<
typename
T
>
void
NMS
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
proposals
,
static
void
NMS
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
proposals
,
const
Tensor
&
sorted_indices
,
const
T
nms_threshold
,
Tensor
*
keep_out
)
{
int
boxes_num
=
proposals
.
dims
()[
0
];
...
...
@@ -247,13 +262,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
const
T
*
boxes
=
proposals
.
data
<
T
>
();
auto
place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx
.
GetPlace
());
int
size_bytes
=
boxes_num
*
col_blocks
*
sizeof
(
uint64_t
);
uint64_t
*
d_mask
=
reinterpret_cast
<
uint64_t
*>
(
memory
::
Alloc
(
place
,
size_bytes
));
NMSKernel
<<<
blocks
,
threads
>>>
(
boxes_num
,
nms_threshold
,
boxes
,
d_mask
);
uint64_t
*
h_mask
=
reinterpret_cast
<
uint64_t
*>
(
memory
::
Alloc
(
platform
::
CPUPlace
(),
size_bytes
));
memory
::
Copy
(
platform
::
CPUPlace
(),
h_mask
,
place
,
d_mask
,
size_bytes
,
0
);
framework
::
Vector
<
uint64_t
>
mask
(
boxes_num
*
col_blocks
);
NMSKernel
<<<
blocks
,
threads
>>>
(
boxes_num
,
nms_threshold
,
boxes
,
mask
.
CUDAMutableData
(
boost
::
get
<
platform
::
CUDAPlace
>
(
ctx
.
GetPlace
())));
std
::
vector
<
uint64_t
>
remv
(
col_blocks
);
memset
(
&
remv
[
0
],
0
,
sizeof
(
uint64_t
)
*
col_blocks
);
...
...
@@ -267,7 +279,7 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
if
(
!
(
remv
[
nblock
]
&
(
1ULL
<<
inblock
)))
{
++
num_to_keep
;
keep_vec
.
push_back
(
i
);
uint64_t
*
p
=
&
h_
mask
[
0
]
+
i
*
col_blocks
;
uint64_t
*
p
=
&
mask
[
0
]
+
i
*
col_blocks
;
for
(
int
j
=
nblock
;
j
<
col_blocks
;
j
++
)
{
remv
[
j
]
|=
p
[
j
];
}
...
...
@@ -276,12 +288,10 @@ void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
int
*
keep
=
keep_out
->
mutable_data
<
int
>
({
num_to_keep
},
ctx
.
GetPlace
());
memory
::
Copy
(
place
,
keep
,
platform
::
CPUPlace
(),
keep_vec
.
data
(),
sizeof
(
int
)
*
num_to_keep
,
0
);
memory
::
Free
(
place
,
d_mask
);
memory
::
Free
(
platform
::
CPUPlace
(),
h_mask
);
}
template
<
typename
T
>
std
::
pair
<
Tensor
,
Tensor
>
ProposalForOneImage
(
st
atic
st
d
::
pair
<
Tensor
,
Tensor
>
ProposalForOneImage
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
im_info
,
const
Tensor
&
anchors
,
const
Tensor
&
variances
,
const
Tensor
&
bbox_deltas
,
// [M, 4]
...
...
@@ -300,18 +310,20 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
// 2. box decode and clipping
Tensor
proposals
;
proposals
.
mutable_data
<
T
>
({
pre_nms_num
,
4
},
ctx
.
GetPlace
());
int
block
=
512
;
auto
stream
=
ctx
.
stream
();
BoxDecodeAndClipKernel
<
T
><<<
DIVUP
(
pre_nms_num
,
block
),
block
,
0
,
stream
>>>
(
{
platform
::
ForRange
<
platform
::
CUDADeviceContext
>
for_range
(
ctx
,
pre_nms_num
);
for_range
(
BoxDecodeAndClipFunctor
<
T
>
{
anchors
.
data
<
T
>
(),
bbox_deltas
.
data
<
T
>
(),
variances
.
data
<
T
>
(),
index_sort
.
data
<
int
>
(),
im_info
.
data
<
T
>
(),
pre_nms_num
,
proposals
.
data
<
T
>
());
index_sort
.
data
<
int
>
(),
im_info
.
data
<
T
>
(),
proposals
.
data
<
T
>
()});
}
// 3. filter
Tensor
keep_index
,
keep_num_t
;
keep_index
.
mutable_data
<
int
>
({
pre_nms_num
},
ctx
.
GetPlace
());
keep_num_t
.
mutable_data
<
int
>
({
1
},
ctx
.
GetPlace
());
min_size
=
std
::
max
(
min_size
,
1.0
f
);
auto
stream
=
ctx
.
stream
();
FilterBBoxes
<
T
,
512
><<<
1
,
512
,
0
,
stream
>>>
(
proposals
.
data
<
T
>
(),
im_info
.
data
<
T
>
(),
min_size
,
pre_nms_num
,
keep_num_t
.
data
<
int
>
(),
keep_index
.
data
<
int
>
());
...
...
@@ -355,8 +367,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
auto
*
scores
=
context
.
Input
<
Tensor
>
(
"Scores"
);
auto
*
bbox_deltas
=
context
.
Input
<
Tensor
>
(
"BboxDeltas"
);
auto
*
im_info
=
context
.
Input
<
Tensor
>
(
"ImInfo"
);
auto
*
anchors
=
context
.
Input
<
Tensor
>
(
"Anchors"
);
auto
*
variances
=
context
.
Input
<
Tensor
>
(
"Variances"
);
auto
anchors
=
detail
::
Ref
(
context
.
Input
<
Tensor
>
(
"Anchors"
),
"Cannot find input Anchors(%s) in scope"
,
context
.
Inputs
(
"Anchors"
)[
0
]);
auto
variances
=
detail
::
Ref
(
context
.
Input
<
Tensor
>
(
"Variances"
),
"Cannot find input Variances(%s) in scope"
,
context
.
Inputs
(
"Variances"
)[
0
]);
auto
*
rpn_rois
=
context
.
Output
<
LoDTensor
>
(
"RpnRois"
);
auto
*
rpn_roi_probs
=
context
.
Output
<
LoDTensor
>
(
"RpnRoiProbs"
);
...
...
@@ -392,10 +408,8 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
trans
(
dev_ctx
,
*
bbox_deltas
,
&
bbox_deltas_swap
,
axis
);
trans
(
dev_ctx
,
*
scores
,
&
scores_swap
,
axis
);
Tensor
*
anchor
=
const_cast
<
framework
::
Tensor
*>
(
anchors
);
anchor
->
Resize
({
anchors
->
numel
()
/
4
,
4
});
Tensor
*
var
=
const_cast
<
framework
::
Tensor
*>
(
variances
);
var
->
Resize
({
var
->
numel
()
/
4
,
4
});
anchors
.
Resize
({
anchors
.
numel
()
/
4
,
4
});
variances
.
Resize
({
variances
.
numel
()
/
4
,
4
});
rpn_rois
->
mutable_data
<
T
>
({
bbox_deltas
->
numel
()
/
4
,
4
},
context
.
GetPlace
());
...
...
@@ -417,12 +431,12 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
scores_slice
.
Resize
({
h_score
*
w_score
*
c_score
,
1
});
std
::
pair
<
Tensor
,
Tensor
>
box_score_pair
=
ProposalForOneImage
<
T
>
(
dev_ctx
,
im_info_slice
,
*
anchor
,
*
var
,
ProposalForOneImage
<
T
>
(
dev_ctx
,
im_info_slice
,
anchors
,
variances
,
bbox_deltas_slice
,
scores_slice
,
pre_nms_top_n
,
post_nms_top_n
,
nms_thresh
,
min_size
,
eta
);
Tensor
proposals
=
box_score_pair
.
first
;
Tensor
scores
=
box_score_pair
.
second
;
Tensor
&
proposals
=
box_score_pair
.
first
;
Tensor
&
scores
=
box_score_pair
.
second
;
memory
::
Copy
(
place
,
rpn_rois_data
+
num_proposals
*
4
,
place
,
proposals
.
data
<
T
>
(),
sizeof
(
T
)
*
proposals
.
numel
(),
0
);
...
...
paddle/fluid/operators/distributed/grpc_client.cc
浏览文件 @
83dc6898
...
...
@@ -86,7 +86,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
// stub context
s
->
response_call_back_
=
nullptr
;
platform
::
RecordEvent
record_event
(
method
,
p_ctx
);
platform
::
Record
RPC
Event
record_event
(
method
,
p_ctx
);
auto
call
=
s
->
stub_g_
.
PrepareUnaryCall
(
s
->
context_
.
get
(),
"/sendrecv.SendRecvService/SendVariable"
,
req
,
&
cq_
);
...
...
@@ -143,7 +143,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
// stub context
s
->
response_call_back_
=
ProcGetResponse
;
platform
::
RecordEvent
record_event
(
method
,
p_ctx
);
platform
::
Record
RPC
Event
record_event
(
method
,
p_ctx
);
auto
call
=
s
->
stub_g_
.
PrepareUnaryCall
(
s
->
context_
.
get
(),
"/sendrecv.SendRecvService/GetVariable"
,
buf
,
&
cq_
);
...
...
@@ -191,7 +191,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
// stub context
s
->
response_call_back_
=
ProcGetResponse
;
platform
::
RecordEvent
record_event
(
method
,
p_ctx
);
platform
::
Record
RPC
Event
record_event
(
method
,
p_ctx
);
auto
call
=
s
->
stub_g_
.
PrepareUnaryCall
(
s
->
context_
.
get
(),
"/sendrecv.SendRecvService/PrefetchVariable"
,
req
,
...
...
@@ -221,7 +221,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
BATCH_BARRIER_MESSAGE
);
platform
::
RecordEvent
record_event
(
method
,
nullptr
);
platform
::
Record
RPC
Event
record_event
(
method
,
nullptr
);
auto
rpc
=
s
->
stub_
->
AsyncSendVariable
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
reinterpret_cast
<
void
*>
(
s
));
...
...
@@ -246,7 +246,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
FETCH_BARRIER_MESSAGE
);
platform
::
RecordEvent
record_event
(
method
,
nullptr
);
platform
::
Record
RPC
Event
record_event
(
method
,
nullptr
);
auto
rpc
=
s
->
stub_
->
AsyncGetVariable
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
reinterpret_cast
<
void
*>
(
s
));
...
...
@@ -271,7 +271,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
COMPLETE_MESSAGE
);
platform
::
RecordEvent
record_event
(
method
,
nullptr
);
platform
::
Record
RPC
Event
record_event
(
method
,
nullptr
);
auto
rpc
=
s
->
stub_
->
AsyncSendVariable
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
reinterpret_cast
<
void
*>
(
s
));
...
...
@@ -301,7 +301,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
req
.
set_varname
(
CHECKPOINT_SAVE_MESSAGE
);
req
.
set_out_varname
(
dir
);
platform
::
RecordEvent
record_event
(
method
,
nullptr
);
platform
::
Record
RPC
Event
record_event
(
method
,
nullptr
);
auto
rpc
=
s
->
stub_
->
AsyncCheckpointNotify
(
s
->
context_
.
get
(),
req
,
&
cq_
);
rpc
->
Finish
(
&
s
->
reply_
,
&
s
->
status_
,
reinterpret_cast
<
void
*>
(
s
));
...
...
paddle/fluid/operators/distributed/grpc_serde.cc
浏览文件 @
83dc6898
...
...
@@ -36,7 +36,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const
platform
::
DeviceContext
&
ctx
,
::
grpc
::
ByteBuffer
*
msg
,
const
std
::
string
&
out_name
)
{
platform
::
RecordEvent
record_event
(
"serial"
,
&
ctx
);
platform
::
Record
RPC
Event
record_event
(
"serial"
,
&
ctx
);
// Default DestroyCallback does nothing, When using GPU
// the CPU buffer need to be freed.
DestroyCallback
destroy_callback
=
[](
void
*
backing
)
{};
...
...
@@ -148,7 +148,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
*
scope
,
framework
::
Variable
**
var
)
{
platform
::
RecordEvent
record_event
(
"deserial"
,
&
ctx
);
platform
::
Record
RPC
Event
record_event
(
"deserial"
,
&
ctx
);
operators
::
distributed
::
GRPCVariableResponse
resp
(
scope
,
&
ctx
);
PADDLE_ENFORCE
(
resp
.
Parse
(
msg
)
==
0
,
"parse bytebuffer to tensor error!"
);
*
var
=
resp
.
GetVar
();
...
...
paddle/fluid/operators/gather.h
浏览文件 @
83dc6898
...
...
@@ -39,11 +39,9 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
PADDLE_ENFORCE
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
()));
// check index of shape 1-D
PADDLE_ENFORCE
(
index
.
dims
().
size
()
==
1
);
int
index_size
=
index
.
dims
()[
0
];
int
64_t
index_size
=
index
.
dims
()[
0
];
auto
src_dims
=
src
.
dims
();
framework
::
DDim
output_dims
(
src_dims
);
output_dims
[
0
]
=
index_size
;
const
T
*
p_src
=
src
.
data
<
T
>
();
const
int
*
p_index
=
index
.
data
<
int
>
();
...
...
@@ -55,7 +53,7 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
const
size_t
slice_bytes
=
slice_size
*
sizeof
(
T
);
for
(
int
i
=
0
;
i
<
index_size
;
++
i
)
{
for
(
int
64_t
i
=
0
;
i
<
index_size
;
++
i
)
{
int
index_
=
p_index
[
i
];
memcpy
(
p_output
+
i
*
slice_size
,
p_src
+
index_
*
slice_size
,
slice_bytes
);
}
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
83dc6898
...
...
@@ -35,6 +35,16 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
return
it
->
second
.
get
();
}
const
std
::
vector
<
const
DeviceContext
*>
DeviceContextPool
::
GetAllDeviceContexts
()
const
{
std
::
vector
<
const
DeviceContext
*>
all_device_ctx
;
all_device_ctx
.
reserve
(
device_contexts_
.
size
());
for
(
auto
&
dev_ctx
:
device_contexts_
)
{
all_device_ctx
.
emplace_back
(
dev_ctx
.
second
.
get
());
}
return
all_device_ctx
;
}
DeviceContextPool
::
DeviceContextPool
(
const
std
::
vector
<
platform
::
Place
>&
places
)
{
PADDLE_ENFORCE_GT
(
places
.
size
(),
0
);
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
83dc6898
...
...
@@ -217,6 +217,9 @@ class DeviceContextPool {
/*! \brief Return handle of single device context. */
platform
::
DeviceContext
*
Get
(
const
platform
::
Place
&
place
);
/*! \brief Return all the device contexts. */
const
std
::
vector
<
const
DeviceContext
*>
GetAllDeviceContexts
()
const
;
template
<
typename
Place
>
const
typename
DefaultDeviceContextType
<
Place
>::
TYPE
*
GetByPlace
(
const
Place
&
place
)
{
...
...
paddle/fluid/platform/profiler.cc
浏览文件 @
83dc6898
...
...
@@ -30,6 +30,8 @@ limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/string/printf.h"
DEFINE_bool
(
enable_rpc_profiler
,
false
,
"Enable rpc profiler or not."
);
namespace
paddle
{
namespace
platform
{
...
...
@@ -193,6 +195,13 @@ RecordEvent::~RecordEvent() {
PopEvent
(
name_
,
dev_ctx_
);
}
RecordRPCEvent
::
RecordRPCEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
)
{
if
(
FLAGS_enable_rpc_profiler
)
{
event_
.
reset
(
new
platform
::
RecordEvent
(
name
,
dev_ctx
));
}
}
RecordBlock
::
RecordBlock
(
int
block_id
)
:
is_enabled_
(
false
),
start_ns_
(
PosixInNsec
())
{
std
::
lock_guard
<
std
::
mutex
>
l
(
profiler_mu
);
...
...
paddle/fluid/platform/profiler.h
浏览文件 @
83dc6898
...
...
@@ -87,6 +87,16 @@ struct RecordEvent {
std
::
string
full_name_
;
};
class
RecordRPCEvent
{
public:
// dev_ctx can be set to nullptr if device is cpu.
RecordRPCEvent
(
const
std
::
string
&
name
,
const
DeviceContext
*
dev_ctx
);
~
RecordRPCEvent
()
{}
private:
std
::
unique_ptr
<
RecordEvent
>
event_
;
};
struct
RecordBlock
{
explicit
RecordBlock
(
int
block_id
);
~
RecordBlock
();
...
...
python/paddle/fluid/__init__.py
浏览文件 @
83dc6898
...
...
@@ -120,6 +120,7 @@ def __bootstrap__():
read_env_flags
.
append
(
'rpc_deadline'
)
read_env_flags
.
append
(
'rpc_server_profile_period'
)
read_env_flags
.
append
(
'rpc_server_profile_path'
)
read_env_flags
.
append
(
'enable_rpc_profiler'
)
if
core
.
is_compiled_with_cuda
():
read_env_flags
+=
[
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录