Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
ea2bdd19
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 1 年 前同步成功
通知
694
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
ea2bdd19
编写于
10月 25, 2018
作者:
T
Tao Luo
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into remove_unused_code
上级
f7bbcfa9
9cb8738f
变更
18
显示空白变更内容
内联
并排
Showing
18 changed file
with
475 addition
and
237 deletion
+475
-237
paddle/fluid/inference/api/demo_ci/run.sh
paddle/fluid/inference/api/demo_ci/run.sh
+1
-1
paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+40
-4
paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+10
-6
paddle/fluid/operators/detection/rpn_target_assign_op.cc
paddle/fluid/operators/detection/rpn_target_assign_op.cc
+52
-16
paddle/fluid/operators/fusion_gru_op.cc
paddle/fluid/operators/fusion_gru_op.cc
+50
-98
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+1
-1
paddle/fluid/operators/math/jit_kernel.h
paddle/fluid/operators/math/jit_kernel.h
+9
-0
paddle/fluid/operators/math/jit_kernel_rnn.cc
paddle/fluid/operators/math/jit_kernel_rnn.cc
+178
-55
paddle/fluid/operators/top_k_op.cu
paddle/fluid/operators/top_k_op.cu
+16
-16
paddle/fluid/operators/top_k_op.h
paddle/fluid/operators/top_k_op.h
+1
-4
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+11
-5
python/paddle/fluid/tests/test_detection.py
python/paddle/fluid/tests/test_detection.py
+5
-2
python/paddle/fluid/tests/unittests/test_dist_mnist.py
python/paddle/fluid/tests/unittests/test_dist_mnist.py
+2
-1
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+2
-1
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+4
-2
python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+6
-0
python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
...paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+34
-14
python/paddle/fluid/tests/unittests/test_top_k_op.py
python/paddle/fluid/tests/unittests/test_top_k_op.py
+53
-11
未找到文件。
paddle/fluid/inference/api/demo_ci/run.sh
浏览文件 @
ea2bdd19
...
...
@@ -21,7 +21,7 @@ else
fi
USE_TENSORRT
=
OFF
if
[
[
-d
"
$TENSORRT_INCLUDE_DIR
"
]
-a
[
-d
"
$TENSORRT_LIB_DIR
"
]
]
;
then
if
[
-d
"
$TENSORRT_INCLUDE_DIR
"
-a
-d
"
$TENSORRT_LIB_DIR
"
]
;
then
USE_TENSORRT
=
ON
fi
...
...
paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
浏览文件 @
ea2bdd19
...
...
@@ -42,16 +42,22 @@ class Pool2dOpConverter : public OpConverter {
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"strides"
));
std
::
vector
<
int
>
paddings
=
boost
::
get
<
std
::
vector
<
int
>>
(
op_desc
.
GetAttr
(
"paddings"
));
bool
ceil_mode
=
boost
::
get
<
bool
>
(
op_desc
.
GetAttr
(
"ceil_mode"
));
nvinfer1
::
DimsHW
nv_ksize
(
ksize
[
0
],
ksize
[
1
]);
if
(
global_pooling
==
true
)
{
nvinfer1
::
Dims
input_shape
=
input1
->
getDimensions
();
int
nbDims
=
input_shape
.
nbDims
;
nvinfer1
::
DimsHW
nv_ksize
(
ksize
[
0
],
ksize
[
1
]);
nvinfer1
::
DimsHW
nv_strides
(
strides
[
0
],
strides
[
1
]);
nvinfer1
::
DimsHW
nv_paddings
(
paddings
[
0
],
paddings
[
1
]);
if
(
global_pooling
==
true
)
{
nv_ksize
.
d
[
0
]
=
input_shape
.
d
[
nbDims
-
2
];
nv_ksize
.
d
[
1
]
=
input_shape
.
d
[
nbDims
-
1
];
nv_strides
.
h
()
=
1
;
nv_strides
.
w
()
=
1
;
nv_paddings
.
h
()
=
0
;
nv_paddings
.
w
()
=
0
;
}
const
nvinfer1
::
DimsHW
nv_strides
(
strides
[
0
],
strides
[
1
]);
const
nvinfer1
::
DimsHW
nv_paddings
(
paddings
[
0
],
paddings
[
1
]);
PADDLE_ENFORCE_EQ
(
input1
->
getDimensions
().
nbDims
,
3UL
);
...
...
@@ -64,6 +70,36 @@ class Pool2dOpConverter : public OpConverter {
PADDLE_THROW
(
"TensorRT unsupported pooling type!"
);
}
if
(
ceil_mode
)
{
nvinfer1
::
DimsHW
pre_pad
(
0
,
0
);
nvinfer1
::
DimsHW
post_pad
(
0
,
0
);
int
input_height
=
input_shape
.
d
[
nbDims
-
2
];
int
input_width
=
input_shape
.
d
[
nbDims
-
1
];
int
floor_h_output_size
=
(
input_height
-
ksize
[
0
]
+
2
*
paddings
[
0
])
/
strides
[
0
]
+
1
;
int
ceil_h_output_size
=
(
input_height
-
ksize
[
0
]
+
2
*
paddings
[
0
]
+
strides
[
0
]
-
1
)
/
strides
[
0
]
+
1
;
int
floor_w_output_size
=
(
input_width
-
ksize
[
1
]
+
2
*
paddings
[
1
])
/
strides
[
1
]
+
1
;
int
ceil_w_output_size
=
(
input_width
-
ksize
[
1
]
+
2
*
paddings
[
1
]
+
strides
[
1
]
-
1
)
/
strides
[
1
]
+
1
;
if
(
floor_h_output_size
!=
ceil_h_output_size
)
{
post_pad
.
h
()
=
strides
[
0
]
-
1
;
}
if
(
floor_w_output_size
!=
ceil_w_output_size
)
{
post_pad
.
w
()
=
strides
[
1
]
-
1
;
}
auto
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Padding
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
input1
),
pre_pad
,
post_pad
);
input1
=
layer
->
getOutput
(
0
);
}
auto
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
Pooling
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
input1
),
nv_pool_type
,
nv_ksize
);
...
...
paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
浏览文件 @
ea2bdd19
...
...
@@ -20,18 +20,20 @@ namespace paddle {
namespace
inference
{
namespace
tensorrt
{
void
test_pool2d
(
bool
global_pooling
)
{
void
test_pool2d
(
bool
global_pooling
,
bool
ceil_mode
)
{
framework
::
Scope
scope
;
std
::
unordered_set
<
std
::
string
>
parameters
;
TRTConvertValidation
validator
(
5
,
parameters
,
scope
,
1
<<
15
);
// The ITensor's Dims should not contain the batch size.
// So, the ITensor's Dims of input and output should be C * H * W.
validator
.
DeclInputVar
(
"pool2d-X"
,
nvinfer1
::
Dims3
(
3
,
4
,
4
));
validator
.
DeclInputVar
(
"pool2d-X"
,
nvinfer1
::
Dims3
(
3
,
13
,
1
4
));
if
(
global_pooling
)
validator
.
DeclOutputVar
(
"pool2d-Out"
,
nvinfer1
::
Dims3
(
3
,
1
,
1
));
else
if
(
ceil_mode
)
validator
.
DeclOutputVar
(
"pool2d-Out"
,
nvinfer1
::
Dims3
(
3
,
6
,
7
));
else
validator
.
DeclOutputVar
(
"pool2d-Out"
,
nvinfer1
::
Dims3
(
3
,
2
,
2
));
validator
.
DeclOutputVar
(
"pool2d-Out"
,
nvinfer1
::
Dims3
(
3
,
6
,
6
));
// Prepare Op description
framework
::
OpDesc
desc
;
...
...
@@ -39,7 +41,7 @@ void test_pool2d(bool global_pooling) {
desc
.
SetInput
(
"X"
,
{
"pool2d-X"
});
desc
.
SetOutput
(
"Out"
,
{
"pool2d-Out"
});
std
::
vector
<
int
>
ksize
({
2
,
2
});
std
::
vector
<
int
>
ksize
({
3
,
3
});
std
::
vector
<
int
>
strides
({
2
,
2
});
std
::
vector
<
int
>
paddings
({
0
,
0
});
std
::
string
pooling_t
=
"max"
;
...
...
@@ -49,6 +51,7 @@ void test_pool2d(bool global_pooling) {
desc
.
SetAttr
(
"strides"
,
strides
);
desc
.
SetAttr
(
"paddings"
,
paddings
);
desc
.
SetAttr
(
"global_pooling"
,
global_pooling
);
desc
.
SetAttr
(
"ceil_mode"
,
ceil_mode
);
LOG
(
INFO
)
<<
"set OP"
;
validator
.
SetOp
(
*
desc
.
Proto
());
...
...
@@ -57,9 +60,10 @@ void test_pool2d(bool global_pooling) {
validator
.
Execute
(
3
);
}
TEST
(
Pool2dOpConverter
,
normal
)
{
test_pool2d
(
false
);
}
TEST
(
Pool2dOpConverter
,
normal
)
{
test_pool2d
(
false
,
false
);
}
TEST
(
Pool2dOpConverter
,
test_global_pooling
)
{
test_pool2d
(
true
,
false
);
}
TEST
(
Pool2dOpConverter
,
test_
global_pooling
)
{
test_pool2d
(
true
);
}
TEST
(
Pool2dOpConverter
,
test_
ceil_mode
)
{
test_pool2d
(
false
,
true
);
}
}
// namespace tensorrt
}
// namespace inference
...
...
paddle/fluid/operators/detection/rpn_target_assign_op.cc
浏览文件 @
ea2bdd19
...
...
@@ -52,6 +52,9 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"TargetBBox"
),
"Output(TargetBBox) of RpnTargetAssignOp should not be null"
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"BBoxInsideWeight"
),
"Output(BBoxInsideWeight) of RpnTargetAssignOp should not be null"
);
auto
anchor_dims
=
ctx
->
GetInputDim
(
"Anchor"
);
auto
gt_boxes_dims
=
ctx
->
GetInputDim
(
"GtBoxes"
);
...
...
@@ -68,6 +71,7 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
ctx
->
SetOutputDim
(
"ScoreIndex"
,
{
-
1
});
ctx
->
SetOutputDim
(
"TargetLabel"
,
{
-
1
,
1
});
ctx
->
SetOutputDim
(
"TargetBBox"
,
{
-
1
,
4
});
ctx
->
SetOutputDim
(
"BBoxInsideWeight"
,
{
-
1
,
4
});
}
protected:
...
...
@@ -169,6 +173,7 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data,
const
float
rpn_positive_overlap
,
const
float
rpn_negative_overlap
,
std
::
vector
<
int
>*
fg_inds
,
std
::
vector
<
int
>*
bg_inds
,
std
::
vector
<
int
>*
tgt_lbl
,
std
::
vector
<
int
>*
fg_fake
,
std
::
vector
<
T
>*
bbox_inside_weight
,
std
::
minstd_rand
engine
,
bool
use_random
)
{
float
epsilon
=
0.00001
;
int
anchor_num
=
anchor_to_gt_max
.
dims
()[
0
];
...
...
@@ -201,12 +206,12 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data,
// Reservoir Sampling
int
fg_num
=
static_cast
<
int
>
(
rpn_fg_fraction
*
rpn_batch_size_per_im
);
ReservoirSampling
(
fg_num
,
&
fg_inds_fake
,
engine
,
use_random
);
fg
_num
=
static_cast
<
int
>
(
fg_inds_fake
.
size
());
for
(
int64_t
i
=
0
;
i
<
fg_num
;
++
i
)
{
int
fg_fake
_num
=
static_cast
<
int
>
(
fg_inds_fake
.
size
());
for
(
int64_t
i
=
0
;
i
<
fg_
fake_
num
;
++
i
)
{
target_label
[
fg_inds_fake
[
i
]]
=
1
;
}
int
bg_num
=
rpn_batch_size_per_im
-
fg_num
;
int
bg_num
=
rpn_batch_size_per_im
-
fg_
fake_
num
;
for
(
int64_t
i
=
0
;
i
<
anchor_num
;
++
i
)
{
if
(
anchor_to_gt_max_data
[
i
]
<
rpn_negative_overlap
)
{
bg_inds_fake
.
push_back
(
i
);
...
...
@@ -214,12 +219,28 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data,
}
ReservoirSampling
(
bg_num
,
&
bg_inds_fake
,
engine
,
use_random
);
bg_num
=
static_cast
<
int
>
(
bg_inds_fake
.
size
());
int
fake_num
=
0
;
for
(
int64_t
i
=
0
;
i
<
bg_num
;
++
i
)
{
// fg fake found
if
(
target_label
[
bg_inds_fake
[
i
]]
==
1
)
{
fake_num
++
;
fg_fake
->
emplace_back
(
fg_inds_fake
[
0
]);
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
bbox_inside_weight
->
emplace_back
(
T
(
0.
));
}
}
target_label
[
bg_inds_fake
[
i
]]
=
0
;
}
for
(
int64_t
i
=
0
;
i
<
(
fg_fake_num
-
fake_num
)
*
4
;
++
i
)
{
bbox_inside_weight
->
emplace_back
(
T
(
1.
));
}
for
(
int64_t
i
=
0
;
i
<
anchor_num
;
++
i
)
{
if
(
target_label
[
i
]
==
1
)
fg_inds
->
emplace_back
(
i
);
if
(
target_label
[
i
]
==
1
)
{
fg_inds
->
emplace_back
(
i
);
fg_fake
->
emplace_back
(
i
);
}
if
(
target_label
[
i
]
==
0
)
bg_inds
->
emplace_back
(
i
);
}
fg_num
=
fg_inds
->
size
();
...
...
@@ -248,7 +269,8 @@ std::vector<Tensor> SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx,
std
::
vector
<
int
>
bg_inds
;
std
::
vector
<
int
>
gt_inds
;
std
::
vector
<
int
>
tgt_lbl
;
std
::
vector
<
int
>
fg_fake
;
std
::
vector
<
T
>
bbox_inside_weight
;
// Calculate the max IoU between anchors and gt boxes
// Map from anchor to gt box that has highest overlap
auto
place
=
ctx
.
GetPlace
();
...
...
@@ -275,32 +297,37 @@ std::vector<Tensor> SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx,
// Follow the Faster RCNN's implementation
ScoreAssign
(
anchor_by_gt_overlap_data
,
anchor_to_gt_max
,
gt_to_anchor_max
,
rpn_batch_size_per_im
,
rpn_fg_fraction
,
rpn_positive_overlap
,
rpn_negative_overlap
,
&
fg_inds
,
&
bg_inds
,
&
tgt_lbl
,
engin
e
,
use_random
);
rpn_negative_overlap
,
&
fg_inds
,
&
bg_inds
,
&
tgt_lbl
,
&
fg_fak
e
,
&
bbox_inside_weight
,
engine
,
use_random
);
int
fg_num
=
fg_inds
.
size
();
int
bg_num
=
bg_inds
.
size
();
gt_inds
.
reserve
(
fg_num
);
for
(
int
i
=
0
;
i
<
fg_num
;
++
i
)
{
gt_inds
.
emplace_back
(
argmax
[
fg_inds
[
i
]]);
int
fg_fake_num
=
fg_fake
.
size
();
gt_inds
.
reserve
(
fg_fake_num
);
for
(
int
i
=
0
;
i
<
fg_fake_num
;
++
i
)
{
gt_inds
.
emplace_back
(
argmax
[
fg_fake
[
i
]]);
}
Tensor
loc_index_t
,
score_index_t
,
tgt_lbl_t
,
gt_inds_t
;
int
*
loc_index_data
=
loc_index_t
.
mutable_data
<
int
>
({
fg_num
},
place
);
Tensor
loc_index_t
,
score_index_t
,
tgt_lbl_t
,
gt_inds_t
,
bbox_inside_weight_t
;
int
*
loc_index_data
=
loc_index_t
.
mutable_data
<
int
>
({
fg_fake_num
},
place
);
int
*
score_index_data
=
score_index_t
.
mutable_data
<
int
>
({
fg_num
+
bg_num
},
place
);
int
*
tgt_lbl_data
=
tgt_lbl_t
.
mutable_data
<
int
>
({
fg_num
+
bg_num
},
place
);
int
*
gt_inds_data
=
gt_inds_t
.
mutable_data
<
int
>
({
fg_num
},
place
);
std
::
copy
(
fg_inds
.
begin
(),
fg_inds
.
end
(),
loc_index_data
);
int
*
gt_inds_data
=
gt_inds_t
.
mutable_data
<
int
>
({
fg_fake_num
},
place
);
T
*
bbox_inside_weight_data
=
bbox_inside_weight_t
.
mutable_data
<
T
>
({
fg_fake_num
,
4
},
place
);
std
::
copy
(
fg_fake
.
begin
(),
fg_fake
.
end
(),
loc_index_data
);
std
::
copy
(
fg_inds
.
begin
(),
fg_inds
.
end
(),
score_index_data
);
std
::
copy
(
bg_inds
.
begin
(),
bg_inds
.
end
(),
score_index_data
+
fg_num
);
std
::
copy
(
tgt_lbl
.
begin
(),
tgt_lbl
.
end
(),
tgt_lbl_data
);
std
::
copy
(
gt_inds
.
begin
(),
gt_inds
.
end
(),
gt_inds_data
);
std
::
copy
(
bbox_inside_weight
.
begin
(),
bbox_inside_weight
.
end
(),
bbox_inside_weight_data
);
std
::
vector
<
Tensor
>
loc_score_tgtlbl_gt
;
loc_score_tgtlbl_gt
.
emplace_back
(
loc_index_t
);
loc_score_tgtlbl_gt
.
emplace_back
(
score_index_t
);
loc_score_tgtlbl_gt
.
emplace_back
(
tgt_lbl_t
);
loc_score_tgtlbl_gt
.
emplace_back
(
gt_inds_t
);
loc_score_tgtlbl_gt
.
emplace_back
(
bbox_inside_weight_t
);
return
loc_score_tgtlbl_gt
;
}
...
...
@@ -318,6 +345,7 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
auto
*
score_index
=
context
.
Output
<
LoDTensor
>
(
"ScoreIndex"
);
auto
*
tgt_bbox
=
context
.
Output
<
LoDTensor
>
(
"TargetBBox"
);
auto
*
tgt_lbl
=
context
.
Output
<
LoDTensor
>
(
"TargetLabel"
);
auto
*
bbox_inside_weight
=
context
.
Output
<
LoDTensor
>
(
"BBoxInsideWeight"
);
PADDLE_ENFORCE_EQ
(
gt_boxes
->
lod
().
size
(),
1UL
,
"RpnTargetAssignOp gt_boxes needs 1 level of LoD"
);
...
...
@@ -340,7 +368,7 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
score_index
->
mutable_data
<
int
>
({
max_num
},
place
);
tgt_bbox
->
mutable_data
<
T
>
({
max_num
,
4
},
place
);
tgt_lbl
->
mutable_data
<
int
>
({
max_num
,
1
},
place
);
bbox_inside_weight
->
mutable_data
<
T
>
({
max_num
,
4
},
place
);
auto
&
dev_ctx
=
context
.
device_context
<
platform
::
CPUDeviceContext
>
();
std
::
random_device
rnd
;
...
...
@@ -394,6 +422,7 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
Tensor
sampled_score_index
=
loc_score_tgtlbl_gt
[
1
];
Tensor
sampled_tgtlbl
=
loc_score_tgtlbl_gt
[
2
];
Tensor
sampled_gt_index
=
loc_score_tgtlbl_gt
[
3
];
Tensor
sampled_bbox_inside_weight
=
loc_score_tgtlbl_gt
[
4
];
int
loc_num
=
sampled_loc_index
.
dims
()[
0
];
int
score_num
=
sampled_score_index
.
dims
()[
0
];
...
...
@@ -432,6 +461,8 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
AppendRpns
<
int
>
(
score_index
,
total_score_num
,
&
sampled_score_index_unmap
);
AppendRpns
<
T
>
(
tgt_bbox
,
total_loc_num
*
4
,
&
sampled_tgt_bbox
);
AppendRpns
<
int
>
(
tgt_lbl
,
total_score_num
,
&
sampled_tgtlbl
);
AppendRpns
<
T
>
(
bbox_inside_weight
,
total_loc_num
*
4
,
&
sampled_bbox_inside_weight
);
total_loc_num
+=
loc_num
;
total_score_num
+=
score_num
;
...
...
@@ -448,10 +479,12 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
score_index
->
set_lod
(
loc_score
);
tgt_bbox
->
set_lod
(
lod_loc
);
tgt_lbl
->
set_lod
(
loc_score
);
bbox_inside_weight
->
set_lod
(
lod_loc
);
loc_index
->
Resize
({
total_loc_num
});
score_index
->
Resize
({
total_score_num
});
tgt_bbox
->
Resize
({
total_loc_num
,
4
});
tgt_lbl
->
Resize
({
total_score_num
,
1
});
bbox_inside_weight
->
Resize
({
total_loc_num
,
4
});
}
};
...
...
@@ -514,6 +547,9 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
"TargetLabel"
,
"(Tensor<int>), The target labels of each anchor with shape "
"[F + B, 1], F and B are sampled foreground and backgroud number."
);
AddOutput
(
"BBoxInsideWeight"
,
"(Tensor), The bbox inside weight with shape "
"[F, 4], F is the sampled foreground number."
);
AddComment
(
R"DOC(
This operator can be, for a given set of ground truth bboxes and the
anchors, to assign classification and regression targets to each prediction.
...
...
paddle/fluid/operators/fusion_gru_op.cc
浏览文件 @
ea2bdd19
...
...
@@ -16,10 +16,9 @@ limitations under the License. */
#include <cstring> // for memcpy
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/fc_compute.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -174,58 +173,44 @@ class FusionGRUKernel : public framework::OpKernel<T> {
}
}
#define INIT_VEC_FUNC \
std::function<void(const int, const T *, T *)> act_gate, act_state; \
std::function<void(const int, const T*, const T*, const T*, T*)> cross; \
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto& act_state_str = ctx.Attr<std::string>("activation"); \
if (platform::jit::MayIUse(platform::jit::avx)) { \
math::VecActivations<T, platform::jit::avx> act_functor; \
act_gate = act_functor(act_gate_str); \
act_state = act_functor(act_state_str); \
cross = math::vec_cross<T, platform::jit::avx>; \
} else { \
math::VecActivations<T, platform::jit::isa_any> act_functor; \
act_gate = act_functor(act_gate_str); \
act_state = act_functor(act_state_str); \
cross = math::vec_cross<T, platform::jit::isa_any>; \
}
#define INIT_BASE_INPUT_OUTPUT \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto* wx = ctx.Input<Tensor>("WeightX"); \
#define INIT_BASE_DEFINES \
auto* x = ctx.Input<LoDTensor>("X"); \
auto* wh = ctx.Input<Tensor>("WeightH"); \
auto* bias = ctx.Input<Tensor>("Bias"); \
auto* xx = ctx.Output<LoDTensor>("XX"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
bool is_reverse = ctx.Attr<bool>("is_reverse");
#define INIT_BASE_SIZES \
auto x_lod = x->lod(); \
auto x_dims = x->dims();
/* T x M*/
\
auto wh_dims = wh->dims();
/* D x 3D*/
\
const int total_T = x_dims[0]; \
const int D3 = wh_dims[1]
#define INIT_OTHER_DEFINES \
auto* h0 = ctx.Input<Tensor>("H0"); \
auto* wx = ctx.Input<Tensor>("WeightX"); \
auto* bias = ctx.Input<Tensor>("Bias"); \
auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
bool is_reverse = ctx.Attr<bool>("is_reverse"); \
const int M = x_dims[1]; \
const int D = wh_dims[0]; \
const int D3 = wh_dims[1]; \
const int D2 = D * 2;
const int D2 = D * 2; \
const auto& ker = math::jitkernel::KernelPool::Instance() \
.template Get<math::jitkernel::GRUKernel<T>, \
const std::string&, const std::string&>( \
ctx.Attr<std::string>("gate_activation"), \
ctx.Attr<std::string>("activation"), D); \
const T* x_data = x->data<T>(); \
const T* wx_data = wx->data<T>(); \
const T* wh_data = wh->data<T>(); \
auto place = ctx.GetPlace(); \
T* xx_data = xx->mutable_data<T>(place)
void
SeqCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
INIT_VEC_FUNC
auto
x_lod
=
x
->
lod
();
INIT_BASE_DEFINES
;
INIT_OTHER_DEFINES
;
const
int
N
=
x_lod
[
0
].
size
()
-
1
;
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
h0_data
=
h0
?
h0
->
data
<
T
>
()
:
nullptr
;
const
T
*
wx_data
=
wx
->
data
<
T
>
();
const
T
*
wh_data
=
wh
->
data
<
T
>
();
const
T
*
wh_state_data
=
wh_data
+
D
*
D2
;
T
*
xx_data
=
xx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
hidden_out_data
=
hidden_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
hidden_out_data
=
hidden_out
->
mutable_data
<
T
>
(
place
);
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
ctx
);
math
::
FCCompute
<
DeviceContext
,
T
>
(
blas
,
total_T
,
D3
,
M
,
x_data
,
wx_data
,
xx_data
,
...
...
@@ -252,14 +237,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
if
(
h0_data
)
{
prev_hidden_data
=
h0_data
+
bid
*
D
;
}
else
{
// W: {W_update, W_reset; W_state}
// update gate
act_gate
(
D
,
xx_data
,
xx_data
);
// state gate
act_state
(
D
,
xx_data
+
D2
,
xx_data
+
D2
);
// out = a*b
blas
.
VMUL
(
D
,
xx_data
,
xx_data
+
D2
,
hidden_out_data
);
// save prev
ker
->
ComputeH1
(
xx_data
,
hidden_out_data
);
prev_hidden_data
=
hidden_out_data
;
tstart
=
1
;
move_step
();
...
...
@@ -269,17 +247,12 @@ class FusionGRUKernel : public framework::OpKernel<T> {
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
1
,
D2
,
D
,
static_cast
<
T
>
(
1
),
prev_hidden_data
,
D
,
wh_data
,
D2
,
static_cast
<
T
>
(
1
),
xx_data
,
D3
);
act_gate
(
D2
,
xx_data
,
xx_data
);
// rt = rt*ht_1 inplace result
blas
.
VMUL
(
D
,
prev_hidden_data
,
xx_data
+
D
,
hidden_out_data
);
ker
->
ComputeHtPart1
(
xx_data
,
prev_hidden_data
,
hidden_out_data
);
// gemm rt * Ws
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
1
,
D
,
D
,
static_cast
<
T
>
(
1
),
hidden_out_data
,
D
,
wh_state_data
,
D
,
static_cast
<
T
>
(
1
),
xx_data
+
D2
,
D3
);
act_state
(
D
,
xx_data
+
D2
,
xx_data
+
D2
);
// out = zt*ht~ + (1-zt)*ht_1
cross
(
D
,
xx_data
,
xx_data
+
D2
,
prev_hidden_data
,
hidden_out_data
);
ker
->
ComputeHtPart2
(
xx_data
,
prev_hidden_data
,
hidden_out_data
);
// save prev
prev_hidden_data
=
hidden_out_data
;
move_step
();
...
...
@@ -289,28 +262,19 @@ class FusionGRUKernel : public framework::OpKernel<T> {
void
BatchCompute
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
using
DeviceContext
=
paddle
::
platform
::
CPUDeviceContext
;
auto
*
x
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
INIT_BASE_INPUT_OUTPUT
INIT_BASE_SIZES
if
(
x
->
lod
()[
0
].
size
()
==
2
)
{
INIT_BASE_DEFINES
;
if
(
x_lod
[
0
].
size
()
==
2
)
{
xx
->
Resize
({
total_T
,
D3
});
SeqCompute
(
ctx
);
return
;
}
INIT_VEC_FUNC
INIT_OTHER_DEFINES
;
auto
*
reordered_h0
=
ctx
.
Output
<
Tensor
>
(
"ReorderedH0"
);
auto
*
batched_input
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedInput"
);
auto
*
batched_out
=
ctx
.
Output
<
LoDTensor
>
(
"BatchedOut"
);
const
T
*
x_data
=
x
->
data
<
T
>
();
const
T
*
wx_data
=
wx
->
data
<
T
>
();
const
T
*
wh_data
=
wh
->
data
<
T
>
();
T
*
xx_data
=
xx
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
batched_input_data
=
batched_input
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
batched_out_data
=
batched_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
hidden_out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
T
*
batched_input_data
=
batched_input
->
mutable_data
<
T
>
(
place
);
T
*
batched_out_data
=
batched_out
->
mutable_data
<
T
>
(
place
);
hidden_out
->
mutable_data
<
T
>
(
place
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
DeviceContext
>();
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
dev_ctx
);
math
::
LoDTensor2BatchFunctor
<
DeviceContext
,
T
>
to_batch
;
...
...
@@ -336,7 +300,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
T
*
prev_hidden_data
=
nullptr
;
if
(
h0
)
{
// reorder h0
T
*
reordered_h0_data
=
reordered_h0
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()
);
T
*
reordered_h0_data
=
reordered_h0
->
mutable_data
<
T
>
(
place
);
const
T
*
h0_data
=
h0
->
data
<
T
>
();
prev_hidden_data
=
reordered_h0_data
;
size_t
sz
=
sizeof
(
T
)
*
D
;
...
...
@@ -350,12 +314,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
T
*
cur_out_data
=
batched_out_data
;
// W: {W_update, W_reset; W_state}
for
(
int
i
=
0
;
i
<
max_bs
;
++
i
)
{
// update gate
act_gate
(
D
,
cur_in_data
,
cur_in_data
);
// state gate
act_state
(
D
,
cur_in_data
+
D2
,
cur_in_data
+
D2
);
// out = a*b
blas
.
VMUL
(
D
,
cur_in_data
,
cur_in_data
+
D2
,
cur_out_data
);
ker
->
ComputeH1
(
cur_in_data
,
cur_out_data
);
// add offset
cur_in_data
+=
D3
;
cur_out_data
+=
D
;
...
...
@@ -380,10 +339,8 @@ class FusionGRUKernel : public framework::OpKernel<T> {
T
*
cur_out_data
=
batched_out_data
;
T
*
cur_prev_hidden_data
=
prev_hidden_data
;
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
act_gate
(
D2
,
cur_batched_data
,
cur_batched_data
);
// rt = rt*ht_1 inplace result
blas
.
VMUL
(
D
,
cur_prev_hidden_data
,
cur_batched_data
+
D
,
cur_out_data
);
ker
->
ComputeHtPart1
(
cur_batched_data
,
cur_prev_hidden_data
,
cur_out_data
);
cur_batched_data
+=
D3
;
cur_prev_hidden_data
+=
D
;
cur_out_data
+=
D
;
...
...
@@ -397,12 +354,8 @@ class FusionGRUKernel : public framework::OpKernel<T> {
cur_prev_hidden_data
=
prev_hidden_data
;
for
(
int
i
=
0
;
i
<
cur_bs
;
++
i
)
{
// ht~ = act_state(...)
act_state
(
D
,
cur_batched_data
+
D2
,
cur_batched_data
+
D2
);
// out = zt*ht~ + (1-zt)*ht_1
cross
(
D
,
cur_batched_data
,
cur_batched_data
+
D2
,
cur_prev_hidden_data
,
ker
->
ComputeHtPart2
(
cur_batched_data
,
cur_prev_hidden_data
,
cur_out_data
);
cur_batched_data
+=
D3
;
cur_prev_hidden_data
+=
D
;
cur_out_data
+=
D
;
...
...
@@ -416,9 +369,8 @@ class FusionGRUKernel : public framework::OpKernel<T> {
batched_out
->
set_lod
(
batched_lod
);
to_seq
(
dev_ctx
,
*
batched_out
,
hidden_out
);
}
#undef INIT_VEC_FUNC
#undef INIT_BASE_SIZES
#undef INIT_BASE_INPUT_OUTPUT
#undef INIT_OTHER_DEFINES
#undef INIT_BASE_DEFINES
};
}
// namespace operators
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
ea2bdd19
...
...
@@ -75,6 +75,6 @@ endif()
cc_test
(
concat_test SRCS concat_test.cc DEPS concat_and_split
)
cc_test
(
cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info
)
cc_library
(
jit_kernel
SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_
lstm
.cc
SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_
rnn
.cc
DEPS cpu_info cblas
)
cc_test
(
jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel
)
paddle/fluid/operators/math/jit_kernel.h
浏览文件 @
ea2bdd19
...
...
@@ -142,6 +142,15 @@ class LSTMKernel : public Kernel {
const
T
*
wp_data
=
nullptr
)
const
=
0
;
};
template
<
typename
T
>
class
GRUKernel
:
public
Kernel
{
public:
// compute h1 without h0
virtual
void
ComputeH1
(
T
*
gates
,
T
*
ht
)
const
=
0
;
virtual
void
ComputeHtPart1
(
T
*
gates
,
const
T
*
ht_1
,
T
*
ht
)
const
=
0
;
virtual
void
ComputeHtPart2
(
T
*
gates
,
const
T
*
ht_1
,
T
*
ht
)
const
=
0
;
};
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
...
...
paddle/fluid/operators/math/jit_kernel_
lstm
.cc
→
paddle/fluid/operators/math/jit_kernel_
rnn
.cc
浏览文件 @
ea2bdd19
...
...
@@ -136,6 +136,21 @@ static std::shared_ptr<const VActKernel<T>> GetActKernel(
return
nullptr
;
}
template
<
jit
::
cpu_isa_t
isa
>
static
std
::
unique_ptr
<
AVXAct
>
GetAVXAct
(
const
std
::
string
&
type
)
{
if
(
type
==
"sigmoid"
)
{
return
std
::
unique_ptr
<
AVXAct
>
(
new
AVXActImpl
<
kSigmoid
,
isa
>
());
}
else
if
(
type
==
"relu"
)
{
return
std
::
unique_ptr
<
AVXAct
>
(
new
AVXActImpl
<
kRelu
,
isa
>
());
}
else
if
(
type
==
"tanh"
)
{
return
std
::
unique_ptr
<
AVXAct
>
(
new
AVXActImpl
<
kTanh
,
isa
>
());
}
else
if
(
type
==
"identity"
||
type
==
""
)
{
return
std
::
unique_ptr
<
AVXAct
>
(
new
AVXActImpl
<
kIdentity
,
isa
>
());
}
PADDLE_THROW
(
"Not support type: %s"
,
type
);
return
nullptr
;
}
/* LSTM JitKernel */
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
class
LSTMKernelImpl
:
public
LSTMKernel
<
T
>
{
...
...
@@ -198,21 +213,9 @@ class LSTMKernelImpl : public LSTMKernel<T> {
const std::string& act_gate, const std::string& act_cand, \
const std::string& act_cell, int d) \
: LSTMKernel<float>() { \
auto GetAVXAct = [&](const std::string& type) -> std::unique_ptr<AVXAct> { \
if (type == "sigmoid") { \
return std::unique_ptr<AVXAct>(new AVXActImpl<kSigmoid, isa>()); \
} else if (type == "relu") { \
return std::unique_ptr<AVXAct>(new AVXActImpl<kRelu, isa>()); \
} else if (type == "tanh") { \
return std::unique_ptr<AVXAct>(new AVXActImpl<kTanh, isa>()); \
} else if (type == "identity" || type == "") { \
return std::unique_ptr<AVXAct>(new AVXActImpl<kIdentity, isa>()); \
} \
PADDLE_THROW("Not support type: %s", type); \
}; \
avx_act_gate_ = GetAVXAct(act_gate); \
avx_act_cand_ = GetAVXAct(act_cand); \
avx_act_cell_ = GetAVXAct(act_cell); \
avx_act_gate_ = GetAVXAct<isa>(act_gate); \
avx_act_cand_ = GetAVXAct<isa>(act_cand); \
avx_act_cell_ = GetAVXAct<isa>(act_cell); \
} \
template <> \
void LSTMKernelImpl<float, isa, kEQ8>::ComputeCtHt( \
...
...
@@ -354,6 +357,126 @@ REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM,
#undef JITKERNEL_DECLARE_LSTM
#undef JITKERNEL_KEY_LSTM
#undef JITKERNEL_NEW_LSTM_IMPL
/* GRU JitKernel */
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
class
GRUKernelImpl
:
public
GRUKernel
<
T
>
{
public:
explicit
GRUKernelImpl
(
const
std
::
string
&
act_gate
,
const
std
::
string
&
act_state
,
int
d
)
:
GRUKernel
<
T
>
()
{
d_
=
d
;
d2_
=
d
*
2
;
act_gate_d2_
=
GetActKernel
<
T
>
(
act_gate
,
d2_
);
act_gate_d_
=
GetActKernel
<
T
>
(
act_gate
,
d
);
act_state_d_
=
GetActKernel
<
T
>
(
act_state
,
d
);
vmul_d_
=
KernelPool
::
Instance
().
template
Get
<
VMulKernel
<
T
>
>
(
d
);
}
void
ComputeH1
(
T
*
gates
,
T
*
ht
)
const
override
{
act_gate_d_
->
Compute
(
gates
,
gates
);
act_state_d_
->
Compute
(
gates
+
d2_
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d2_
,
ht
);
}
void
ComputeHtPart1
(
T
*
gates
,
const
T
*
ht_1
,
T
*
ht
)
const
override
{
// W: {W_update, W_reset; W_state}
act_gate_d2_
->
Compute
(
gates
,
gates
);
vmul_d_
->
Compute
(
ht_1
,
gates
+
d_
,
ht
);
}
void
ComputeHtPart2
(
T
*
gates
,
const
T
*
ht_1
,
T
*
ht
)
const
override
{
T
*
y
=
gates
+
d2_
;
act_state_d_
->
Compute
(
y
,
y
);
// out = zt*ht~ + (1-zt)*ht_1
for
(
int
i
=
0
;
i
<
d_
;
++
i
)
{
ht
[
i
]
=
gates
[
i
]
*
y
[
i
]
+
(
static_cast
<
T
>
(
1
)
-
gates
[
i
])
*
ht_1
[
i
];
}
}
private:
int
d_
,
d2_
;
std
::
shared_ptr
<
const
VActKernel
<
T
>>
act_gate_d2_
,
act_gate_d_
,
act_state_d_
;
std
::
shared_ptr
<
const
VMulKernel
<
T
>>
vmul_d_
;
#ifdef __AVX__
std
::
unique_ptr
<
const
AVXAct
>
avx_act_gate_
,
avx_act_state_
;
#endif
};
#define INTRI8_FLOAT(isa) \
template <> \
GRUKernelImpl<float, isa, kEQ8>::GRUKernelImpl( \
const std::string& act_gate, const std::string& act_state, int d) \
: GRUKernel<float>() { \
avx_act_gate_ = GetAVXAct<isa>(act_gate); \
avx_act_state_ = GetAVXAct<isa>(act_state); \
} \
template <> \
void GRUKernelImpl<float, isa, kEQ8>::ComputeH1(float* gates, float* ht) \
const { \
__m256 u, s; \
/* W: {W_update, W_reset; W_state} */
\
u = _mm256_loadu_ps(gates); \
s = _mm256_loadu_ps(gates + 16); \
s = _mm256_mul_ps(avx_act_gate_->Compute(u), avx_act_state_->Compute(s)); \
_mm256_storeu_ps(ht, s); \
} \
template <> \
void GRUKernelImpl<float, isa, kEQ8>::ComputeHtPart1( \
float* gates, const float* ht_1, float* ht) const { \
/* not exactly equal the any implementation */
\
__m256 r, ht0; \
r = _mm256_loadu_ps(gates + 8); \
ht0 = _mm256_loadu_ps(ht_1); \
r = _mm256_mul_ps(avx_act_gate_->Compute(r), ht0); \
_mm256_storeu_ps(ht, r); \
} \
template <> \
void GRUKernelImpl<float, isa, kEQ8>::ComputeHtPart2( \
float* gates, const float* ht_1, float* ht) const { \
/* not exactly equal the any implementation */
\
__m256 u, s, ht0; \
u = _mm256_loadu_ps(gates); \
s = _mm256_loadu_ps(gates + 16); \
ht0 = _mm256_loadu_ps(ht_1); \
u = avx_act_gate_->Compute(u); \
s = _mm256_mul_ps(u, avx_act_state_->Compute(s)); \
u = _mm256_sub_ps(_mm256_set1_ps(1.f), u); \
u = _mm256_mul_ps(u, ht0); \
u = _mm256_add_ps(s, u); \
_mm256_storeu_ps(ht, u); \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
#endif
#define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype) \
template <> \
std::shared_ptr<const GRUKernel<ker_dtype>> KernelPool::Get< \
GRUKernel<ker_dtype>, const std::string&, const std::string&, int>( \
const std::string& act_gate, const std::string& act_state, int d)
#define JITKERNEL_KEY_GRU(ker_key, dtype_key) \
#ker_key #dtype_key + std::to_string(d) + act_gate + act_state
#define JITKERNEL_NEW_GRU_IMPL(ker, dtype, isa, k) \
p = std::dynamic_pointer_cast<ker<dtype>>( \
std::make_shared<ker##Impl<dtype, isa, k>>(act_gate, act_state, d));
REGISTER_JITKERNEL_ARGS
(
gru
,
GRUKernel
,
JITKERNEL_DECLARE_GRU
,
JITKERNEL_KEY_GRU
,
JITKERNEL_NEW_GRU_IMPL
);
#undef INTRI8_FLOAT
#undef JITKERNEL_NEW_GRU_IMPL
#undef JITKERNEL_KEY_GRU
#undef JITKERNEL_DECLARE_GRU
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
...
...
paddle/fluid/operators/top_k_op.cu
浏览文件 @
ea2bdd19
...
...
@@ -262,31 +262,31 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
const
T
*
src
,
int
lds
,
int
dim
,
int
k
,
int
grid_dim
,
int
num
)
{
__shared__
Pair
<
T
>
sh_topk
[
BlockSize
];
__shared__
int
maxid
[
BlockSize
/
2
];
const
int
tid
=
threadIdx
.
x
;
const
int
warp
=
threadIdx
.
x
/
32
;
const
int
bid
=
blockIdx
.
x
;
for
(
int
i
=
bid
;
i
<
num
;
i
+=
grid_dim
)
{
output
+=
i
*
output_stride
;
indices
+=
i
*
k
;
int
top_num
=
k
;
__shared__
int
maxid
[
BlockSize
/
2
];
T
*
out
=
output
+
i
*
output_stride
;
int64_t
*
inds
=
indices
+
i
*
k
;
Pair
<
T
>
topk
[
MaxLength
];
int
beam
=
MaxLength
;
Pair
<
T
>
max
;
bool
is_empty
=
false
;
bool
firststep
=
true
;
for
(
int
k
=
0
;
k
<
MaxLength
;
k
++
)
{
topk
[
k
].
set
(
-
INFINITY
,
-
1
);
for
(
int
j
=
0
;
j
<
MaxLength
;
j
++
)
{
topk
[
j
].
set
(
-
INFINITY
,
-
1
);
}
while
(
k
)
{
while
(
top_num
)
{
ThreadGetTopK
<
T
,
MaxLength
,
BlockSize
>
(
topk
,
&
beam
,
k
,
src
+
i
*
lds
,
&
firststep
,
&
is_empty
,
&
max
,
dim
,
tid
);
sh_topk
[
tid
]
=
topk
[
0
];
BlockReduce
<
T
,
MaxLength
,
BlockSize
>
(
sh_topk
,
maxid
,
topk
,
&
out
put
,
&
indices
,
&
beam
,
&
k
,
tid
,
warp
);
BlockReduce
<
T
,
MaxLength
,
BlockSize
>
(
sh_topk
,
maxid
,
topk
,
&
out
,
&
inds
,
&
beam
,
&
top_num
,
tid
,
warp
);
}
}
}
...
...
@@ -327,13 +327,15 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
size_t
k
=
static_cast
<
int
>
(
ctx
.
Attr
<
int
>
(
"k"
));
const
T
*
input_data
=
input
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// FIXME(typhoonzero): data is always converted to type T?
int64_t
*
indices_data
=
indices
->
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
size_t
input_height
=
input
->
dims
()[
0
];
size_t
input_width
=
input
->
dims
()[
1
];
framework
::
DDim
inputdims
=
input
->
dims
();
const
size_t
input_height
=
framework
::
product
(
framework
::
slice_ddim
(
inputdims
,
0
,
inputdims
.
size
()
-
1
));
const
size_t
input_width
=
inputdims
[
inputdims
.
size
()
-
1
];
if
(
k
>
input_width
)
k
=
input_width
;
// NOTE: pass lds and dim same to input width.
...
...
@@ -342,14 +344,12 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
const
int
kMaxHeight
=
2048
;
int
gridx
=
input_height
<
kMaxHeight
?
input_height
:
kMaxHeight
;
auto
&
dev_ctx
=
ctx
.
cuda_device_context
();
switch
(
GetDesiredBlockDim
(
input_width
))
{
FIXED_BLOCK_DIM
(
KeMatrixTopK
<
T
,
5
,
kBlockDim
><<<
gridx
,
kBlockDim
,
0
,
dev_ctx
.
stream
()
>>>
(
output_data
,
output
->
dims
()[
1
],
indices_data
,
input_data
,
input_width
,
input_width
,
static_cast
<
int
>
(
k
),
gridx
,
input_height
));
output_data
,
k
,
indices_data
,
input_data
,
input_width
,
input_width
,
static_cast
<
int
>
(
k
),
gridx
,
input_height
));
default:
PADDLE_THROW
(
"Error"
);
}
...
...
paddle/fluid/operators/top_k_op.h
浏览文件 @
ea2bdd19
...
...
@@ -34,7 +34,6 @@ class TopkKernel : public framework::OpKernel<T> {
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
// Get the top k elements of each row of input tensor
// FIXME: only deal with matrix(2d tensor).
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
auto
*
indices
=
ctx
.
Output
<
Tensor
>
(
"Indices"
);
...
...
@@ -44,8 +43,6 @@ class TopkKernel : public framework::OpKernel<T> {
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
int64_t
*
indices_data
=
indices
->
mutable_data
<
int64_t
>
(
ctx
.
GetPlace
());
auto
eg_input
=
EigenMatrix
<
T
>::
From
(
*
input
);
// reshape input to a flattern matrix(like flat_inner_dims)
framework
::
DDim
inputdims
=
input
->
dims
();
const
size_t
row
=
framework
::
product
(
...
...
@@ -53,7 +50,7 @@ class TopkKernel : public framework::OpKernel<T> {
const
size_t
col
=
inputdims
[
inputdims
.
size
()
-
1
];
Eigen
::
DSizes
<
int
,
2
>
flat2dims
(
row
,
col
);
// NOTE: eigen shape doesn't affect paddle tensor.
eg_input
.
reshape
(
flat2dims
);
auto
eg_input
=
EigenMatrix
<
T
>::
Reshape
(
*
input
,
inputdims
.
size
()
-
1
);
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
...
...
python/paddle/fluid/layers/detection.py
浏览文件 @
ea2bdd19
...
...
@@ -116,8 +116,8 @@ def rpn_target_assign(bbox_pred,
Returns:
tuple:
A tuple(predicted_scores, predicted_location, target_label,
target_bbox
) is returned. The predicted_scores and
predicted_location is the predicted result of the RPN.
target_bbox
, bbox_inside_weight) is returned. The predicted_scores
and
predicted_location is the predicted result of the RPN.
The target_label and target_bbox is the ground truth,
respectively. The predicted_location is a 2D Tensor with shape
[F, 4], and the shape of target_bbox is same as the shape of
...
...
@@ -126,6 +126,8 @@ def rpn_target_assign(bbox_pred,
[F + B, 1], and the shape of target_label is same as the shape
of the predicted_scores, B is the number of the background
anchors, the F and B is depends on the input of this operator.
Bbox_inside_weight represents whether the predicted loc is fake_fg
or not and the shape is [F, 4].
Examples:
.. code-block:: python
...
...
@@ -138,7 +140,7 @@ def rpn_target_assign(bbox_pred,
append_batch_size=False, dtype='float32')
gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
append_batch_size=False, dtype='float32')
loc_pred, score_pred, loc_target, score_target =
loc_pred, score_pred, loc_target, score_target
, bbox_inside_weight
=
fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
cls_logits=cls_logits,
anchor_box=anchor_box,
...
...
@@ -152,6 +154,8 @@ def rpn_target_assign(bbox_pred,
target_label
=
helper
.
create_variable_for_type_inference
(
dtype
=
'int32'
)
target_bbox
=
helper
.
create_variable_for_type_inference
(
dtype
=
anchor_box
.
dtype
)
bbox_inside_weight
=
helper
.
create_variable_for_type_inference
(
dtype
=
anchor_box
.
dtype
)
helper
.
append_op
(
type
=
"rpn_target_assign"
,
inputs
=
{
...
...
@@ -164,7 +168,8 @@ def rpn_target_assign(bbox_pred,
'LocationIndex'
:
loc_index
,
'ScoreIndex'
:
score_index
,
'TargetLabel'
:
target_label
,
'TargetBBox'
:
target_bbox
'TargetBBox'
:
target_bbox
,
'BBoxInsideWeight'
:
bbox_inside_weight
},
attrs
=
{
'rpn_batch_size_per_im'
:
rpn_batch_size_per_im
,
...
...
@@ -179,13 +184,14 @@ def rpn_target_assign(bbox_pred,
score_index
.
stop_gradient
=
True
target_label
.
stop_gradient
=
True
target_bbox
.
stop_gradient
=
True
bbox_inside_weight
.
stop_gradient
=
True
cls_logits
=
nn
.
reshape
(
x
=
cls_logits
,
shape
=
(
-
1
,
1
))
bbox_pred
=
nn
.
reshape
(
x
=
bbox_pred
,
shape
=
(
-
1
,
4
))
predicted_cls_logits
=
nn
.
gather
(
cls_logits
,
score_index
)
predicted_bbox_pred
=
nn
.
gather
(
bbox_pred
,
loc_index
)
return
predicted_cls_logits
,
predicted_bbox_pred
,
target_label
,
target_bbox
return
predicted_cls_logits
,
predicted_bbox_pred
,
target_label
,
target_bbox
,
bbox_inside_weight
def
detection_output
(
loc
,
...
...
python/paddle/fluid/tests/test_detection.py
浏览文件 @
ea2bdd19
...
...
@@ -301,7 +301,7 @@ class TestRpnTargetAssign(unittest.TestCase):
dtype
=
'float32'
,
lod_level
=
1
,
append_batch_size
=
False
)
pred_scores
,
pred_loc
,
tgt_lbl
,
tgt_bbox
=
layers
.
rpn_target_assign
(
pred_scores
,
pred_loc
,
tgt_lbl
,
tgt_bbox
,
bbox_inside_weight
=
layers
.
rpn_target_assign
(
bbox_pred
=
bbox_pred
,
cls_logits
=
cls_logits
,
anchor_box
=
anchor_box
,
...
...
@@ -313,15 +313,18 @@ class TestRpnTargetAssign(unittest.TestCase):
rpn_straddle_thresh
=
0.0
,
rpn_fg_fraction
=
0.5
,
rpn_positive_overlap
=
0.7
,
rpn_negative_overlap
=
0.3
)
rpn_negative_overlap
=
0.3
,
use_random
=
False
)
self
.
assertIsNotNone
(
pred_scores
)
self
.
assertIsNotNone
(
pred_loc
)
self
.
assertIsNotNone
(
tgt_lbl
)
self
.
assertIsNotNone
(
tgt_bbox
)
self
.
assertIsNotNone
(
bbox_inside_weight
)
assert
pred_scores
.
shape
[
1
]
==
1
assert
pred_loc
.
shape
[
1
]
==
4
assert
pred_loc
.
shape
[
1
]
==
tgt_bbox
.
shape
[
1
]
print
(
str
(
program
))
class
TestGenerateProposals
(
unittest
.
TestCase
):
...
...
python/paddle/fluid/tests/unittests/test_dist_mnist.py
浏览文件 @
ea2bdd19
...
...
@@ -40,7 +40,8 @@ class TestDistMnistAsync(TestDistBase):
self
.
_sync_mode
=
False
self
.
_use_reduce
=
False
def
test_dist_train
(
self
):
# FIXME(typhoonzero): fix async mode test later
def
no_test_dist_train
(
self
):
self
.
check_with_place
(
"dist_mnist.py"
,
delta
=
200
)
...
...
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
浏览文件 @
ea2bdd19
...
...
@@ -40,7 +40,8 @@ class TestDistSeResneXt2x2Async(TestDistBase):
self
.
_sync_mode
=
False
self
.
_use_reader_alloc
=
False
def
test_dist_train
(
self
):
#FIXME(typhoonzero): fix async mode later
def
no_test_dist_train
(
self
):
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
100
)
...
...
python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
浏览文件 @
ea2bdd19
...
...
@@ -42,7 +42,8 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase):
self
.
_sync_mode
=
False
self
.
_enforce_place
=
"CPU"
def
test_simnet_bow
(
self
):
#FIXME(typhoonzero): fix async tests later
def
no_test_simnet_bow
(
self
):
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'0'
,
...
...
@@ -78,7 +79,8 @@ class TestDistSimnetBow2x2SparseAsync(TestDistBase):
self
.
_sync_mode
=
False
self
.
_enforce_place
=
"CPU"
def
test_simnet_bow
(
self
):
#FIXME(typhoonzero): fix async tests later
def
no_test_simnet_bow
(
self
):
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'1'
,
...
...
python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
浏览文件 @
ea2bdd19
...
...
@@ -125,6 +125,12 @@ class TestFusionGRUOpMD2(TestFusionGRUOp):
self
.
D
=
8
class
TestFusionGRUOpMD3
(
TestFusionGRUOp
):
def
set_confs
(
self
):
self
.
M
=
17
self
.
D
=
15
class
TestFusionGRUOpBS1
(
TestFusionGRUOp
):
def
set_confs
(
self
):
self
.
lod
=
[[
3
]]
...
...
python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
浏览文件 @
ea2bdd19
...
...
@@ -50,8 +50,10 @@ def rpn_target_assign(anchor_by_gt_overlap,
fg_inds
,
size
=
(
len
(
fg_inds
)
-
num_fg
),
replace
=
False
)
else
:
disable_inds
=
fg_inds
[
num_fg
:]
labels
[
disable_inds
]
=
-
1
fg_inds
=
np
.
where
(
labels
==
1
)[
0
]
bbox_inside_weight
=
np
.
zeros
((
len
(
fg_inds
),
4
),
dtype
=
np
.
float32
)
num_bg
=
rpn_batch_size_per_im
-
np
.
sum
(
labels
==
1
)
bg_inds
=
np
.
where
(
anchor_to_gt_max
<
rpn_negative_overlap
)[
0
]
...
...
@@ -59,18 +61,27 @@ def rpn_target_assign(anchor_by_gt_overlap,
enable_inds
=
bg_inds
[
np
.
random
.
randint
(
len
(
bg_inds
),
size
=
num_bg
)]
else
:
enable_inds
=
bg_inds
[:
num_bg
]
fg_fake_inds
=
np
.
array
([],
np
.
int32
)
fg_value
=
np
.
array
([
fg_inds
[
0
]],
np
.
int32
)
fake_num
=
0
for
bg_id
in
enable_inds
:
if
bg_id
in
fg_inds
:
fake_num
+=
1
fg_fake_inds
=
np
.
hstack
([
fg_fake_inds
,
fg_value
])
labels
[
enable_inds
]
=
0
bbox_inside_weight
[
fake_num
:,
:]
=
1
fg_inds
=
np
.
where
(
labels
==
1
)[
0
]
bg_inds
=
np
.
where
(
labels
==
0
)[
0
]
loc_index
=
fg_inds
score_index
=
np
.
hstack
((
fg_inds
,
bg_inds
))
loc_index
=
np
.
hstack
([
fg_fake_inds
,
fg_inds
])
score_index
=
np
.
hstack
([
fg_inds
,
bg_inds
])
labels
=
labels
[
score_index
]
assert
not
np
.
any
(
labels
==
-
1
),
"Wrong labels with -1"
gt_inds
=
anchor_to_gt_argmax
[
fg_inds
]
gt_inds
=
anchor_to_gt_argmax
[
loc_index
]
return
loc_index
,
score_index
,
labels
,
gt_inds
return
loc_index
,
score_index
,
labels
,
gt_inds
,
bbox_inside_weight
def
get_anchor
(
n
,
c
,
h
,
w
):
...
...
@@ -123,9 +134,12 @@ def rpn_target_assign_in_python(all_anchors,
gt_boxes_slice
=
gt_boxes_slice
[
not_crowd_inds
]
iou
=
_bbox_overlaps
(
inside_anchors
,
gt_boxes_slice
)
loc_inds
,
score_inds
,
labels
,
gt_inds
=
rpn_target_assign
(
iou
,
rpn_batch_size_per_im
,
rpn_positive_overlap
,
rpn_negative_overlap
,
rpn_fg_fraction
,
use_random
)
loc_inds
,
score_inds
,
labels
,
gt_inds
,
bbox_inside_weight
=
\
rpn_target_assign
(
iou
,
rpn_batch_size_per_im
,
rpn_positive_overlap
,
rpn_negative_overlap
,
rpn_fg_fraction
,
use_random
)
# unmap to all anchor
loc_inds
=
inds_inside
[
loc_inds
]
score_inds
=
inds_inside
[
score_inds
]
...
...
@@ -139,6 +153,7 @@ def rpn_target_assign_in_python(all_anchors,
score_indexes
=
score_inds
tgt_labels
=
labels
tgt_bboxes
=
box_deltas
bbox_inside_weights
=
bbox_inside_weight
else
:
loc_indexes
=
np
.
concatenate
(
[
loc_indexes
,
loc_inds
+
i
*
anchor_num
])
...
...
@@ -146,8 +161,10 @@ def rpn_target_assign_in_python(all_anchors,
[
score_indexes
,
score_inds
+
i
*
anchor_num
])
tgt_labels
=
np
.
concatenate
([
tgt_labels
,
labels
])
tgt_bboxes
=
np
.
vstack
([
tgt_bboxes
,
box_deltas
])
bbox_inside_weights
=
np
.
vstack
([
bbox_inside_weights
,
\
bbox_inside_weight
])
return
loc_indexes
,
score_indexes
,
tgt_bboxes
,
tgt_labels
return
loc_indexes
,
score_indexes
,
tgt_bboxes
,
tgt_labels
,
bbox_inside_weights
class
TestRpnTargetAssignOp
(
OpTest
):
...
...
@@ -182,9 +199,11 @@ class TestRpnTargetAssignOp(OpTest):
rpn_fg_fraction
=
0.5
use_random
=
False
loc_index
,
score_index
,
tgt_bbox
,
labels
=
rpn_target_assign_in_python
(
all_anchors
,
gt_boxes
,
is_crowd
,
im_info
,
lod
,
rpn_straddle_thresh
,
rpn_batch_size_per_im
,
rpn_positive_overlap
,
rpn_negative_overlap
,
loc_index
,
score_index
,
tgt_bbox
,
labels
,
bbox_inside_weights
=
\
rpn_target_assign_in_python
(
all_anchors
,
gt_boxes
,
is_crowd
,
im_info
,
lod
,
rpn_straddle_thresh
,
rpn_batch_size_per_im
,
rpn_positive_overlap
,
rpn_negative_overlap
,
rpn_fg_fraction
,
use_random
)
labels
=
labels
[:,
np
.
newaxis
]
...
...
@@ -207,7 +226,8 @@ class TestRpnTargetAssignOp(OpTest):
'LocationIndex'
:
loc_index
.
astype
(
'int32'
),
'ScoreIndex'
:
score_index
.
astype
(
'int32'
),
'TargetBBox'
:
tgt_bbox
.
astype
(
'float32'
),
'TargetLabel'
:
labels
.
astype
(
'int32'
)
'TargetLabel'
:
labels
.
astype
(
'int32'
),
'BBoxInsideWeight'
:
bbox_inside_weights
.
astype
(
'float32'
)
}
def
test_check_output
(
self
):
...
...
python/paddle/fluid/tests/unittests/test_top_k_op.py
浏览文件 @
ea2bdd19
...
...
@@ -21,22 +21,27 @@ from op_test import OpTest
class
TestTopkOp
(
OpTest
):
def
setUp
(
self
):
self
.
set_args
()
self
.
op_type
=
"top_k"
k
=
1
input
=
np
.
random
.
random
((
32
,
84
)).
astype
(
"float32"
)
output
=
np
.
ndarray
((
32
,
k
))
indices
=
np
.
ndarray
((
32
,
k
)).
astype
(
"int64"
)
k
=
self
.
top_k
input
=
np
.
random
.
random
((
self
.
row
,
k
)).
astype
(
"float32"
)
output
=
np
.
ndarray
((
self
.
row
,
k
))
indices
=
np
.
ndarray
((
self
.
row
,
k
)).
astype
(
"int64"
)
self
.
inputs
=
{
'X'
:
input
}
self
.
attrs
=
{
'k'
:
k
}
for
rowid
in
range
(
32
):
for
rowid
in
range
(
self
.
row
):
row
=
input
[
rowid
]
output
[
rowid
]
=
np
.
sort
(
row
)[
-
k
:
]
indices
[
rowid
]
=
row
.
argsort
()[
-
k
:
]
output
[
rowid
]
=
np
.
sort
(
row
)[
::
-
1
][:
k
]
indices
[
rowid
]
=
row
.
argsort
()[
::
-
1
][:
k
]
self
.
outputs
=
{
'Out'
:
output
,
'Indices'
:
indices
}
def
set_args
(
self
):
self
.
row
=
32
self
.
top_k
=
1
def
test_check_output
(
self
):
self
.
check_output
()
...
...
@@ -50,14 +55,39 @@ class TestTopkOp3d(OpTest):
output
=
np
.
ndarray
((
64
,
k
))
indices
=
np
.
ndarray
((
64
,
k
)).
astype
(
"int64"
)
# FIXME: should use 'X': input for a 3d input
self
.
inputs
=
{
'X'
:
input_flat_2d
}
self
.
inputs
=
{
'X'
:
input
}
self
.
attrs
=
{
'k'
:
k
}
for
rowid
in
range
(
64
):
row
=
input_flat_2d
[
rowid
]
output
[
rowid
]
=
np
.
sort
(
row
)[
-
k
:]
indices
[
rowid
]
=
row
.
argsort
()[
-
k
:]
output
[
rowid
]
=
np
.
sort
(
row
)[::
-
1
][:
k
]
indices
[
rowid
]
=
row
.
argsort
()[::
-
1
][:
k
]
self
.
outputs
=
{
'Out'
:
output
.
reshape
((
32
,
2
,
k
)),
'Indices'
:
indices
.
reshape
((
32
,
2
,
k
))
}
def
test_check_output
(
self
):
self
.
check_output
()
class
TestTopkOp2
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"top_k"
k
=
1
m
=
2056
input
=
np
.
random
.
random
((
m
,
84
)).
astype
(
"float32"
)
output
=
np
.
ndarray
((
m
,
k
))
indices
=
np
.
ndarray
((
m
,
k
)).
astype
(
"int64"
)
self
.
inputs
=
{
'X'
:
input
}
self
.
attrs
=
{
'k'
:
k
}
for
rowid
in
range
(
m
):
row
=
input
[
rowid
]
output
[
rowid
]
=
-
np
.
sort
(
-
row
)[:
k
]
indices
[
rowid
]
=
(
-
row
).
argsort
()[:
k
]
self
.
outputs
=
{
'Out'
:
output
,
'Indices'
:
indices
}
...
...
@@ -65,5 +95,17 @@ class TestTopkOp3d(OpTest):
self
.
check_output
()
class
TestTopkOp3
(
TestTopkOp
):
def
set_args
(
self
):
self
.
row
=
2056
self
.
top_k
=
3
class
TestTopkOp4
(
TestTopkOp
):
def
set_args
(
self
):
self
.
row
=
40000
self
.
top_k
=
1
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录