Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
7f6e8c61
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7f6e8c61
编写于
8月 22, 2019
作者:
qnqinan
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update proposal and psroipool kernel file in FPGA V2 track
上级
411b24e3
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
98 addition
and
127 deletion
+98
-127
mobile/src/fpga/V2/api.cpp
mobile/src/fpga/V2/api.cpp
+2
-2
mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp
mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp
+41
-92
mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
+15
-28
mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
+35
-1
mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
+3
-4
mobile/src/operators/kernel/fpga/V2/softmax_kernel.cpp
mobile/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+2
-0
未找到文件。
mobile/src/fpga/V2/api.cpp
浏览文件 @
7f6e8c61
...
...
@@ -359,7 +359,7 @@ void expand_conv_arg(ConvArgs *arg) {
if
(((
res_win
%
2
)
!=
0
)
&&
(
res_win
!=
1
))
{
res_win
=
res_win
-
1
;
}
PADDLE_MOBILE_ENFORCE
(
res_win
>=
2
,
"window too bigger than fpga volume"
);
//
PADDLE_MOBILE_ENFORCE(res_win >= 2, "window too bigger than fpga volume");
res_fit
=
res_win
;
auto
block_num
=
(
output_width
+
res_fit
-
1
)
/
res_fit
;
...
...
@@ -885,7 +885,7 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
int
padding_h
,
int
padding_w
,
float
*
bias_ptr
)
{
auto
filter_ptr
=
filter
->
data
<
int16_t
>
();
auto
input_ptr
=
input
->
data
<
int8_t
>
();
auto
output_ptr
=
out
->
mutable_
data
<
int8_t
>
();
auto
output_ptr
=
out
->
data
<
int8_t
>
();
arg
->
sub_conv_num
=
1
;
arg
->
relu_enabled
=
relu_enabled
;
// arg->output.activation.activation_type = activation_enable;
...
...
mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp
浏览文件 @
7f6e8c61
...
...
@@ -30,16 +30,12 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
int64_t
batch
=
param
->
scores_
->
dims
()[
0
];
auto
total
=
post_nms_top_n
*
batch
;
param
->
rpn_rois_
->
mutable_data
<
float
>
({
total
,
4
});
param
->
rpn_probs_
->
mutable_data
<
floa
t
>
({
total
,
1
});
param
->
rpn_probs_
->
mutable_data
<
int8_
t
>
({
total
,
1
});
param
->
float_bbox
=
std
::
make_shared
<
Tensor
>
();
param
->
float_bbox
->
Resize
(
param
->
bbox_deltas_
->
dims
());
param
->
float_bbox
->
init
(
type_id
<
float
>
().
hash_code
());
fpga
::
format_fp32_ofm
(
param
->
float_bbox
.
get
());
param
->
float_score
=
std
::
make_shared
<
Tensor
>
();
param
->
float_score
->
Resize
(
param
->
scores_
->
dims
());
param
->
float_score
->
init
(
type_id
<
float
>
().
hash_code
());
fpga
::
format_fp32_ofm
(
param
->
float_score
.
get
());
auto
input
=
param
->
scores_
;
param
->
score_index_
=
std
::
make_shared
<
Tensor
>
();
...
...
@@ -87,8 +83,8 @@ void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
}
template
<
class
T
>
static
inline
void
BoxCoder
(
Tensor
*
all_anchors
,
Tensor
*
bbox_deltas
,
Tensor
*
variances
,
Tensor
*
proposals
)
{
static
inline
void
BoxCoder
(
Tensor
*
all_anchors
,
Tensor
*
bbox_deltas
,
Tensor
*
proposals
)
{
T
*
proposals_data
=
proposals
->
mutable_data
<
T
>
();
int64_t
row
=
all_anchors
->
dims
()[
0
];
...
...
@@ -96,10 +92,6 @@ static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
auto
*
bbox_deltas_data
=
bbox_deltas
->
data
<
T
>
();
auto
*
anchor_data
=
all_anchors
->
data
<
T
>
();
const
T
*
variances_data
=
nullptr
;
if
(
variances
)
{
variances_data
=
variances
->
data
<
T
>
();
}
for
(
int64_t
i
=
0
;
i
<
row
;
++
i
)
{
T
anchor_width
=
anchor_data
[
i
*
len
+
2
]
-
anchor_data
[
i
*
len
]
+
1.0
;
...
...
@@ -244,10 +236,10 @@ static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold,
// 4: [xmin ymin xmax ymax]
int64_t
box_size
=
bbox
->
dims
()[
1
];
std
::
vector
<
T
>
scores_data
(
num_boxes
);
std
::
copy_n
(
scores
->
data
<
T
>
(),
num_boxes
,
scores_data
.
begin
());
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
=
GetSortedScoreIndex
<
T
>
(
scores_data
);
std
::
vector
<
int8_t
>
scores_data
(
num_boxes
);
std
::
copy_n
(
scores
->
data
<
int8_t
>
(),
num_boxes
,
scores_data
.
begin
());
std
::
vector
<
std
::
pair
<
int8_t
,
int
>>
sorted_indices
=
GetSortedScoreIndex
<
int8_t
>
(
scores_data
);
std
::
vector
<
int
>
selected_indices
;
int
selected_num
=
0
;
...
...
@@ -284,8 +276,7 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
const
Tensor
&
scores_slice
,
// [N, 1]
const
Tensor
&
score_index
,
int
pre_nms_top_n
,
int
post_nms_top_n
,
float
nms_thresh
,
float
min_size
,
float
eta
)
{
auto
*
scores_data
=
scores_slice
.
data
<
T
>
();
auto
*
scores_data
=
scores_slice
.
data
<
int8_t
>
();
// Sort index
Tensor
index_t
;
index_t
.
Resize
({
scores_slice
.
numel
()});
...
...
@@ -306,17 +297,17 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
}
Tensor
scores_sel
,
bbox_sel
,
anchor_sel
,
var_sel
;
scores_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
1
});
scores_sel
.
mutable_data
<
int8_t
>
({
index_t
.
numel
(),
1
});
bbox_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
});
anchor_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
});
var_sel
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
});
CPUGather
<
T
>
(
scores_slice
,
index_t
,
&
scores_sel
);
CPUGather
<
int8_t
>
(
scores_slice
,
index_t
,
&
scores_sel
);
CPUGather
<
T
>
(
bbox_deltas_slice
,
index_t
,
&
bbox_sel
);
CPUGather
<
T
>
(
anchors
,
index_t
,
&
anchor_sel
);
Tensor
proposals
;
proposals
.
mutable_data
<
T
>
({
index_t
.
numel
(),
4
});
BoxCoder
<
T
>
(
&
anchor_sel
,
&
bbox_sel
,
nullptr
,
&
proposals
);
BoxCoder
<
T
>
(
&
anchor_sel
,
&
bbox_sel
,
&
proposals
);
ClipTiledBoxes
<
T
>
(
im_info_slice
,
&
proposals
);
...
...
@@ -325,10 +316,10 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
Tensor
scores_filter
;
bbox_sel
.
mutable_data
<
T
>
({
keep
.
numel
(),
4
});
scores_filter
.
mutable_data
<
T
>
({
keep
.
numel
(),
1
});
scores_filter
.
mutable_data
<
int8_t
>
({
keep
.
numel
(),
1
});
CPUGather
<
T
>
(
proposals
,
keep
,
&
bbox_sel
);
CPUGather
<
T
>
(
scores_sel
,
keep
,
&
scores_filter
);
CPUGather
<
int8_t
>
(
scores_sel
,
keep
,
&
scores_filter
);
if
(
nms_thresh
<=
0
)
{
return
std
::
make_pair
(
bbox_sel
,
scores_filter
);
}
...
...
@@ -341,10 +332,10 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
}
proposals
.
mutable_data
<
T
>
({
keep_nms
.
numel
(),
4
});
// original
scores_sel
.
mutable_data
<
T
>
({
keep_nms
.
numel
(),
1
});
// original
scores_sel
.
mutable_data
<
int8_t
>
({
keep_nms
.
numel
(),
1
});
// original
CPUGather
<
T
>
(
bbox_sel
,
keep_nms
,
&
proposals
);
CPUGather
<
T
>
(
scores_filter
,
keep_nms
,
&
scores_sel
);
CPUGather
<
int8_t
>
(
scores_filter
,
keep_nms
,
&
scores_sel
);
return
std
::
make_pair
(
proposals
,
scores_sel
);
}
...
...
@@ -368,69 +359,41 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> ¶m) {
bbox_height
=
(
uint32_t
)(
input_bbox
->
dims
()[
2
]);
bbox_width
=
(
uint32_t
)(
input_bbox
->
dims
()[
3
]);
std
::
shared_ptr
<
Tensor
>
score_tmp
=
std
::
make_shared
<
Tensor
>
();
score_tmp
->
Resize
(
param
.
scores_
->
dims
());
score_tmp
->
mutable_data
<
int8_t
>
();
std
::
shared_ptr
<
Tensor
>
bbox_tmp
=
std
::
make_shared
<
Tensor
>
();
bbox_tmp
->
Resize
(
param
.
bbox_deltas_
->
dims
());
bbox_tmp
->
mutable_data
<
int8_t
>
();
auto
score_tmp_data
=
score_tmp
->
data
<
int8_t
>
();
auto
bbox_tmp_data
=
bbox_tmp
->
data
<
int8_t
>
();
int64_t
amount_per_side
=
score_width
*
score_height
;
int
idx
=
0
;
int
alignedCW
=
fpga
::
align_to_x
(
score_width
*
score_channels
,
IMAGE_ALIGNMENT
);
int
unalignedCW
=
score_width
*
score_channels
;
fpga
::
fpga_invalidate
(
input_score_data
,
score_height
*
alignedCW
*
sizeof
(
int8_t
));
for
(
int
h
=
0
;
h
<
score_height
;
h
++
)
{
for
(
int
w
=
0
;
w
<
score_width
;
w
++
)
{
for
(
int
c
=
0
;
c
<
score_channels
;
c
++
)
{
if
(
alignedCW
==
unalignedCW
)
{
*
(
score_tmp_data
+
c
*
amount_per_side
+
score_width
*
h
+
w
)
=
(
*
(
input_score_data
++
));
}
else
{
idx
=
h
*
alignedCW
+
w
*
score_channels
+
c
;
*
(
score_tmp_data
+
c
*
amount_per_side
+
score_width
*
h
+
w
)
=
input_score_data
[
idx
];
}
Tensor
score_tensor
=
*
input_score
;
for
(
int
h
=
0
;
h
<
score_height
;
h
++
){
for
(
int
w
=
0
;
w
<
score_width
;
w
++
){
for
(
int
c
=
0
;
c
<
score_channels
;
++
c
)
{
int
dstidx
=
h
*
unalignedCW
+
w
*
score_channels
+
c
;
int
srcidx
=
h
*
alignedCW
+
w
*
score_channels
+
c
;
score_tensor
.
data
<
int8_t
>
()[
dstidx
]
=
input_score_data
[
srcidx
];
}
}
}
amount_per_side
=
bbox_width
*
bbox_height
;
alignedCW
=
fpga
::
align_to_x
(
bbox_width
*
bbox_channels
,
IMAGE_ALIGNMENT
);
unalignedCW
=
bbox_width
*
bbox_channels
;
fpga
::
fpga_invalidate
(
input_bbox_data
,
bbox_height
*
alignedCW
*
sizeof
(
int8_t
));
for
(
int
h
=
0
;
h
<
bbox_height
;
h
++
)
{
for
(
int
w
=
0
;
w
<
bbox_width
;
w
++
)
{
for
(
int
c
=
0
;
c
<
bbox_channels
;
c
++
)
{
if
(
alignedCW
==
unalignedCW
)
{
*
(
bbox_tmp_data
+
c
*
amount_per_side
+
bbox_width
*
h
+
w
)
=
(
*
(
input_bbox_data
++
));
}
else
{
idx
=
h
*
alignedCW
+
w
*
bbox_channels
+
c
;
*
(
bbox_tmp_data
+
c
*
amount_per_side
+
bbox_width
*
h
+
w
)
=
input_bbox_data
[
idx
];
}
}
}
}
auto
score_tensor
=
param
.
float_score
.
get
();
for
(
int
i
=
0
;
i
<
score_height
*
score_width
*
score_channels
;
i
++
)
{
score_tensor
->
data
<
float
>
()[
i
]
=
score_tmp_data
[
i
]
/
127.0
*
input_score
->
scale
[
0
];
}
auto
bbox_tensor
=
param
.
float_bbox
.
get
();
for
(
int
i
=
0
;
i
<
bbox_height
*
bbox_width
*
bbox_channels
;
i
++
)
{
bbox_tensor
->
data
<
float
>
()[
i
]
=
bbox_tmp_data
[
i
]
/
127.0
*
input_bbox
->
scale
[
0
];
for
(
int
h
=
0
;
h
<
bbox_height
;
h
++
){
for
(
int
w
=
0
;
w
<
bbox_width
;
w
++
){
for
(
int
c
=
0
;
c
<
bbox_channels
;
++
c
)
{
int
dstidx
=
h
*
unalignedCW
+
w
*
bbox_channels
+
c
;
int
srcidx
=
h
*
alignedCW
+
w
*
bbox_channels
+
c
;
bbox_tensor
->
data
<
float
>
()[
dstidx
]
=
((
int
)(
input_bbox_data
[
srcidx
]))
/
127.0
*
input_bbox
->
scale
[
0
];
}
}
}
auto
*
scores
=
param
.
float_score
.
get
();
auto
*
bbox_deltas
=
param
.
float_bbox
.
get
();
auto
*
im_info
=
param
.
im_info_
;
auto
anchors
=
*
param
.
anchors_
;
auto
variances
=
*
param
.
variances_
;
...
...
@@ -447,37 +410,23 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> ¶m) {
float
min_size
=
param
.
min_size_
;
float
eta
=
param
.
eta_
;
auto
&
scores_dim
=
scores
->
dims
();
int64_t
num
=
scores_dim
[
0
];
int64_t
c_score
=
scores_dim
[
1
];
int64_t
h_score
=
scores_dim
[
2
];
int64_t
w_score
=
scores_dim
[
3
];
auto
&
bbox_dim
=
bbox_deltas
->
dims
();
int64_t
c_bbox
=
bbox_dim
[
1
];
int64_t
h_bbox
=
bbox_dim
[
2
];
int64_t
w_bbox
=
bbox_dim
[
3
];
//
rpn_rois
->
mutable_data
<
float
>
({
bbox_deltas
->
numel
(),
4
});
rpn_roi_probs
->
mutable_data
<
float
>
({
scores
->
numel
(),
1
});
rpn_rois
->
mutable_data
<
float
>
({
bbox_tensor
->
numel
()
/
4
,
4
});
rpn_roi_probs
->
mutable_data
<
int8_t
>
({
input_score
->
numel
()
/
4
,
1
});
framework
::
LoD
lod
;
lod
.
resize
(
1
);
auto
&
lod0
=
lod
[
0
];
lod0
.
push_back
(
0
);
anchors
.
Resize
({
anchors
.
numel
(),
4
});
variances
.
Resize
({
variances
.
numel
(),
4
});
anchors
.
Resize
({
anchors
.
numel
()
/
4
,
4
});
variances
.
Resize
({
variances
.
numel
()
/
4
,
4
});
int64_t
num_proposals
=
0
;
for
(
int64_t
i
=
0
;
i
<
num
;
++
i
)
{
for
(
int64_t
i
=
0
;
i
<
score_n
;
++
i
)
{
Tensor
im_info_slice
=
im_info
->
Slice
(
i
,
i
+
1
);
Tensor
bbox_deltas_slice
=
(
*
bbox_tensor
).
Slice
(
i
,
i
+
1
);
Tensor
scores_slice
=
(
*
score_tensor
).
Slice
(
i
,
i
+
1
);
bbox_deltas_slice
.
Resize
({
h_bbox
*
w_bbox
*
c_bbox
,
4
});
scores_slice
.
Resize
({
h_score
*
w_score
*
c_score
,
1
});
Tensor
scores_slice
=
score_tensor
.
Slice
(
i
,
i
+
1
);
bbox_deltas_slice
.
Resize
({
bbox_height
*
bbox_width
*
bbox_channels
/
4
,
4
});
scores_slice
.
Resize
({
score_height
*
score_width
*
score_channels
,
1
});
std
::
pair
<
Tensor
,
Tensor
>
tensor_pair
=
ProposalForOneImage
<
float
>
(
im_info_slice
,
anchors
,
variances
,
bbox_deltas_slice
,
scores_slice
,
score_index
,
pre_nms_top_n
,
post_nms_top_n
,
nms_thresh
,
min_size
,
eta
);
...
...
mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
浏览文件 @
7f6e8c61
...
...
@@ -44,14 +44,14 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
}
template
<
typename
Dtype
>
void
PSROIPoolingForward
(
const
Dtype
*
bottom_data
,
const
int
height
,
void
PSROIPoolingForward
(
const
int8_t
*
bottom_data
,
const
int
height
,
const
int
width
,
const
int
input_channel
,
Dtype
*
top_data
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
output_channel
,
const
Dtype
*
bottom_rois
,
const
Dtype
Bin_size_h
,
const
Dtype
Bin_size_w
,
const
Dtype
roi_start_h
,
const
Dtype
roi_start_w
,
const
int
pw
,
const
int
ph
,
const
int
roi_batch_ind
)
{
float
scale
,
const
int
roi_batch_ind
)
{
int
hstart
=
floor
(
static_cast
<
Dtype
>
(
ph
)
*
Bin_size_h
+
roi_start_h
);
int
wstart
=
floor
(
static_cast
<
Dtype
>
(
pw
)
*
Bin_size_w
+
roi_start_w
);
int
hend
=
ceil
(
static_cast
<
Dtype
>
(
ph
+
1
)
*
Bin_size_h
+
roi_start_h
);
...
...
@@ -64,11 +64,12 @@ void PSROIPoolingForward(const Dtype* bottom_data, const int height,
wend
=
std
::
min
(
std
::
max
(
wend
,
0
),
width
);
bool
is_empty
=
(
hend
<=
hstart
)
||
(
wend
<=
wstart
);
float
sum_pixels_c
[
output_channel
]
=
{
0
};
float
pixels_c
[
output_channel
]
=
{
0
};
float
avg_pixels_c
[
output_channel
]
=
{
0
};
int
sum_pixels_c
[
output_channel
]
=
{
0
};
int8_t
pixels_c
[
output_channel
]
=
{
0
};
if
(
!
is_empty
)
{
Dtype
bin_area
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
float
rec_bin_area
=
1
/
bin_area
;
float
scale_fuse
=
scale
/
bin_area
;
for
(
int
h
=
hstart
;
h
<
hend
;
++
h
)
{
for
(
int
w
=
wstart
;
w
<
wend
;
++
w
)
{
...
...
@@ -86,27 +87,21 @@ void PSROIPoolingForward(const Dtype* bottom_data, const int height,
}
}
for
(
int
output_c
=
0
;
output_c
<
output_channel
;
output_c
++
)
{
sum_pixels_c
[
output_c
]
*=
rec_bin_area
;
avg_pixels_c
[
output_c
]
=
sum_pixels_c
[
output_c
]
*
scale_fuse
;
}
}
int
output_index_base
=
(
ph
*
pooled_width
+
pw
)
*
output_channel
;
top_data
+=
output_index_base
;
memcpy
(
top_data
,
sum
_pixels_c
,
output_channel
*
4
);
memcpy
(
top_data
,
avg
_pixels_c
,
output_channel
*
4
);
}
template
<
>
void
PSRoiPoolKernel
<
FPGA
,
float
>::
Compute
(
const
PSRoiPoolParam
<
FPGA
>&
param
)
{
auto
input_tensor
=
param
.
input_x_
;
auto
input_data
=
input_tensor
->
data
<
int8_t
>
();
auto
Si
=
input_tensor
->
scale
[
0
];
auto
float_input_tensor
=
param
.
float_input
.
get
();
auto
float_input_data
=
float_input_tensor
->
data
<
float
>
();
for
(
int
i
=
0
;
i
<
float_input_tensor
->
numel
();
i
++
)
{
float_input_data
[
i
]
=
input_data
[
i
]
/
127.0
*
Si
;
}
auto
*
in
=
float_input_tensor
;
auto
scale
=
input_tensor
->
scale
[
0
]
/
127.0
;
fpga
::
fpga_invalidate
(
input_data
,
input_tensor
->
numel
()
*
sizeof
(
int8_t
));
auto
*
rois
=
param
.
input_rois_
;
auto
*
out
=
param
.
output_
;
...
...
@@ -115,22 +110,19 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto
spatial_scale
=
param
.
spatial_scale_
;
auto
output_channels
=
param
.
output_channels_
;
auto
in_dims
=
in
->
dims
();
auto
in_dims
=
in
put_tensor
->
dims
();
int
batch_size
=
in_dims
[
0
];
int
input_channels
=
in_dims
[
1
];
int
height
=
in_dims
[
2
];
int
width
=
in_dims
[
3
];
int
rois_num
=
rois
->
dims
()[
0
];
auto
data_nhwc
=
in
->
mutable_data
<
float
>
();
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
rois_num
,
(
param
.
output_
)
->
dims
()[
1
],
(((
param
.
output_
)
->
dims
()[
2
])),
(
param
.
output_
)
->
dims
()[
3
]});
(
param
.
output_
)
->
Resize
(
dims_out_new
);
const
float
*
input_data_tmp
=
data_nhwc
;
// in->data<float>();
framework
::
Tensor
rois_batch_id_list
;
rois_batch_id_list
.
Resize
({
rois_num
});
auto
rois_batch_id_data
=
rois_batch_id_list
.
mutable_data
<
int
>
();
...
...
@@ -151,12 +143,7 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
"the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width"
);
// calculate batch id index for each roi according to LoD
for
(
int
n
=
0
;
n
<
rois_batch_size
;
++
n
)
{
for
(
size_t
i
=
rois_lod
[
n
];
i
<
rois_lod
[
n
+
1
];
++
i
)
{
rois_batch_id_data
[
i
]
=
n
;
}
}
auto
output_data
=
out
->
mutable_data
<
float
>
();
auto
input_rois
=
rois
->
data
<
float
>
();
...
...
@@ -187,10 +174,10 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
for
(
int
pw
=
0
;
pw
<
pooled_width
;
pw
++
)
{
PSROIPoolingForward
<
float
>
(
input_data
_tmp
,
height
,
width
,
input_channels
,
offset_output_data
,
input_data
,
height
,
width
,
input_channels
,
offset_output_data
,
pooled_height
,
pooled_width
,
output_channels
,
input_rois
,
bin_size_h
,
bin_size_w
,
roi_start_h
,
roi_start_w
,
pw
,
ph
,
roi_batch_ind
);
bin_size_h
,
bin_size_w
,
roi_start_h
,
roi_start_w
,
pw
,
ph
,
scale
,
roi_batch_ind
);
}
}
}
...
...
mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
浏览文件 @
7f6e8c61
...
...
@@ -25,6 +25,7 @@ bool Reshape2Kernel<FPGA, float>::Init(Reshape2Param<FPGA> *param) {
auto
input
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
output
=
param
->
Out
();
auto
shape
=
param
->
Shape
();
output
->
scale
[
0
]
=
input
->
scale
[
0
];
auto
num_in
=
framework
::
product
(
input
->
dims
());
auto
num_shape
=
framework
::
product
(
framework
::
make_ddim
(
shape
));
...
...
@@ -92,6 +93,29 @@ void reshape(LoDTensor *input, LoDTensor *output) {
fpga
::
fpga_flush
(
output_ptr
,
Hr
*
WCr_align
*
sizeof
(
int8_t
));
}
static
inline
bool
reshape2_judge
(
const
framework
::
DDim
input_dims
,
const
framework
::
DDim
output_dims
){
int
input_dims_size
=
input_dims
.
size
();
int
output_dims_size
=
output_dims
.
size
();
bool
dims_flag2
=
true
;
auto
temp_dims
=
input_dims_size
>
output_dims_size
?
input_dims
:
output_dims
;
int
short_dims
=
input_dims_size
>
output_dims_size
?
output_dims_size
:
input_dims_size
;
for
(
int
i
=
0
;
i
<
temp_dims
.
size
();
++
i
){
if
(
i
<
short_dims
){
if
(
input_dims
[
i
]
!=
output_dims
[
i
]){
dims_flag2
=
false
;
break
;
}
}
else
{
if
(
temp_dims
[
i
]
!=
1
){
dims_flag2
=
false
;
break
;
}
}
}
return
dims_flag2
;
}
template
<
>
void
Reshape2Kernel
<
FPGA
,
float
>::
Compute
(
const
Reshape2Param
<
FPGA
>
&
param
)
{
auto
input
=
const_cast
<
LoDTensor
*>
(
param
.
InputX
());
...
...
@@ -109,7 +133,17 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> ¶m) {
}
}
output
->
Resize
(
framework
::
make_ddim
(
shape
));
if
(
output
->
dims
()
==
input
->
dims
())
{
auto
input_dims
=
input
->
dims
();
auto
output_dims
=
output
->
dims
();
bool
dims_flags
=
input_dims
==
output_dims
;
bool
dims_flag2
=
true
;
if
(
!
dims_flags
){
dims_flag2
=
reshape2_judge
(
input_dims
,
output_dims
);
}
if
(
dims_flags
||
dims_flag2
)
{
DLOG
<<
"No need to reshape"
;
output
->
ShareDataWith
(
*
input
);
framework
::
LoD
lod
=
input
->
lod
();
...
...
mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
浏览文件 @
7f6e8c61
...
...
@@ -21,12 +21,11 @@ namespace operators {
template
<
>
bool
SigmoidKernel
<
FPGA
,
float
>::
Init
(
SigmoidParam
<
FPGA
>
*
param
)
{
auto
input
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
input_ptr
=
input
->
data
<
int8_t
>
();
paddle_mobile
::
fpga
::
ActivationType
activation_enable
=
paddle_mobile
::
fpga
::
SIGMOID
;
int16_t
leaky_relu_negative_slope
=
fpga
::
fp32_2_fp16
(
input
->
scale
[
0
]
/
127.0
);
int16_t
leaky_relu_negative_slope
=
0
;
auto
input
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
input_ptr
=
input
->
data
<
int8_t
>
();
auto
out
=
param
->
Out
();
fpga
::
format_ofm
(
out
);
...
...
mobile/src/operators/kernel/fpga/V2/softmax_kernel.cpp
浏览文件 @
7f6e8c61
...
...
@@ -81,6 +81,7 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> ¶m) {
auto
w
=
1
;
auto
c
=
1
;
if
(
dims
.
size
()
==
4
)
{
n
=
dims
[
0
];
h
=
dims
[
1
];
w
=
dims
[
2
];
c
=
dims
[
3
];
...
...
@@ -90,6 +91,7 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> ¶m) {
h
=
1
;
}
}
else
if
(
dims
.
size
()
==
2
)
{
n
=
dims
[
0
];
c
=
dims
[
1
];
}
if
((
c
==
2
)
&&
(
in_x
->
type
()
==
type_id
<
int8_t
>
()))
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录