Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
4eb5680a
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
4eb5680a
编写于
3月 20, 2019
作者:
Z
zhangyang0701
提交者:
GitHub
3月 20, 2019
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into backup
上级
71e83cb3
8c2d98f1
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
253 addition
and
253 deletion
+253
-253
src/operators/kernel/fpga/V1/proposal_kernel.cpp
src/operators/kernel/fpga/V1/proposal_kernel.cpp
+9
-7
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
+109
-86
test/fpga/test_marker.cpp
test/fpga/test_marker.cpp
+78
-120
test/fpga/test_marker_api.cpp
test/fpga/test_marker_api.cpp
+57
-40
未找到文件。
src/operators/kernel/fpga/V1/proposal_kernel.cpp
浏览文件 @
4eb5680a
...
@@ -300,7 +300,7 @@ static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
...
@@ -300,7 +300,7 @@ static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
template
<
class
T
>
template
<
class
T
>
static
inline
Tensor
NMS
(
Tensor
*
bbox
,
Tensor
*
scores
,
T
nms_threshold
,
static
inline
Tensor
NMS
(
Tensor
*
bbox
,
Tensor
*
scores
,
T
nms_threshold
,
float
eta
)
{
float
eta
,
int
post_nms_num
=
100
)
{
int64_t
num_boxes
=
bbox
->
dims
()[
0
];
int64_t
num_boxes
=
bbox
->
dims
()[
0
];
// 4: [xmin ymin xmax ymax]
// 4: [xmin ymin xmax ymax]
int64_t
box_size
=
bbox
->
dims
()[
1
];
int64_t
box_size
=
bbox
->
dims
()[
1
];
...
@@ -314,7 +314,7 @@ static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold,
...
@@ -314,7 +314,7 @@ static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold,
int
selected_num
=
0
;
int
selected_num
=
0
;
T
adaptive_threshold
=
nms_threshold
;
T
adaptive_threshold
=
nms_threshold
;
const
T
*
bbox_data
=
bbox
->
data
<
T
>
();
const
T
*
bbox_data
=
bbox
->
data
<
T
>
();
while
(
sorted_indices
.
size
()
!=
0
)
{
while
(
(
sorted_indices
.
size
()
!=
0
)
&&
(
selected_num
<
post_nms_num
)
)
{
int
idx
=
sorted_indices
.
back
().
second
;
int
idx
=
sorted_indices
.
back
().
second
;
bool
flag
=
true
;
bool
flag
=
true
;
for
(
int
kept_idx
:
selected_indices
)
{
for
(
int
kept_idx
:
selected_indices
)
{
...
@@ -397,17 +397,19 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
...
@@ -397,17 +397,19 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
return
std
::
make_pair
(
bbox_sel
,
scores_filter
);
return
std
::
make_pair
(
bbox_sel
,
scores_filter
);
}
}
Tensor
keep_nms
=
NMS
<
T
>
(
&
bbox_sel
,
&
scores_filter
,
nms_thresh
,
eta
);
// Tensor keep_nms = NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta);
Tensor
keep_nms
=
NMS
<
T
>
(
&
bbox_sel
,
&
scores_filter
,
nms_thresh
,
eta
,
post_nms_top_n
);
if
(
post_nms_top_n
>
0
&&
post_nms_top_n
<
keep_nms
.
numel
())
{
if
(
post_nms_top_n
>
0
&&
post_nms_top_n
<
keep_nms
.
numel
())
{
keep_nms
.
Resize
({
post_nms_top_n
});
keep_nms
.
Resize
({
post_nms_top_n
});
}
}
// proposals.mutable_data<T>({keep_nms.numel(), 4});//
original
proposals
.
mutable_data
<
T
>
({
keep_nms
.
numel
(),
4
});
//
original
// scores_sel.mutable_data<T>({keep_nms.numel(), 1});//
original
scores_sel
.
mutable_data
<
T
>
({
keep_nms
.
numel
(),
1
});
//
original
proposals
.
mutable_data
<
T
>
({
post_nms_top_n
,
4
});
// wong
//
proposals.mutable_data<T>({post_nms_top_n, 4}); // wong
scores_sel
.
mutable_data
<
T
>
({
post_nms_top_n
,
1
});
// wong
//
scores_sel.mutable_data<T>({post_nms_top_n, 1}); // wong
CPUGather
<
T
>
(
bbox_sel
,
keep_nms
,
&
proposals
);
CPUGather
<
T
>
(
bbox_sel
,
keep_nms
,
&
proposals
);
CPUGather
<
T
>
(
scores_filter
,
keep_nms
,
&
scores_sel
);
CPUGather
<
T
>
(
scores_filter
,
keep_nms
,
&
scores_sel
);
return
std
::
make_pair
(
proposals
,
scores_sel
);
return
std
::
make_pair
(
proposals
,
scores_sel
);
...
...
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
浏览文件 @
4eb5680a
...
@@ -15,7 +15,6 @@ limitations under the License. */
...
@@ -15,7 +15,6 @@ limitations under the License. */
#ifdef PSROI_POOL_OP
#ifdef PSROI_POOL_OP
#include <cmath>
#include <cmath>
#include <memory>
#include <vector>
#include <vector>
#include "operators/kernel/detection_kernel.h"
#include "operators/kernel/detection_kernel.h"
...
@@ -72,16 +71,72 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
...
@@ -72,16 +71,72 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
return
true
;
return
true
;
}
}
/*
template <typename Dtype>
void PSROIPoolingForward(
const Dtype* bottom_data,
const int height, const int width, const int input_channel,
Dtype* top_data,
const int pooled_height, const int pooled_width, const int output_channel,
const Dtype* bottom_rois,
const Dtype Bin_size_h, const Dtype Bin_size_w, const Dtype roi_start_h,
const Dtype roi_start_w, const int pw, const int ph, const int roi_batch_ind)
{
int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
int wstart = floor(static_cast<Dtype>(pw)* Bin_size_w + roi_start_w);
int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
int wend = ceil(static_cast<Dtype>(pw + 1) * Bin_size_w + roi_start_w);
hstart = std::min(std::max(hstart, 0), height);
hend = std::min(std::max(hend, 0), height);
wstart = std::min(std::max(wstart, 0), width);
wend = std::min(std::max(wend, 0), width);
bool is_empty = (hend <= hstart) || (wend <= wstart);
float32x4_t sum_pixels_low_c= vdupq_n_f32(0);
float32x4_t sum_pixels_high_c= vdupq_n_f32(0);
if(!is_empty){
Dtype bin_area = (hend - hstart) * (wend - wstart);
float rev_bin_area = 1 / bin_area;
float32x4_t q_bin_area = vdupq_n_f32(rev_bin_area);
//static_cast<float>(bin_area) float pixels_c[output_channel];
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
int pixel_offset = (h * width + w) * input_channel;
for(int output_c = 0; output_c < output_channel; output_c++){
int input_channel_offset = output_c * pooled_height *
pooled_width; int input_bias = pixel_offset + input_channel_offset + ph *
pooled_width + pw; pixels_c[output_c] = bottom_data[input_bias];
}
float32x4_t pixel_low_c = vld1q_f32(pixels_c);
float32x4_t pixel_high_c = vld1q_f32(pixels_c + 4);
sum_pixels_low_c = vaddq_f32(sum_pixels_low_c, pixel_low_c);
sum_pixels_high_c = vaddq_f32(sum_pixels_high_c, pixel_high_c);
}
}
sum_pixels_low_c = vmulq_f32(sum_pixels_low_c, q_bin_area);
sum_pixels_high_c = vmulq_f32(sum_pixels_high_c, q_bin_area);
}
int output_index_base = (ph * pooled_width + pw) * output_channel;
top_data += output_index_base;
vst1q_f32(top_data, sum_pixels_low_c);
top_data += 4;
vst1q_f32(top_data, sum_pixels_high_c);
}*/
template
<
typename
Dtype
>
template
<
typename
Dtype
>
void
PSROIPooling
(
const
Dtype
*
bottom_data
,
const
int
channels
,
void
PSROIPoolingForward
(
const
Dtype
*
bottom_data
,
const
int
height
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
width
,
const
int
input_channel
,
const
int
pooled_width
,
const
Dtype
*
bottom_rois
,
Dtype
*
top_data
,
const
int
pooled_height
,
const
int
output_dim
,
const
int
group_size
,
Dtype
*
top_data
,
const
int
pooled_width
,
const
int
output_channel
,
int
index
,
int
nid
,
const
Dtype
Bin_size_h
,
const
Dtype
*
bottom_rois
,
const
Dtype
Bin_size_h
,
const
Dtype
Bin_size_w
,
const
Dtype
roi_start_h
,
const
Dtype
Bin_size_w
,
const
Dtype
roi_start_h
,
const
Dtype
roi_start_w
,
const
int
ctop
,
const
int
ph
,
const
Dtype
roi_start_w
,
const
int
pw
,
const
int
ph
,
const
int
roi_batch_ind
)
{
const
int
roi_batch_ind
)
{
int
pw
=
index
;
int
hstart
=
floor
(
static_cast
<
Dtype
>
(
ph
)
*
Bin_size_h
+
roi_start_h
);
int
hstart
=
floor
(
static_cast
<
Dtype
>
(
ph
)
*
Bin_size_h
+
roi_start_h
);
int
wstart
=
floor
(
static_cast
<
Dtype
>
(
pw
)
*
Bin_size_w
+
roi_start_w
);
int
wstart
=
floor
(
static_cast
<
Dtype
>
(
pw
)
*
Bin_size_w
+
roi_start_w
);
int
hend
=
ceil
(
static_cast
<
Dtype
>
(
ph
+
1
)
*
Bin_size_h
+
roi_start_h
);
int
hend
=
ceil
(
static_cast
<
Dtype
>
(
ph
+
1
)
*
Bin_size_h
+
roi_start_h
);
...
@@ -94,60 +149,35 @@ void PSROIPooling(const Dtype* bottom_data, const int channels,
...
@@ -94,60 +149,35 @@ void PSROIPooling(const Dtype* bottom_data, const int channels,
wend
=
std
::
min
(
std
::
max
(
wend
,
0
),
width
);
wend
=
std
::
min
(
std
::
max
(
wend
,
0
),
width
);
bool
is_empty
=
(
hend
<=
hstart
)
||
(
wend
<=
wstart
);
bool
is_empty
=
(
hend
<=
hstart
)
||
(
wend
<=
wstart
);
int
c
=
(
ctop
*
group_size
+
ph
)
*
group_size
+
pw
;
float
sum_pixels_c
[
output_channel
]
=
{
0
};
float
pixels_c
[
output_channel
]
=
{
0
};
Dtype
bin_area
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
if
(
!
is_empty
)
{
bottom_data
+=
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
Dtype
bin_area
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
Dtype
out_sum
=
0
;
float
rec_bin_area
=
1
/
bin_area
;
for
(
int
h
=
hstart
;
h
<
hend
;
++
h
)
{
for
(
int
w
=
wstart
;
w
<
wend
;
++
w
)
{
for
(
int
h
=
hstart
;
h
<
hend
;
++
h
)
{
int
bottom_index
=
h
*
width
+
w
;
for
(
int
w
=
wstart
;
w
<
wend
;
++
w
)
{
out_sum
+=
bottom_data
[
bottom_index
];
int
pixel_offset
=
(
h
*
width
+
w
)
*
input_channel
;
}
for
(
int
output_c
=
0
;
output_c
<
output_channel
;
output_c
++
)
{
}
int
input_channel_offset
=
output_c
*
pooled_height
*
pooled_width
;
int
input_bias
=
top_data
[
nid
+
index
]
=
is_empty
?
0.
:
out_sum
/
bin_area
;
pixel_offset
+
input_channel_offset
+
ph
*
pooled_width
+
pw
;
}
pixels_c
[
output_c
]
=
bottom_data
[
input_bias
];
void
convert_to_chw
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
,
int
num
)
{
float
*
data_in_tmp
=
*
data_in
;
float
*
data_tmp
=
reinterpret_cast
<
float
*>
(
fpga
::
fpga_malloc
(
channel
*
height
*
width
*
sizeof
(
float
)));
// NOLINT
int64_t
amount_per_side
=
width
*
height
;
for
(
int
n
=
0
;
n
<
num
;
n
++
)
{
for
(
int
h
=
0
;
h
<
height
;
h
++
)
{
for
(
int
w
=
0
;
w
<
width
;
w
++
)
{
for
(
int
c
=
0
;
c
<
channel
;
c
++
)
{
*
(
data_tmp
+
n
*
height
*
width
*
channel
+
c
*
amount_per_side
+
width
*
h
+
w
)
=
*
((
*
data_in
)
++
);
}
}
}
}
}
*
data_in
=
data_tmp
;
fpga
::
fpga_free
(
data_in_tmp
);
}
void
convert_to_hwc
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
,
for
(
int
output_c
=
0
;
output_c
<
output_channel
;
output_c
++
)
{
int
num
)
{
sum_pixels_c
[
output_c
]
+=
pixels_c
[
output_c
];
float
*
data_in_tmp
=
*
data_in
;
float
*
data_tmp
=
reinterpret_cast
<
float
*>
(
fpga
::
fpga_malloc
(
num
*
channel
*
height
*
width
*
sizeof
(
float
)));
int64_t
amount_per_row
=
width
*
channel
;
for
(
int
n
=
0
;
n
<
num
;
n
++
)
{
for
(
int
c
=
0
;
c
<
channel
;
c
++
)
{
for
(
int
h
=
0
;
h
<
height
;
h
++
)
{
int64_t
offset_height
=
h
*
amount_per_row
;
for
(
int
w
=
0
;
w
<
width
;
w
++
)
{
*
(
data_tmp
+
n
*
channel
*
height
*
width
+
offset_height
+
w
*
channel
+
c
)
=
*
((
*
data_in
)
++
);
}
}
}
}
}
}
for
(
int
output_c
=
0
;
output_c
<
output_channel
;
output_c
++
)
{
sum_pixels_c
[
output_c
]
*=
rec_bin_area
;
}
}
}
*
data_in
=
data_tmp
;
fpga
::
fpga_free
(
data_in_tmp
);
int
output_index_base
=
(
ph
*
pooled_width
+
pw
)
*
output_channel
;
top_data
+=
output_index_base
;
memcpy
(
top_data
,
sum_pixels_c
,
output_channel
*
4
);
}
}
template
<
>
template
<
>
...
@@ -174,14 +204,15 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
...
@@ -174,14 +204,15 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
int
rois_num
=
rois
->
dims
()[
0
];
int
rois_num
=
rois
->
dims
()[
0
];
auto
data_nhwc
=
in
->
mutable_data
<
float
>
();
auto
data_nhwc
=
in
->
mutable_data
<
float
>
();
fpga
::
image
::
convert_to_chw
(
&
data_nhwc
,
input_channels
,
height
,
width
,
1
);
// fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
rois_num
,
(
param
.
output_
)
->
dims
()[
1
],
(((
param
.
output_
)
->
dims
()[
2
])),
{
rois_num
,
(
param
.
output_
)
->
dims
()[
1
],
(((
param
.
output_
)
->
dims
()[
2
])),
(
param
.
output_
)
->
dims
()[
3
]});
(
param
.
output_
)
->
dims
()[
3
]});
(
param
.
output_
)
->
Resize
(
dims_out_new
);
(
param
.
output_
)
->
Resize
(
dims_out_new
);
float
*
input_data
=
data_nhwc
;
// in->data<float>();
const
float
*
input_data
=
data_nhwc
;
// in->data<float>();
// shared_ptr<float> input_data(data_nhwc);
framework
::
Tensor
rois_batch_id_list
;
framework
::
Tensor
rois_batch_id_list
;
rois_batch_id_list
.
Resize
({
rois_num
});
rois_batch_id_list
.
Resize
({
rois_num
});
auto
rois_batch_id_data
=
rois_batch_id_list
.
mutable_data
<
int
>
();
auto
rois_batch_id_data
=
rois_batch_id_list
.
mutable_data
<
int
>
();
...
@@ -203,18 +234,19 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
...
@@ -203,18 +234,19 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
"output_channels x pooled_height x pooled_width"
);
"output_channels x pooled_height x pooled_width"
);
// calculate batch id index for each roi according to LoD
// calculate batch id index for each roi according to LoD
//
for (int n = 0; n < rois_batch_size; ++n) {
for
(
int
n
=
0
;
n
<
rois_batch_size
;
++
n
)
{
//
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
for
(
size_t
i
=
rois_lod
[
n
];
i
<
rois_lod
[
n
+
1
];
++
i
)
{
//
rois_batch_id_data[i] = n;
rois_batch_id_data
[
i
]
=
n
;
//
}
}
//
}
}
auto
output_data
=
out
->
mutable_data
<
float
>
();
auto
output_data
=
out
->
mutable_data
<
float
>
();
auto
input_rois
=
rois
->
data
<
float
>
();
auto
input_rois
=
rois
->
data
<
float
>
();
// calculate psroipooling, parallel processing can be implemented per ROI
for
(
int
n
=
0
;
n
<
rois_num
;
++
n
)
{
for
(
int
n
=
0
;
n
<
rois_num
;
++
n
)
{
// [start, end) interval for spatial sampling
auto
offset_input_rois
=
input_rois
+
n
*
4
;
auto
offset_input_rois
=
input_rois
+
n
*
4
;
auto
offset_output_data
=
output_data
+
pooled_height
*
pooled_width
*
output_channels
*
n
;
auto
roi_start_w
=
auto
roi_start_w
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
0
]))
*
spatial_scale
;
static_cast
<
float
>
(
round
(
offset_input_rois
[
0
]))
*
spatial_scale
;
auto
roi_start_h
=
auto
roi_start_h
=
...
@@ -232,27 +264,18 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
...
@@ -232,27 +264,18 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
auto
bin_size_h
=
roi_height
/
static_cast
<
float
>
(
pooled_height
);
auto
bin_size_h
=
roi_height
/
static_cast
<
float
>
(
pooled_height
);
auto
bin_size_w
=
roi_width
/
static_cast
<
float
>
(
pooled_width
);
auto
bin_size_w
=
roi_width
/
static_cast
<
float
>
(
pooled_width
);
int
roi_batch_ind
=
0
;
// rois_batch_id_data[n];
int
roi_batch_ind
=
rois_batch_id_data
[
n
];
// std::cout << "roi_batch_ind: " << roi_batch_ind << std::endl;
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
for
(
int
pw
=
0
;
pw
<
pooled_width
;
pw
++
)
{
int
index
=
pooled_width
;
PSROIPoolingForward
<
float
>
(
input_data
,
height
,
width
,
input_channels
,
int
nid
=
n
*
output_channels
*
pooled_height
*
pooled_width
+
offset_output_data
,
pooled_height
,
c
*
pooled_width
*
pooled_height
+
ph
*
pooled_width
;
pooled_width
,
output_channels
,
input_rois
,
for
(
int
idx
=
0
;
idx
<
index
;
idx
++
)
{
bin_size_h
,
bin_size_w
,
roi_start_h
,
PSROIPooling
<
float
>
(
input_data
,
input_channels
,
height
,
width
,
roi_start_w
,
pw
,
ph
,
roi_batch_ind
);
pooled_height
,
pooled_width
,
input_rois
,
output_channels
,
pooled_height
,
output_data
,
idx
,
nid
,
bin_size_h
,
bin_size_w
,
roi_start_h
,
roi_start_w
,
c
,
ph
,
roi_batch_ind
);
}
}
}
}
}
}
}
fpga
::
fpga_free
(
input_data
);
fpga
::
image
::
convert_to_hwc
(
&
output_data
,
output_channels
,
pooled_height
,
pooled_width
,
rois_num
);
out
->
reset_data_ptr
(
output_data
);
}
}
}
// namespace operators
}
// namespace operators
...
...
test/fpga/test_marker.cpp
浏览文件 @
4eb5680a
...
@@ -12,17 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,17 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <iostream>
#ifndef PADDLE_MOBILE_FPGA
#define PADDLE_MOBILE_FPGA
#endif
#include "../test_helper.h"
#include "../test_helper.h"
#include "../test_include.h"
#include "../test_include.h"
#ifdef PADDLE_MOBILE_FPGA_V1
#ifdef PADDLE_MOBILE_FPGA_V1
#include "fpga/V1/api.h"
#include "fpga/V1/api.h"
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA_V2
#ifdef PADDLE_MOBILE_FPGA_V2
#include "fpga/V2/api.h"
#include "fpga/V2/api.h"
#endif
#endif
#include <string>
#include <fstream>
#include <iostream>
#include "../../src/io/paddle_inference_api.h"
using
namespace
paddle_mobile
;
// NOLINT
using
namespace
paddle_mobile
::
fpga
;
// NOLINT
static
const
char
*
g_image
=
"../models/marker/marker1/image.bin"
;
static
const
char
*
g_model
=
"../models/marker/marker1/model"
;
static
const
char
*
g_param
=
"../models/marker/marker1/params"
;
void
readStream
(
std
::
string
filename
,
char
*
buf
)
{
void
readStream
(
std
::
string
filename
,
char
*
buf
)
{
std
::
ifstream
in
;
std
::
ifstream
in
;
...
@@ -36,132 +48,78 @@ void readStream(std::string filename, char *buf) {
...
@@ -36,132 +48,78 @@ void readStream(std::string filename, char *buf) {
auto
length
=
in
.
tellg
();
// report location (this is the length)
auto
length
=
in
.
tellg
();
// report location (this is the length)
in
.
seekg
(
0
,
std
::
ios
::
beg
);
// go back to the beginning
in
.
seekg
(
0
,
std
::
ios
::
beg
);
// go back to the beginning
in
.
read
(
buf
,
length
);
in
.
read
(
buf
,
length
);
DLOG
<<
length
;
in
.
close
();
in
.
close
();
}
}
void
convert_to_chw
(
int16_t
**
data_in
,
int
channel
,
int
height
,
int
width
,
PaddleMobileConfig
GetConfig
()
{
int
num
,
int16_t
*
data_tmp
)
{
PaddleMobileConfig
config
;
int64_t
amount_per_side
=
width
*
height
;
config
.
precision
=
PaddleMobileConfig
::
FP32
;
for
(
int
n
=
0
;
n
<
num
;
n
++
)
{
config
.
device
=
PaddleMobileConfig
::
kFPGA
;
for
(
int
h
=
0
;
h
<
height
;
h
++
)
{
config
.
prog_file
=
g_model
;
for
(
int
w
=
0
;
w
<
width
;
w
++
)
{
config
.
param_file
=
g_param
;
for
(
int
c
=
0
;
c
<
channel
;
c
++
)
{
config
.
thread_num
=
1
;
*
(
data_tmp
+
n
*
amount_per_side
*
channel
+
c
*
amount_per_side
+
config
.
batch_size
=
1
;
width
*
h
+
w
)
=
*
((
*
data_in
)
++
);
config
.
optimize
=
true
;
}
config
.
lod_mode
=
true
;
}
config
.
quantification
=
false
;
}
return
config
;
}
}
}
void
dump_stride_half
(
std
::
string
filename
,
Tensor
input_tensor
,
int
main
()
{
const
int
dumpnum
,
bool
use_chw
)
{
open_device
();
// bool use_chw = true;
if
(
input_tensor
.
dims
().
size
()
!=
4
)
return
;
PaddleMobileConfig
config
=
GetConfig
();
int
c
=
(
input_tensor
.
dims
())[
1
];
auto
predictor
=
int
h
=
(
input_tensor
.
dims
())[
2
];
CreatePaddlePredictor
<
PaddleMobileConfig
,
int
w
=
(
input_tensor
.
dims
())[
3
];
PaddleEngineKind
::
kPaddleMobile
>
(
config
);
int
n
=
(
input_tensor
.
dims
())[
0
];
auto
data_ptr
=
input_tensor
.
get_data
();
std
::
cout
<<
"Finishing loading model"
<<
std
::
endl
;
auto
*
data_ptr_16
=
reinterpret_cast
<
half
*>
(
data_ptr
);
auto
data_tmp
=
data_ptr_16
;
float
img_info
[
3
]
=
{
432
,
1280
,
1.0
f
};
if
(
use_chw
)
{
int
img_length
=
432
*
1280
*
3
;
data_tmp
=
auto
img
=
reinterpret_cast
<
float
*>
(
fpga_malloc
(
img_length
*
sizeof
(
float
)));
reinterpret_cast
<
half
*>
(
malloc
(
n
*
c
*
h
*
w
*
sizeof
(
int16_t
)));
readStream
(
g_image
,
reinterpret_cast
<
char
*>
(
img
));
convert_to_chw
(
&
data_ptr_16
,
c
,
h
,
w
,
n
,
data_tmp
);
}
std
::
cout
<<
"Finishing initializing data"
<<
std
::
endl
;
std
::
ofstream
out
(
filename
.
c_str
());
struct
PaddleTensor
t_img_info
,
t_img
;
float
result
=
0
;
t_img
.
dtypeid
=
typeid
(
float
);
int
stride
=
input_tensor
.
numel
()
/
dumpnum
;
t_img_info
.
layout
=
LAYOUT_HWC
;
stride
=
stride
>
0
?
stride
:
1
;
t_img_info
.
shape
=
std
::
vector
<
int
>
({
1
,
3
});
for
(
int
i
=
0
;
i
<
input_tensor
.
numel
();
i
+=
stride
)
{
t_img_info
.
name
=
"Image information"
;
result
=
paddle_mobile
::
fpga
::
fp16_2_fp32
(
data_tmp
[
i
]);
t_img_info
.
data
.
Reset
(
img_info
,
3
*
sizeof
(
float
));
out
<<
result
<<
std
::
endl
;
}
t_img
.
dtypeid
=
typeid
(
float
);
out
.
close
();
t_img
.
layout
=
LAYOUT_HWC
;
if
(
data_tmp
!=
data_ptr_16
)
{
t_img
.
shape
=
std
::
vector
<
int
>
({
1
,
432
,
1280
,
3
});
free
(
data_tmp
);
t_img
.
name
=
"Image information"
;
t_img
.
data
.
Reset
(
img
,
img_length
*
sizeof
(
float
));
predictor
->
FeedPaddleTensors
({
t_img_info
,
t_img
});
std
::
cout
<<
"Finishing feeding data "
<<
std
::
endl
;
predictor
->
Predict_From_To
(
0
,
-
1
);
std
::
cout
<<
"Finishing predicting "
<<
std
::
endl
;
std
::
vector
<
PaddleTensor
>
v
;
// No need to initialize v
predictor
->
FetchPaddleTensors
(
&
v
);
// Old data in v will be cleared
for
(
int
i
=
0
;
i
<
v
.
size
();
++
i
)
{
auto
p
=
reinterpret_cast
<
float
*>
(
v
[
i
].
data
.
data
());
int
len
=
v
[
i
].
data
.
length
();
float
result
=
0.0
f
;
std
::
string
str
=
"fetch"
+
std
::
to_string
(
i
);
fpga
::
savefile
<
float
>
(
str
,
p
,
len
,
result
);
}
}
}
void
dump_stride_float
(
std
::
string
filename
,
Tensor
input_tensor
,
std
::
cout
<<
"Finish getting vector values"
<<
std
::
endl
;
const
int
dumpnum
)
{
auto
data_ptr
=
reinterpret_cast
<
float
*>
(
input_tensor
.
get_data
());
std
::
ofstream
out
(
filename
.
c_str
());
float
result
=
0
;
int
stride
=
input_tensor
.
numel
()
/
dumpnum
;
stride
=
stride
>
0
?
stride
:
1
;
for
(
int
i
=
0
;
i
<
input_tensor
.
numel
();
i
+=
stride
)
{
result
=
data_ptr
[
i
];
out
<<
result
<<
std
::
endl
;
}
out
.
close
();
}
void
dump_stride
(
std
::
string
filename
,
Tensor
input_tensor
,
const
int
dumpnum
,
////////////////////////////////////////////////////
bool
use_chw
)
{
static
int
i
=
0
;
if
(
input_tensor
.
numel
()
==
0
)
{
return
;
}
if
(
input_tensor
.
type
()
==
typeid
(
float
))
{
DLOG
<<
"op: "
<<
i
++
<<
", float data "
<<
input_tensor
.
numel
();
dump_stride_float
(
filename
,
input_tensor
,
dumpnum
);
}
else
{
DLOG
<<
"op: "
<<
i
++
<<
", half data "
<<
input_tensor
.
numel
();
dump_stride_half
(
filename
,
input_tensor
,
dumpnum
,
use_chw
);
}
DLOG
<<
"dump input address: "
<<
input_tensor
.
get_data
();
}
static
const
char
*
g_marker_combine
=
"../models/marker/model"
;
// PaddleTensor tensor;
static
const
char
*
g_image_src_float
=
"../models/marker/model/input_0.bin"
;
// predictor->GetPaddleTensor("fetch2", &tensor);
int
main
()
{
// for (int i = 0; i < post_nms; i++) {
paddle_mobile
::
fpga
::
open_device
();
// auto p = reinterpret_cast<float *>(tensor.data.data());
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
FPGA
>
paddle_mobile
;
// std::cout << p[+i] << std::endl;
// }
// if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model",
// std::string(g_rfcn_combine) + "/params", true, false,
// 1, true)) {
if
(
paddle_mobile
.
Load
(
std
::
string
(
g_marker_combine
),
true
))
{
float
img_info
[
3
]
=
{
720
,
1280
,
800.0
f
/
960.0
f
};
auto
img
=
reinterpret_cast
<
float
*>
(
fpga
::
fpga_malloc
(
720
*
1280
*
3
*
sizeof
(
float
)));
readStream
(
g_image_src_float
,
reinterpret_cast
<
char
*>
(
img
));
std
::
vector
<
void
*>
v
(
3
,
nullptr
);
paddle_mobile
.
FeedData
({
img
});
paddle_mobile
.
Predict_To
(
-
1
);
for
(
int
i
=
47
;
i
<
52
;
i
++
)
{
auto
tensor_ptr
=
paddle_mobile
.
FetchResult
(
i
);
std
::
string
saveName
=
"marker_"
+
std
::
to_string
(
i
);
// if(i != 58)
paddle_mobile
::
fpga
::
fpga_invalidate
((
*
tensor_ptr
).
get_data
(),
tensor_ptr
->
numel
()
*
sizeof
(
float
));
// tensor_ptr->numel() * sizeof(float));
dump_stride
(
saveName
,
(
*
tensor_ptr
),
tensor_ptr
->
numel
(),
true
);
// 20);//tensor_ptr->numel());
/* float result = 0;
std::string str = "softmax_input_data";
float* data =
static_cast<float*>(fpga::fpga_malloc(tensor_ptr->numel() *
sizeof(float))); str = "softmax_output_data"; auto output_ptr =
static_cast<half*>((*tensor_ptr).get_data()); for (int idx = 0; idx <
tensor_ptr->numel(); ++idx)
{
data[idx] = fpga::fp16_2_fp32(output_ptr[idx]);
}
fpga::savefile<float>(str,data, tensor_ptr->numel(), result ); */
}
// paddle_mobile.GetResults(&v);
DLOG
<<
"Computation done"
;
fpga
::
fpga_free
(
img
);
}
return
0
;
return
0
;
}
}
test/fpga/test_marker_api.cpp
浏览文件 @
4eb5680a
...
@@ -15,12 +15,15 @@ limitations under the License. */
...
@@ -15,12 +15,15 @@ limitations under the License. */
#ifndef PADDLE_MOBILE_FPGA
#ifndef PADDLE_MOBILE_FPGA
#define PADDLE_MOBILE_FPGA
#define PADDLE_MOBILE_FPGA
#endif
#endif
#include <sys/time.h>
#include <time.h>
#include <fstream>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <iostream>
#include "../../src/io/paddle_inference_api.h"
#include "../../src/io/paddle_inference_api.h"
using
namespace
paddle_mobile
;
using
namespace
paddle_mobile
;
// NOLINT
using
namespace
paddle_mobile
::
fpga
;
using
namespace
paddle_mobile
::
fpga
;
// NOLINT
static
const
char
*
g_image
=
"../models/marker/model/image.bin"
;
static
const
char
*
g_image
=
"../models/marker/model/image.bin"
;
static
const
char
*
g_model
=
"../models/marker/model/model"
;
static
const
char
*
g_model
=
"../models/marker/model/model"
;
...
@@ -136,44 +139,6 @@ PaddleMobileConfig GetConfig1() {
...
@@ -136,44 +139,6 @@ PaddleMobileConfig GetConfig1() {
int
main
()
{
int
main
()
{
open_device
();
open_device
();
PaddleMobileConfig
config1
=
GetConfig1
();
auto
predictor1
=
CreatePaddlePredictor
<
PaddleMobileConfig
,
PaddleEngineKind
::
kPaddleMobile
>
(
config1
);
std
::
cout
<<
"Finishing loading model"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
1
;
++
i
)
{
int
img_length1
=
144
*
14
*
14
;
auto
img1
=
reinterpret_cast
<
float
*>
(
fpga_malloc
(
img_length1
*
sizeof
(
float
)));
readStream
(
g_image1
,
reinterpret_cast
<
char
*>
(
img1
));
std
::
cout
<<
"Finishing initializing data"
<<
std
::
endl
;
struct
PaddleTensor
t_img1
;
t_img1
.
dtypeid
=
typeid
(
float
);
t_img1
.
layout
=
LAYOUT_HWC
;
t_img1
.
shape
=
std
::
vector
<
int
>
({
1
,
14
,
14
,
144
});
t_img1
.
name
=
"Image information"
;
t_img1
.
data
.
Reset
(
img1
,
img_length1
*
sizeof
(
float
));
predictor1
->
FeedPaddleTensors
({
t_img1
});
std
::
cout
<<
"Finishing feeding data "
<<
std
::
endl
;
predictor1
->
Predict_From_To
(
0
,
-
1
);
std
::
cout
<<
"Finishing predicting "
<<
std
::
endl
;
std
::
vector
<
paddle_mobile
::
PaddleTensor
>
v1
;
// No need to initialize v
predictor1
->
FetchPaddleTensors
(
&
v1
);
// Old data in v will be cleared
std
::
cout
<<
"Output number is "
<<
v1
.
size
()
<<
std
::
endl
;
for
(
int
fetchNum
=
0
;
fetchNum
<
v1
.
size
();
fetchNum
++
)
{
std
::
string
dumpName
=
"marker2_api_fetch_"
+
std
::
to_string
(
fetchNum
);
dump_stride
(
dumpName
,
v1
[
fetchNum
]);
}
}
/////////////////////////////////////
PaddleMobileConfig
config
=
GetConfig
();
PaddleMobileConfig
config
=
GetConfig
();
auto
predictor
=
auto
predictor
=
CreatePaddlePredictor
<
PaddleMobileConfig
,
CreatePaddlePredictor
<
PaddleMobileConfig
,
...
@@ -207,7 +172,16 @@ int main() {
...
@@ -207,7 +172,16 @@ int main() {
std
::
cout
<<
"Finishing feeding data "
<<
std
::
endl
;
std
::
cout
<<
"Finishing feeding data "
<<
std
::
endl
;
timeval
start11
,
end11
;
long
dif_sec
,
dif_usec
;
// NOLINT
gettimeofday
(
&
start11
,
NULL
);
predictor
->
Predict_From_To
(
0
,
-
1
);
predictor
->
Predict_From_To
(
0
,
-
1
);
gettimeofday
(
&
end11
,
NULL
);
dif_sec
=
end11
.
tv_sec
-
start11
.
tv_sec
;
dif_usec
=
end11
.
tv_usec
-
start11
.
tv_usec
;
std
::
cout
<<
"marker1 total"
<<
" cost time: "
<<
(
dif_sec
*
1000000
+
dif_usec
)
<<
" us"
<<
std
::
endl
;
std
::
cout
<<
"Finishing predicting "
<<
std
::
endl
;
std
::
cout
<<
"Finishing predicting "
<<
std
::
endl
;
std
::
vector
<
paddle_mobile
::
PaddleTensor
>
v
;
// No need to initialize v
std
::
vector
<
paddle_mobile
::
PaddleTensor
>
v
;
// No need to initialize v
...
@@ -217,5 +191,48 @@ int main() {
...
@@ -217,5 +191,48 @@ int main() {
std
::
string
dumpName
=
"marker_api_fetch_"
+
std
::
to_string
(
fetchNum
);
std
::
string
dumpName
=
"marker_api_fetch_"
+
std
::
to_string
(
fetchNum
);
dump_stride
(
dumpName
,
v
[
fetchNum
]);
dump_stride
(
dumpName
,
v
[
fetchNum
]);
}
}
PaddleMobileConfig
config1
=
GetConfig1
();
auto
predictor1
=
CreatePaddlePredictor
<
PaddleMobileConfig
,
PaddleEngineKind
::
kPaddleMobile
>
(
config1
);
std
::
cout
<<
"Finishing loading model"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
1
;
++
i
)
{
int
img_length1
=
144
*
14
*
14
;
auto
img1
=
reinterpret_cast
<
float
*>
(
fpga_malloc
(
img_length1
*
sizeof
(
float
)));
readStream
(
g_image1
,
reinterpret_cast
<
char
*>
(
img1
));
std
::
cout
<<
"Finishing initializing data"
<<
std
::
endl
;
struct
PaddleTensor
t_img1
;
t_img1
.
dtypeid
=
typeid
(
float
);
t_img1
.
layout
=
LAYOUT_HWC
;
t_img1
.
shape
=
std
::
vector
<
int
>
({
1
,
14
,
14
,
144
});
t_img1
.
name
=
"Image information"
;
t_img1
.
data
.
Reset
(
img1
,
img_length1
*
sizeof
(
float
));
predictor1
->
FeedPaddleTensors
({
t_img1
});
std
::
cout
<<
"Finishing feeding data "
<<
std
::
endl
;
gettimeofday
(
&
start11
,
NULL
);
predictor1
->
Predict_From_To
(
0
,
-
1
);
gettimeofday
(
&
end11
,
NULL
);
dif_sec
=
end11
.
tv_sec
-
start11
.
tv_sec
;
dif_usec
=
end11
.
tv_usec
-
start11
.
tv_usec
;
std
::
cout
<<
"marker2 total"
<<
" cost time: "
<<
(
dif_sec
*
1000000
+
dif_usec
)
<<
" us"
<<
std
::
endl
;
std
::
cout
<<
"Finishing predicting "
<<
std
::
endl
;
std
::
vector
<
paddle_mobile
::
PaddleTensor
>
v1
;
// No need to initialize v
predictor1
->
FetchPaddleTensors
(
&
v1
);
// Old data in v will be cleared
std
::
cout
<<
"Output number is "
<<
v1
.
size
()
<<
std
::
endl
;
for
(
int
fetchNum
=
0
;
fetchNum
<
v1
.
size
();
fetchNum
++
)
{
std
::
string
dumpName
=
"marker2_api_fetch_"
+
std
::
to_string
(
fetchNum
);
dump_stride
(
dumpName
,
v1
[
fetchNum
]);
}
}
return
0
;
return
0
;
}
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录