Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
00aa83e5
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
00aa83e5
编写于
5月 20, 2020
作者:
C
chonwhite
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
modified nms
上级
f28625a4
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
271 addition
and
69 deletion
+271
-69
lite/backends/fpga/KD/debugger.hpp
lite/backends/fpga/KD/debugger.hpp
+19
-18
lite/backends/fpga/KD/llapi/zynqmp_api.cpp
lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+1
-2
lite/backends/fpga/KD/pes/prior_box_pe.cpp
lite/backends/fpga/KD/pes/prior_box_pe.cpp
+7
-3
lite/backends/fpga/KD/tensor.hpp
lite/backends/fpga/KD/tensor.hpp
+8
-2
lite/backends/fpga/lite_tensor.h
lite/backends/fpga/lite_tensor.h
+26
-12
lite/core/mir/fusion/conv_activation_fuse_pass.cc
lite/core/mir/fusion/conv_activation_fuse_pass.cc
+1
-1
lite/kernels/fpga/CMakeLists.txt
lite/kernels/fpga/CMakeLists.txt
+4
-3
lite/kernels/fpga/io_copy_compute.cc
lite/kernels/fpga/io_copy_compute.cc
+24
-13
lite/kernels/fpga/multiclass_nms_compute.cc
lite/kernels/fpga/multiclass_nms_compute.cc
+33
-14
lite/kernels/fpga/prior_box_compute.cc
lite/kernels/fpga/prior_box_compute.cc
+24
-0
lite/kernels/fpga/split_compute.cc
lite/kernels/fpga/split_compute.cc
+70
-0
lite/kernels/fpga/split_compute.h
lite/kernels/fpga/split_compute.h
+43
-0
lite/kernels/fpga/transpose_compute.cc
lite/kernels/fpga/transpose_compute.cc
+11
-1
未找到文件。
lite/backends/fpga/KD/debugger.hpp
浏览文件 @
00aa83e5
...
@@ -56,24 +56,25 @@ class Debugger {
...
@@ -56,24 +56,25 @@ class Debugger {
std
::
unordered_map
<
std
::
string
,
bool
>
op_config
;
std
::
unordered_map
<
std
::
string
,
bool
>
op_config
;
std
::
unordered_map
<
std
::
string
,
float
>
tick_tock_map
;
std
::
unordered_map
<
std
::
string
,
float
>
tick_tock_map
;
Debugger
()
{
Debugger
()
{
// op_config["concat"] = true;
op_config
[
"concat"
]
=
true
;
// op_config["pooling"] = true;
op_config
[
"pooling"
]
=
true
;
// op_config["conv"] = true;
op_config
[
"conv"
]
=
true
;
// op_config["dropout"] = true;
op_config
[
"dropout"
]
=
true
;
// op_config["dwconv"] = true;
op_config
[
"dwconv"
]
=
true
;
// op_config["ew_add"] = true;
op_config
[
"ew_add"
]
=
true
;
// op_config["ew_mul"] = true;
op_config
[
"ew_mul"
]
=
true
;
// op_config["crop"] = true;
op_config
[
"crop"
]
=
true
;
// op_config["feed"] = true;
op_config
[
"feed"
]
=
true
;
// op_config["fc"] = true;
op_config
[
"fetch"
]
=
true
;
// op_config["mul"] = true;
op_config
[
"fc"
]
=
true
;
// op_config["fetch"] = true;
op_config
[
"mul"
]
=
true
;
// op_config["boxes"] = true;
op_config
[
"boxes"
]
=
true
;
// op_config["scores"] = true;
op_config
[
"scores"
]
=
true
;
// op_config["nms"] = true;
op_config
[
"nms"
]
=
true
;
// op_config["pb_boxes"] = true;
op_config
[
"pb_boxes"
]
=
true
;
// op_config["pb_variances"] = true;
op_config
[
"pb_variances"
]
=
true
;
// op_config["softmax"] = true;
op_config
[
"softmax"
]
=
true
;
op_config
[
"split"
]
=
true
;
}
}
};
};
...
...
lite/backends/fpga/KD/llapi/zynqmp_api.cpp
浏览文件 @
00aa83e5
...
@@ -61,7 +61,6 @@ void reset_device() {
...
@@ -61,7 +61,6 @@ void reset_device() {
// memory management;
// memory management;
void
*
fpga_malloc
(
size_t
size
)
{
void
*
fpga_malloc
(
size_t
size
)
{
#ifdef PADDLE_MOBILE_OS_LINUX
#ifdef PADDLE_MOBILE_OS_LINUX
void
*
ptr
=
reinterpret_cast
<
void
*>
(
void
*
ptr
=
reinterpret_cast
<
void
*>
(
...
@@ -205,7 +204,7 @@ int get_device_info(const struct DeviceInfo &args) {
...
@@ -205,7 +204,7 @@ int get_device_info(const struct DeviceInfo &args) {
int
perform_bypass
(
const
struct
BypassArgs
&
args
)
{
int
perform_bypass
(
const
struct
BypassArgs
&
args
)
{
int
ret
=
-
1
;
int
ret
=
-
1
;
int
size
=
args
.
image
.
channels
*
args
.
image
.
width
*
args
.
image
.
height
;
int
size
=
args
.
image
.
channels
*
args
.
image
.
width
*
args
.
image
.
height
;
int
max_size
=
1
<<
2
2
;
int
max_size
=
1
<<
2
0
;
float
times
=
1.0
*
size
/
max_size
;
float
times
=
1.0
*
size
/
max_size
;
int
count
=
static_cast
<
int
>
(
times
);
int
count
=
static_cast
<
int
>
(
times
);
...
...
lite/backends/fpga/KD/pes/prior_box_pe.cpp
浏览文件 @
00aa83e5
...
@@ -241,10 +241,13 @@ void PriorBoxPE::compute_prior_box() {
...
@@ -241,10 +241,13 @@ void PriorBoxPE::compute_prior_box() {
}
}
boxes
.
flush
();
boxes
.
flush
();
boxes
.
syncToCPU
();
//
boxes.syncToCPU();
variances
.
flush
();
variances
.
flush
();
output_boxes
->
copyFrom
(
&
boxes
);
output_boxes
->
copyFrom
(
&
boxes
);
output_variances
->
copyFrom
(
&
variances
);
output_variances
->
copyFrom
(
&
variances
);
output_boxes
->
invalidate
();
output_variances
->
invalidate
();
}
}
void
PriorBoxPE
::
apply
()
{}
void
PriorBoxPE
::
apply
()
{}
...
@@ -253,8 +256,9 @@ bool PriorBoxPE::dispatch() {
...
@@ -253,8 +256,9 @@ bool PriorBoxPE::dispatch() {
if
(
cachedBoxes_
==
nullptr
)
{
if
(
cachedBoxes_
==
nullptr
)
{
cachedBoxes_
=
new
Tensor
();
cachedBoxes_
=
new
Tensor
();
cachedVariances_
=
new
Tensor
();
cachedVariances_
=
new
Tensor
();
cachedBoxes_
->
mutableData
<
float
>
(
FP32
,
param_
.
outputBoxes
->
shape
());
cachedBoxes_
->
mutableData
<
float16
>
(
FP16
,
param_
.
outputBoxes
->
shape
());
cachedVariances_
->
mutableData
<
float
>
(
FP32
,
param_
.
outputVariances
->
shape
());
cachedVariances_
->
mutableData
<
float16
>
(
FP16
,
param_
.
outputVariances
->
shape
());
cachedBoxes_
->
setDataLocation
(
CPU
);
cachedBoxes_
->
setDataLocation
(
CPU
);
cachedVariances_
->
setDataLocation
(
CPU
);
cachedVariances_
->
setDataLocation
(
CPU
);
compute_prior_box
();
compute_prior_box
();
...
...
lite/backends/fpga/KD/tensor.hpp
浏览文件 @
00aa83e5
...
@@ -389,11 +389,17 @@ class Tensor {
...
@@ -389,11 +389,17 @@ class Tensor {
float
value
=
0
;
float
value
=
0
;
if
(
dataType_
==
FP32
)
{
if
(
dataType_
==
FP32
)
{
value
=
data
<
float
>
()[
i
];
value
=
data
<
float
>
()[
i
];
}
else
if
(
dataType_
==
FP16
)
{
}
if
(
dataType_
==
FP16
)
{
value
=
half_to_float
(
data
<
float16
>
()[
i
]);
value
=
half_to_float
(
data
<
float16
>
()[
i
]);
}
else
{
}
if
(
dataType_
==
INT8
)
{
value
=
data
<
int8_t
>
()[
i
];
value
=
data
<
int8_t
>
()[
i
];
}
}
if
(
dataType_
==
INT32
)
{
value
=
data
<
int32_t
>
()[
i
];
}
ofs
<<
value
<<
std
::
endl
;
ofs
<<
value
<<
std
::
endl
;
}
}
ofs
.
close
();
ofs
.
close
();
...
...
lite/backends/fpga/lite_tensor.h
浏览文件 @
00aa83e5
...
@@ -81,8 +81,7 @@ class DDimLite {
...
@@ -81,8 +81,7 @@ class DDimLite {
return
!
(
a
==
b
);
return
!
(
a
==
b
);
}
}
~
DDimLite
()
{
~
DDimLite
()
{}
}
private:
private:
std
::
vector
<
value_type
>
data_
;
std
::
vector
<
value_type
>
data_
;
...
@@ -112,9 +111,7 @@ class TensorLite {
...
@@ -112,9 +111,7 @@ class TensorLite {
return
zynq_tensor_
->
data
<
R
>
()
+
offset_
;
return
zynq_tensor_
->
data
<
R
>
()
+
offset_
;
}
}
void
Resize
(
const
DDimLite
&
ddim
)
{
void
Resize
(
const
DDimLite
&
ddim
)
{
dims_
=
ddim
;
}
dims_
=
ddim
;
}
void
Resize
(
const
std
::
vector
<
int64_t
>
&
x
)
{
dims_
=
DDimLite
(
x
);
}
void
Resize
(
const
std
::
vector
<
int64_t
>
&
x
)
{
dims_
=
DDimLite
(
x
);
}
const
DDimLite
&
dims
()
const
{
return
dims_
;
}
const
DDimLite
&
dims
()
const
{
return
dims_
;
}
...
@@ -212,6 +209,28 @@ class TensorLite {
...
@@ -212,6 +209,28 @@ class TensorLite {
void
mutable_data_internal
();
void
mutable_data_internal
();
};
};
template
<
typename
T
>
zynqmp
::
DataType
get_date_type
()
{
zynqmp
::
DataType
data_type
=
zynqmp
::
FP32
;
if
(
typeid
(
T
)
==
typeid
(
float
))
{
data_type
=
zynqmp
::
FP32
;
}
if
(
typeid
(
T
)
==
typeid
(
zynqmp
::
float16
))
{
data_type
=
zynqmp
::
FP16
;
}
if
(
typeid
(
T
)
==
typeid
(
int
))
{
data_type
=
zynqmp
::
INT32
;
}
if
(
typeid
(
T
)
==
typeid
(
int32_t
))
{
data_type
=
zynqmp
::
INT32
;
}
if
(
typeid
(
T
)
==
typeid
(
int8_t
))
{
data_type
=
zynqmp
::
INT8
;
}
return
data_type
;
}
template
<
typename
T
,
typename
R
>
template
<
typename
T
,
typename
R
>
R
*
TensorLite
::
mutable_data
()
{
R
*
TensorLite
::
mutable_data
()
{
std
::
vector
<
int
>
v
;
std
::
vector
<
int
>
v
;
...
@@ -237,13 +256,8 @@ R *TensorLite::mutable_data() {
...
@@ -237,13 +256,8 @@ R *TensorLite::mutable_data() {
break
;
break
;
}
}
zynqmp
::
Shape
input_shape
(
layout_type
,
v
);
zynqmp
::
Shape
input_shape
(
layout_type
,
v
);
zynqmp
::
DataType
data_type
=
zynqmp
::
FP32
;
zynqmp
::
DataType
data_type
=
get_date_type
<
T
>
();
if
(
typeid
(
T
)
==
typeid
(
float
))
{
data_type
=
zynqmp
::
FP32
;
}
if
(
typeid
(
T
)
==
typeid
(
zynqmp
::
float16
))
{
data_type
=
zynqmp
::
FP16
;
}
if
(
zynq_tensor_
.
get
()
==
nullptr
)
{
if
(
zynq_tensor_
.
get
()
==
nullptr
)
{
zynq_tensor_
.
reset
(
new
zynqmp
::
Tensor
());
zynq_tensor_
.
reset
(
new
zynqmp
::
Tensor
());
}
}
...
...
lite/core/mir/fusion/conv_activation_fuse_pass.cc
浏览文件 @
00aa83e5
...
@@ -25,7 +25,7 @@ namespace mir {
...
@@ -25,7 +25,7 @@ namespace mir {
void
ConvActivationFusePass
::
Apply
(
const
std
::
unique_ptr
<
SSAGraph
>&
graph
)
{
void
ConvActivationFusePass
::
Apply
(
const
std
::
unique_ptr
<
SSAGraph
>&
graph
)
{
std
::
vector
<
std
::
string
>
act_types
{
"relu"
};
std
::
vector
<
std
::
string
>
act_types
{
"relu"
};
for
(
auto
&
place
:
graph
->
valid_places
())
{
for
(
auto
&
place
:
graph
->
valid_places
())
{
if
(
place
.
target
==
TARGET
(
kCUDA
))
{
if
(
place
.
target
==
TARGET
(
kCUDA
)
||
place
.
target
==
TARGET
(
kFPGA
)
)
{
act_types
.
push_back
(
"leaky_relu"
);
act_types
.
push_back
(
"leaky_relu"
);
break
;
break
;
}
}
...
...
lite/kernels/fpga/CMakeLists.txt
浏览文件 @
00aa83e5
...
@@ -8,7 +8,7 @@ set(fpga_deps fpga_target_wrapper kernel_fpga)
...
@@ -8,7 +8,7 @@ set(fpga_deps fpga_target_wrapper kernel_fpga)
add_kernel
(
activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS
${
fpga_deps
}
)
add_kernel
(
activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS
${
fpga_deps
}
)
# add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
# add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
#
add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
add_kernel
(
concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS
${
fpga_deps
}
)
add_kernel
(
conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS
${
fpga_deps
}
)
add_kernel
(
conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS
${
fpga_deps
}
)
# add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
# add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
...
@@ -28,8 +28,9 @@ add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fp
...
@@ -28,8 +28,9 @@ add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fp
add_kernel
(
reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS
${
fpga_deps
}
reshape_op
)
add_kernel
(
reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS
${
fpga_deps
}
reshape_op
)
# add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps})
# add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps})
add_kernel
(
scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS
${
fpga_deps
}
)
add_kernel
(
scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS
${
fpga_deps
}
)
# add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
add_kernel
(
softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS
${
fpga_deps
}
)
# add_kernel(transpose_compute_fpga FPGA basic SRCS transpose_compute.cc DEPS ${fpga_deps})
add_kernel
(
split_compute_fpga FPGA basic SRCS split_compute.cc DEPS
${
fpga_deps
}
)
add_kernel
(
transpose_compute_fpga FPGA basic SRCS transpose_compute.cc DEPS
${
fpga_deps
}
)
add_kernel
(
io_copy_compute_fpga FPGA basic SRCS io_copy_compute.cc DEPS
${
fpga_deps
}
)
add_kernel
(
io_copy_compute_fpga FPGA basic SRCS io_copy_compute.cc DEPS
${
fpga_deps
}
)
add_kernel
(
calib_compute_fpga FPGA basic SRCS calib_compute.cc DEPS
${
fpga_deps
}
)
add_kernel
(
calib_compute_fpga FPGA basic SRCS calib_compute.cc DEPS
${
fpga_deps
}
)
...
...
lite/kernels/fpga/io_copy_compute.cc
浏览文件 @
00aa83e5
...
@@ -45,21 +45,32 @@ class IoCopyHostToFpgaCompute
...
@@ -45,21 +45,32 @@ class IoCopyHostToFpgaCompute
auto
&
param
=
Param
<
operators
::
IoCopyParam
>
();
auto
&
param
=
Param
<
operators
::
IoCopyParam
>
();
CHECK
(
param
.
x
->
target
()
==
TARGET
(
kHost
)
||
CHECK
(
param
.
x
->
target
()
==
TARGET
(
kHost
)
||
param
.
x
->
target
()
==
TARGET
(
kFPGA
));
param
.
x
->
target
()
==
TARGET
(
kFPGA
));
param
.
y
->
mutable_data
<
float16
>
();
param
.
x
->
ZynqTensor
()
->
flush
();
if
(
param
.
x
->
ZynqTensor
()
->
aligned
()
&&
param
.
x
->
ZynqTensor
()
->
shape
().
shouldAlign
())
{
if
(
param
.
x
->
ZynqTensor
()
->
dataType
()
==
zynqmp
::
INT32
)
{
zynqmp
::
Tensor
tempTensor
;
param
.
y
->
mutable_data
<
int
>
();
tempTensor
.
mutableData
<
float16
>
(
zynqmp
::
FP16
,
param
.
x
->
ZynqTensor
()
->
shape
());
tempTensor
.
copyFrom
(
param
.
x
->
ZynqTensor
());
tempTensor
.
setAligned
(
true
);
tempTensor
.
unalignImage
();
param
.
y
->
ZynqTensor
()
->
copyFrom
(
&
tempTensor
);
}
else
{
param
.
y
->
ZynqTensor
()
->
copyFrom
(
param
.
x
->
ZynqTensor
());
param
.
y
->
ZynqTensor
()
->
copyFrom
(
param
.
x
->
ZynqTensor
());
return
;
}
}
param
.
y
->
ZynqTensor
()
->
invalidate
();
param
.
y
->
ZynqTensor
()
->
copyScaleFrom
(
param
.
x
->
ZynqTensor
());
if
(
param
.
x
->
ZynqTensor
()
->
dataType
()
==
zynqmp
::
FP32
)
{
param
.
y
->
mutable_data
<
float16
>
();
if
(
param
.
x
->
ZynqTensor
()
->
aligned
()
&&
param
.
x
->
ZynqTensor
()
->
shape
().
shouldAlign
())
{
zynqmp
::
Tensor
tempTensor
;
tempTensor
.
mutableData
<
float16
>
(
zynqmp
::
FP16
,
param
.
x
->
ZynqTensor
()
->
shape
());
tempTensor
.
copyFrom
(
param
.
x
->
ZynqTensor
());
tempTensor
.
setAligned
(
true
);
tempTensor
.
unalignImage
();
param
.
y
->
ZynqTensor
()
->
copyFrom
(
&
tempTensor
);
}
else
{
param
.
y
->
ZynqTensor
()
->
copyFrom
(
param
.
x
->
ZynqTensor
());
}
param
.
y
->
ZynqTensor
()
->
invalidate
();
param
.
y
->
ZynqTensor
()
->
copyScaleFrom
(
param
.
x
->
ZynqTensor
());
}
auto
out_lod
=
param
.
y
->
mutable_lod
();
auto
out_lod
=
param
.
y
->
mutable_lod
();
*
out_lod
=
param
.
x
->
lod
();
*
out_lod
=
param
.
x
->
lod
();
}
}
...
...
lite/kernels/fpga/multiclass_nms_compute.cc
浏览文件 @
00aa83e5
...
@@ -318,14 +318,29 @@ void MultiClassOutput(const Tensor& scores,
...
@@ -318,14 +318,29 @@ void MultiClassOutput(const Tensor& scores,
void
MulticlassNmsCompute
::
Run
()
{
void
MulticlassNmsCompute
::
Run
()
{
auto
&
param
=
Param
<
operators
::
MulticlassNmsParam
>
();
auto
&
param
=
Param
<
operators
::
MulticlassNmsParam
>
();
auto
*
boxes
=
param
.
bboxes
;
auto
*
boxes
_in
=
param
.
bboxes
;
auto
*
scores
=
param
.
scores
;
auto
*
scores
_in
=
param
.
scores
;
auto
*
outs
=
param
.
out
;
auto
*
outs
=
param
.
out
;
outs
->
mutable_data
<
float
>
();
outs
->
mutable_data
<
float
>
();
auto
score_dims
=
scores
->
dims
();
auto
score_dims
=
boxes_in
->
dims
();
auto
score_size
=
score_dims
.
size
();
auto
score_size
=
score_dims
.
size
();
Tensor
boxes_float
;
Tensor
scores_float
;
boxes_float
.
Resize
(
boxes_in
->
dims
());
scores_float
.
Resize
(
scores_in
->
dims
());
boxes_float
.
mutable_data
<
float
>
();
scores_float
.
mutable_data
<
float
>
();
boxes_float
.
ZynqTensor
()
->
copyFrom
(
boxes_in
->
ZynqTensor
());
scores_float
.
ZynqTensor
()
->
copyFrom
(
scores_in
->
ZynqTensor
());
Tensor
*
boxes
=
&
boxes_float
;
Tensor
*
scores
=
&
scores_float
;
auto
box_dims
=
boxes
->
dims
();
auto
box_dims
=
boxes
->
dims
();
int64_t
box_dim
=
boxes
->
dims
()[
2
];
int64_t
box_dim
=
boxes
->
dims
()[
2
];
...
@@ -383,6 +398,7 @@ void MulticlassNmsCompute::Run() {
...
@@ -383,6 +398,7 @@ void MulticlassNmsCompute::Run() {
MultiClassOutput
<
float
>
(
MultiClassOutput
<
float
>
(
scores_slice
,
boxes_slice
,
all_indices
[
i
],
score_dims
.
size
(),
&
out
);
scores_slice
,
boxes_slice
,
all_indices
[
i
],
score_dims
.
size
(),
&
out
);
outs
->
ZynqTensor
()
->
copyFrom
(
out
.
ZynqTensor
());
outs
->
ZynqTensor
()
->
copyFrom
(
out
.
ZynqTensor
());
out
.
ZynqTensor
()
->
saveToFile
(
"nms_oo"
,
true
);
}
}
outs
->
Resize
({
static_cast
<
int64_t
>
(
e
-
s
),
out_dim
});
outs
->
Resize
({
static_cast
<
int64_t
>
(
e
-
s
),
out_dim
});
}
}
...
@@ -402,16 +418,16 @@ void MulticlassNmsCompute::Run() {
...
@@ -402,16 +418,16 @@ void MulticlassNmsCompute::Run() {
}
// namespace lite
}
// namespace lite
}
// namespace paddle
}
// namespace paddle
REGISTER_LITE_KERNEL
(
multiclass_nms
,
//
REGISTER_LITE_KERNEL(multiclass_nms,
kFPGA
,
//
kFPGA,
kFP16
,
//
kFP16,
kNHWC
,
//
kNHWC,
paddle
::
lite
::
kernels
::
fpga
::
MulticlassNmsCompute
,
//
paddle::lite::kernels::fpga::MulticlassNmsCompute,
def
)
//
def)
.
BindInput
(
"BBoxes"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
))})
//
.BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
.
BindInput
(
"Scores"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
))})
//
.BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
))})
//
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.
Finalize
();
//
.Finalize();
REGISTER_LITE_KERNEL
(
multiclass_nms
,
REGISTER_LITE_KERNEL
(
multiclass_nms
,
kFPGA
,
kFPGA
,
...
@@ -427,5 +443,8 @@ REGISTER_LITE_KERNEL(multiclass_nms,
...
@@ -427,5 +443,8 @@ REGISTER_LITE_KERNEL(multiclass_nms,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
DATALAYOUT
(
kNHWC
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFloat
),
DATALAYOUT
(
kNHWC
))})
.
Finalize
();
.
Finalize
();
lite/kernels/fpga/prior_box_compute.cc
浏览文件 @
00aa83e5
...
@@ -131,3 +131,27 @@ REGISTER_LITE_KERNEL(prior_box,
...
@@ -131,3 +131,27 @@ REGISTER_LITE_KERNEL(prior_box,
.
BindOutput
(
"Boxes"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"Boxes"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"Variances"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"Variances"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
Finalize
();
.
Finalize
();
// REGISTER_LITE_KERNEL(prior_box,
// kFPGA,
// kFP16,
// kNHWC,
// paddle::lite::kernels::fpga::PriorBoxCompute,
// def)
// .BindInput("Input",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindInput("Image",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindOutput("Boxes",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindOutput("Variances",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .Finalize();
lite/kernels/fpga/split_compute.cc
0 → 100644
浏览文件 @
00aa83e5
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/split_compute.h"
#include <vector>
#include "lite/backends/arm/math/funcs.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
fpga
{
void
SplitCompute
::
PrepareForRun
()
{
auto
&
param
=
Param
<
operators
::
SplitParam
>
();
zynqmp
::
SplitParam
&
split_param
=
pe_
.
param
();
split_param
.
input
=
param
.
x
->
ZynqTensor
();
auto
&
dout
=
param
.
output
;
for
(
int
i
=
0
;
i
<
dout
.
size
();
i
++
)
{
dout
[
i
]
->
mutable_data
<
zynqmp
::
float16
>
();
split_param
.
outputs
.
push_back
(
dout
[
i
]
->
ZynqTensor
());
}
pe_
.
init
();
pe_
.
apply
();
}
void
SplitCompute
::
Run
()
{
zynqmp
::
SplitParam
&
split_param
=
pe_
.
param
();
pe_
.
dispatch
();
#ifdef FPGA_PRINT_TENSOR
auto
&
dout
=
param
.
output
;
for
(
int
i
=
0
;
i
<
dout
.
size
();
i
++
)
{
Debugger
::
get_instance
().
registerOutput
(
"split"
,
split_param
.
outputs
[
0
]);
}
#endif
}
}
// namespace fpga
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
REGISTER_LITE_KERNEL
(
split
,
kFPGA
,
kFP16
,
kNHWC
,
paddle
::
lite
::
kernels
::
fpga
::
SplitCompute
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindInput
(
"AxisTensor"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kInt32
))})
.
BindInput
(
"SectionsTensorList"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kInt32
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
Finalize
();
lite/kernels/fpga/split_compute.h
0 → 100644
浏览文件 @
00aa83e5
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/backends/fpga/KD/float16.hpp"
#include "lite/backends/fpga/KD/pes/split_pe.hpp"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
fpga
{
class
SplitCompute
:
public
KernelLite
<
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
)
>
{
public:
void
PrepareForRun
()
override
;
void
Run
()
override
;
virtual
~
SplitCompute
()
=
default
;
private:
zynqmp
::
SplitPE
pe_
;
};
}
// namespace fpga
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
lite/kernels/fpga/transpose_compute.cc
浏览文件 @
00aa83e5
...
@@ -81,7 +81,17 @@ void transposeCompute(operators::TransposeParam param) {
...
@@ -81,7 +81,17 @@ void transposeCompute(operators::TransposeParam param) {
}
}
// Transpose
// Transpose
void
TransposeCompute
::
Run
()
{
auto
&
param
=
this
->
Param
<
param_t
>
();
}
void
TransposeCompute
::
Run
()
{
auto
&
param
=
this
->
Param
<
param_t
>
();
param
.
output
->
mutable_data
<
zynqmp
::
float16
>
();
param
.
x
->
ZynqTensor
()
->
invalidate
();
param
.
x
->
ZynqTensor
()
->
unalignImage
();
if
(
param
.
x
->
dims
().
size
()
!=
4
)
{
transposeCompute
(
param
);
}
else
{
param
.
output
->
ZynqTensor
()
->
copyFrom
(
param
.
x
->
ZynqTensor
());
}
}
// Transpose2
// Transpose2
void
Transpose2Compute
::
Run
()
{
void
Transpose2Compute
::
Run
()
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录