Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
a59d6fab
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a59d6fab
编写于
6月 19, 2020
作者:
C
chonwhite
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
arm & fpga kernel works together
上级
2b32484a
变更
41
隐藏空白更改
内联
并排
Showing
41 changed file
with
1235 addition
and
289 deletion
+1235
-289
lite/backends/fpga/KD/debugger.hpp
lite/backends/fpga/KD/debugger.hpp
+1
-0
lite/backends/fpga/KD/dispatch/action.hpp
lite/backends/fpga/KD/dispatch/action.hpp
+36
-0
lite/backends/fpga/KD/dispatch/transaction.hpp
lite/backends/fpga/KD/dispatch/transaction.hpp
+40
-0
lite/backends/fpga/KD/dispatch/transaction_manager.hpp
lite/backends/fpga/KD/dispatch/transaction_manager.hpp
+47
-0
lite/backends/fpga/KD/llapi/filter.cpp
lite/backends/fpga/KD/llapi/filter.cpp
+2
-2
lite/backends/fpga/KD/pes/conv_process.hpp
lite/backends/fpga/KD/pes/conv_process.hpp
+6
-8
lite/backends/fpga/KD/pes/input_pe.hpp
lite/backends/fpga/KD/pes/input_pe.hpp
+3
-1
lite/backends/fpga/KD/pes/norm_pe.hpp
lite/backends/fpga/KD/pes/norm_pe.hpp
+1
-0
lite/backends/fpga/KD/pes/output_pe.hpp
lite/backends/fpga/KD/pes/output_pe.hpp
+2
-2
lite/backends/fpga/KD/pes/prior_box_pe.cpp
lite/backends/fpga/KD/pes/prior_box_pe.cpp
+4
-3
lite/backends/fpga/KD/pes/prior_box_pe.hpp
lite/backends/fpga/KD/pes/prior_box_pe.hpp
+7
-0
lite/backends/fpga/KD/pes/resize_pe.hpp
lite/backends/fpga/KD/pes/resize_pe.hpp
+35
-1
lite/backends/fpga/KD/pes/scale_pe.hpp
lite/backends/fpga/KD/pes/scale_pe.hpp
+23
-16
lite/backends/fpga/KD/pes/softmax_pe.cpp
lite/backends/fpga/KD/pes/softmax_pe.cpp
+1
-0
lite/backends/fpga/KD/pes/split_pe.hpp
lite/backends/fpga/KD/pes/split_pe.hpp
+1
-1
lite/backends/fpga/KD/tensor.hpp
lite/backends/fpga/KD/tensor.hpp
+25
-10
lite/backends/fpga/lite_tensor.cc
lite/backends/fpga/lite_tensor.cc
+1
-0
lite/backends/fpga/lite_tensor.h
lite/backends/fpga/lite_tensor.h
+26
-1
lite/core/mir/kernel_place_correct_pass.h
lite/core/mir/kernel_place_correct_pass.h
+142
-1
lite/core/mir/static_kernel_pick_pass.h
lite/core/mir/static_kernel_pick_pass.h
+17
-0
lite/core/mir/type_precision_cast_pass.cc
lite/core/mir/type_precision_cast_pass.cc
+10
-0
lite/core/mir/type_target_cast_pass.cc
lite/core/mir/type_target_cast_pass.cc
+10
-1
lite/kernels/arm/concat_compute.cc
lite/kernels/arm/concat_compute.cc
+17
-15
lite/kernels/fpga/CMakeLists.txt
lite/kernels/fpga/CMakeLists.txt
+2
-0
lite/kernels/fpga/calib_compute.cc
lite/kernels/fpga/calib_compute.cc
+28
-1
lite/kernels/fpga/calib_compute.h
lite/kernels/fpga/calib_compute.h
+12
-0
lite/kernels/fpga/concat_compute.cc
lite/kernels/fpga/concat_compute.cc
+2
-1
lite/kernels/fpga/conv_compute.cc
lite/kernels/fpga/conv_compute.cc
+11
-0
lite/kernels/fpga/elementwise_compute.cc
lite/kernels/fpga/elementwise_compute.cc
+46
-6
lite/kernels/fpga/fetch_compute.cc
lite/kernels/fpga/fetch_compute.cc
+12
-12
lite/kernels/fpga/interpolate_compute.cc
lite/kernels/fpga/interpolate_compute.cc
+282
-0
lite/kernels/fpga/interpolate_compute.h
lite/kernels/fpga/interpolate_compute.h
+50
-0
lite/kernels/fpga/io_copy_compute.cc
lite/kernels/fpga/io_copy_compute.cc
+109
-87
lite/kernels/fpga/multiclass_nms_compute.cc
lite/kernels/fpga/multiclass_nms_compute.cc
+101
-70
lite/kernels/fpga/prior_box_compute.cc
lite/kernels/fpga/prior_box_compute.cc
+2
-1
lite/kernels/fpga/reshape_compute.cc
lite/kernels/fpga/reshape_compute.cc
+65
-32
lite/kernels/fpga/reshape_compute.h
lite/kernels/fpga/reshape_compute.h
+8
-0
lite/kernels/fpga/scale_compute.cc
lite/kernels/fpga/scale_compute.cc
+2
-2
lite/kernels/fpga/scale_compute.h
lite/kernels/fpga/scale_compute.h
+2
-0
lite/kernels/fpga/softmax_compute.cc
lite/kernels/fpga/softmax_compute.cc
+20
-5
lite/kernels/fpga/transpose_compute.cc
lite/kernels/fpga/transpose_compute.cc
+24
-10
未找到文件。
lite/backends/fpga/KD/debugger.hpp
浏览文件 @
a59d6fab
...
...
@@ -73,6 +73,7 @@ class Debugger {
op_config
[
"nms"
]
=
true
;
op_config
[
"pb_boxes"
]
=
true
;
op_config
[
"pb_variances"
]
=
true
;
op_config
[
"reshape"
]
=
true
;
op_config
[
"softmax"
]
=
true
;
op_config
[
"split"
]
=
true
;
}
...
...
lite/backends/fpga/KD/dispatch/action.hpp
0 → 100644
浏览文件 @
a59d6fab
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace
paddle
{
namespace
zynqmp
{
class
Action
{
public:
void
readScale
(
float
*
scale
)
{
}
void
writeScale
(
float
*
scale
)
{
}
private:
int
id_
=
-
1
;
int
scaleIndex_
=
-
1
;
}
}
}
\ No newline at end of file
lite/backends/fpga/KD/dispatch/transaction.hpp
0 → 100644
浏览文件 @
a59d6fab
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/fpga/KD/dispatch/action.hpp"
#include <algorithm>
#include <vector>
namespace
paddle
{
namespace
zynqmp
{
class
Transaction
{
public:
void
appendAction
(
Action
*
action
)
{
actions_
.
push_back
(
action
);
};
void
startTraction
()
{
};
private:
std
::
std
::
vector
<
Action
*>
actions_
;
int
id_
=
-
1
;
}
}
}
\ No newline at end of file
lite/backends/fpga/KD/dispatch/transaction_manager.hpp
0 → 100644
浏览文件 @
a59d6fab
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
namespace
paddle
{
namespace
zynqmp
{
class
TransactionManager
{
public:
static
TransactionManager
&
get_instance
()
{
static
TransactionManager
s_instance
;
return
s_instance
;
}
Transaction
*
getTransaction
()
{
if
(
currentTransaction_
==
nullptr
)
{
currentTransaction_
=
new
Transaction
();
transactions_
.
push_back
(
currentTransaction_
);
}
return
currentTransaction_
;
};
void
endTransaction
()
{
currentTransaction_
=
nullptr
;
}
private:
Transaction
*
currentTransaction_
=
nullptr
;
std
::
vector
<
Transaction
*>
transactions_
;
}
}
}
\ No newline at end of file
lite/backends/fpga/KD/llapi/filter.cpp
浏览文件 @
a59d6fab
...
...
@@ -240,8 +240,8 @@ int8_t* format_filter(float* data_in,
for
(
int
n
=
0
;
n
<
num
;
n
++
)
{
float
*
filter_start
=
data_in
+
n
*
chw
;
int8_t
*
quantized_start
=
quantized_data
+
n
*
chw
;
//
float f_max = find_max(filter_start, chw);
float
f_max
=
max
;
float
f_max
=
find_max
(
filter_start
,
chw
);
//
float f_max = max;
quantize
(
filter_start
,
quantized_start
,
chw
,
f_max
);
filter_max
.
push_back
(
f_max
);
}
...
...
lite/backends/fpga/KD/pes/conv_process.hpp
浏览文件 @
a59d6fab
...
...
@@ -264,10 +264,10 @@ inline void format_filter(Tensor* filter,
quantized_filter
->
flush
();
fpga_free
(
quantized_data
);
//
for (size_t i = 0; i < max_values.size(); i++) {
// //
scales.push_back(max_values[i] / max_value);
//
scales.push_back(1.0f);
//
}
for
(
size_t
i
=
0
;
i
<
max_values
.
size
();
i
++
)
{
scales
.
push_back
(
max_values
[
i
]
/
max_value
);
//
scales.push_back(1.0f);
}
// filter->saveToFile("filter.txt");
// std::ofstream ofs;
...
...
@@ -374,17 +374,15 @@ inline void split_filter_num(const ConvParam& c_param) {
std
::
vector
<
float
>
v
;
// TODO(chonwhite) change variable name;
format_filter
(
&
new_filter
,
&
(
conv_param
->
filter
),
param
.
groups
,
v
,
max
);
conv_param
->
filter
.
setDataType
(
INT8
);
Tensor
scale
;
Tensor
bias
;
int
chnnnel_start
=
i
*
filter_num_per_div
;
Shape
s_shape
(
NC
,
{
1
,
filter_num
});
float
*
scale_data
=
scale
.
mutableData
<
float
>
(
FP32
,
s_shape
);
float
*
bias_data
=
bias
.
mutableData
<
float
>
(
FP32
,
s_shape
);
for
(
int
n
=
0
;
n
<
filter_num
;
n
++
)
{
scale_data
[
n
]
=
param
.
scale
()
->
data
<
float
>
()[
n
+
chnnnel_start
];
scale_data
[
n
]
=
param
.
scale
()
->
data
<
float
>
()[
n
+
chnnnel_start
]
*
v
[
n
]
;
}
for
(
int
n
=
0
;
n
<
filter_num
;
n
++
)
{
bias_data
[
n
]
=
param
.
bias
()
->
data
<
float
>
()[
n
+
chnnnel_start
];
...
...
@@ -513,7 +511,7 @@ inline void pack_channel_filter(const ConvParam& c_param) {
float
*
scale_data
=
scale
.
mutableData
<
float
>
(
FP32
,
s_shape
);
float
*
bias_data
=
bias
.
mutableData
<
float
>
(
FP32
,
s_shape
);
for
(
int
n
=
0
;
n
<
filter_current_pack
;
n
++
)
{
scale_data
[
n
]
=
param
.
scale
()
->
data
<
float
>
()[
n
+
chnnnel_start
];
scale_data
[
n
]
=
param
.
scale
()
->
data
<
float
>
()[
n
+
chnnnel_start
]
*
v
[
n
]
;
}
for
(
int
n
=
0
;
n
<
filter_current_pack
;
n
++
)
{
bias_data
[
n
]
=
param
.
bias
()
->
data
<
float
>
()[
n
+
chnnnel_start
];
...
...
lite/backends/fpga/KD/pes/input_pe.hpp
浏览文件 @
a59d6fab
...
...
@@ -41,7 +41,9 @@ class InputPE : public PE {
src
=
&
half_tensor
;
}
output
->
mutableData
<
void
>
();
src
->
alignImage
(
output
,
true
);
src
->
alignImage
();
output
->
copyFrom
(
src
);
// src->alignImage(output, true);
return
true
;
}
...
...
lite/backends/fpga/KD/pes/norm_pe.hpp
浏览文件 @
a59d6fab
...
...
@@ -103,6 +103,7 @@ class NormPE : public PE {
float_out
.
flush
();
// float_out.saveToFile("normalize_", true);
param_
.
output
->
copyFrom
(
&
float_out
);
param_
.
output
->
flush
();
}
bool
dispatch
()
{
...
...
lite/backends/fpga/KD/pes/output_pe.hpp
浏览文件 @
a59d6fab
...
...
@@ -56,8 +56,8 @@ class OutputPE : public PE {
fpga_reset
();
auto
max
=
fpga_get_memory_size_max
();
std
::
cout
<<
"PL ===== Max: ===== :: "
<<
max
<<
std
::
endl
;
//
auto max = fpga_get_memory_size_max();
//
std::cout << "PL ===== Max: ===== :: " << max << std::endl;
return
true
;
}
...
...
lite/backends/fpga/KD/pes/prior_box_pe.cpp
浏览文件 @
a59d6fab
...
...
@@ -241,7 +241,7 @@ void PriorBoxPE::compute_prior_box() {
}
boxes
.
flush
();
boxes
.
syncToCPU
();
//
boxes.syncToCPU();
variances
.
flush
();
output_boxes
->
copyFrom
(
&
boxes
);
output_variances
->
copyFrom
(
&
variances
);
...
...
@@ -261,11 +261,12 @@ bool PriorBoxPE::dispatch() {
}
param_
.
outputBoxes
->
copyFrom
(
this
->
cachedBoxes_
);
param_
.
outputVariances
->
copyFrom
(
this
->
cachedVariances_
);
param_
.
outputBoxes
->
flush
();
param_
.
outputBoxes
->
syncToCPU
();
//
param_.outputBoxes->syncToCPU();
param_
.
outputVariances
->
flush
();
return
true
;
}
}
// namespace zynqmp
...
...
lite/backends/fpga/KD/pes/prior_box_pe.hpp
浏览文件 @
a59d6fab
...
...
@@ -35,6 +35,13 @@ class PriorBoxPE : public PE {
PriorBoxParam
&
param
()
{
return
param_
;
}
~
PriorBoxPE
()
{
if
(
cachedBoxes_
!=
nullptr
)
{
delete
cachedBoxes_
;
delete
cachedVariances_
;
}
}
private:
PriorBoxParam
param_
;
Tensor
*
cachedBoxes_
=
nullptr
;
...
...
lite/backends/fpga/KD/pes/resize.hpp
→
lite/backends/fpga/KD/pes/resize
_pe
.hpp
浏览文件 @
a59d6fab
...
...
@@ -73,9 +73,43 @@ class ResizePE : public PE {
scale
[
0
]
=
max
/
127.0
;
scale
[
1
]
=
127.0
/
max
;
}
void
cpu_compute
()
{
Shape
&
in_shape
=
param_
.
input
->
shape
();
Shape
&
out_shape
=
param_
.
output
->
shape
();
int
channel
=
in_shape
.
channel
();
int
in_height
=
in_shape
.
height
();
int
in_width
=
in_shape
.
width
();
int
out_width
=
out_shape
.
width
();
int
factor
=
out_shape
.
width
()
/
in_shape
.
width
();
param_
.
input
->
syncToCPU
();
for
(
int
h
=
0
;
h
<
in_height
;
h
++
)
{
for
(
int
w
=
0
;
w
<
in_width
;
w
++
)
{
int
src_index
=
in_width
*
channel
*
h
+
w
*
channel
;
float16
*
src
=
param_
.
input
->
data
<
float16
>
()
+
src_index
;
// std::cout << "src_index:" << src_index << std::endl;
for
(
int
v
=
0
;
v
<
factor
;
v
++
)
{
for
(
int
i
=
0
;
i
<
factor
;
i
++
)
{
int
dst_index
=
out_width
*
channel
*
h
*
factor
+
out_width
*
channel
*
v
+
w
*
channel
*
factor
+
channel
*
i
;
float16
*
dst
=
param_
.
output
->
data
<
float16
>
()
+
dst_index
;
memcpy
(
dst
,
src
,
channel
*
sizeof
(
float16
));
// std::cout << "dst_index:" << dst_index << std::endl;
}
}
}
}
param_
.
output
->
flush
();
param_
.
output
->
copyScaleFrom
(
param_
.
input
);
}
bool
dispatch
()
{
bool
ret
=
compute_fpga_resize
(
args_
)
==
0
;
cpu_compute
();
// bool ret = compute_fpga_resize(args_) == 0;
return
true
;
}
...
...
lite/backends/fpga/KD/pes/scale_pe.hpp
浏览文件 @
a59d6fab
...
...
@@ -141,22 +141,26 @@ class ScalePE : public PE {
Tensor
*
output
=
param_
.
output
;
Tensor
float_input
;
float
*
image_addr
=
float_input
.
mutableData
<
float
>
(
FP32
,
input
->
shape
());
input
->
syncToCPU
();
// input->syncToCPU();
// input->invalidate();
float_input
.
copyFrom
(
input
);
float16
*
data_out
=
output
->
data
<
float16
>
();
float
*
scale_data
=
param_
.
scale
->
data
<
float
>
();
float
16
*
scale_data
=
param_
.
scale
->
data
<
float16
>
();
int
wh
=
input
->
shape
().
width
()
*
input
->
shape
().
height
();
float16
*
in_data
=
input
->
data
<
float16
>
();
float
max
=
0
;
for
(
int
i
=
0
;
i
<
wh
;
i
++
)
{
for
(
int
c
=
0
;
c
<
input
->
shape
().
channel
();
c
++
)
{
int
index
=
i
*
input
->
shape
().
channel
()
+
c
;
float
value
=
half_to_float
(
in_data
[
index
])
*
scale_data
[
c
];
float
x
=
image_addr
[
index
];
float
y
=
half_to_float
(
scale_data
[
c
]);
float
value
=
x
*
y
;
// std::cout << " x = " << std::to_string(x) << " y = " << std::to_string(y) << " v = " << std::to_string(value) << std::endl;
// float value = half_to_float(in_data[index]) * 19.3598f;
data_out
[
index
]
=
float_to_half
(
value
);
if
(
value
<
0
)
{
...
...
@@ -167,24 +171,27 @@ class ScalePE : public PE {
}
}
}
// exit(-1);
output
->
flush
();
output
->
scale
()[
0
]
=
max
/
127.0
f
;
output
->
scale
()[
1
]
=
127.0
f
/
max
;
}
bool
dispatch
()
{
if
(
param_
.
scale
->
dataType
()
==
FP16
)
{
DepthwiseConvParam
&
dw_param
=
dw_pe_
.
param
();
memcpy
(
dw_param
.
quantizedFilter
()
->
mutableData
<
float16
>
(),
param_
.
scale
->
data
<
float16
>
(),
param_
.
scale
->
shape
().
numel
()
*
sizeof
(
float16
));
dw_param
.
quantizedFilter
()
->
scale
()[
0
]
=
param_
.
scale
->
scale
()[
0
];
dw_param
.
quantizedFilter
()
->
scale
()[
1
]
=
param_
.
scale
->
scale
()[
1
];
dw_param
.
quantizedFilter
()
->
flush
();
}
param_
.
input
->
syncToDevice
();
return
dw_pe_
.
dispatch
();
// if (param_.scale->dataType() == FP16) {
// DepthwiseConvParam& dw_param = dw_pe_.param();
// memcpy(dw_param.quantizedFilter()->mutableData<float16>(),
// param_.scale->data<float16>(),
// param_.scale->shape().numel() * sizeof(float16));
// dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
// dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
// dw_param.quantizedFilter()->flush();
// }
// param_.input->syncToDevice();
// return dw_pe_.dispatch();
cpu_compute
();
return
true
;
}
ScaleParam
&
param
()
{
return
param_
;
}
...
...
lite/backends/fpga/KD/pes/softmax_pe.cpp
浏览文件 @
a59d6fab
...
...
@@ -154,6 +154,7 @@ bool SoftmaxPE::dispatch() {
float_output
.
flush
();
output
->
copyFrom
(
&
float_output
);
output
->
flush
();
return
true
;
}
...
...
lite/backends/fpga/KD/pes/split_pe.hpp
浏览文件 @
a59d6fab
...
...
@@ -105,7 +105,7 @@ class SplitPE : public PE {
in_stride
,
out_stride
[
axis
]);
input_offset
+=
out_stride
[
axis
];
//
out->flush();
out
->
flush
();
}
return
true
;
}
...
...
lite/backends/fpga/KD/tensor.hpp
浏览文件 @
a59d6fab
...
...
@@ -266,22 +266,25 @@ class Tensor {
return
;
}
BypassArgs
args
;
args
.
input_data_type
=
src
->
dataType_
==
FP32
?
DATA_TYPE_FP32
:
DATA_TYPE_FP16
;
args
.
input_data_type
=
src
->
dataType_
==
FP32
?
DATA_TYPE_FP32
:
DATA_TYPE_FP16
;
args
.
output_data_type
=
dataType_
==
FP32
?
DATA_TYPE_FP32
:
DATA_TYPE_FP16
;
args
.
input_layout_type
=
LAYOUT_HWC
;
args
.
output_layout_type
=
LAYOUT_HWC
;
args
.
image
=
{.
address
=
src
->
data
<
void
>
(),
.
scale_address
=
src
->
scale
(),
.
channels
=
(
uint32_t
)
src
->
shape
().
numel
(),
.
width
=
1
,
.
height
=
1
,
.
pad_width
=
0u
,
.
pad_height
=
0u
};
args
.
image
=
{
.
address
=
src
->
data
<
void
>
(),
.
scale_address
=
src
->
scale
(),
.
channels
=
(
uint32_t
)
src
->
shape
().
numel
(),
.
width
=
1
,
.
height
=
1
,
.
pad_width
=
0U
,
.
pad_height
=
0U
};
ImageOutputArgs
output
=
{
.
address
=
data
<
void
>
(),
.
scale_address
=
scale
(),
.
address
=
data
<
void
>
(),
.
scale_address
=
scale
(),
};
args
.
output
=
output
;
size_t
aligned_remainder
=
src
->
shape
().
numel
()
%
16
;
if
(
aligned_remainder
>
0
)
{
...
...
@@ -380,6 +383,10 @@ class Tensor {
}
void
save_file_with_name
(
std
::
string
path
)
{
// std::cout << "saving file: " << path << std::endl;
void
*
add
=
(
void
*
)
this
;
// printf("tensor @: %p data: %p \n", (void *)add, (void*)data<void>());
// return;
std
::
ofstream
ofs
;
ofs
.
open
(
path
);
ofs
<<
scale
()[
0
]
<<
" / "
<<
scale
()[
1
]
<<
std
::
endl
;
...
...
@@ -399,8 +406,15 @@ class Tensor {
if
(
dataType_
==
INT32
)
{
value
=
data
<
int32_t
>
()[
i
];
}
if
(
i
<
10
)
{
std
::
cout
<<
value
<<
","
;
}
ofs
<<
value
<<
std
::
endl
;
}
usleep
(
30000
);
ofs
.
close
();
}
...
...
@@ -451,6 +465,7 @@ class Tensor {
value
=
half_to_float
(
tensor
.
data
<
float16
>
()[
i
]);
}
os
<<
value
<<
" "
;
}
os
<<
"
\n
"
;
return
os
;
...
...
lite/backends/fpga/lite_tensor.cc
浏览文件 @
a59d6fab
...
...
@@ -102,6 +102,7 @@ void TensorLite::CopyDataFrom(const TensorLite &other) {
Resize
(
other
.
dims
());
auto
shape
=
other
.
zynq_tensor_
->
shape
();
zynq_tensor_
->
mutableData
<
void
>
(
zynq_tensor_
->
dataType
(),
shape
);
precision_
=
other
.
precision_
;
// this->ZynqTensor()->copyFrom(other.ZynqTensor());
memcpy
(
this
->
ZynqTensor
()
->
data
<
void
>
(),
...
...
lite/backends/fpga/lite_tensor.h
浏览文件 @
a59d6fab
...
...
@@ -109,6 +109,7 @@ class TensorLite {
template
<
typename
T
,
typename
R
=
T
>
const
R
*
data
()
const
{
return
zynq_tensor_
->
data
<
R
>
()
+
offset_
;
// return zynq_tensor_->data<R>();
}
void
Resize
(
const
DDimLite
&
ddim
)
{
dims_
=
ddim
;
}
...
...
@@ -198,7 +199,8 @@ class TensorLite {
// set values of precision_ and persistable_ after updating it.
// If your tensor is just a temp tensor, such as activations,
// you can ignore these two attributes.
PrecisionType
precision_
{
PrecisionType
::
kUnk
};
// PrecisionType precision_{PrecisionType::kUnk};
PrecisionType
precision_
{
PrecisionType
::
kFloat
};
bool
persistable_
{
false
};
DDimLite
dims_
;
...
...
@@ -235,6 +237,28 @@ zynqmp::DataType get_date_type() {
return
data_type
;
}
template
<
typename
T
>
PrecisionType
get_precistion_type
()
{
PrecisionType
data_type
=
PrecisionType
::
kUnk
;
if
(
typeid
(
T
)
==
typeid
(
float
))
{
data_type
=
PrecisionType
::
kFloat
;
}
if
(
typeid
(
T
)
==
typeid
(
zynqmp
::
float16
))
{
data_type
=
PrecisionType
::
kFP16
;
}
if
(
typeid
(
T
)
==
typeid
(
int
))
{
data_type
=
PrecisionType
::
kInt32
;
}
if
(
typeid
(
T
)
==
typeid
(
int32_t
))
{
data_type
=
PrecisionType
::
kInt32
;
}
if
(
typeid
(
T
)
==
typeid
(
int8_t
))
{
data_type
=
PrecisionType
::
kInt8
;
}
return
data_type
;
}
template
<
typename
T
,
typename
R
>
R
*
TensorLite
::
mutable_data
()
{
std
::
vector
<
int
>
v
;
...
...
@@ -261,6 +285,7 @@ R *TensorLite::mutable_data() {
}
zynqmp
::
Shape
input_shape
(
layout_type
,
v
);
zynqmp
::
DataType
data_type
=
get_date_type
<
T
>
();
precision_
=
get_precistion_type
<
T
>
();
if
(
zynq_tensor_
.
get
()
==
nullptr
)
{
zynq_tensor_
.
reset
(
new
zynqmp
::
Tensor
());
...
...
lite/core/mir/kernel_place_correct_pass.h
浏览文件 @
a59d6fab
...
...
@@ -50,6 +50,7 @@ class KernelPlaceCorrectPass : public DebugPass {
VLOG
(
4
)
<<
"lite_with_targets['kFPGA']:"
<<
lite_with_targets
[
"kFPGA"
];
VLOG
(
3
)
<<
"param-type-registry:
\n
"
<<
ParamTypeRegistry
::
Global
();
// std::cout << ""
for
(
auto
&
x
:
graph
->
StmtTopologicalOrder
())
{
auto
&
inst
=
x
->
AsStmt
();
// The IoCopyOp is a tool operator, it won't support the type inference.
...
...
@@ -77,6 +78,80 @@ class KernelPlaceCorrectPass : public DebugPass {
bool
need_correct_place
=
true
;
auto
in
=
x
->
inlinks
.
front
();
auto
out
=
x
->
outlinks
.
front
();
auto
p
=
in
->
AsArg
().
type
->
precision
();
std
::
string
node_name
=
out
->
AsArg
().
name
;
std
::
string
arg_name
=
get_argname
(
node_name
,
inst
.
op_info
()
->
outputs
());
auto
op_type
=
inst
.
op_type
();
if
(
op_type
==
"reshape"
||
op_type
==
"reshape2"
)
{
for
(
auto
*
x_in
:
x
->
inlinks
)
{
std
::
string
in_name
=
get_argname
(
x_in
->
AsArg
().
name
,
inst
.
op_info
()
->
inputs
());
// std::cout << "name: " << x_in->AsArg().name << std::endl;
// std::cout << "in_name: " << in_name << std::endl;
if
(
in_name
==
"X"
)
{
in
=
x_in
;
std
::
cout
<<
"found input
\n
"
;
// exit(-1);
}
}
p
=
in
->
AsArg
().
type
->
precision
();
if
(
p
!=
PrecisionType
::
kFP16
)
{
// std::cout << "found an arm ............... : " << inst.kernels().size() << std::endl;
// std::cout << "tt:" << TargetRepr(inst.kernels()[0]->target()) << std::endl;
UpdateTarget
(
inst
,
TargetType
::
kHost
);
UpdateTensor
(
inst
,
in
,
out
,
TargetType
::
kHost
);
}
}
if
(
inst
.
op_type
()
==
"fetch"
)
{
UpdateTarget
(
inst
,
TargetType
::
kFPGA
);
}
if
(
inst
.
op_type
()
==
"split"
||
inst
.
op_type
()
==
"transpose"
)
{
if
(
p
!=
PrecisionType
::
kFP16
)
{
UpdateTarget
(
inst
,
TargetType
::
kARM
);
for
(
auto
*
x_out
:
x
->
outlinks
)
{
UpdateTensor
(
inst
,
in
,
x_out
,
TargetType
::
kARM
);
}
}
}
if
(
inst
.
op_type
()
==
"concat"
)
{
std
::
cout
<<
"concat target:"
<<
TargetRepr
(
inst
.
kernels
()[
0
]
->
target
())
<<
std
::
endl
;
std
::
cout
<<
"concat p:"
<<
PrecisionToStr
(
inst
.
kernels
()[
0
]
->
precision
())
<<
std
::
endl
;
if
(
p
!=
PrecisionType
::
kFP16
)
{
UpdateTarget
(
inst
,
TargetType
::
kARM
);
UpdateTensor
(
inst
,
in
,
out
,
TargetType
::
kARM
);
}
}
// if (inst.op_type() == "elementwise_mul") {
// for (auto* x_in : x->inlinks) {
// std::string in_name = get_argname(x_in->AsArg().name, inst.op_info()->inputs());
// std::cout << "name: " << x_in->AsArg().name << std::endl;
// std::cout << "in_name: " << in_name << std::endl;
// if (in_name == "Y") {
// in = x_in;
// std::cout << "found y \n";
// // exit(-1);
// }
// }
// if ( p != PrecisionType::kFP16) {
// UpdateTarget(inst, TargetType::kARM);
// UpdateTensor(inst, in, out, TargetType::kARM);
// }
// }
std
::
vector
<
TargetType
>
in_types
;
std
::
vector
<
TargetType
>
out_types
;
for
(
auto
*
x_in
:
x
->
inlinks
)
{
...
...
@@ -88,6 +163,21 @@ class KernelPlaceCorrectPass : public DebugPass {
<<
"-- node name:"
<<
node_name
;
auto
type
=
inst
.
picked_kernel
().
GetInputDeclType
(
arg_name
);
// std::cout << arg_name <<" is weight:: " << std::to_string(x_in->AsArg().is_weight)
// << " is persist: " << std::to_string(x_in->AsArg().is_persist) << std::endl;
// std::cout << " type: "<< inst.op_type() << std::endl;
if
(
!
x_in
->
AsArg
().
is_weight
)
{
auto
p
=
x_in
->
AsArg
().
type
->
precision
();
auto
t
=
x_in
->
AsArg
().
type
->
target
();
auto
l
=
x_in
->
AsArg
().
type
->
layout
();
// std::cout << "p:" << PrecisionToStr(p) << std::endl;
// std::cout << "t:" << TargetRepr(t) << std::endl;
// std::cout << "layout:" << DataLayoutToStr(l) << std::endl;
}
if
(
!
x_in
->
AsArg
().
type
)
{
need_correct_place
&=
false
;
}
else
{
...
...
@@ -129,18 +219,69 @@ class KernelPlaceCorrectPass : public DebugPass {
need_correct_place
&=
(
io_target_same
&&
(
in_types
[
0
]
!=
this_type
));
if
(
need_correct_place
)
{
// update this kernel's valid place;
UpdateTarget
(
inst
,
in_types
[
0
]);
//
UpdateTarget(inst, in_types[0]);
}
}
}
// Update me's kUnk fields by other's fields.
void
UpdateTarget
(
mir
::
Node
::
Stmt
&
inst
,
TargetType
new_target
)
{
// NOLINT
// std::cout << "1 kernels: " << std::to_string(inst.kernels().size()) << std::endl;
auto
new_place
=
inst
.
place
();
new_place
.
target
=
new_target
;
if
(
new_target
==
TargetType
::
kARM
)
{
new_place
.
precision
=
PrecisionType
::
kFloat
;
new_place
.
layout
=
DataLayoutType
::
kNCHW
;
}
if
(
new_target
==
TargetType
::
kHost
)
{
new_place
.
precision
=
PrecisionType
::
kFloat
;
new_place
.
layout
=
DataLayoutType
::
kNCHW
;
}
std
::
vector
<
Place
>
places
;
places
.
push_back
(
new_place
);
inst
.
ResetKernels
(
places
);
// std::cout << "2 kernels: " << std::to_string(inst.kernels().size()) << std::endl;
}
void
UpdateTensor
(
mir
::
Node
::
Stmt
&
inst
,
Node
*
in
,
Node
*
out
,
TargetType
new_target
=
TargetType
::
kUnk
)
{
auto
get_argname
=
[
&
](
const
std
::
string
&
node_name
,
const
std
::
map
<
std
::
string
,
std
::
vector
<
std
::
string
>>&
argname_map
)
->
std
::
string
{
for
(
auto
&
ele
:
argname_map
)
{
auto
it
=
std
::
find
(
ele
.
second
.
begin
(),
ele
.
second
.
end
(),
node_name
);
if
(
it
!=
ele
.
second
.
end
())
return
ele
.
first
;
}
return
""
;
};
std
::
string
arg_name
=
get_argname
(
out
->
AsArg
().
name
,
inst
.
op_info
()
->
outputs
());
std
::
string
in_name
=
get_argname
(
in
->
AsArg
().
name
,
inst
.
op_info
()
->
inputs
());
auto
type
=
inst
.
picked_kernel
().
GetInputDeclType
(
in_name
);
auto
tmp_ptype
=
in
->
AsArg
().
type
->
precision
();
auto
tmp_target
=
type
->
target
();
auto
tmp_layout
=
type
->
layout
();
if
(
new_target
==
TargetType
::
kARM
)
{
tmp_target
=
TargetType
::
kARM
;
tmp_ptype
=
PrecisionType
::
kFloat
;
tmp_layout
=
DataLayoutType
::
kNCHW
;
}
if
(
new_target
==
TargetType
::
kHost
)
{
tmp_target
=
TargetType
::
kHost
;
tmp_ptype
=
PrecisionType
::
kFloat
;
tmp_layout
=
DataLayoutType
::
kNCHW
;
}
out
->
AsArg
().
type
=
LiteType
::
GetTensorTy
(
tmp_target
,
tmp_ptype
,
tmp_layout
);
}
};
...
...
lite/core/mir/static_kernel_pick_pass.h
浏览文件 @
a59d6fab
...
...
@@ -144,6 +144,23 @@ class StaticKernelPickPass : public mir::StmtPass {
}
}
if
(
kernel
.
target
()
==
TARGET
(
kFPGA
))
{
final_score
=
4000
;
bool
in_match
=
true
;
for
(
size_t
i
=
0
;
i
<
in_names
.
size
();
++
i
)
{
std
::
string
tmp
;
CHECK
(
instruct
.
op_info
()
->
GetInputArgname
(
in_names
[
i
],
&
tmp
));
if
(
in_types
.
count
(
in_names
[
i
])
&&
in_types
.
at
(
in_names
[
i
])
!=
kernel
.
GetInputDeclType
(
tmp
)
->
precision
())
{
in_match
=
false
;
}
}
if
(
in_match
)
{
final_score
=
5000
;
}
}
VLOG
(
4
)
<<
"[score(final)]:"
<<
final_score
;
VLOG
(
2
)
<<
"-------- pick summary for "
<<
instruct
.
op_type
()
<<
" --------"
;
...
...
lite/core/mir/type_precision_cast_pass.cc
浏览文件 @
a59d6fab
...
...
@@ -134,6 +134,12 @@ void PrecisionCastPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// Start from inputs of the graph, those should have place set.
std
::
list
<
Node
*>
nodes
;
for
(
auto
&
node
:
graph
->
StmtTopologicalOrder
())
{
// if (node->IsStmt()) {
// auto& s = node->AsStmt();
// std::cout << "type_precision type:" << s.op_type() << std::endl;
// }
// type_precision_cast_pass
nodes
.
push_back
(
node
);
}
...
...
@@ -231,6 +237,10 @@ void PrecisionCastPass::AddCastInst(
// create Op and kernels.
bool
in_persist
=
in
->
AsArg
().
is_weight
||
in
->
AsArg
().
is_persist
;
std
::
string
cast_type
=
in_persist
?
"calib_once"
:
"calib"
;
// TODO
cast_type
=
"calib"
;
cast_op_output_arg
->
AsArg
().
is_persist
=
in_persist
;
auto
cast_op
=
LiteOpRegistry
::
Global
().
Create
(
cast_type
);
CHECK
(
cast_op
)
<<
"create op ["
<<
cast_op
<<
"] failed"
;
...
...
lite/core/mir/type_target_cast_pass.cc
浏览文件 @
a59d6fab
...
...
@@ -32,6 +32,12 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// Start from inputs of the graph, those should have place set.
std
::
list
<
Node
*>
nodes
;
for
(
auto
&
node
:
graph
->
StmtTopologicalOrder
())
{
// if (node->IsStmt()) {
// auto& s = node->AsStmt();
// // std::cout << "type_target type:" << s.op_type() << std::endl;
// }else {
// // std::cout << "type_target not a statement \n";
// }
nodes
.
push_back
(
node
);
}
...
...
@@ -47,6 +53,7 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
ComplementInputs
(
graph
.
get
(),
node
,
in
,
&
copied_nodes
);
}
}
}
void
TypeTargetTransformPass
::
ComplementInputs
(
...
...
@@ -127,7 +134,8 @@ void TypeTargetTransformPass::AddIoCopyInst(
auto
*
io_copy_inst
=
graph
->
NewInstructNode
();
bool
in_persist
=
in
->
AsArg
().
is_weight
||
in
->
AsArg
().
is_persist
;
std
::
string
io_copy_type
=
in_persist
?
"io_copy_once"
:
"io_copy"
;
// std::string io_copy_type = in_persist ? "io_copy_once" : "io_copy";
std
::
string
io_copy_type
=
"io_copy"
;
io_copy_output_arg
->
AsArg
().
is_persist
=
in_persist
;
// create Op and kernels.
auto
io_copy_op
=
LiteOpRegistry
::
Global
().
Create
(
io_copy_type
);
...
...
@@ -147,6 +155,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
// fix(MyPandaShaoxiang): select kernel that input_dcl_type same as in.type
bool
is_found
=
false
;
std
::
vector
<
std
::
unique_ptr
<
KernelBase
>>
selected_kernels
;
std
::
cout
<<
"kernels:"
<<
std
::
to_string
(
kernels
.
size
())
<<
std
::
endl
;
for
(
auto
&
kernel
:
kernels
)
{
const
Type
*
in_arg_ty
=
kernel
->
GetInputDeclType
(
"Input"
);
const
Type
*
out_arg_ty
=
kernel
->
GetOutputDeclType
(
"Out"
);
...
...
lite/kernels/arm/concat_compute.cc
浏览文件 @
a59d6fab
...
...
@@ -64,6 +64,7 @@ void ConcatCompute::Run() {
auto
&
param
=
Param
<
operators
::
ConcatParam
>
();
std
::
vector
<
lite
::
Tensor
*>
inputs
=
param
.
x
;
CHECK_GE
(
inputs
.
size
(),
1
);
// std::cout << "concat size:" << std::to_string(inputs.size()) << std::endl;
auto
*
out
=
param
.
output
;
int
axis
=
param
.
axis
;
auto
*
axis_tensor
=
param
.
axis_tensor
;
...
...
@@ -72,21 +73,22 @@ void ConcatCompute::Run() {
axis
=
axis_tensor_data
[
0
];
}
switch
(
inputs
.
front
()
->
precision
())
{
case
PRECISION
(
kFloat
):
ConcatFunc
<
float
>
(
inputs
,
axis
,
out
);
break
;
case
PRECISION
(
kInt32
):
ConcatFunc
<
int32_t
>
(
inputs
,
axis
,
out
);
break
;
case
PRECISION
(
kInt64
):
ConcatFunc
<
int64_t
>
(
inputs
,
axis
,
out
);
break
;
default:
LOG
(
FATAL
)
<<
"Concat does not implement for the "
<<
"input type:"
<<
static_cast
<
int
>
(
inputs
.
front
()
->
precision
());
}
ConcatFunc
<
float
>
(
inputs
,
axis
,
out
);
// switch (inputs.front()->precision()) {
// case PRECISION(kFloat):
// ConcatFunc<float>(inputs, axis, out);
// break;
// case PRECISION(kInt32):
// ConcatFunc<int32_t>(inputs, axis, out);
// break;
// case PRECISION(kInt64):
// ConcatFunc<int64_t>(inputs, axis, out);
// break;
// default:
// LOG(FATAL) << "Concat does not implement for the "
// << "input type:"
// << static_cast<int>(inputs.front()->precision());
// }
}
}
// namespace arm
...
...
lite/kernels/fpga/CMakeLists.txt
浏览文件 @
a59d6fab
...
...
@@ -17,6 +17,8 @@ add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
add_kernel
(
dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS
${
fpga_deps
}
)
add_kernel
(
elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS
${
fpga_deps
}
)
add_kernel
(
interpolate_compute_fpga FPGA basic SRCS interpolate_compute.cc DEPS
${
fpga_deps
}
)
add_kernel
(
fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS
${
fpga_deps
}
)
add_kernel
(
gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS
${
fpga_deps
}
)
...
...
lite/kernels/fpga/calib_compute.cc
浏览文件 @
a59d6fab
...
...
@@ -44,6 +44,17 @@ void CalibComputeFP16ToFp32::Run() {
return
;
}
void
CalibComputeFloat2Int
::
Run
()
{
auto
&
param
=
this
->
Param
<
operators
::
CalibParam
>
();
const
auto
*
din
=
param
.
input
->
data
<
float
>
();
auto
*
dout
=
param
.
output
->
mutable_data
<
int
>
();
// param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
//TODO
auto
out_lod
=
param
.
output
->
mutable_lod
();
*
out_lod
=
param
.
input
->
lod
();
return
;
}
}
// namespace fpga
}
// namespace kernels
}
// namespace lite
...
...
@@ -65,12 +76,28 @@ REGISTER_LITE_KERNEL(calib,
DATALAYOUT
(
kNHWC
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
calib
,
kFPGA
,
kFP16
,
kNHWC
,
paddle
::
lite
::
kernels
::
fpga
::
CalibComputeFloat2Int
,
float_2_int_fpga
)
.
BindInput
(
"Input"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kFloat
),
DATALAYOUT
(
kNCHW
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kInt32
),
DATALAYOUT
(
kNCHW
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
calib
,
kFPGA
,
kFP16
,
kNHWC
,
paddle
::
lite
::
kernels
::
fpga
::
CalibComputeFP16ToFp32
,
f
p16_to_fp32
_fpga
)
f
loat_to_int
_fpga
)
.
BindInput
(
"Input"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
...
...
lite/kernels/fpga/calib_compute.h
浏览文件 @
a59d6fab
...
...
@@ -45,6 +45,18 @@ class CalibComputeFP16ToFp32
private:
};
class
CalibComputeFloat2Int
:
public
KernelLite
<
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
)
>
{
public:
using
param_t
=
operators
::
CalibParam
;
void
Run
()
override
;
~
CalibComputeFloat2Int
()
override
{};
private:
};
}
// namespace fpga
}
// namespace kernels
}
// namespace lite
...
...
lite/kernels/fpga/concat_compute.cc
浏览文件 @
a59d6fab
...
...
@@ -47,7 +47,8 @@ void ConcatCompute::Run() {
pe_
.
dispatch
();
#ifdef FPGA_PRINT_TENSOR
zynqmp
::
ConcatParam
&
concat_param
=
pe_
.
param
();
Debugger
::
get_instance
().
registerOutput
(
"concat"
,
concat_param
.
output
);
concat_param
.
output
->
flush
();
// Debugger::get_instance().registerOutput("concat", concat_param.output);
#endif
}
...
...
lite/kernels/fpga/conv_compute.cc
浏览文件 @
a59d6fab
...
...
@@ -51,6 +51,11 @@ void ConvCompute::PrepareForRun() {
conv_param
.
activeParam
.
type
=
zynqmp
::
TYPE_RELU
;
}
if
(
param
.
activation_param
.
Leaky_relu_alpha
>
0.001
)
{
conv_param
.
activeParam
.
type
=
zynqmp
::
TYPE_LEAKY_RELU
;
conv_param
.
activeParam
.
leaky_relu_factor
=
param
.
activation_param
.
Leaky_relu_alpha
;
}
dw_conv_pe_
.
init
();
dw_conv_pe_
.
apply
();
}
else
{
...
...
@@ -72,9 +77,15 @@ void ConvCompute::PrepareForRun() {
conv_param
.
activeParam
.
type
=
zynqmp
::
TYPE_RELU
;
}
if
(
param
.
activation_param
.
Leaky_relu_alpha
>
0.001
)
{
conv_param
.
activeParam
.
type
=
zynqmp
::
TYPE_LEAKY_RELU
;
conv_param
.
activeParam
.
leaky_relu_factor
=
param
.
activation_param
.
Leaky_relu_alpha
;
}
conv_pe_
.
init
();
conv_pe_
.
apply
();
}
// std::cout << "Leaky_relu_alpha:" << param.activation_param.Leaky_relu_alpha << std::endl;
}
void
ConvCompute
::
Run
()
{
...
...
lite/kernels/fpga/elementwise_compute.cc
浏览文件 @
a59d6fab
...
...
@@ -88,13 +88,33 @@ void ElementwiseMulCompute::PrepareForRun() {
scale_
.
mutableData
<
zynqmp
::
float16
>
(
zynqmp
::
FP16
,
shape
);
zynqmp
::
float16
*
bias_data
=
bias_
.
mutableData
<
zynqmp
::
float16
>
(
zynqmp
::
FP16
,
shape
);
float
scale_value
=
param
.
Y
->
data
<
float
>
()[
0
];
zynqmp
::
float16
scale_value
=
0
;
if
(
param
.
Y
->
ZynqTensor
()
->
dataType
()
==
zynqmp
::
FP32
)
{
scale_value
=
zynqmp
::
float_to_half
(
param
.
Y
->
data
<
float
>
()[
0
]);
// std::cout << "FP32 \n";
}
else
{
scale_value
=
param
.
Y
->
data
<
zynqmp
::
float16
>
()[
0
];
// std::cout << "FP16 \n";
}
// std::cout << "channel:" << channel << std::endl;
// std::cout << "production:" << param.Y->dims().production() << std::endl;
// std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
// exit(-1);
for
(
int
i
=
0
;
i
<
channel
;
i
++
)
{
if
(
param
.
Y
->
dims
().
production
()
!=
1
)
{
scale_value
=
param
.
Y
->
ZynqTensor
()
->
data
<
float
>
()[
i
];
// scale_value = param.Y->ZynqTensor()->data<zynqmp::float16>()[i];
if
(
param
.
Y
->
ZynqTensor
()
->
dataType
()
==
zynqmp
::
FP32
)
{
scale_value
=
zynqmp
::
float_to_half
(
param
.
Y
->
data
<
float
>
()[
i
]);
}
else
{
scale_value
=
param
.
Y
->
data
<
zynqmp
::
float16
>
()[
i
];
}
}
scale_data
[
i
]
=
zynqmp
::
float_to_half
(
scale_value
);
// std::cout << "scale_value:" << std::to_string(zynqmp::half_to_float(scale_value)) << std::endl;
// exit(-1);
scale_data
[
i
]
=
scale_value
;
bias_data
[
i
]
=
zero_
;
}
...
...
@@ -104,15 +124,17 @@ void ElementwiseMulCompute::PrepareForRun() {
void
ElementwiseMulCompute
::
Run
()
{
auto
&
param
=
Param
<
operators
::
ElementwiseParam
>
();
// std::cout << "param.Y :" << param.Y->persistable() << std::endl;
if
(
!
param
.
Y
->
persistable
())
{
// TODO
scale_
.
copyFrom
(
param
.
Y
->
ZynqTensor
());
scale_
.
invalidate
();
scale_
.
flush
();
//TODO
}
pe_
.
dispatch
();
#ifdef FPGA_PRINT_TENSOR
zynqmp
::
ScaleParam
&
scale_param
=
pe_
.
param
();
Debugger
::
get_instance
().
registerOutput
(
"ew_mul_in"
,
scale_param
.
input
);
Debugger
::
get_instance
().
registerOutput
(
"ew_mul"
,
scale_param
.
output
);
//
Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
//
Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
#endif
}
...
...
@@ -181,3 +203,21 @@ REGISTER_LITE_KERNEL(elementwise_mul,
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
elementwise_mul
,
kFPGA
,
kFP16
,
kNHWC
,
paddle
::
lite
::
kernels
::
fpga
::
ElementwiseMulCompute
,
ew_mul_y_arm
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindInput
(
"Y"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
Finalize
();
\ No newline at end of file
lite/kernels/fpga/fetch_compute.cc
浏览文件 @
a59d6fab
...
...
@@ -64,18 +64,18 @@ void FetchCompute::Run() {
}
// namespace lite
}
// namespace paddle
REGISTER_LITE_KERNEL
(
fetch
,
kFPGA
,
kFP16
,
kNHWC
,
paddle
::
lite
::
kernels
::
fpga
::
FetchCompute
,
fpga_host
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kAny
),
DATALAYOUT
(
kAny
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
))})
.
Finalize
();
//
REGISTER_LITE_KERNEL(fetch,
//
kFPGA,
//
kFP16,
//
kNHWC,
//
paddle::lite::kernels::fpga::FetchCompute,
//
fpga_host)
//
.BindInput("X",
//
{LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16
),
// DATALAYOUT(kNHWC
))})
//
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
//
.Finalize();
REGISTER_LITE_KERNEL
(
fetch
,
kFPGA
,
...
...
lite/kernels/fpga/interpolate_compute.cc
0 → 100644
浏览文件 @
a59d6fab
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/interpolate_compute.h"
#include <string>
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
fpga
{
using
float16
=
zynqmp
::
float16
;
void
BilinearInterpCompute
::
Run
()
{
// auto& param = Param<operators::InterpolateParam>();
// lite::Tensor* X = param.X;
// lite::Tensor* OutSize = param.OutSize;
// auto SizeTensor = param.SizeTensor;
// auto Scale = param.Scale;
// lite::Tensor* Out = param.Out;
// float scale = param.scale;
// int out_w = param.out_w;
// int out_h = param.out_h;
// bool align_corners = param.align_corners;
// std::string interp_method = "Bilinear";
// lite::arm::math::interpolate(X,
// OutSize,
// SizeTensor,
// Scale,
// Out,
// out_h,
// out_w,
// scale,
// align_corners,
// interp_method);
}
void
nearest_interp
(
const
float16
*
src
,
int
w_in
,
int
h_in
,
int
c
,
float16
*
dst
,
int
w_out
,
int
h_out
,
float
scale_x
,
float
scale_y
,
bool
with_align
)
{
float
scale_w_new
=
(
with_align
)
?
(
static_cast
<
float
>
(
w_in
-
1
)
/
(
w_out
-
1
))
:
(
static_cast
<
float
>
(
w_in
)
/
(
w_out
));
float
scale_h_new
=
(
with_align
)
?
(
static_cast
<
float
>
(
h_in
-
1
)
/
(
h_out
-
1
))
:
(
static_cast
<
float
>
(
h_in
)
/
(
h_out
));
if
(
with_align
)
{
for
(
int
h
=
0
;
h
<
h_out
;
++
h
)
{
float16
*
dst_p
=
dst
+
h
*
w_out
*
c
;
int
near_y
=
static_cast
<
int
>
(
scale_h_new
*
h
+
0.5
);
for
(
int
w
=
0
;
w
<
w_out
;
++
w
)
{
int
near_x
=
static_cast
<
int
>
(
scale_w_new
*
w
+
0.5
);
// *dst_p++ = src[near_y * w_in + near_x];
const
float16
*
src_n
=
src
+
(
near_y
*
w_in
+
near_x
)
*
c
;
memcpy
(
dst_p
,
src_n
,
c
*
sizeof
(
float16
));
dst_p
+=
c
;
}
}
}
else
{
for
(
int
h
=
0
;
h
<
h_out
;
++
h
)
{
float16
*
dst_p
=
dst
+
h
*
w_out
;
int
near_y
=
static_cast
<
int
>
(
scale_h_new
*
h
);
for
(
int
w
=
0
;
w
<
w_out
;
++
w
)
{
int
near_x
=
static_cast
<
int
>
(
scale_w_new
*
w
);
const
float16
*
src_n
=
src
+
(
near_y
*
w_in
+
near_x
)
*
c
;
memcpy
(
dst_p
,
src_n
,
c
*
sizeof
(
float16
));
dst_p
+=
c
;
}
}
}
}
void
NearestInterpCompute
::
PrepareForRun
()
{
auto
&
param
=
Param
<
operators
::
InterpolateParam
>
();
lite
::
Tensor
*
X
=
param
.
X
;
lite
::
Tensor
*
OutSize
=
param
.
OutSize
;
lite
::
Tensor
*
Out
=
param
.
Out
;
Out
->
mutable_data
<
float16
>
();
zynqmp
::
ResizeParam
&
norm_param
=
pe_
.
param
();
norm_param
.
input
=
X
->
ZynqTensor
();
norm_param
.
output
=
Out
->
ZynqTensor
();
pe_
.
init
();
pe_
.
apply
();
}
// TODO
inline
std
::
vector
<
int
>
get_new_shape
(
std
::
vector
<
const
lite
::
Tensor
*>
list_new_shape_tensor
)
{
// get tensor from
std
::
vector
<
int
>
vec_new_shape
;
for
(
size_t
i
=
0
;
i
<
list_new_shape_tensor
.
size
();
++
i
)
{
auto
tensor
=
list_new_shape_tensor
[
i
];
vec_new_shape
.
push_back
(
static_cast
<
int32_t
>
(
*
tensor
->
data
<
int32_t
>
()));
}
return
vec_new_shape
;
}
template
<
typename
T
>
inline
std
::
vector
<
T
>
get_new_data_from_tensor
(
const
Tensor
*
new_data_tensor
)
{
std
::
vector
<
T
>
vec_new_data
;
auto
*
new_data
=
new_data_tensor
->
data
<
T
>
();
lite
::
Tensor
cpu_starts_tensor
;
vec_new_data
=
std
::
vector
<
T
>
(
new_data
,
new_data
+
new_data_tensor
->
dims
().
production
());
return
vec_new_data
;
}
void
interpolate
(
lite
::
Tensor
*
X
,
lite
::
Tensor
*
OutSize
,
std
::
vector
<
const
lite
::
Tensor
*>
SizeTensor
,
lite
::
Tensor
*
Scale
,
lite
::
Tensor
*
Out
,
int
out_height
,
int
out_width
,
float
scale
,
bool
with_align
,
std
::
string
interpolate_type
)
{
int
in_h
=
X
->
dims
()[
2
];
int
in_w
=
X
->
dims
()[
3
];
if
(
SizeTensor
.
size
()
>
0
)
{
auto
new_size
=
get_new_shape
(
SizeTensor
);
out_height
=
new_size
[
0
];
out_width
=
new_size
[
1
];
}
else
{
auto
scale_tensor
=
Scale
;
if
(
scale_tensor
!=
nullptr
)
{
auto
scale_data
=
get_new_data_from_tensor
<
float
>
(
scale_tensor
);
scale
=
scale_data
[
0
];
}
if
(
scale
>
0
)
{
out_height
=
static_cast
<
int
>
(
in_h
*
scale
);
out_width
=
static_cast
<
int
>
(
in_w
*
scale
);
}
auto
out_size
=
OutSize
;
if
(
out_size
!=
nullptr
)
{
auto
out_size_data
=
get_new_data_from_tensor
<
int
>
(
out_size
);
out_height
=
out_size_data
[
0
];
out_width
=
out_size_data
[
1
];
}
}
float
height_scale
=
scale
;
float
width_scale
=
scale
;
if
(
out_width
>
0
&&
out_height
>
0
)
{
height_scale
=
static_cast
<
float
>
(
out_height
/
X
->
dims
()[
2
]);
width_scale
=
static_cast
<
float
>
(
out_width
/
X
->
dims
()[
3
]);
}
int
num_cout
=
X
->
dims
()[
0
];
int
c_cout
=
X
->
dims
()[
1
];
Out
->
Resize
({
num_cout
,
c_cout
,
out_height
,
out_width
});
float16
*
dout
=
Out
->
mutable_data
<
float16
>
();
const
float16
*
din
=
X
->
data
<
float16
>
();
int
out_num
=
Out
->
dims
()[
0
];
int
out_c
=
Out
->
dims
()[
1
];
int
count
=
out_num
;
int
out_h
=
Out
->
dims
()[
2
];
int
out_w
=
Out
->
dims
()[
3
];
int
spatial_in
=
in_h
*
in_w
;
int
spatial_out
=
out_h
*
out_w
;
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
nearest_interp
(
din
+
spatial_in
*
i
,
in_w
,
in_h
,
out_c
,
dout
+
spatial_out
*
i
,
out_w
,
out_h
,
1.
f
/
width_scale
,
1.
f
/
height_scale
,
with_align
);
}
}
void
NearestInterpCompute
::
Run
()
{
auto
&
param
=
Param
<
operators
::
InterpolateParam
>
();
lite
::
Tensor
*
X
=
param
.
X
;
lite
::
Tensor
*
OutSize
=
param
.
OutSize
;
auto
SizeTensor
=
param
.
SizeTensor
;
auto
Scale
=
param
.
Scale
;
lite
::
Tensor
*
Out
=
param
.
Out
;
float
scale
=
param
.
scale
;
int
out_w
=
param
.
out_w
;
int
out_h
=
param
.
out_h
;
bool
align_corners
=
param
.
align_corners
;
std
::
string
interp_method
=
""
;
X
->
ZynqTensor
()
->
invalidate
();
//TODO
X
->
ZynqTensor
()
->
saveToFile
(
"n_in"
,
true
);
interpolate
(
X
,
OutSize
,
SizeTensor
,
Scale
,
Out
,
out_h
,
out_w
,
scale
,
align_corners
,
interp_method
);
Out
->
ZynqTensor
()
->
flush
();
Out
->
ZynqTensor
()
->
copyScaleFrom
(
X
->
ZynqTensor
());
Out
->
ZynqTensor
()
->
saveToFile
(
"n_out"
,
true
);
}
}
/* namespace fpga */
}
/* namespace kernels */
}
/* namespace lite */
}
/* namespace paddle */
REGISTER_LITE_KERNEL
(
bilinear_interp
,
kFPGA
,
kFP16
,
kNHWC
,
paddle
::
lite
::
kernels
::
fpga
::
BilinearInterpCompute
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindInput
(
"OutSize"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kInt32
))})
.
BindInput
(
"SizeTensor"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kInt32
))})
.
BindInput
(
"Scale"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
nearest_interp
,
kFPGA
,
kFP16
,
kNHWC
,
paddle
::
lite
::
kernels
::
fpga
::
NearestInterpCompute
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindInput
(
"OutSize"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kInt32
))})
.
BindInput
(
"SizeTensor"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kInt32
))})
.
BindInput
(
"Scale"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
Finalize
();
lite/kernels/fpga/interpolate_compute.h
0 → 100644
浏览文件 @
a59d6fab
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/backends/fpga/KD/pes/resize_pe.hpp"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
fpga
{
class
BilinearInterpCompute
:
public
KernelLite
<
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
)
>
{
public:
void
Run
()
override
;
virtual
~
BilinearInterpCompute
()
=
default
;
};
class
NearestInterpCompute
:
public
KernelLite
<
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
)
>
{
public:
void
PrepareForRun
()
override
;
void
Run
()
override
;
virtual
~
NearestInterpCompute
()
=
default
;
private:
zynqmp
::
ResizePE
pe_
;
};
}
/* namespace fpga */
}
/* namespace kernels */
}
/* namespace lite */
}
/* namespace paddle */
lite/kernels/fpga/io_copy_compute.cc
浏览文件 @
a59d6fab
...
...
@@ -25,10 +25,17 @@ namespace fpga {
using
float16
=
zynqmp
::
float16
;
void
copy_properties
(
operators
::
IoCopyParam
&
param
)
{
param
.
y
->
set_persistable
(
param
.
x
->
persistable
());
auto
out_lod
=
param
.
y
->
mutable_lod
();
*
out_lod
=
param
.
x
->
lod
();
param
.
y
->
ZynqTensor
()
->
copyScaleFrom
(
param
.
x
->
ZynqTensor
());
}
/*
* This kernel copies a tensor from host to FPGA space.
*/
class
IoCopyHost
ToFpga
Compute
class
IoCopyHost
CHWToFpgaHWC
Compute
:
public
KernelLite
<
TARGET
(
kFPGA
),
PRECISION
(
kAny
),
DATALAYOUT
(
kAny
)
>
{
public:
void
Run
()
override
{
...
...
@@ -37,52 +44,33 @@ class IoCopyHostToFpgaCompute
param
.
x
->
target
()
==
TARGET
(
kFPGA
));
param
.
x
->
ZynqTensor
()
->
flush
();
if
(
param
.
x
->
ZynqTensor
()
->
dataType
()
==
zynqmp
::
INT32
)
{
param
.
y
->
mutable_data
<
int
>
();
param
.
y
->
ZynqTensor
()
->
copyFrom
(
param
.
x
->
ZynqTensor
());
param
.
y
->
ZynqTensor
()
->
flush
();
copy_properties
(
param
);
return
;
}
if
(
param
.
x
->
ZynqTensor
()
->
dataType
()
==
zynqmp
::
FP32
)
{
param
.
y
->
mutable_data
<
float16
>
();
if
(
param
.
x
->
ZynqTensor
()
->
aligned
()
&&
param
.
x
->
ZynqTensor
()
->
shape
().
shouldAlign
())
{
zynqmp
::
Tensor
tempTensor
;
tempTensor
.
mutableData
<
float16
>
(
zynqmp
::
FP16
,
param
.
x
->
ZynqTensor
()
->
shape
());
tempTensor
.
copyFrom
(
param
.
x
->
ZynqTensor
());
tempTensor
.
setAligned
(
true
);
tempTensor
.
unalignImage
();
param
.
y
->
ZynqTensor
()
->
copyFrom
(
&
tempTensor
);
}
else
{
param
.
y
->
ZynqTensor
()
->
copyFrom
(
param
.
x
->
ZynqTensor
());
}
param
.
y
->
ZynqTensor
()
->
invalidate
();
param
.
y
->
ZynqTensor
()
->
copyScaleFrom
(
param
.
x
->
ZynqTensor
());
param
.
y
->
mutable_data
<
float16
>
();
param
.
y
->
ZynqTensor
()
->
setDataLocation
(
zynqmp
::
Device
);
if
(
param
.
x
->
ZynqTensor
()
->
aligned
()
&&
param
.
x
->
ZynqTensor
()
->
shape
().
shouldAlign
())
{
zynqmp
::
Tensor
tempTensor
;
tempTensor
.
mutableData
<
float16
>
(
zynqmp
::
FP16
,
param
.
x
->
ZynqTensor
()
->
shape
());
tempTensor
.
copyFrom
(
param
.
x
->
ZynqTensor
());
tempTensor
.
setAligned
(
true
);
tempTensor
.
unalignImage
();
tempTensor
.
flush
();
param
.
y
->
ZynqTensor
()
->
copyFrom
(
&
tempTensor
);
}
else
{
param
.
y
->
ZynqTensor
()
->
copyFrom
(
param
.
x
->
ZynqTensor
());
}
auto
out_lod
=
param
.
y
->
mutable_lod
();
*
out_lod
=
param
.
x
->
lod
();
}
std
::
unique_ptr
<
type_infer_handler_t
>
GetTypeInferHandler
()
override
{
std
::
unique_ptr
<
type_infer_handler_t
>
res
(
new
type_infer_handler_t
);
*
res
=
[](
const
std
::
map
<
std
::
string
,
const
Type
*>&
inputs
,
const
std
::
string
&
out
)
->
const
Type
*
{
CHECK
(
!
inputs
.
empty
());
auto
*
type
=
inputs
.
at
(
"Input"
);
CHECK
(
type
->
target
()
==
TARGET
(
kHost
));
auto
out_place
=
type
->
place
();
out_place
.
target
=
TARGET
(
kFPGA
);
auto
*
out_type
=
Type
::
Get
(
type
->
id
(),
out_place
.
target
,
out_place
.
precision
,
out_place
.
layout
,
out_place
.
device
);
return
out_type
;
};
return
res
;
copy_properties
(
param
);
param
.
y
->
ZynqTensor
()
->
invalidate
();
}
std
::
string
doc
()
const
override
{
return
"Copy IO from HOST to FPGA"
;
}
...
...
@@ -98,10 +86,11 @@ class IoCopyFpgaToHostCompute
auto
&
param
=
Param
<
operators
::
IoCopyParam
>
();
CHECK
(
param
.
x
->
target
()
==
TARGET
(
kHost
)
||
param
.
x
->
target
()
==
TARGET
(
kFPGA
));
param
.
x
->
ZynqTensor
()
->
syncToDevice
();
param
.
y
->
mutable_data
<
float
>
();
param
.
y
->
ZynqTensor
()
->
setDataType
(
zynqmp
::
FP32
);
param
.
x
->
ZynqTensor
()
->
syncToDevice
(
);
param
.
y
->
ZynqTensor
()
->
setDataLocation
(
zynqmp
::
CPU
);
if
(
param
.
x
->
ZynqTensor
()
->
aligned
()
&&
param
.
x
->
ZynqTensor
()
->
shape
().
shouldAlign
())
{
...
...
@@ -115,10 +104,9 @@ class IoCopyFpgaToHostCompute
}
else
{
param
.
y
->
ZynqTensor
()
->
copyFrom
(
param
.
x
->
ZynqTensor
());
}
param
.
y
->
ZynqTensor
()
->
copyScaleFrom
(
param
.
x
->
ZynqTensor
());
param
.
y
->
ZynqTensor
()
->
flush
();
auto
out_lod
=
param
.
y
->
mutable_lod
();
*
out_lod
=
param
.
x
->
lod
();
param
.
y
->
ZynqTensor
()
->
invalidate
();
copy_properties
(
param
);
}
std
::
string
doc
()
const
override
{
return
"Copy IO from FPGA to HOST"
;
}
};
...
...
@@ -153,14 +141,16 @@ class IoCopyFpgaToHostCHWCompute
CHECK
(
param
.
x
->
target
()
==
TARGET
(
kHost
)
||
param
.
x
->
target
()
==
TARGET
(
kFPGA
));
Tensor
hwc
;
Tensor
hwc
;
hwc
.
Resize
(
param
.
y
->
dims
());
float
*
hwc_data
=
hwc
.
mutable_data
<
float
>
();
float
*
chw_data
=
param
.
y
->
mutable_data
<
float
>
();
param
.
y
->
ZynqTensor
()
->
setDataType
(
zynqmp
::
FP32
);
param
.
x
->
ZynqTensor
()
->
syncToDevice
();
hwc
.
ZynqTensor
()
->
setDataLocation
(
zynqmp
::
CPU
);
param
.
y
->
ZynqTensor
()
->
setDataLocation
(
zynqmp
::
CPU
);
if
(
param
.
x
->
ZynqTensor
()
->
aligned
()
&&
param
.
x
->
ZynqTensor
()
->
shape
().
shouldAlign
())
{
zynqmp
::
Tensor
tempTensor
;
...
...
@@ -168,10 +158,30 @@ class IoCopyFpgaToHostCHWCompute
param
.
x
->
ZynqTensor
()
->
shape
());
tempTensor
.
copyFrom
(
param
.
x
->
ZynqTensor
());
tempTensor
.
setAligned
(
true
);
// tempTensor.saveToFile("temp_1", true);
tempTensor
.
unalignImage
();
// tempTensor.saveToFile("temp_2", true);
hwc
.
ZynqTensor
()
->
copyFrom
(
&
tempTensor
);
}
else
{
hwc
.
ZynqTensor
()
->
copyFrom
(
param
.
x
->
ZynqTensor
());
// hwc.ZynqTensor()->copyFrom(param.x->ZynqTensor());
float16
*
in_data
=
param
.
x
->
ZynqTensor
()
->
data
<
float16
>
();
// float* f_data =
param
.
x
->
ZynqTensor
()
->
flush
();
float
max
=
0
;
for
(
int
i
=
0
;
i
<
param
.
x
->
dims
().
production
();
i
++
)
{
float
value
=
zynqmp
::
half_to_float
(
in_data
[
i
]);
hwc_data
[
i
]
=
value
;
if
(
value
<
0
)
{
value
=
-
value
;
}
if
(
value
>
max
)
{
max
=
value
;
}
}
param
.
x
->
ZynqTensor
()
->
scale
()[
0
]
=
max
/
127
;
param
.
x
->
ZynqTensor
()
->
scale
()[
1
]
=
127
/
max
;
}
int
num
=
1
;
...
...
@@ -188,10 +198,15 @@ class IoCopyFpgaToHostCHWCompute
dims
.
height
(),
dims
.
width
());
param
.
y
->
ZynqTensor
()
->
copyScaleFrom
(
param
.
x
->
ZynqTensor
());
//
param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
param
.
y
->
ZynqTensor
()
->
flush
();
auto
out_lod
=
param
.
y
->
mutable_lod
();
*
out_lod
=
param
.
x
->
lod
();
copy_properties
(
param
);
param
.
x
->
ZynqTensor
()
->
invalidate
();
param
.
x
->
ZynqTensor
()
->
flush
();
// hwc.ZynqTensor()->saveToFile("hwc", true);
// param.x->ZynqTensor()->saveToFile("io2_x", true);
// param.y->ZynqTensor()->saveToFile("io2_y", true);
}
std
::
string
doc
()
const
override
{
return
"Copy IO from FPGA to HOST"
;
}
};
...
...
@@ -201,52 +216,36 @@ class IoCopyFpgaToHostCHWCompute
}
// namespace lite
}
// namespace paddle
// REGISTER_LITE_KERNEL(io_copy,
// kFPGA,
// kAny,
// kAny,
// paddle::lite::kernels::fpga::IoCopyHostToFpgaCompute,
// host_to_device)
// .BindInput("Input",
// {LiteType::GetTensorTy(TARGET(kHost),
// PRECISION(kAny),
// DATALAYOUT(kAny))})
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kAny),
// DATALAYOUT(kAny))})
// .Finalize();
REGISTER_LITE_KERNEL
(
io_copy
,
kFPGA
,
kAny
,
kAny
,
paddle
::
lite
::
kernels
::
fpga
::
IoCopyHost
ToFpga
Compute
,
host_to_device
_any_any
)
paddle
::
lite
::
kernels
::
fpga
::
IoCopyHost
CHWToFpgaHWC
Compute
,
host_to_device
)
.
BindInput
(
"Input"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
),
PRECISION
(
kAny
),
DATALAYOUT
(
kAny
),
-
1
)})
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
),
PRECISION
(
kInt32
),
DATALAYOUT
(
kAny
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kAny
),
DATALAYOUT
(
kAny
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
io_copy
,
kFPGA
,
kAny
,
kAny
,
paddle
::
lite
::
kernels
::
fpga
::
IoCopyHostCHWToFpgaHWCCompute
,
host_float_chw_to_device_fp16_hwc
)
.
BindInput
(
"Input"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
),
PRECISION
(
kFloat
),
DATALAYOUT
(
kNCHW
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
Finalize
();
// REGISTER_LITE_KERNEL(io_copy,
// kFPGA,
// kAny,
// kAny,
// paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute,
// device_to_host)
// .BindInput("Input",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kHost),
// PRECISION(kFloat),
// DATALAYOUT(kNHWC))})
// .Finalize();
REGISTER_LITE_KERNEL
(
io_copy
,
kFPGA
,
...
...
@@ -311,3 +310,26 @@ REGISTER_LITE_KERNEL(io_copy,
// PRECISION(kAny),
// DATALAYOUT(kAny))})
// .Finalize();
// ==========================================================
// std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
// std::unique_ptr<type_infer_handler_t> res(new type_infer_handler_t);
// *res = [](const std::map<std::string, const Type*>& inputs,
// const std::string& out) -> const Type* {
// CHECK(!inputs.empty());
// auto* type = inputs.at("Input");
// CHECK(type->target() == TARGET(kHost));
// auto out_place = type->place();
// out_place.target = TARGET(kFPGA);
// auto* out_type = Type::Get(type->id(),
// out_place.target,
// out_place.precision,
// out_place.layout,
// out_place.device);
// return out_type;
// };
// return res;
// }
\ No newline at end of file
lite/kernels/fpga/multiclass_nms_compute.cc
浏览文件 @
a59d6fab
...
...
@@ -94,6 +94,7 @@ T PolyIoU(const T* box1,
const
size_t
box_size
,
const
bool
normalized
)
{
LOG
(
FATAL
)
<<
"PolyIoU not implement."
;
return
*
box1
;
}
template
<
class
T
>
...
...
@@ -128,34 +129,44 @@ void NMSFast(const Tensor& bbox,
std
::
vector
<
int
>*
selected_indices
,
const
bool
normalized
)
{
// The total boxes for each instance.
// std::cout << "1\n";
int64_t
num_boxes
=
bbox
.
dims
()[
0
];
// std::cout << "1,1\n";
// 4: [xmin ymin xmax ymax]
// 8: [x1 y1 x2 y2 x3 y3 x4 y4]
// 16, 24, or 32: [x1 y1 x2 y2 ... xn yn], n = 8, 12 or 16
int64_t
box_size
=
bbox
.
dims
()[
1
];
// std::cout << "1,2\n";
std
::
vector
<
T
>
scores_data
(
num_boxes
);
std
::
copy_n
(
scores
.
data
<
T
>
(),
num_boxes
,
scores_data
.
begin
());
// std::cout << "1,3\n";
std
::
vector
<
std
::
pair
<
T
,
int
>>
sorted_indices
;
// std::cout << "1,4\n";
GetMaxScoreIndex
(
scores_data
,
score_threshold
,
top_k
,
&
sorted_indices
);
// std::cout << "2\n";
selected_indices
->
clear
();
T
adaptive_threshold
=
nms_threshold
;
const
T
*
bbox_data
=
bbox
.
data
<
T
>
();
// std::cout << "3\n";
while
(
sorted_indices
.
size
()
!=
0
)
{
const
int
idx
=
sorted_indices
.
front
().
second
;
// std::cout << "4\n";
bool
keep
=
true
;
for
(
size_t
k
=
0
;
k
<
selected_indices
->
size
();
++
k
)
{
// std::cout << "5\n";
if
(
keep
)
{
const
int
kept_idx
=
(
*
selected_indices
)[
k
];
T
overlap
=
T
(
0.
);
// std::cout << "6\n";
// 4: [xmin ymin xmax ymax]
if
(
box_size
==
4
)
{
overlap
=
JaccardOverlap
<
T
>
(
bbox_data
+
idx
*
box_size
,
bbox_data
+
kept_idx
*
box_size
,
normalized
);
}
// std::cout << "7\n";
// 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32
if
(
box_size
==
8
||
box_size
==
16
||
box_size
==
24
||
box_size
==
32
)
{
...
...
@@ -168,10 +179,13 @@ void NMSFast(const Tensor& bbox,
}
else
{
break
;
}
// std::cout << "8\n";
}
// std::cout << "9\n";
if
(
keep
)
{
selected_indices
->
push_back
(
idx
);
}
// std::cout << "10\n";
sorted_indices
.
erase
(
sorted_indices
.
begin
());
if
(
keep
&&
eta
<
1
&&
adaptive_threshold
>
0.5
)
{
adaptive_threshold
*=
eta
;
...
...
@@ -195,21 +209,25 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
T
score_threshold
=
static_cast
<
T
>
(
param
.
score_threshold
);
int
num_det
=
0
;
int64_t
class_num
=
scores_size
==
3
?
scores
.
dims
()[
0
]
:
scores
.
dims
()[
1
];
int64_t
class_num
=
scores_size
==
3
?
scores
.
dims
()[
0
]
:
scores
.
dims
()[
1
];
Tensor
bbox_slice
,
score_slice
;
for
(
int64_t
c
=
0
;
c
<
class_num
;
++
c
)
{
Tensor
bbox_slice
,
score_slice
;
if
(
c
==
background_label
)
continue
;
// std::cout << "------ 1 \n";
if
(
scores_size
==
3
)
{
// std::cout << "------ scores_size = 3 \n";
scores
.
Slice
<
T
>
(
score_slice
,
c
,
c
+
1
);
bbox_slice
=
bboxes
;
//
bbox_slice = bboxes;
}
else
{
// std::cout << "------ scores_size != 3 \n";
score_slice
.
Resize
({
scores
.
dims
()[
0
],
1
});
bbox_slice
.
Resize
({
scores
.
dims
()[
0
],
4
});
SliceOneClass
<
T
>
(
scores
,
c
,
&
score_slice
);
SliceOneClass
<
T
>
(
bboxes
,
c
,
&
bbox_slice
);
}
NMSFast
(
bboxes
,
NMSFast
(
bboxes
,
// TODO
score_slice
,
score_threshold
,
nms_threshold
,
...
...
@@ -226,8 +244,6 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
*
num_nmsed_out
=
num_det
;
const
T
*
scores_data
=
scores
.
data
<
T
>
();
if
(
keep_top_k
>
-
1
&&
num_det
>
keep_top_k
)
{
Tensor
score_slice
;
const
T
*
sdata
;
std
::
vector
<
std
::
pair
<
float
,
std
::
pair
<
int
,
int
>>>
score_index_pairs
;
for
(
const
auto
&
it
:
*
indices
)
{
...
...
@@ -275,7 +291,9 @@ void MultiClassOutput(const Tensor& scores,
const
Tensor
&
bboxes
,
const
std
::
map
<
int
,
std
::
vector
<
int
>>&
selected_indices
,
const
int
scores_size
,
Tensor
*
outs
)
{
Tensor
*
outs
,
int
*
oindices
=
nullptr
,
const
int
offset
=
0
)
{
int64_t
class_num
=
scores
.
dims
()[
1
];
int64_t
predict_dim
=
scores
.
dims
()[
1
];
int64_t
box_size
=
bboxes
.
dims
()[
1
];
...
...
@@ -305,9 +323,15 @@ void MultiClassOutput(const Tensor& scores,
if
(
scores_size
==
3
)
{
bdata
=
bboxes_data
+
idx
*
box_size
;
odata
[
count
*
out_dim
+
1
]
=
sdata
[
idx
];
// score
if
(
oindices
!=
nullptr
)
{
oindices
[
count
]
=
offset
+
idx
;
}
}
else
{
bdata
=
bbox
.
data
<
T
>
()
+
idx
*
box_size
;
odata
[
count
*
out_dim
+
1
]
=
*
(
scores_data
+
idx
*
class_num
+
label
);
if
(
oindices
!=
nullptr
)
{
oindices
[
count
]
=
offset
+
idx
*
class_num
+
label
;
}
}
// xmin, ymin, xmax, ymax or multi-points coordinates
std
::
memcpy
(
odata
+
count
*
out_dim
+
2
,
bdata
,
box_size
*
sizeof
(
T
));
...
...
@@ -318,36 +342,18 @@ void MultiClassOutput(const Tensor& scores,
void
MulticlassNmsCompute
::
Run
()
{
auto
&
param
=
Param
<
operators
::
MulticlassNmsParam
>
();
auto
*
boxes
_in
=
param
.
bboxes
;
auto
*
scores
_in
=
param
.
scores
;
auto
*
boxes
=
param
.
bboxes
;
auto
*
scores
=
param
.
scores
;
auto
*
outs
=
param
.
out
;
outs
->
mutable_data
<
float
>
()
;
auto
score_dims
=
boxes_in
->
dims
();
bool
return_index
=
param
.
index
?
true
:
false
;
auto
*
index
=
param
.
index
;
auto
score_dims
=
scores
->
dims
();
auto
score_size
=
score_dims
.
size
();
Tensor
boxes_float
;
Tensor
scores_float
;
boxes_float
.
Resize
(
boxes_in
->
dims
());
scores_float
.
Resize
(
scores_in
->
dims
());
boxes_float
.
mutable_data
<
float
>
();
scores_float
.
mutable_data
<
float
>
();
boxes_float
.
ZynqTensor
()
->
copyFrom
(
boxes_in
->
ZynqTensor
());
scores_float
.
ZynqTensor
()
->
copyFrom
(
scores_in
->
ZynqTensor
());
Tensor
*
boxes
=
&
boxes_float
;
Tensor
*
scores
=
&
scores_float
;
auto
box_dims
=
boxes
->
dims
();
int64_t
box_dim
=
boxes
->
dims
()[
2
];
std
::
vector
<
std
::
map
<
int
,
std
::
vector
<
int
>>>
all_indices
;
std
::
vector
<
uint64_t
>
batch_starts
=
{
0
};
int64_t
batch_size
=
score_dims
[
0
];
int64_t
box_dim
=
boxes
->
dims
()[
2
];
int64_t
out_dim
=
box_dim
+
2
;
int
num_nmsed_out
=
0
;
Tensor
boxes_slice
,
scores_slice
;
...
...
@@ -372,79 +378,104 @@ void MulticlassNmsCompute::Run() {
uint64_t
num_kept
=
batch_starts
.
back
();
if
(
num_kept
==
0
)
{
outs
->
Resize
({
1
,
1
});
float
*
od
=
outs
->
mutable_data
<
float
>
();
od
[
0
]
=
-
1
;
batch_starts
=
{
0
,
1
};
if
(
return_index
)
{
outs
->
Resize
({
0
,
out_dim
});
index
->
Resize
({
0
,
1
});
}
else
{
outs
->
Resize
({
1
,
1
});
float
*
od
=
outs
->
mutable_data
<
float
>
();
od
[
0
]
=
-
1
;
batch_starts
=
{
0
,
1
};
}
}
else
{
outs
->
Resize
({
static_cast
<
int64_t
>
(
num_kept
),
out_dim
});
outs
->
mutable_data
<
float
>
();
int
offset
=
0
;
int
*
oindices
=
nullptr
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
if
(
score_size
==
3
)
{
scores
->
Slice
<
float
>
(
scores_slice
,
i
,
i
+
1
);
boxes
->
Slice
<
float
>
(
boxes_slice
,
i
,
i
+
1
);
scores_slice
.
Resize
({
score_dims
[
1
],
score_dims
[
2
]});
boxes_slice
.
Resize
({
score_dims
[
2
],
box_dim
});
if
(
return_index
)
{
offset
=
i
*
score_dims
[
2
];
}
}
else
{
auto
boxes_lod
=
boxes
->
lod
().
back
();
scores
->
Slice
<
float
>
(
scores_slice
,
boxes_lod
[
i
],
boxes_lod
[
i
+
1
]);
boxes
->
Slice
<
float
>
(
boxes_slice
,
boxes_lod
[
i
],
boxes_lod
[
i
+
1
]);
if
(
return_index
)
{
offset
=
boxes_lod
[
i
]
*
score_dims
[
1
];
}
}
int64_t
s
=
static_cast
<
int64_t
>
(
batch_starts
[
i
]);
int64_t
e
=
static_cast
<
int64_t
>
(
batch_starts
[
i
+
1
]);
if
(
e
>
s
)
{
Tensor
out
;
outs
->
Slice
<
float
>
(
out
,
s
,
e
);
MultiClassOutput
<
float
>
(
scores_slice
,
boxes_slice
,
all_indices
[
i
],
score_dims
.
size
(),
&
out
);
if
(
return_index
)
{
index
->
Resize
({
static_cast
<
int64_t
>
(
num_kept
),
1
});
int
*
output_idx
=
index
->
mutable_data
<
int
>
();
oindices
=
output_idx
+
s
;
}
MultiClassOutput
<
float
>
(
scores_slice
,
boxes_slice
,
all_indices
[
i
],
score_dims
.
size
(),
&
out
,
oindices
,
offset
);
// out.ZynqTensor()->saveToFile("nms_o", true);
outs
->
ZynqTensor
()
->
copyFrom
(
out
.
ZynqTensor
());
out
.
ZynqTensor
()
->
saveToFile
(
"nms_oo"
,
true
);
out
s
->
ZynqTensor
()
->
flush
(
);
}
outs
->
Resize
({
static_cast
<
int64_t
>
(
e
-
s
),
out_dim
});
}
}
LoD
lod
;
lod
.
emplace_back
(
batch_starts
);
if
(
return_index
)
{
index
->
set_lod
(
lod
);
}
outs
->
set_lod
(
lod
);
#ifdef FPGA_PRINT_TENSOR
Debugger
::
get_instance
().
registerOutput
(
"boxes"
,
boxes
->
ZynqTensor
());
Debugger
::
get_instance
().
registerOutput
(
"scores"
,
scores
->
ZynqTensor
());
Debugger
::
get_instance
().
registerOutput
(
"nms"
,
outs
->
ZynqTensor
());
#endif
// boxes->ZynqTensor()->saveToFile("boxes", true);
// scores->ZynqTensor()->saveToFile("scores", true);
// outs->ZynqTensor()->saveToFile("nms", true);
}
}
// namespace fpga
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
// REGISTER_LITE_KERNEL(multiclass_nms,
// kFPGA,
// kFP16,
// kNHWC,
// paddle::lite::kernels::fpga::MulticlassNmsCompute,
// def)
// .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
// .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
// .Finalize();
REGISTER_LITE_KERNEL
(
multiclass_nms
,
kFPGA
,
kFP16
,
kNHWC
,
paddle
::
lite
::
kernels
::
fpga
::
MulticlassNmsCompute
,
def2
)
.
BindInput
(
"BBoxes"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindInput
(
"Scores"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFloat
),
DATALAYOUT
(
kNHWC
))})
def
)
.
BindInput
(
"BBoxes"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindInput
(
"Scores"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
Finalize
();
// REGISTER_LITE_KERNEL(multiclass_nms,
// kFPGA,
// kFP16,
// kNHWC,
// paddle::lite::kernels::fpga::MulticlassNmsCompute,
// def2)
// .BindInput("BBoxes",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindInput("Scores",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFloat),
// DATALAYOUT(kNHWC))})
// .Finalize();
lite/kernels/fpga/prior_box_compute.cc
浏览文件 @
a59d6fab
...
...
@@ -64,7 +64,7 @@ void PriorBoxCompute::PrepareForRun() {
float
offset
=
param
.
offset
;
std
::
vector
<
float
>
aspect_ratios_vec
;
ExpandAspectRatios
(
aspect_ratio
,
is_flip
,
&
aspect_ratios_vec
);
size_
t
prior_num
=
aspect_ratios_vec
.
size
()
*
min_size
.
size
();
in
t
prior_num
=
aspect_ratios_vec
.
size
()
*
min_size
.
size
();
prior_num
+=
max_size
.
size
();
std
::
vector
<
std
::
string
>
order
=
param
.
order
;
bool
min_max_aspect_ratios_order
=
param
.
min_max_aspect_ratios_order
;
...
...
@@ -78,6 +78,7 @@ void PriorBoxCompute::PrepareForRun() {
param
.
boxes
->
mutable_data
<
float
>
();
param
.
variances
->
mutable_data
<
float
>
();
zynqmp
::
PriorBoxParam
&
priobox_param
=
pe_
.
param
();
priobox_param
.
input
=
param
.
input
->
ZynqTensor
();
priobox_param
.
image
=
param
.
image
->
ZynqTensor
();
...
...
lite/kernels/fpga/reshape_compute.cc
浏览文件 @
a59d6fab
...
...
@@ -23,31 +23,64 @@ namespace fpga {
using
float16
=
zynqmp
::
float16
;
void
ReshapeCompute
::
Run
()
{
void
FlattenCompute
::
Run
()
{
auto
&
param
=
Param
<
operators
::
ReshapeParam
>
();
param
.
output
->
mutable_data
<
float16
>
();
auto
x
=
param
.
x
;
// auto actual_shape = param.actual_shape;
Tensor
*
actual_shape
=
nullptr
;
// TODO(chonwhite) change it.
auto
output
=
param
.
output
;
bool
inplace
=
param
.
inplace
;
auto
x_dims
=
x
->
dims
();
output
->
mutable_data
<
float16
>
();
auto
output_dims
=
output
->
dims
();
if
(
actual_shape
)
{
auto
actual_shape_dims
=
actual_shape
->
dims
();
auto
*
actual_shape_data
=
actual_shape
->
data
<
int
>
();
auto
shape
=
std
::
vector
<
int
>
(
actual_shape_data
,
actual_shape_data
+
actual_shape_dims
.
production
());
// output_dims = lite::operators::ValidateShape(shape, x_dims); //TODO
output
->
Resize
(
output_dims
);
if
(
param
.
inplace
)
{
output
->
ShareDataWith
(
*
x
);
}
else
{
// output->CopyDataFrom(*x);
}
// if (inplace) {
// output->ShareDataWith(*x);
// } else {
// output->CopyDataFrom(*x);
// }
x
->
ZynqTensor
()
->
unalignImage
();
// x->ZynqTensor()->saveToFile("fi", true);
output
->
ZynqTensor
()
->
copyFrom
(
x
->
ZynqTensor
());
// output->ZynqTensor()->saveToFile("fo", true);
output
->
ZynqTensor
()
->
flush
();
output
->
ZynqTensor
()
->
setAligned
(
x
->
ZynqTensor
()
->
aligned
());
output
->
Resize
(
output_dims
);
#ifdef FPGA_PRINT_TENSOR
Debugger
::
get_instance
().
registerOutput
(
"flatten"
,
output
->
ZynqTensor
());
#endif
}
void
ReshapeCompute
::
Run
()
{
auto
&
param
=
Param
<
operators
::
ReshapeParam
>
();
auto
x
=
param
.
x
;
auto
output
=
param
.
output
;
auto
output_dims
=
output
->
dims
();
x
->
ZynqTensor
()
->
unalignImage
();
// x->ZynqTensor()->saveToFile("ri", true);
output
->
Resize
(
output_dims
);
output
->
mutable_data
<
float16
>
();
if
(
param
.
inplace
)
{
output
->
ShareDataWith
(
*
x
);
}
else
{
// output->CopyDataFrom(*x);
}
output
->
ZynqTensor
()
->
copyFrom
(
x
->
ZynqTensor
());
// output->ZynqTensor()->saveToFile("ro", true);
output
->
ZynqTensor
()
->
flush
();
output
->
ZynqTensor
()
->
setAligned
(
x
->
ZynqTensor
()
->
aligned
());
#ifdef FPGA_PRINT_TENSOR
Debugger
::
get_instance
().
registerOutput
(
"reshape"
,
output
->
ZynqTensor
());
#endif
}
}
// namespace fpga
...
...
@@ -66,9 +99,9 @@ REGISTER_LITE_KERNEL(reshape,
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindInput
(
"Shape"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
k
FPGA
),
PRECISION
(
k
FP16
),
DATALAYOUT
(
k
NHWC
))})
{
LiteType
::
GetTensorTy
(
TARGET
(
k
Host
),
PRECISION
(
k
Any
),
DATALAYOUT
(
k
Any
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
...
...
@@ -86,9 +119,9 @@ REGISTER_LITE_KERNEL(reshape2,
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindInput
(
"Shape"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
k
FPGA
),
PRECISION
(
k
FP16
),
DATALAYOUT
(
k
NHWC
))})
{
LiteType
::
GetTensorTy
(
TARGET
(
k
Host
),
PRECISION
(
k
Any
),
DATALAYOUT
(
k
Any
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
...
...
@@ -103,16 +136,16 @@ REGISTER_LITE_KERNEL(flatten,
kFPGA
,
kFP16
,
kNHWC
,
paddle
::
lite
::
kernels
::
fpga
::
Reshape
Compute
,
paddle
::
lite
::
kernels
::
fpga
::
Flatten
Compute
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindInput
(
"Shape"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
k
FPGA
),
PRECISION
(
k
FP16
),
DATALAYOUT
(
k
NHWC
))})
{
LiteType
::
GetTensorTy
(
TARGET
(
k
Host
),
PRECISION
(
k
Any
),
DATALAYOUT
(
k
Any
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
...
...
@@ -123,16 +156,16 @@ REGISTER_LITE_KERNEL(flatten2,
kFPGA
,
kFP16
,
kNHWC
,
paddle
::
lite
::
kernels
::
fpga
::
Reshape
Compute
,
paddle
::
lite
::
kernels
::
fpga
::
Flatten
Compute
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindInput
(
"Shape"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
k
FP16
),
DATALAYOUT
(
k
NHWC
))})
{
LiteType
::
GetTensorTy
(
TARGET
(
kHost
),
PRECISION
(
k
Any
),
DATALAYOUT
(
k
Any
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
...
...
lite/kernels/fpga/reshape_compute.h
浏览文件 @
a59d6fab
...
...
@@ -30,6 +30,14 @@ class ReshapeCompute
virtual
~
ReshapeCompute
()
=
default
;
};
class
FlattenCompute
:
public
KernelLite
<
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
)
>
{
public:
void
Run
()
override
;
virtual
~
FlattenCompute
()
=
default
;
};
class
ReshapeComputeFpgaToHost
:
public
KernelLite
<
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
)
>
{
public:
...
...
lite/kernels/fpga/scale_compute.cc
浏览文件 @
a59d6fab
...
...
@@ -29,8 +29,8 @@ void ScaleCompute::PrepareForRun() {
scale_param
.
output
=
param
.
output
->
ZynqTensor
();
int
channel
=
scale_param
.
input
->
shape
().
channel
();
zynqmp
::
Tensor
*
scale
=
new
zynqmp
::
Tensor
()
;
zynqmp
::
Tensor
*
bias
=
new
zynqmp
::
Tensor
()
;
zynqmp
::
Tensor
*
scale
=
&
scale_
;
zynqmp
::
Tensor
*
bias
=
&
bias_
;
zynqmp
::
Shape
shape
(
zynqmp
::
N
,
{
channel
});
float
*
scale_data
=
scale
->
mutableData
<
float
>
(
zynqmp
::
FP32
,
shape
);
float
*
bias_data
=
bias
->
mutableData
<
float
>
(
zynqmp
::
FP32
,
shape
);
...
...
lite/kernels/fpga/scale_compute.h
浏览文件 @
a59d6fab
...
...
@@ -37,6 +37,8 @@ class ScaleCompute
private:
zynqmp
::
ScalePE
pe_
;
zynqmp
::
Tensor
scale_
;
zynqmp
::
Tensor
bias_
;
};
}
// namespace fpga
...
...
lite/kernels/fpga/softmax_compute.cc
浏览文件 @
a59d6fab
...
...
@@ -26,7 +26,8 @@ void SoftmaxCompute::PrepareForRun() {
zynqmp
::
SoftmaxParam
&
softmax_param
=
pe_
.
param
();
auto
&
param
=
Param
<
operators
::
SoftmaxParam
>
();
param
.
output
->
mutable_data
<
float16
>
();
// param.output->mutable_data<float16>();
param
.
output
->
mutable_data
<
float
>
();
softmax_param
.
input
=
param
.
x
->
ZynqTensor
();
softmax_param
.
output
=
param
.
output
->
ZynqTensor
();
pe_
.
init
();
...
...
@@ -34,9 +35,13 @@ void SoftmaxCompute::PrepareForRun() {
}
void
SoftmaxCompute
::
Run
()
{
zynqmp
::
SoftmaxParam
&
softmax_param
=
pe_
.
param
();
// softmax_param.input->saveToFile("softmax_in", true);
pe_
.
dispatch
();
softmax_param
.
output
->
flush
();
// softmax_param.output->saveToFile("softmax", true);
#ifdef FPGA_PRINT_TENSOR
zynqmp
::
SoftmaxParam
&
softmax_param
=
pe_
.
param
();
Debugger
::
get_instance
().
registerOutput
(
"softmax"
,
softmax_param
.
output
);
#endif
}
...
...
@@ -57,7 +62,17 @@ REGISTER_LITE_KERNEL(softmax,
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
Finalize
();
// .BindOutput("Out",
// {LiteType::GetTensorTy(TARGET(kFPGA),
// PRECISION(kFP16),
// DATALAYOUT(kNHWC))})
\ No newline at end of file
lite/kernels/fpga/transpose_compute.cc
浏览文件 @
a59d6fab
...
...
@@ -34,17 +34,17 @@ void transposeCompute(operators::TransposeParam param) {
input_x
->
ZynqTensor
()
->
invalidate
();
input_x
->
ZynqTensor
()
->
unalignImage
();
Tensor
float_input
;
float_input
.
Resize
(
input_x_dims
);
float_input
.
mutable_data
<
float
>
();
float_input
.
ZynqTensor
()
->
copyFrom
(
input_x
->
ZynqTensor
());
//
Tensor float_input;
//
float_input.Resize(input_x_dims);
//
float_input.mutable_data<float>();
//
float_input.ZynqTensor()->copyFrom(input_x->ZynqTensor());
const
auto
*
input_x_data
=
float_input
.
data
<
float
>
();
const
auto
*
input_x_data
=
input_x
->
data
<
float16
>
();
auto
*
out
=
param
.
output
;
const
auto
axis
=
param
.
axis
;
auto
*
out_data
=
out
->
mutable_data
<
float
>
();
auto
*
out_data
=
out
->
mutable_data
<
float
16
>
();
size_t
ndim
=
axis
.
size
();
std
::
vector
<
int
>
xdim
(
ndim
);
...
...
@@ -84,10 +84,11 @@ void transposeCompute(operators::TransposeParam param) {
void
TransposeCompute
::
Run
()
{
auto
&
param
=
this
->
Param
<
param_t
>
();
param
.
output
->
mutable_data
<
zynqmp
::
float16
>
();
param
.
x
->
ZynqTensor
()
->
invalidate
();
//
param.x->ZynqTensor()->invalidate();
param
.
x
->
ZynqTensor
()
->
unalignImage
();
if
(
param
.
x
->
dims
().
size
()
!=
4
)
{
transposeCompute
(
param
);
param
.
output
->
ZynqTensor
()
->
setAligned
(
param
.
x
->
ZynqTensor
()
->
aligned
());
}
else
{
param
.
output
->
ZynqTensor
()
->
copyFrom
(
param
.
x
->
ZynqTensor
());
}
...
...
@@ -96,14 +97,25 @@ void TransposeCompute::Run() {
// Transpose2
void
Transpose2Compute
::
Run
()
{
auto
&
param
=
this
->
Param
<
param_t
>
();
param
.
output
->
mutable_data
<
float
>
();
param
.
x
->
ZynqTensor
()
->
invalidate
();
param
.
output
->
mutable_data
<
float16
>
();
// param.x->ZynqTensor()->syncToCPU();
// param.x->ZynqTensor()->saveToFile("t_in", true);
param
.
x
->
ZynqTensor
()
->
unalignImage
();
// param.x->ZynqTensor()->saveToFile("t_unaligned", true);
param
.
x
->
ZynqTensor
()
->
flush
();
param
.
x
->
ZynqTensor
()
->
invalidate
();
if
(
param
.
x
->
dims
().
size
()
!=
4
)
{
transposeCompute
(
param
);
param
.
output
->
ZynqTensor
()
->
setAligned
(
param
.
x
->
ZynqTensor
()
->
aligned
());
}
else
{
param
.
output
->
ZynqTensor
()
->
copyFrom
(
param
.
x
->
ZynqTensor
());
}
// param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
param
.
output
->
ZynqTensor
()
->
flush
();
// param.output->ZynqTensor()->saveToFile("Transpose2", true);
}
}
// namespace fpga
...
...
@@ -139,6 +151,8 @@ REGISTER_LITE_KERNEL(transpose2,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindOutput
(
"XShape"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
Finalize
();
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录