Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
945aa36f
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
337
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
945aa36f
编写于
12月 09, 2019
作者:
C
chonwhite
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
removed some comments
上级
c6f500fd
变更
33
隐藏空白更改
内联
并排
Showing
33 changed file
with
107 addition
and
913 deletion
+107
-913
lite/api/paddle_use_kernels.h
lite/api/paddle_use_kernels.h
+1
-0
lite/api/test_ssd_fpga.cc
lite/api/test_ssd_fpga.cc
+10
-10
lite/backends/fpga/KD/debugger.hpp
lite/backends/fpga/KD/debugger.hpp
+3
-4
lite/backends/fpga/KD/llapi/bias_scale.cpp
lite/backends/fpga/KD/llapi/bias_scale.cpp
+0
-2
lite/backends/fpga/KD/llapi/zynqmp_api.cpp
lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+4
-6
lite/backends/fpga/KD/pes/conv_pe.hpp
lite/backends/fpga/KD/pes/conv_pe.hpp
+0
-3
lite/backends/fpga/KD/pes/conv_process.hpp
lite/backends/fpga/KD/pes/conv_process.hpp
+1
-1
lite/backends/fpga/KD/pes/fully_connected_pe.hpp
lite/backends/fpga/KD/pes/fully_connected_pe.hpp
+0
-14
lite/backends/fpga/KD/tensor.hpp
lite/backends/fpga/KD/tensor.hpp
+1
-1
lite/backends/fpga/lite_tensor.cc
lite/backends/fpga/lite_tensor.cc
+0
-17
lite/backends/fpga/lite_tensor.h
lite/backends/fpga/lite_tensor.h
+0
-16
lite/core/optimizer.h
lite/core/optimizer.h
+2
-1
lite/kernels/arm/prior_box_compute.cc
lite/kernels/arm/prior_box_compute.cc
+12
-0
lite/kernels/fpga/beam_search_decode_compute.cc
lite/kernels/fpga/beam_search_decode_compute.cc
+0
-296
lite/kernels/fpga/beam_search_decode_compute.h
lite/kernels/fpga/beam_search_decode_compute.h
+0
-39
lite/kernels/fpga/box_coder_compute.cc
lite/kernels/fpga/box_coder_compute.cc
+0
-60
lite/kernels/fpga/box_coder_compute.h
lite/kernels/fpga/box_coder_compute.h
+0
-37
lite/kernels/fpga/calib_compute.cc
lite/kernels/fpga/calib_compute.cc
+0
-13
lite/kernels/fpga/concat_compute.cc
lite/kernels/fpga/concat_compute.cc
+6
-2
lite/kernels/fpga/conv_compute.cc
lite/kernels/fpga/conv_compute.cc
+8
-13
lite/kernels/fpga/dropout_compute.cc
lite/kernels/fpga/dropout_compute.cc
+6
-7
lite/kernels/fpga/elementwise_compute.cc
lite/kernels/fpga/elementwise_compute.cc
+12
-24
lite/kernels/fpga/fc_compute.cc
lite/kernels/fpga/fc_compute.cc
+4
-2
lite/kernels/fpga/feed_compute.cc
lite/kernels/fpga/feed_compute.cc
+5
-11
lite/kernels/fpga/fetch_compute.cc
lite/kernels/fpga/fetch_compute.cc
+9
-5
lite/kernels/fpga/mul_compute.cc
lite/kernels/fpga/mul_compute.cc
+8
-43
lite/kernels/fpga/multiclass_nms_compute.cc
lite/kernels/fpga/multiclass_nms_compute.cc
+4
-16
lite/kernels/fpga/norm_compute.cc
lite/kernels/fpga/norm_compute.cc
+5
-17
lite/kernels/fpga/pooling_compute.cc
lite/kernels/fpga/pooling_compute.cc
+6
-3
lite/kernels/fpga/while_compute.cc
lite/kernels/fpga/while_compute.cc
+0
-58
lite/kernels/fpga/while_compute.h
lite/kernels/fpga/while_compute.h
+0
-84
lite/kernels/fpga/write_to_array_compute.cc
lite/kernels/fpga/write_to_array_compute.cc
+0
-65
lite/kernels/fpga/write_to_array_compute.h
lite/kernels/fpga/write_to_array_compute.h
+0
-43
未找到文件。
lite/api/paddle_use_kernels.h
浏览文件 @
945aa36f
...
...
@@ -15,6 +15,7 @@ USE_LITE_KERNEL(floor, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL
(
hard_sigmoid
,
kARM
,
kFloat
,
kNCHW
,
def
);
USE_LITE_KERNEL
(
rsqrt
,
kARM
,
kFloat
,
kNCHW
,
def
);
USE_LITE_KERNEL
(
prior_box
,
kARM
,
kFloat
,
kNCHW
,
def
);
USE_LITE_KERNEL
(
prior_box_fpga
,
kARM
,
kFloat
,
kNCHW
,
def
);
USE_LITE_KERNEL
(
affine_channel
,
kARM
,
kFloat
,
kNCHW
,
def
);
USE_LITE_KERNEL
(
logical_xor
,
kARM
,
kFloat
,
kNCHW
,
def
);
USE_LITE_KERNEL
(
logical_and
,
kARM
,
kFloat
,
kNCHW
,
def
);
...
...
lite/api/test_ssd_fpga.cc
浏览文件 @
945aa36f
...
...
@@ -135,16 +135,16 @@ TEST(ResNet50, test) {
// std::cout << ":" << out1->data<float>()[i] << std::endl;
// }
//
std::string file = "output/" + FLAGS_input_file.substr (6);
//
std::cout << "file:::" << file << std::endl;
//
std::ofstream ofs;
//
ofs.open(file);
//
for (int i = 0; i < out->dims().production(); i++) {
//
float value = out->data<float>()[i];
//
ofs << value << std::endl;
//
}
//
ofs.close();
std
::
string
file
=
"output/"
+
FLAGS_input_file
.
substr
(
6
);
std
::
cout
<<
"file:::"
<<
file
<<
std
::
endl
;
std
::
ofstream
ofs
;
ofs
.
open
(
file
);
for
(
int
i
=
0
;
i
<
out
->
dims
().
production
();
i
++
)
{
float
value
=
out
->
data
<
float
>
()[
i
];
ofs
<<
value
<<
std
::
endl
;
}
ofs
.
close
();
LOG
(
INFO
)
<<
"================== Speed Report ==================="
;
}
...
...
lite/backends/fpga/KD/debugger.hpp
浏览文件 @
945aa36f
...
...
@@ -5,6 +5,8 @@
namespace
paddle
{
namespace
lite
{
#define FPGA_PRINT_TENSOR
class
Debugger
{
public:
static
Debugger
&
get_instance
()
{
...
...
@@ -12,7 +14,7 @@ class Debugger {
return
s_instance
;
}
void
registerOutput
(
std
::
string
op_type
,
Tensor
*
tensor
)
{
void
registerOutput
(
std
::
string
op_type
,
zynqmp
::
Tensor
*
tensor
)
{
// tensor->printScale();
// tensor->saveToFile(op_type, true);
}
...
...
@@ -101,8 +103,6 @@ inline void save_float(float* data, const std::string& name, int len) {
}
inline
void
save_tensor
(
lite
::
Tensor
*
t
,
const
std
::
string
&
name
,
bool
convert
=
true
)
{
float
*
data
=
const_cast
<
float
*>
(
t
->
data
<
float
>
());
float
*
dst
=
new
float
[
t
->
numel
()];
if
(
convert
)
{
...
...
@@ -111,7 +111,6 @@ inline void save_tensor(lite::Tensor* t,const std::string& name, bool convert =
}
save_float
(
data
,
name
,
t
->
numel
());
delete
[]
dst
;
}
...
...
lite/backends/fpga/KD/llapi/bias_scale.cpp
浏览文件 @
945aa36f
...
...
@@ -86,10 +86,8 @@ void format_bias_array(float **bias_array, int num) {
(
int16_t
*
)
fpga_malloc
(
num_after_align
*
sizeof
(
int16_t
));
// NOLINT
memset
(
ptr_aligned
,
0
,
num_after_align
*
sizeof
(
int16_t
));
std
::
cout
<<
"bias::"
<<
std
::
endl
;
for
(
int
i
=
0
;
i
<
num_before_align
;
i
++
)
{
float
value
=
ptr_aligned
[
i
];
std
::
cout
<<
"@:"
<<
i
<<
" = "
<<
value
<<
std
::
endl
;
ptr_aligned
[
i
]
=
fp32_2_fp16
(
ptr_unaligned
[
i
]);
}
*
bias_array
=
(
float
*
)
ptr_aligned
;
// NOLINT
...
...
lite/backends/fpga/KD/llapi/zynqmp_api.cpp
浏览文件 @
945aa36f
...
...
@@ -28,7 +28,7 @@ limitations under the License. */
namespace
paddle
{
namespace
zynqmp
{
#define PADDLE_
MOBILE_
OS_LINUX
#define PADDLE_OS_LINUX
static
int
fd
=
-
1
;
static
const
char
*
device_path
=
"/dev/fpgadrv0"
;
...
...
@@ -38,7 +38,7 @@ static size_t memory_size_max = 0;
static
size_t
memory_size
=
0
;
static
inline
int
do_ioctl
(
uint64_t
req
,
const
void
*
arg
)
{
#ifdef PADDLE_
MOBILE_
OS_LINUX
#ifdef PADDLE_OS_LINUX
return
ioctl
(
fd
,
req
,
arg
);
#else
return
-
1
;
...
...
@@ -46,11 +46,9 @@ static inline int do_ioctl(uint64_t req, const void *arg) {
}
int
open_device
()
{
// std::cout << "open_device" << std::endl;
if
(
fd
==
-
1
)
{
fd
=
open
(
device_path
,
O_RDWR
);
}
// std::cout << "open_device fd:" << fd << std::endl;
return
fd
;
}
...
...
@@ -68,7 +66,7 @@ void *fpga_malloc(size_t size) {
#ifdef ENABLE_DEBUG
// std::cout << "fpga_malloc:" << size << std::endl;
#endif
#ifdef PADDLE_
MOBILE_
OS_LINUX
#ifdef PADDLE_OS_LINUX
void
*
ptr
=
reinterpret_cast
<
void
*>
(
mmap64
(
NULL
,
size
,
PROT_READ
|
PROT_WRITE
,
MAP_SHARED
,
fd
,
0
));
if
(
ptr
==
NULL
)
{
...
...
@@ -113,7 +111,7 @@ void fpga_free(void *ptr) {
memory_size
-=
size
;
#ifdef PADDLE_
MOBILE_
OS_LINUX
#ifdef PADDLE_OS_LINUX
munmap
(
ptr
,
size
);
#else
...
...
lite/backends/fpga/KD/pes/conv_pe.hpp
浏览文件 @
945aa36f
...
...
@@ -64,14 +64,11 @@ class ConvPE : public PE {
if
(
!
use_cpu_
)
{
// param_.filter->releaseData();
}
// exit(-1);
}
void
cpu_conv_hwc
()
{
Tensor
*
input
=
param_
.
input
;
Tensor
*
output
=
param_
.
output
;
input
->
syncToCPU
();
...
...
lite/backends/fpga/KD/pes/conv_process.hpp
浏览文件 @
945aa36f
...
...
@@ -324,7 +324,7 @@ inline void split_filter_num(const ConvParam& c_param) {
Shape
s_shape
(
N
,
{
filter_num
});
float
*
scale_data
=
scale
.
mutableData
<
float
>
(
FP32
,
s_shape
);
float
*
bias_data
=
bias
.
mutableData
<
float
>
(
FP32
,
s_shape
);
std
::
cout
<<
"v size: "
<<
v
.
size
()
<<
std
::
endl
;
//
std::cout << "v size: " << v.size() << std::endl;
for
(
int
n
=
0
;
n
<
filter_num
;
n
++
)
{
scale_data
[
n
]
=
param
.
scale
()
->
data
<
float
>
()[
n
+
chnnnel_start
]
*
v
[
n
];
// scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
...
...
lite/backends/fpga/KD/pes/fully_connected_pe.hpp
浏览文件 @
945aa36f
...
...
@@ -94,21 +94,7 @@ class FullyConnectedPE : public PE {
}
bool
dispatch
()
{
// return
return
convPE_
.
dispatch
();
// convPE_.dispatch();
// if (num_ == 1) {
// return true;
// }
// Tensor* output = param_.output;
// int size = output->shape().numel() * sizeof(floa16);
// memcpy(output->data<void>(), tempOut_->data<void>(), size);
// for (int i = 1;i < num_;i ++) {
// memcpy(output->data<void>(), tempOut_->data<void>(), size);
// }
// return true;
}
FullyConnectedParam
&
param
()
{
return
param_
;
}
...
...
lite/backends/fpga/KD/tensor.hpp
浏览文件 @
945aa36f
...
...
@@ -395,7 +395,7 @@ class Tensor {
}
void
save_file_with_name
(
std
::
string
path
)
{
//
return;
return
;
invalidate
();
// usleep(20000);
// return;
...
...
lite/backends/fpga/lite_tensor.cc
浏览文件 @
945aa36f
...
...
@@ -92,34 +92,17 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
}
void
TensorLite
::
CopyDataFrom
(
const
TensorLite
&
other
)
{
// std::cout << "1\n";
dims_
=
other
.
dims_
;
// std::cout << "2\n";
target_
=
other
.
target_
;
// std::cout << "3\n";
lod_
=
other
.
lod_
;
auto
dt
=
zynq_tensor_
->
dataType
();
// std::cout << "4\n";
// std::cout << "dt:" << dt << std::endl;
auto
shape
=
other
.
zynq_tensor_
->
shape
();
Resize
(
other
.
dims
());
// mutable_data<float>();
zynq_tensor_
->
mutableData
<
void
>
(
zynq_tensor_
->
dataType
(),
shape
);
// std::cout << "copy Data From: \n";
// std::cout << "ss" << (void*)(other.ZynqTensor()) << "\n";
this
->
ZynqTensor
()
->
copyFrom
(
other
.
ZynqTensor
());
// set_lod(other.lod());
}
// template <typename T>
// void TensorLite::mutable_data_internal() {
// }
}
// namespace lite
}
// namespace paddle
lite/backends/fpga/lite_tensor.h
浏览文件 @
945aa36f
...
...
@@ -293,23 +293,7 @@ void TensorLite::Slice(TensorLite& dst, int64_t begin, int64_t end) const {
int64_t
base
=
numel
()
/
dims_
[
0
];
T
*
src_data
=
const_cast
<
T
*>
(
data
<
T
>
());
std
::
cout
<<
"end:"
<<
end
<<
" begin:"
<<
begin
<<
std
::
endl
;
std
::
cout
<<
"base:"
<<
base
<<
std
::
endl
;
std
::
cout
<<
"production:"
<<
dst_dims
.
production
()
<<
std
::
endl
;
memcpy
(
dst_data
,
src_data
+
static_cast
<
size_t
>
(
begin
*
dst_dims
.
production
()),
dst_dims
.
production
()
*
sizeof
(
T
));
// dst.ZynqTensor()->saveToFile("_slice", true);
// if (dims_[0] == 1) {
// dst-
// return;
// } else {
// // dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
// return dst;
// }
}
template
<
typename
TensorT
>
...
...
lite/core/optimizer.h
100644 → 100755
浏览文件 @
945aa36f
...
...
@@ -109,7 +109,8 @@ class Optimizer {
"runtime_context_assign_pass"
,
"argument_type_display_pass"
,
"memory_optimize_pass"
}};
// "memory_optimize_pass"
}};
RunPasses
(
passes_local
);
}
else
{
RunPasses
(
passes
);
...
...
lite/kernels/arm/prior_box_compute.cc
浏览文件 @
945aa36f
...
...
@@ -98,6 +98,18 @@ REGISTER_LITE_KERNEL(prior_box,
kNCHW
,
paddle
::
lite
::
kernels
::
arm
::
PriorBoxCompute
,
def
)
.
BindInput
(
"Input"
,{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindInput
(
"Image"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"Boxes"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"Variances"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
prior_box_fpga
,
kARM
,
kFloat
,
kNCHW
,
paddle
::
lite
::
kernels
::
arm
::
PriorBoxCompute
,
def
)
.
BindInput
(
"Input"
,{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kAny
),
DATALAYOUT
(
kAny
))})
.
BindInput
(
"Image"
,
{
LiteType
::
GetTensorTy
(
...
...
lite/kernels/fpga/beam_search_decode_compute.cc
已删除
100755 → 0
浏览文件 @
c6f500fd
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/arm/beam_search_decode_compute.h"
#include <algorithm>
#include <vector>
#include "lite/api/paddle_place.h"
#include "lite/backends/arm/math/funcs.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/core/type_system.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
arm
{
using
LoDTensor
=
lite
::
Tensor
;
using
LoDTensorArray
=
std
::
vector
<
lite
::
Tensor
>
;
// all the lod have 2 levels.
// The first is source level, the second is sentence level.
// source level describe how many prefixes (branchs) for each source sentece
// (beam). sentence level describe how these candidates belong to the prefixes.
const
size_t
kSourceLevel
=
0
;
const
size_t
kSentenceLevel
=
1
;
template
<
typename
T
>
struct
Sentence
{
std
::
vector
<
float
>
word_ids
;
std
::
vector
<
T
>
scores
;
};
template
<
typename
T
>
using
SentenceVector
=
std
::
vector
<
Sentence
<
T
>>
;
template
<
typename
T
>
struct
BeamSearchDecoder
{
BeamSearchDecoder
(
size_t
beam_size
,
int
end_id
)
:
beam_size_
(
beam_size
),
end_id_
(
end_id
)
{}
/**
* convert the result sentence_vector for each source sentence into two
* LodTensor.
* One is all candidate sentences with word id, one is all candidate sentences
* with word score.
* Param:
* sentence_vector_list: sentence_vector for each source sentence.
* id_tensor: result LoDTensor for sentences of id.
* score_tensor: result LoDTensor for sentences of score.
* reverse: whether ids of sentence in sentence_vector_list is reversed
* sort_by_score: whether to sort hypotheses of each sentence by scores.
*/
void
ConvertSentenceVectorToLodTensor
(
std
::
vector
<
SentenceVector
<
T
>>
sentence_vector_list
,
LoDTensor
*
id_tensor
,
LoDTensor
*
score_tensor
,
bool
reverse
=
true
,
bool
sort_by_score
=
true
)
const
{
size_t
src_num
=
sentence_vector_list
.
size
();
CHECK_GT
(
src_num
,
0
)
<<
"src_num should not be 0"
;
std
::
vector
<
uint64_t
>
source_level_lod
=
{
0
};
std
::
vector
<
uint64_t
>
sentence_level_lod
=
{
0
};
std
::
vector
<
float
>
id_data
;
std
::
vector
<
T
>
score_data
;
for
(
size_t
src_idx
=
0
;
src_idx
<
src_num
;
++
src_idx
)
{
if
(
sort_by_score
)
{
sort
(
sentence_vector_list
[
src_idx
].
begin
(),
sentence_vector_list
[
src_idx
].
end
(),
[
reverse
](
const
Sentence
<
T
>&
a
,
const
Sentence
<
T
>&
b
)
{
if
(
reverse
)
return
a
.
scores
.
front
()
>
b
.
scores
.
front
();
else
return
a
.
scores
.
back
()
>
b
.
scores
.
back
();
});
}
for
(
Sentence
<
T
>&
sentence
:
sentence_vector_list
[
src_idx
])
{
if
(
reverse
)
{
id_data
.
insert
(
id_data
.
end
(),
sentence
.
word_ids
.
rbegin
(),
sentence
.
word_ids
.
rend
());
score_data
.
insert
(
score_data
.
end
(),
sentence
.
scores
.
rbegin
(),
sentence
.
scores
.
rend
());
}
else
{
id_data
.
insert
(
id_data
.
end
(),
sentence
.
word_ids
.
begin
(),
sentence
.
word_ids
.
end
());
score_data
.
insert
(
score_data
.
end
(),
sentence
.
scores
.
begin
(),
sentence
.
scores
.
end
());
}
sentence_level_lod
.
push_back
(
sentence_level_lod
.
back
()
+
sentence
.
word_ids
.
size
());
}
source_level_lod
.
push_back
(
source_level_lod
.
back
()
+
sentence_vector_list
[
src_idx
].
size
());
}
LoD
lod
;
lod
.
push_back
(
source_level_lod
);
lod
.
push_back
(
sentence_level_lod
);
*
(
id_tensor
->
mutable_lod
())
=
lod
;
id_tensor
->
Resize
({
static_cast
<
int64_t
>
(
id_data
.
size
())});
auto
id_ptr
=
id_tensor
->
mutable_data
<
float
>
();
TargetCopy
(
TARGET
(
kARM
),
id_ptr
,
id_data
.
data
(),
id_data
.
size
()
*
sizeof
(
float
));
*
(
score_tensor
->
mutable_lod
())
=
lod
;
score_tensor
->
Resize
({
static_cast
<
int64_t
>
(
score_data
.
size
())});
auto
score_ptr
=
score_tensor
->
mutable_data
<
T
>
();
TargetCopy
(
TARGET
(
kARM
),
score_ptr
,
score_data
.
data
(),
score_data
.
size
()
*
sizeof
(
T
));
}
/**
* Gather the hypotheses for each source sentence by backtrace though the
* LoDTensorArray step_ids whose lods reserve the path in the tree.
*/
void
Backtrace
(
const
LoDTensorArray
&
step_ids
,
const
LoDTensorArray
&
step_scores
,
LoDTensor
*
id_tensor
,
LoDTensor
*
score_tensor
)
const
{
CHECK
(
!
step_ids
.
empty
())
<<
"step num should be larger than 0"
;
CHECK_EQ
(
step_ids
.
size
(),
step_scores
.
size
())
<<
"step_ids and step_scores should be the same"
;
const
size_t
step_num
=
step_ids
.
size
();
const
size_t
src_num
=
step_ids
.
at
(
0
).
lod
().
at
(
kSourceLevel
).
size
()
-
1
;
std
::
vector
<
SentenceVector
<
T
>>
sentence_vector_list
(
src_num
,
SentenceVector
<
T
>
(
beam_size_
));
std
::
vector
<
std
::
vector
<
size_t
>>
prefix_idx_vector_list
(
src_num
);
for
(
int
step_id
=
step_num
-
1
;
step_id
>=
0
;
--
step_id
)
{
auto
&
cur_ids
=
step_ids
.
at
(
step_id
);
auto
&
cur_scores
=
step_scores
.
at
(
step_id
);
for
(
size_t
src_idx
=
0
;
src_idx
<
src_num
;
++
src_idx
)
{
// for each source sentence
auto
&
sentence_vector
=
sentence_vector_list
.
at
(
src_idx
);
auto
&
prefix_idx_vector
=
prefix_idx_vector_list
.
at
(
src_idx
);
size_t
src_prefix_start
=
cur_ids
.
lod
().
at
(
kSourceLevel
)[
src_idx
];
size_t
src_prefix_end
=
cur_ids
.
lod
().
at
(
kSourceLevel
)[
src_idx
+
1
];
if
(
prefix_idx_vector
.
empty
())
{
// be finished and pruned at this step
// or the last time step
for
(
size_t
prefix_idx
=
src_prefix_start
;
prefix_idx
<
src_prefix_end
;
++
prefix_idx
)
{
size_t
candidate_start
=
cur_ids
.
lod
().
at
(
kSentenceLevel
)[
prefix_idx
];
size_t
candidate_end
=
cur_ids
.
lod
().
at
(
kSentenceLevel
)[
prefix_idx
+
1
];
for
(
size_t
candidate_idx
=
candidate_start
;
candidate_idx
<
candidate_end
;
++
candidate_idx
)
{
prefix_idx_vector
.
push_back
(
prefix_idx
);
size_t
idx
=
prefix_idx_vector
.
size
()
-
1
;
auto
cur_id
=
cur_ids
.
data
<
float
>
()[
candidate_idx
];
auto
cur_score
=
cur_scores
.
data
<
T
>
()[
candidate_idx
];
sentence_vector
.
at
(
idx
).
word_ids
.
push_back
(
cur_id
);
sentence_vector
.
at
(
idx
).
scores
.
push_back
(
cur_score
);
}
}
}
else
{
// use prefix_idx_vector to backtrace
size_t
src_candidate_start
=
cur_ids
.
lod
().
at
(
kSentenceLevel
)[
src_prefix_start
];
size_t
prefix_idx
=
src_prefix_start
;
size_t
candidate_num
=
cur_ids
.
lod
().
at
(
kSentenceLevel
)[
prefix_idx
+
1
]
-
cur_ids
.
lod
().
at
(
kSentenceLevel
)[
prefix_idx
];
for
(
size_t
idx
=
0
;
idx
<
prefix_idx_vector
.
size
();
++
idx
)
{
auto
candidate_idx
=
prefix_idx_vector
.
at
(
idx
);
auto
cur_id
=
cur_ids
.
data
<
float
>
()[
candidate_idx
];
auto
cur_score
=
cur_scores
.
data
<
T
>
()[
candidate_idx
];
if
(
cur_id
!=
end_id_
||
sentence_vector
.
at
(
idx
).
word_ids
.
empty
())
{
// to skip redundant end tokens
sentence_vector
.
at
(
idx
).
word_ids
.
push_back
(
cur_id
);
sentence_vector
.
at
(
idx
).
scores
.
push_back
(
cur_score
);
}
while
(
src_candidate_start
+
candidate_num
<=
candidate_idx
)
{
// search the corresponding prefix
prefix_idx
++
;
candidate_num
+=
cur_ids
.
lod
().
at
(
kSentenceLevel
)[
prefix_idx
+
1
]
-
cur_ids
.
lod
().
at
(
kSentenceLevel
)[
prefix_idx
];
}
prefix_idx_vector
.
at
(
idx
)
=
prefix_idx
;
}
}
}
}
ConvertSentenceVectorToLodTensor
(
sentence_vector_list
,
id_tensor
,
score_tensor
,
true
,
true
);
}
size_t
beam_size_
;
int
end_id_
;
};
struct
BeamSearchDecodeFunctor
{
BeamSearchDecodeFunctor
(
const
LoDTensorArray
&
step_ids
,
const
LoDTensorArray
&
step_scores
,
LoDTensor
*
id_tensor
,
LoDTensor
*
score_tensor
,
size_t
beam_size
,
int
end_id
)
:
beam_size_
(
beam_size
),
end_id_
(
end_id
),
step_ids_
(
step_ids
),
step_scores_
(
step_scores
),
id_tensor_
(
id_tensor
),
score_tensor_
(
score_tensor
)
{}
template
<
typename
T
>
void
apply
()
const
{
BeamSearchDecoder
<
T
>
beam_search_decoder
(
beam_size_
,
end_id_
);
beam_search_decoder
.
Backtrace
(
step_ids_
,
step_scores_
,
id_tensor_
,
score_tensor_
);
}
size_t
beam_size_
;
int
end_id_
;
const
LoDTensorArray
&
step_ids_
;
const
LoDTensorArray
&
step_scores_
;
LoDTensor
*
id_tensor_
;
LoDTensor
*
score_tensor_
;
};
template
<
>
void
BeamSearchDecodeFunctor
::
apply
<
bool
>
()
const
{
LOG
(
FATAL
)
<<
"beam search decode op does not support bool!"
;
}
void
BeamSearchDecodeCompute
::
Run
()
{
auto
&
param
=
this
->
Param
<
param_t
>
();
auto
&
ctx
=
this
->
ctx_
->
template
As
<
ARMContext
>();
// inputs
auto
ids
=
param
.
ids
;
auto
scores
=
param
.
scores
;
// outputs
auto
sentence_ids
=
param
.
sentence_ids
;
auto
sentence_scores
=
param
.
sentence_scores
;
const
size_t
step_num
=
ids
->
size
();
CHECK_GT
(
step_num
,
0UL
)
<<
"beam search steps should be larger than 0"
;
const
size_t
source_num
=
ids
->
at
(
0
).
lod
().
at
(
0
).
size
()
-
1
;
CHECK_GT
(
source_num
,
0UL
)
<<
"source num should be larger than 0"
;
for
(
size_t
i
=
0
;
i
<
step_num
;
++
i
)
{
CHECK_EQ
(
ids
->
at
(
i
).
lod
().
size
(),
2UL
)
<<
"Level of LodTensor should be 2"
;
}
//! fixme
// only support float score now
BeamSearchDecodeFunctor
func
(
*
ids
,
*
scores
,
sentence_ids
,
sentence_scores
,
param
.
beam_size
,
param
.
end_id
);
func
.
apply
<
float
>
();
}
}
// namespace arm
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
REGISTER_LITE_KERNEL
(
beam_search_decode
,
kARM
,
kFloat
,
kNCHW
,
paddle
::
lite
::
kernels
::
arm
::
BeamSearchDecodeCompute
,
def
)
.
BindInput
(
"Ids"
,
{
LiteType
::
GetTensorListTy
(
TARGET
(
kARM
))})
.
BindInput
(
"Scores"
,
{
LiteType
::
GetTensorListTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"SentenceIds"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"SentenceScores"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
Finalize
();
lite/kernels/fpga/beam_search_decode_compute.h
已删除
100755 → 0
浏览文件 @
c6f500fd
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include "lite/core/kernel.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
fpga
{
class
BeamSearchDecodeCompute
:
public
KernelLite
<
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
)
>
{
public:
using
param_t
=
operators
::
BeamSearchDecodeParam
;
BeamSearchDecodeCompute
()
=
default
;
void
Run
()
override
;
virtual
~
BeamSearchDecodeCompute
()
=
default
;
};
}
// namespace fpga
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
lite/kernels/fpga/box_coder_compute.cc
已删除
100755 → 0
浏览文件 @
c6f500fd
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/box_coder_compute.h"
#include <string>
#include <vector>
#include "lite/backends/arm/math/funcs.h"
#include "lite/backends/fpga/KD/float16.hpp"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
fpga
{
using
float16
=
zynqmp
::
float16
;
void
BoxCoderCompute
::
Run
()
{
auto
&
param
=
Param
<
operators
::
ReshapeParam
>
();
param
.
output
->
mutable_data
<
float16
>
();
}
}
// namespace fpga
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
REGISTER_LITE_KERNEL
(
box_coder
,
kFPGA
,
kFP16
,
kNHWC
,
paddle
::
lite
::
kernels
::
fpga
::
BoxCoderCompute
,
def
)
.
BindInput
(
"PriorBox"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindInput
(
"PriorBoxVar"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindInput
(
"TargetBox"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindOutput
(
"OutputBox"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
Finalize
();
lite/kernels/fpga/box_coder_compute.h
已删除
100755 → 0
浏览文件 @
c6f500fd
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
fpga
{
class
BoxCoderCompute
:
public
KernelLite
<
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
)
>
{
public:
using
param_t
=
operators
::
BoxCoderParam
;
void
Run
()
override
;
virtual
~
BoxCoderCompute
()
=
default
;
};
}
// namespace fpga
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
lite/kernels/fpga/calib_compute.cc
浏览文件 @
945aa36f
...
...
@@ -33,13 +33,6 @@ void CalibComputeFp32ToFP16::Run() {
const
auto
*
din
=
param
.
input
->
data
<
float
>
();
param
.
output
->
mutable_data
<
float16
>
();
param
.
output
->
ZynqTensor
()
->
copyFrom
(
param
.
input
->
ZynqTensor
());
// for (int i = 0; i < param.input->numel(); ++i) {
// dout[i] = zynqmp::float_to_half(din[i]);
// }
param
.
input
->
ZynqTensor
()
->
saveToFile
(
"calib_input.txt"
);
param
.
output
->
ZynqTensor
()
->
saveToFile
(
"ouput_31.txt"
);
param
.
output
->
ZynqTensor
()
->
printScale
(
"calib"
);
auto
out_lod
=
param
.
output
->
mutable_lod
();
*
out_lod
=
param
.
input
->
lod
();
return
;
...
...
@@ -53,13 +46,7 @@ void CalibComputeFP16ToFp32::Run() {
auto
&
param
=
this
->
Param
<
operators
::
CalibParam
>
();
const
auto
*
din
=
param
.
input
->
data
<
float16
>
();
auto
*
dout
=
param
.
output
->
mutable_data
<
float
>
();
// for (int i = 0; i < param.input->numel(); ++i) {
// dout[i] = zynqmp::half_to_float(din[i]);
// }
param
.
output
->
ZynqTensor
()
->
copyFrom
(
param
.
input
->
ZynqTensor
());
param
.
output
->
ZynqTensor
()
->
saveToFile
(
"ouput_13.txt"
);
auto
out_lod
=
param
.
output
->
mutable_lod
();
*
out_lod
=
param
.
input
->
lod
();
return
;
...
...
lite/kernels/fpga/concat_compute.cc
浏览文件 @
945aa36f
...
...
@@ -12,13 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/concat_compute.h"
#include <string>
#include <vector>
#include "lite/kernels/fpga/concat_compute.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
...
...
@@ -43,8 +45,10 @@ void ConcatCompute::PrepareForRun() {
void
ConcatCompute
::
Run
()
{
pe_
.
dispatch
();
#ifdef FPGA_PRINT_TENSOR
zynqmp
::
ConcatParam
&
concat_param
=
pe_
.
param
();
concat_param
.
output
->
saveToFile
(
"concat"
,
true
);
Debugger
.
get_instance
()
::
registerOutput
(
"concat"
,
concat_param
.
output
);
#endif
}
}
// namespace fpga
...
...
lite/kernels/fpga/conv_compute.cc
100644 → 100755
浏览文件 @
945aa36f
...
...
@@ -12,10 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/conv_compute.h"
#include <vector>
#include "lite/kernels/fpga/conv_compute.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
...
...
@@ -60,14 +63,9 @@ void ConvCompute::PrepareForRun() {
fill_scale_bias_const
(
&
conv_param
);
if
(
param
.
bias
!=
nullptr
)
{
conv_param
.
bias
()
->
copyFrom
(
param
.
bias
->
ZynqTensor
());
std
::
cout
<<
"copy bias
\n
"
;
}
conv_param
.
relu
.
enabled
=
param
.
fuse_relu
;
// conv_param.filter->saveToFile("filter", true);
// conv_param.bias()->saveToFile("bias", true);
// conv_param.scale()->saveToFile("scale", true);
conv_pe_
.
init
();
conv_pe_
.
apply
();
}
...
...
@@ -75,18 +73,15 @@ void ConvCompute::PrepareForRun() {
void
ConvCompute
::
Run
()
{
auto
&
param
=
this
->
Param
<
param_t
>
();
// std::cout << "in:" << param.x->ZynqTensor()->data<void>() << std::endl;
if
(
param
.
x
->
ZynqTensor
()
->
shape
().
channel
()
!=
1
&&
param
.
groups
==
param
.
x
->
ZynqTensor
()
->
shape
().
channel
())
{
dw_conv_pe_
.
dispatch
();
// param.output->ZynqTensor()->saveToFile("dw", true);
}
else
{
zynqmp
::
ConvParam
&
conv_param
=
conv_pe_
.
param
();
conv_pe_
.
dispatch
();
// conv_param.input->saveToFile("_conv_in", true);
conv_param
.
output
->
printScale
(
"conv"
);
param
.
output
->
ZynqTensor
()
->
saveToFile
(
"_conv"
,
true
);
// conv_param.output->saveToFile("_conv_param", true);
#ifdef FPGA_PRINT_TENSOR
zynqmp
::
ConvParam
&
conv_param
=
conv_pe_
.
param
(
);
Debugger
::
get_instance
().
registerOutput
(
"conv"
,
conv_param
.
output
);
#endif
}
}
...
...
lite/kernels/fpga/dropout_compute.cc
浏览文件 @
945aa36f
...
...
@@ -12,11 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/dropout_compute.h"
#include <string>
#include "lite/kernels/fpga/dropout_compute.h"
#include "lite/backends/fpga/KD/float16.hpp"
// #include "lite/backends/arm/math/funcs.h
"
#include "lite/backends/fpga/KD/debugger.hpp
"
namespace
paddle
{
namespace
lite
{
...
...
@@ -54,12 +54,11 @@ void DropoutCompute::PrepareForRun() {
}
void
DropoutCompute
::
Run
()
{
auto
&
param
=
Param
<
operators
::
DropoutParam
>
();
zynqmp
::
ScaleParam
&
scale_param
=
pe_
.
param
();
// scale_param.input->saveToFile("drop_in.txt");
pe_
.
dispatch
();
// scale_param.output->saveToFile("drop_out.txt");
// std::cout << "prob:" << param.dropout_prob << std::endl;
#ifdef FPGA_PRINT_TENSOR
zynqmp
::
ScaleParam
&
scale_param
=
pe_
.
param
();
Debugger
::
get_instance
().
registerOutput
(
"dropout"
,
scale_param
.
output
);
#endif
}
}
// namespace fpga
...
...
lite/kernels/fpga/elementwise_compute.cc
浏览文件 @
945aa36f
...
...
@@ -12,9 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/elementwise_compute.h"
#include <string>
#include "lite/kernels/fpga/elementwise_compute.h"
#include "lite/backends/arm/math/funcs.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace
paddle
{
namespace
lite
{
...
...
@@ -39,8 +40,10 @@ void ElementwiseAddCompute::PrepareForRun() {
}
void
ElementwiseAddCompute
::
Run
()
{
pe_
.
dispatch
();
#ifdef FPGA_PRINT_TENSOR
zynqmp
::
ElementwiseAddParam
&
ew_param
=
pe_
.
param
();
// ew_param.output->saveToFile("ew", true);
Debugger
::
get_instance
().
registerOutput
(
"ew_add"
,
ew_param
.
output
);
#endif
}
void
ElementwiseAddActivationCompute
::
PrepareForRun
()
{
...
...
@@ -59,6 +62,10 @@ void ElementwiseAddActivationCompute::PrepareForRun() {
}
void
ElementwiseAddActivationCompute
::
Run
()
{
pe_
.
dispatch
();
#ifdef FPGA_PRINT_TENSOR
zynqmp
::
ElementwiseAddParam
&
ew_param
=
pe_
.
param
();
Debugger
::
get_instance
().
registerOutput
(
"ew_add"
,
ew_param
.
output
);
#endif
}
void
ElementwiseMulCompute
::
PrepareForRun
()
{
...
...
@@ -66,14 +73,8 @@ void ElementwiseMulCompute::PrepareForRun() {
auto
&
param
=
Param
<
operators
::
ElementwiseParam
>
();
param
.
Out
->
mutable_data
<
float16
>
();
scale_param
.
input
=
param
.
X
->
ZynqTensor
();
scale_param
.
output
=
param
.
Out
->
ZynqTensor
();
// param.Y->ZynqTensor()->saveToFile("scale_y", true);
std
::
cout
<<
"y_production:"
<<
param
.
Y
->
dims
().
production
()
<<
std
::
endl
;
// exit(-1);
scale_param
.
relu
.
enabled
=
false
;
...
...
@@ -85,39 +86,26 @@ void ElementwiseMulCompute::PrepareForRun() {
zynqmp
::
Shape
shape
(
zynqmp
::
N
,
{
channel
});
float
*
scale_data
=
scale
->
mutableData
<
float
>
(
zynqmp
::
FP32
,
shape
);
float
*
bias_data
=
bias
->
mutableData
<
float
>
(
zynqmp
::
FP32
,
shape
);
float
scale_value
=
param
.
Y
->
data
<
float
>
()[
0
];;
std
::
cout
<<
"scale_value:"
<<
scale_value
<<
std
::
endl
;
std
::
cout
<<
"channel:"
<<
channel
<<
std
::
endl
;
std
::
cout
<<
"data_type:"
<<
param
.
Y
->
ZynqTensor
()
->
dataType
()
<<
std
::
endl
;
// exit(-1);
for
(
int
i
=
0
;
i
<
channel
;
++
i
)
{
if
(
param
.
Y
->
dims
().
production
()
!=
1
)
{
scale_value
=
param
.
Y
->
ZynqTensor
()
->
data
<
float
>
()[
i
];
}
scale_data
[
i
]
=
scale_value
;
bias_data
[
i
]
=
0
;
}
pe_
.
init
();
pe_
.
apply
();
// scale_param.input->saveToFile("scale_input", true);
// scale_param.scale->saveToFile("scale_scale", true);
param
.
Y
->
ZynqTensor
()
->
saveToFile
(
"ew_y"
,
true
);
// exit(-1);
}
void
ElementwiseMulCompute
::
Run
()
{
pe_
.
dispatch
();
#ifdef FPGA_PRINT_TENSOR
zynqmp
::
ScaleParam
&
scale_param
=
pe_
.
param
();
// scale_param.output->saveToFile("ew_mul", true
);
// exit(-1);
Debugger
::
get_instance
().
registerOutput
(
"ew_mul"
,
scale_param
.
output
);
#endif
}
}
// namespace fpga
...
...
lite/kernels/fpga/fc_compute.cc
浏览文件 @
945aa36f
...
...
@@ -15,6 +15,7 @@
#include "lite/kernels/fpga/fc_compute.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace
paddle
{
namespace
lite
{
...
...
@@ -30,7 +31,6 @@ void FcCompute::PrepareForRun() {
zynqmp
::
FullyConnectedParam
&
fc_param
=
pe_
.
param
();
param
.
output
->
mutable_data
<
float16
>
();
fc_param
.
input
=
param
.
input
->
ZynqTensor
();
fc_param
.
output
=
param
.
output
->
ZynqTensor
();
fc_param
.
filter
=
param
.
w
->
ZynqTensor
();
...
...
@@ -42,8 +42,10 @@ void FcCompute::PrepareForRun() {
void
FcCompute
::
Run
()
{
pe_
.
dispatch
();
#ifdef FPGA_PRINT_TENSOR
zynqmp
::
FullyConnectedParam
&
fc_param
=
pe_
.
param
();
// fc_param.output->saveToFile("fc", true);
Debugger
::
get_instance
().
registerOutput
(
"fc"
,
fc_param
.
output
);
#endif
}
}
// namespace fpga
...
...
lite/kernels/fpga/feed_compute.cc
浏览文件 @
945aa36f
...
...
@@ -15,6 +15,7 @@
#include "lite/kernels/fpga/feed_compute.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace
paddle
{
namespace
lite
{
...
...
@@ -37,24 +38,17 @@ void FeedCompute::PrepareForRun() {
}
void
FeedCompute
::
Run
()
{
std
::
cout
<<
"================= FeedCompute =================
\n
"
;
auto
&
param
=
this
->
Param
<
param_t
>
();
Tensor
&
x
=
param
.
feed_list
->
at
(
param
.
col
);
zynqmp
::
InputParam
&
feed_param
=
pe_
.
param
();
if
(
x
.
dims
().
production
()
==
7590
)
{
feed_param
.
input
->
readFromFile
(
"position_encoding.data"
);
feed_param
.
input
->
saveToFile
(
"read.txt"
);
}
pe_
.
dispatch
();
auto
out_lod
=
param
.
out
->
mutable_lod
();
*
out_lod
=
x
.
lod
();
feed_param
.
input
->
saveToFile
(
"feed_in.txt"
);
feed_param
.
output
->
saveToFile
(
"feed.txt"
);
#ifdef FPGA_PRINT_TENSOR
zynqmp
::
InputParam
&
feed_param
=
pe_
.
param
();
Debugger
::
get_instance
().
registerOutput
(
"feed"
,
feed_param
.
output
);
#endif
}
}
// namespace fpga
...
...
lite/kernels/fpga/fetch_compute.cc
浏览文件 @
945aa36f
...
...
@@ -14,6 +14,7 @@
#include "lite/kernels/fpga/fetch_compute.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace
paddle
{
namespace
lite
{
...
...
@@ -25,7 +26,7 @@ using float16 = zynqmp::float16;
void
FetchCompute
::
PrepareForRun
()
{
auto
&
param
=
this
->
Param
<
param_t
>
();
// ====================================================
zynqmp
::
OutputParam
&
conv
_param
=
pe_
.
param
();
zynqmp
::
OutputParam
&
fetch
_param
=
pe_
.
param
();
auto
fetch_list
=
param
.
fetch_list
;
if
(
fetch_list
->
size
()
<=
static_cast
<
size_t
>
(
param
.
col
))
{
fetch_list
->
resize
(
param
.
col
+
1
);
...
...
@@ -34,8 +35,8 @@ void FetchCompute::PrepareForRun() {
out
.
Resize
(
param
.
input
->
dims
());
out
.
mutable_data
<
float
>
();
conv
_param
.
input
=
param
.
input
->
ZynqTensor
();
conv
_param
.
output
=
out
.
ZynqTensor
();
fetch
_param
.
input
=
param
.
input
->
ZynqTensor
();
fetch
_param
.
output
=
out
.
ZynqTensor
();
pe_
.
init
();
pe_
.
apply
();
...
...
@@ -44,8 +45,11 @@ void FetchCompute::PrepareForRun() {
void
FetchCompute
::
Run
()
{
pe_
.
dispatch
();
auto
&
param
=
this
->
Param
<
param_t
>
();
zynqmp
::
OutputParam
&
conv_param
=
pe_
.
param
();
conv_param
.
output
->
saveToFile
(
"fetch"
,
true
);
#ifdef FPGA_PRINT_TENSOR
zynqmp
::
OutputParam
&
fetch_param
=
pe_
.
param
();
Debugger
::
get_instance
().
registerOutput
(
"fetch"
,
fetch_param
.
output
);
#endif
}
}
// namespace fpga
...
...
lite/kernels/fpga/mul_compute.cc
浏览文件 @
945aa36f
...
...
@@ -12,12 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/mul_compute.h"
#include <vector>
// #include "lite/backends/arm/math/funcs
.h"
#include "lite/kernels/fpga/mul_compute
.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
...
...
@@ -37,7 +38,6 @@ void MulCompute::PrepareForRun() {
fc_param
.
output
=
param
.
output
->
ZynqTensor
();
fc_param
.
filter
=
param
.
y
->
ZynqTensor
();
// fc_param.bias = param.bias->ZynqTensor();
fc_param
.
bias
=
&
bias_
;
int
channel
=
fc_param
.
filter
->
shape
().
channel
();
...
...
@@ -59,15 +59,7 @@ void mul(MulCompute* k) {
int
fn
=
param
.
y
->
dims
()[
1
];
std
::
cout
<<
"num: "
<<
num
<<
std
::
endl
;
std
::
cout
<<
"channel: "
<<
channel
<<
std
::
endl
;
std
::
cout
<<
"fn: "
<<
fn
<<
std
::
endl
;
param
.
y
->
ZynqTensor
()
->
saveToFile
(
"filter.txt"
);
float16
*
out_data
=
param
.
output
->
mutable_data
<
float16
>
();
// int si = 0;
int
g_index
=
0
;
for
(
int
n
=
0
;
n
<
1
;
n
++
)
{
...
...
@@ -77,12 +69,10 @@ void mul(MulCompute* k) {
for
(
int
c
=
0
;
c
<
channel
;
c
++
)
{
float
value
=
zynqmp
::
half_to_float
(
param
.
x
->
data
<
float16
>
()[
si
]);
int
index
=
c
*
fn
+
on
;
// std::cout << "index: " << index << std::endl;
float
weight
=
param
.
y
->
data
<
float
>
()[
index
];
sum
+=
value
*
weight
;
si
++
;
}
std
::
cout
<<
sum
<<
"
\n
"
;
out_data
[
g_index
]
=
zynqmp
::
float_to_half
(
sum
);
g_index
++
;
}
...
...
@@ -91,37 +81,12 @@ void mul(MulCompute* k) {
void
MulCompute
::
Run
()
{
// auto& param = Param<param_t>();
zynqmp
::
FullyConnectedParam
&
fc_param
=
pe_
.
param
();
std
::
cout
<<
"1
\n
"
;
// fc_param.input->readFromFile("arm_8_im_in.data");
// fc_param.input->flush();
float16
*
data_in
=
fc_param
.
input
->
data
<
float16
>
();
// float16 one = zynqmp::float_to_half(1.0f);
// for (int i = 0; i < fc_param.input->shape().alignedElementCount(); i++) {
// data_in[i] = one;
// }
// fc_param.input->scale()[0] = 1.0 / 127;
// fc_param.input->scale()[1] = 127;
pe_
.
dispatch
();
// std::cout << "2\n";
// fc_param.input->printScale("mul");
// std::cout << "3\n";
fc_param
.
input
->
saveToFile
(
"mul_in.txt"
);
// std::cout << "4\n";
// mul(this);
// std::cout << "5\n";
fc_param
.
output
->
saveToFile
(
"mul_out.txt"
);
// exit(-1);
// exit(-1);
// fc_param.output->saveToFile("mul.txt");
// Tensor* output = const_cast<Tensor*>(param.output);
// const auto* x_data = param.x->data<float>();
// param.y->mutable_data<float16>();
// param.output->mutable_data<float16>();
#ifdef FPGA_PRINT_TENSOR
zynqmp
::
FullyConnectedParam
&
fc_param
=
pe_
.
param
();
Debugger
.
get_instance
().
registerOutput
(
"mul"
,
fc_param
.
output
);
#endif
}
}
// namespace fpga
...
...
lite/kernels/fpga/multiclass_nms_compute.cc
浏览文件 @
945aa36f
...
...
@@ -195,17 +195,13 @@ void MultiClassNMS(const operators::MulticlassNmsParam& param,
T
score_threshold
=
static_cast
<
T
>
(
param
.
score_threshold
);
int
num_det
=
0
;
int64_t
class_num
=
scores_size
==
3
?
scores
.
dims
()[
0
]
:
scores
.
dims
()[
1
];
// scores.ZynqTensor()->saveToFile("nms_scores", true);
for
(
int64_t
c
=
0
;
c
<
class_num
;
++
c
)
{
Tensor
bbox_slice
,
score_slice
;
if
(
c
==
background_label
)
continue
;
if
(
scores_size
==
3
)
{
scores
.
Slice
<
T
>
(
score_slice
,
c
,
c
+
1
);
// score_slice.ZynqTensor()->saveToFile("nms_slice", true);
bbox_slice
=
bboxes
;
}
else
{
score_slice
.
Resize
({
scores
.
dims
()[
0
],
1
});
...
...
@@ -387,27 +383,19 @@ void MulticlassNmsCompute::Run() {
if
(
e
>
s
)
{
Tensor
out
;
outs
->
Slice
<
float
>
(
out
,
s
,
e
);
// scores_slice.ZynqTensor()->saveToFile("scores_slice", true);
MultiClassOutput
<
float
>
(
scores_slice
,
boxes_slice
,
all_indices
[
i
],
score_dims
.
size
(),
&
out
);
out
.
ZynqTensor
()
->
saveToFile
(
"out"
,
true
);
outs
->
ZynqTensor
()
->
copyFrom
(
out
.
ZynqTensor
());
}
}
}
// save_tensor(param.scores, "_scores.txt", false);
// save_tensor(param.bboxes, "_bboxes.txt", false);
boxes
->
ZynqTensor
()
->
saveToFile
(
"_boxes"
,
true
);
scores
->
ZynqTensor
()
->
saveToFile
(
"_scores"
,
true
);
outs
->
ZynqTensor
()
->
saveToFile
(
"_outs"
,
true
);
LoD
lod
;
lod
.
emplace_back
(
batch_starts
);
outs
->
set_lod
(
lod
);
#ifdef FPGA_PRINT_TENSOR
Debugger
::
get_instance
().
registerOutput
(
"nms"
,
outs
->
ZynqTensor
());
#endif
}
}
// namespace host
}
// namespace kernels
...
...
lite/kernels/fpga/norm_compute.cc
浏览文件 @
945aa36f
...
...
@@ -13,8 +13,7 @@
// limitations under the License.
#include "lite/kernels/fpga/norm_compute.h"
// #include "lite/backends/arm/math/funcs.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace
paddle
{
namespace
lite
{
...
...
@@ -27,7 +26,6 @@ void NormCompute::PrepareForRun() {
auto
&
param
=
this
->
Param
<
operators
::
NormParam
>
();
param
.
Out
->
mutable_data
<
float16
>
();
zynqmp
::
NormParam
&
norm_param
=
pe_
.
param
();
norm_param
.
input
=
param
.
X
->
ZynqTensor
();
norm_param
.
output
=
param
.
Out
->
ZynqTensor
();
...
...
@@ -39,20 +37,10 @@ void NormCompute::PrepareForRun() {
void
NormCompute
::
Run
()
{
pe_
.
dispatch
();
pe_
.
param
().
output
->
saveToFile
(
"norm.txt"
,
true
);
// auto& ctx = this->ctx_->template As<ARMContext>();
// auto& param = this->Param<operators::NormParam>();
// auto input_dims = param.X->dims();
// int dim_size = param.X->dims().size();
// auto axis = (param.axis < 0) ? param.axis + dim_size : param.axis;
// const auto* x_data = param.X->data<float>();
// auto* o_data = param.Out->mutable_data<float>();
// int pre_n = input_dims.count(0, axis);
// int post_n = input_dims.count(axis + 1, dim_size);
// int n = input_dims[axis];
// lite::arm::math::norm(x_data, pre_n, n, post_n, param.epsilon, o_data, &ctx);
#ifdef FPGA_PRINT_TENSOR
zynqmp
::
NormParam
&
norm_param
=
pe_
.
param
();
Debugger
::
get_instance
().
registerOutput
(
"norm"
,
norm_param
.
output
);
#endif
}
}
// namespace fpga
...
...
lite/kernels/fpga/pooling_compute.cc
浏览文件 @
945aa36f
...
...
@@ -12,12 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/pooling_compute.h"
#include <string>
#include <vector>
#include "lite/kernels/fpga/pooling_compute.h"
#include "lite/core/op_registry.h"
#include "lite/core/type_system.h"
#include "lite/backends/fpga/KD/debugger.hpp"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
...
...
@@ -47,9 +49,10 @@ void PoolCompute::PrepareForRun() {
void
PoolCompute
::
Run
()
{
pe_
.
dispatch
();
#ifdef FPGA_PRINT_TENSOR
zynqmp
::
PoolingParam
&
pool_param
=
pe_
.
param
();
pool_param
.
output
->
printScale
(
"pooling"
);
pool_param
.
output
->
saveToFile
(
"pool"
,
true
);
Debugger
::
get_instance
().
registerOutput
(
"pooling"
,
pool_param
.
output
);
#endif
}
}
// namespace fpga
...
...
lite/kernels/fpga/while_compute.cc
已删除
100755 → 0
浏览文件 @
c6f500fd
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/while_compute.h"
#include <memory>
#include <string>
#include <vector>
#include "lite/backends/arm/math/funcs.h"
#include "lite/core/tensor.h"
#include "lite/core/type_system.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
fpga
{
void
WhileCompute
::
PrepareForRun
()
{
auto
&
param
=
Param
<
operators
::
WhileParam
>
();
auto
cur_scope
=
param
.
scope
;
executor_
=
std
::
make_shared
<
StepExecutor
>
(
param
.
sub_block
,
cur_scope
,
place
());
}
void
WhileCompute
::
Run
()
{
auto
&
param
=
Param
<
operators
::
WhileParam
>
();
while
(
param
.
cond
->
data
<
bool
>
()[
0
])
{
executor_
->
Run
();
}
}
}
// namespace fpga
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
REGISTER_LITE_KERNEL
(
while
,
kFPGA
,
kFP16
,
kNHWC
,
paddle
::
lite
::
kernels
::
fpga
::
WhileCompute
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindInput
(
"Condition"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kBool
))})
.
BindOutput
(
"Out"
,{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindOutput
(
"StepScopes"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
Finalize
();
lite/kernels/fpga/while_compute.h
已删除
100755 → 0
浏览文件 @
c6f500fd
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <memory>
#include <vector>
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/operators/while_op.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
fpga
{
class
StepExecutor
{
typedef
std
::
shared_ptr
<
OpLite
>
OpPtr
;
public:
StepExecutor
(
cpp
::
BlockDesc
*
block
,
Scope
*
scope
,
Place
place
)
:
scope_
(
scope
),
place_
(
place
)
{
int32_t
op_size
=
block
->
OpsSize
();
for
(
int32_t
i
=
0
;
i
<
op_size
;
++
i
)
{
auto
&
op_desc
=
*
block
->
template
GetOp
<
cpp
::
OpDesc
>(
i
);
auto
op_type
=
op_desc
.
Type
();
auto
op_handler
=
lite
::
LiteOpRegistry
::
Global
().
Create
(
op_desc
.
Type
());
VLOG
(
LOG_INFO
)
<<
"while: creating Op ["
<<
op_type
<<
"]"
;
op_handler
->
Attach
(
op_desc
,
scope
);
auto
hostplace
=
place_
;
hostplace
.
target
=
TARGET
(
kHost
);
auto
kernels
=
op_handler
->
CreateKernels
({
place_
,
hostplace
});
CHECK_GT
(
kernels
.
size
(),
0
)
<<
"cannot create kernel"
;
op_handler
->
AttachKernel
(
kernels
[
0
].
get
());
op_handler
->
SetKernel
(
kernels
);
ops_of_block_
.
push_back
(
op_handler
);
}
}
void
Run
()
{
for
(
auto
&
op_handler
:
ops_of_block_
)
{
// VLOG(4) << op_handler->op_info()->Repr();
op_handler
->
InferShape
();
// VLOG(4) << "while: infered shape";
op_handler
->
Run
();
}
}
private:
Scope
*
scope_
;
Place
place_
;
std
::
vector
<
OpPtr
>
ops_of_block_
;
};
class
WhileCompute
:
public
KernelLite
<
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
)
>
{
public:
using
param_t
=
operators
::
WhileParam
;
void
Run
()
override
;
void
PrepareForRun
()
override
;
virtual
~
WhileCompute
()
=
default
;
private:
std
::
shared_ptr
<
StepExecutor
>
executor_
;
};
}
// namespace fpga
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
lite/kernels/fpga/write_to_array_compute.cc
已删除
100755 → 0
浏览文件 @
c6f500fd
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/fpga/write_to_array_compute.h"
#include "lite/backends/arm/math/funcs.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
fpga
{
void
WriteToArrayCompute
::
PrepareForRun
()
{}
void
WriteToArrayCompute
::
Run
()
{
auto
&
ctx
=
this
->
ctx_
->
template
As
<
ARMContext
>();
auto
&
param
=
this
->
Param
<
operators
::
WriteToArrayParam
>
();
CHECK_EQ
(
param
.
I
->
numel
(),
1
)
<<
"input2 should have only one element"
;
const
auto
*
x_data
=
param
.
X
->
data
<
float
>
();
int
id
=
param
.
I
->
data
<
int
>
()[
0
];
int
id_test
=
param
.
I
->
data
<
int64_t
>
()[
0
];
if
(
id
>=
param
.
Out
->
size
())
{
for
(
int
i
=
param
.
Out
->
size
();
i
<
id
+
1
;
i
++
)
{
lite
::
Tensor
tmp
;
param
.
Out
->
push_back
(
tmp
);
}
}
(
*
param
.
Out
)[
id
].
Resize
(
param
.
X
->
dims
());
auto
out_lod
=
(
*
param
.
Out
)[
id
].
mutable_lod
();
*
out_lod
=
param
.
X
->
lod
();
auto
*
o_data
=
(
*
param
.
Out
)[
id
].
mutable_data
<
float
>
(
TARGET
(
kHost
));
int
input_size
=
param
.
X
->
numel
();
memcpy
(
o_data
,
x_data
,
sizeof
(
float
)
*
input_size
);
}
}
// namespace fpga
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
REGISTER_LITE_KERNEL
(
write_to_array
,
kFPGA
,
kFP16
,
kNHWC
,
paddle
::
lite
::
kernels
::
fpga
::
WriteToArrayCompute
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
BindInput
(
"I"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
))})
.
Finalize
();
lite/kernels/fpga/write_to_array_compute.h
已删除
100755 → 0
浏览文件 @
c6f500fd
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
#include "lite/backends/arm/math/type_trans.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
fpga
{
class
WriteToArrayCompute
:
public
KernelLite
<
TARGET
(
kFPGA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNHWC
)
>
{
public:
using
param_t
=
operators
::
WriteToArrayParam
;
void
PrepareForRun
()
override
;
void
Run
()
override
;
~
WriteToArrayCompute
()
{}
private:
};
}
// namespace fpga
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录