Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
84ebc523
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
84ebc523
编写于
9月 26, 2018
作者:
L
liuruilong
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'upstream/develop' into develop
上级
3555b293
3b9d9819
变更
18
展开全部
显示空白变更内容
内联
并排
Showing
18 changed file
with
666 addition
and
387 deletion
+666
-387
python/tools/imagetools/imagetools.py
python/tools/imagetools/imagetools.py
+61
-0
python/tools/imagetools/img2nchw.py
python/tools/imagetools/img2nchw.py
+69
-0
python/tools/imagetools/img2nhwc.py
python/tools/imagetools/img2nhwc.py
+34
-0
python/tools/imagetools/numpy2binary.py
python/tools/imagetools/numpy2binary.py
+47
-0
python/tools/mdl2fluid/model_combine.py
python/tools/mdl2fluid/model_combine.py
+19
-0
python/tools/mdl2fluid/swicher.py
python/tools/mdl2fluid/swicher.py
+29
-6
src/fpga/api.cpp
src/fpga/api.cpp
+36
-31
src/io/executor.cpp
src/io/executor.cpp
+11
-1
src/operators/feed_op.h
src/operators/feed_op.h
+1
-1
src/operators/kernel/central-arm-func/conv_add_arm_func.h
src/operators/kernel/central-arm-func/conv_add_arm_func.h
+6
-3
src/operators/kernel/fpga/softmax_kernel.cpp
src/operators/kernel/fpga/softmax_kernel.cpp
+2
-1
src/operators/math/depthwise_conv_3x3.cpp
src/operators/math/depthwise_conv_3x3.cpp
+97
-0
src/operators/math/depthwise_conv_3x3.h
src/operators/math/depthwise_conv_3x3.h
+3
-0
src/operators/op_param.h
src/operators/op_param.h
+121
-335
test/CMakeLists.txt
test/CMakeLists.txt
+28
-9
test/fpga/test_resnet50.cpp
test/fpga/test_resnet50.cpp
+39
-0
test/net/test_yolo_combined.cpp
test/net/test_yolo_combined.cpp
+60
-0
test/test_helper.h
test/test_helper.h
+3
-0
未找到文件。
python/tools/imagetools/imagetools.py
0 → 100644
浏览文件 @
84ebc523
# coding=utf-8
import
cv2
from
array
import
array
def
resize_take_rgbs
(
path
,
shape_h_w
):
print
'--------------resize_take_rgbs-----------------begin'
image
=
cv2
.
imread
(
path
)
# print image.shape
cv2
.
imshow
(
"before"
,
image
)
print_rgb
(
image
[
0
,
0
])
# image len may be for .just check it
# image.resize(shape_h_w)
image
=
cv2
.
resize
(
image
,
(
shape_h_w
[
0
],
shape_h_w
[
1
]))
cv2
.
imshow
(
"after"
,
image
)
print
image
.
shape
height
=
shape_h_w
[
0
]
width
=
shape_h_w
[
1
]
rs_
=
[]
gs_
=
[]
bs_
=
[]
for
h
in
range
(
0
,
height
):
for
w
in
range
(
0
,
width
):
bs_
.
append
(
image
[
h
,
w
,
0
])
gs_
.
append
(
image
[
h
,
w
,
1
])
rs_
.
append
(
image
[
h
,
w
,
2
])
# print image[2, 2, 0]/255.
print
len
(
bs_
)
print
len
(
gs_
)
print
len
(
rs_
)
print
'--------------resize_take_rgbs-----------------end'
return
bs_
,
gs_
,
rs_
def
print_rgb
((
b
,
g
,
r
)):
print
"像素 - R:%d,G:%d,B:%d"
%
(
r
,
g
,
b
)
# 显示像素值
#
# image[0, 0] = (100, 150, 200) # 更改位置(0,0)处的像素
#
# (b, g, r) = image[0, 0] # 再次读取(0,0)像素
# print "位置(0,0)处的像素 - 红:%d,绿:%d,蓝:%d" % (r, g, b) # 显示更改后的像素值
#
# corner = image[0:100, 0:100] # 读取像素块
# cv2.imshow("Corner", corner) # 显示读取的像素块
#
# image[0:100, 0:100] = (0, 255, 0); # 更改读取的像素块
#
# cv2.imshow("Updated", image) # 显示图像
#
# cv2.waitKey(0) # 程序暂停
def
save_to_file
(
to_file_name
,
array
):
to_file
=
open
(
to_file_name
,
"wb"
)
array
.
tofile
(
to_file
)
to_file
.
close
()
python/tools/imagetools/img2nchw.py
0 → 100644
浏览文件 @
84ebc523
# coding=utf-8
import
cv2
from
array
import
array
import
imagetools
as
tools
from
enum
import
Enum
class
ChannelType
(
Enum
):
RGB
=
0
,
BGR
=
1
def
combine_bgrs_nchw
(
bgrs
,
means_b_g_r
,
scale
,
channel_type
=
ChannelType
.
BGR
):
print
'--------------combine_bgrs_nchw-----------------begin'
print
"scale: %f"
%
scale
print
means_b_g_r
# print len(bgrs)
bs
=
bgrs
[
0
]
gs
=
bgrs
[
1
]
rs
=
bgrs
[
2
]
assert
len
(
bs
)
==
len
(
gs
)
==
len
(
rs
)
print
len
(
bs
)
bgrs_float_array
=
array
(
'f'
)
if
channel_type
==
ChannelType
.
BGR
:
print
'bgr'
for
i
in
range
(
0
,
len
(
bs
)):
bgrs_float_array
.
append
((
bs
[
i
]
-
means_b_g_r
[
0
])
*
scale
)
# b
for
i
in
range
(
0
,
len
(
gs
)):
bgrs_float_array
.
append
((
gs
[
i
]
-
means_b_g_r
[
1
])
*
scale
)
# g
for
i
in
range
(
0
,
len
(
rs
)):
bgrs_float_array
.
append
((
rs
[
i
]
-
means_b_g_r
[
2
])
*
scale
)
# r
elif
channel_type
==
ChannelType
.
RGB
:
print
'rgb'
for
i
in
range
(
0
,
len
(
rs
)):
bgrs_float_array
.
append
((
rs
[
i
]
-
means_b_g_r
[
2
])
*
scale
)
# r
for
i
in
range
(
0
,
len
(
gs
)):
bgrs_float_array
.
append
((
gs
[
i
]
-
means_b_g_r
[
1
])
*
scale
)
# g
for
i
in
range
(
0
,
len
(
bs
)):
bgrs_float_array
.
append
((
bs
[
i
]
-
means_b_g_r
[
0
])
*
scale
)
# b
print
len
(
bgrs_float_array
)
print
'------------------'
print
bgrs_float_array
[
0
]
print
bgrs_float_array
[
416
*
416
*
2
+
416
*
2
+
2
]
# for i in range(0, 9):
# print'bs %d' % i
# print bs[i] / 255.
print
bs
[
416
*
2
+
2
]
/
255.
print
'--------------combine_bgrs_nchw-----------------end'
return
bgrs_float_array
# bgrs = tools.resize_take_rgbs('banana.jpeg', (224, 224, 3))
# array = combine_bgrs_nchw(bgrs, (103.94, 116.78, 123.68), 0.017, array,ChannelType.BGR)
# tools.save_to_file('banana_1_3_224_224_nchw_float')
# cv2.waitKey(0)
bgrs
=
tools
.
resize_take_rgbs
(
'datas/newyolo.jpg'
,
(
416
,
416
,
3
))
array
=
combine_bgrs_nchw
(
bgrs
,
(
0
,
0
,
0
),
1.
/
255
,
ChannelType
.
RGB
)
tools
.
save_to_file
(
'datas/desktop_1_3_416_416_nchw_float'
,
array
)
python/tools/imagetools/img2nhwc.py
0 → 100644
浏览文件 @
84ebc523
# coding=utf-8
import
cv2
from
array
import
array
import
imagetools
as
tools
def
combine_bgrs_nhwc
(
bgrs
,
means_b_g_r
,
scale
):
print
"scale: %f"
%
scale
print
means_b_g_r
# print len(bgrs)
bs
=
bgrs
[
0
]
gs
=
bgrs
[
1
]
rs
=
bgrs
[
2
]
assert
len
(
bs
)
==
len
(
gs
)
==
len
(
rs
)
# print len(bs)
bgrs_float_array
=
array
(
'f'
)
for
i
in
range
(
0
,
len
(
bs
)):
bgrs_float_array
.
append
((
rs
[
i
]
-
means_b_g_r
[
2
])
*
scale
)
# r
bgrs_float_array
.
append
((
gs
[
i
]
-
means_b_g_r
[
1
])
*
scale
)
# g
bgrs_float_array
.
append
((
bs
[
i
]
-
means_b_g_r
[
0
])
*
scale
)
# b
print
len
(
bgrs_float_array
)
print
'------------------'
print
bgrs_float_array
[
0
]
print
bgrs_float_array
[
999
]
return
bgrs_float_array
bgrs
=
tools
.
resize_take_rgbs
(
'newyolo_1.jpg'
,
(
416
,
416
,
3
))
array
=
combine_bgrs_nhwc
(
bgrs
,
(
0
,
0
,
0
),
1.0
/
255
)
tools
.
save_to_file
(
'desktop_1_3_416_416_nhwc_float'
,
array
)
cv2
.
waitKey
(
0
)
python/tools/imagetools/numpy2binary.py
0 → 100644
浏览文件 @
84ebc523
# coding=utf-8
# 这个脚本是可以将numpy合并到二进制
import
cv2
import
numpy
as
np
import
imagetools
as
tools
from
array
import
array
#
# image = cv2.imread(path)
# print image.shape
#
# print_rgb(image[0, 0])
# # image len may be for .just check it
# image.resize(shape_h_w)
data
=
np
.
fromfile
(
'datas/img.res'
)
print
data
.
size
print
data
[
0
]
data
.
reshape
(
1
,
3
,
416
,
416
)
out_array
=
array
(
'f'
)
print
'--------------------'
print
data
.
size
print
data
[
0
]
print
'如果是nhwc --------'
# rgb rgb rgb rgb rgb
print
data
[
416
*
3
*
2
+
3
*
2
+
2
]
# print data[2]
print
'如果是nchw --------'
# rgb rgb rgb rgb rgb
print
data
[
416
*
416
*
2
+
416
*
2
+
2
]
# print data[2]
# 明明是nchw
for
i
in
range
(
0
,
data
.
size
):
out_array
.
append
(
data
[
i
])
print
len
(
out_array
)
print
out_array
[
416
*
416
*
2
+
416
*
2
+
2
]
tools
.
save_to_file
(
'datas/in_put_1_3_416_416_2'
,
out_array
)
python/tools/mdl2fluid/model_combine.py
0 → 100644
浏览文件 @
84ebc523
# coding=utf-8
import
os
path
=
"yolo_v2_tofile_source/"
# 文件夹目录
to_file_path
=
"yolo_v2_tofile_combined/params"
files
=
os
.
listdir
(
path
)
# 得到文件夹下的所有文件名称
files
.
sort
(
cmp
=
None
,
key
=
str
.
lower
)
to_file
=
open
(
to_file_path
,
"wb"
)
for
file
in
files
:
# 遍历文件夹
if
not
os
.
path
.
isdir
(
file
):
# 判断是否是文件夹,不是文件夹才打开
f
=
open
(
path
+
"/"
+
file
)
# 打开文件
name
=
f
.
name
print
'name: '
+
name
from_file
=
open
(
name
,
"rb"
)
to_file
.
write
(
from_file
.
read
())
from_file
.
close
()
to_file
.
close
()
python/tools/mdl2fluid/swicher.py
浏览文件 @
84ebc523
...
...
@@ -66,7 +66,7 @@ class Swichter:
def
read_head
(
self
,
head_file
):
from_file
=
open
(
head_file
,
"rb"
)
read
=
from_file
.
read
(
2
0
)
read
=
from_file
.
read
(
2
4
)
# print read
from_file
.
close
()
# print read
...
...
@@ -84,9 +84,32 @@ class Swichter:
to_file
.
close
()
pass
def
copy_padding_add_head
(
self
,
from_file_name
,
to_file_name
,
tmp_file_name
,
padding
):
print
'padding = %d'
%
padding
from_file
=
open
(
from_file_name
,
"rb"
)
# print len(from_file.read())
from_file
.
seek
(
padding
,
0
)
read
=
from_file
.
read
()
print
len
(
read
)
to_file
=
open
(
to_file_name
,
"wb"
)
# tmp_file = open(tmp_file_name, "wb")
head
=
self
.
read_head
(
'/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases'
)
to_file
.
write
(
head
)
to_file
.
write
(
read
)
from_file
.
close
()
to_file
.
close
()
pass
# Swichter().nhwc2nchw_one_slice_add_head(
# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/conv1_0.bin',
# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/conv1_0',
# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/.tmp',
# 32,
# 3, 3, 3)
# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases')
# Swichter().nhwc2nchw_one_slice(
# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/conv5_6_dw_0.bin',
# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/conv5_6_dw_0', 1,
# 512, 3, 3)
Swichter
().
read_head
(
'/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases'
)
# Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')
src/fpga/api.cpp
浏览文件 @
84ebc523
...
...
@@ -29,9 +29,7 @@ namespace fpga {
static
int
fd
=
-
1
;
static
const
char
*
device_path
=
"/dev/fpgadrv0"
;
#ifdef PADDLE_MOBILE_OS_LINUX
static
std
::
map
<
void
*
,
size_t
>
memory_map
;
#endif
static
inline
int
do_ioctl
(
int
req
,
const
void
*
arg
)
{
#ifdef PADDLE_MOBILE_OS_LINUX
...
...
@@ -53,32 +51,38 @@ int open_device() {
// memory management;
void
*
fpga_malloc
(
size_t
size
)
{
static
uint64_t
counter
=
0
;
counter
+=
size
;
DLOG
<<
size
<<
" bytes allocated. Total "
<<
counter
<<
" bytes"
;
#ifdef PADDLE_MOBILE_OS_LINUX
auto
ptr
=
mmap64
(
nullptr
,
size
,
PROT_READ
|
PROT_WRITE
,
MAP_SHARED
,
fd
,
0
);
memory_map
.
insert
(
std
::
make_pair
(
ptr
,
size
));
return
ptr
;
#else
return
malloc
(
size
);
auto
ptr
=
malloc
(
size
);
#endif
counter
+=
size
;
memory_map
.
insert
(
std
::
make_pair
(
ptr
,
size
));
DLOG
<<
"Address: "
<<
ptr
<<
", "
<<
size
<<
" bytes allocated. Total "
<<
counter
<<
" bytes"
;
return
ptr
;
}
void
fpga_free
(
void
*
ptr
)
{
#ifdef PADDLE_MOBILE_OS_LINUX
static
uint64_t
counter
=
0
;
size_t
size
=
0
;
auto
iter
=
memory_map
.
find
(
ptr
);
// std::map<void *, size_t>::iterator
if
(
iter
!=
memory_map
.
end
())
{
size
=
iter
->
second
;
munmap
(
ptr
,
size
);
memory_map
.
erase
(
iter
);
}
counter
+=
size
;
DLOG
<<
size
<<
" bytes freed. Total "
<<
counter
<<
" bytes"
;
#ifdef PADDLE_MOBILE_OS_LINUX
munmap
(
ptr
,
size
);
#else
free
(
ptr
);
#endif
counter
+=
size
;
DLOG
<<
"Address: "
<<
ptr
<<
", "
<<
size
<<
" bytes freed. Total "
<<
counter
<<
" bytes"
;
}
else
{
DLOG
<<
"Invalid pointer"
;
}
}
void
fpga_copy
(
void
*
dest
,
const
void
*
src
,
size_t
num
)
{
...
...
@@ -211,7 +215,8 @@ int PerformBypass(const struct BypassArgs &args) {
int
ComputeFPGAConcat
(
const
struct
ConcatArgs
&
args
)
{
#ifdef FPGA_TEST_MODE
DLOG
<<
"=============ComputeFpgaConcat==========="
;
DLOG
<<
" out_address:"
<<
args
.
image_out
DLOG
<<
" Image_num: "
<<
args
.
image_num
<<
" out_address:"
<<
args
.
image_out
<<
" out_scale_address:"
<<
args
.
scale_out
;
DLOG
<<
" image_height:"
<<
args
.
height
<<
" image_width:"
<<
args
.
width
;
for
(
int
i
=
0
;
i
<
args
.
image_num
;
i
++
)
{
...
...
@@ -235,7 +240,7 @@ void format_image(framework::Tensor *image_tensor) {
auto
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
auto
data_ptr
=
image_tensor
->
data
<
float
>
();
size_t
memory_size
=
channel
*
height
*
width
*
sizeof
(
float
);
float
*
new_data
=
(
float
*
)
fpga_malloc
(
memory_size
);
auto
new_data
=
(
float
*
)
fpga_malloc
(
memory_size
);
fpga_copy
(
new_data
,
data_ptr
,
memory_size
);
image
::
format_image
(
&
new_data
,
channel
,
height
,
width
);
image_tensor
->
reset_data_ptr
(
new_data
);
...
...
@@ -346,12 +351,12 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
auto
out_ptr
=
out
->
data
<
float
>
();
arg
->
group_num
=
(
uint32_t
)
group_num
;
arg
->
split_num
=
(
uint32_t
)
fpga
::
get_plit_num
(
filter
);
// Either group_num or split_num = 1;
arg
->
split_num
=
group_num
==
1
?
(
uint32_t
)
get_plit_num
(
filter
)
:
1
;
arg
->
filter_num
=
(
uint32_t
)
filter
->
dims
()[
0
];
arg
->
output
.
address
=
out_ptr
;
arg
->
output
.
scale_address
=
out
->
scale
;
arg
->
conv_args
=
(
fpga
::
ConvArgs
*
)
fpga
::
fpga_malloc
(
arg
->
split_num
*
sizeof
(
fpga
::
ConvArgs
));
arg
->
conv_args
=
(
ConvArgs
*
)
fpga_malloc
(
arg
->
split_num
*
sizeof
(
ConvArgs
));
arg
->
concat_arg
.
image_num
=
arg
->
split_num
;
arg
->
concat_arg
.
image_out
=
out_ptr
;
...
...
@@ -360,15 +365,14 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
arg
->
concat_arg
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
int
n
=
arg
->
split_num
;
arg
->
concat_arg
.
images_in
=
(
half
**
)
fpga
::
fpga_malloc
(
n
*
sizeof
(
int
*
));
arg
->
concat_arg
.
scales_in
=
(
float
**
)
fpga
::
fpga_malloc
(
n
*
sizeof
(
float
*
));
arg
->
concat_arg
.
channel_num
=
(
uint32_t
*
)
fpga
::
fpga_malloc
(
n
*
sizeof
(
uint32_t
));
arg
->
concat_arg
.
images_in
=
(
half
**
)
fpga_malloc
(
n
*
sizeof
(
int
*
));
arg
->
concat_arg
.
scales_in
=
(
float
**
)
fpga_malloc
(
n
*
sizeof
(
float
*
));
arg
->
concat_arg
.
channel_num
=
(
uint32_t
*
)
fpga_malloc
(
n
*
sizeof
(
uint32_t
));
arg
->
concat_arg
.
image_out
=
out_ptr
;
auto
channel
=
(
int
)
out
->
dims
()[
1
];
int
filter_num_per_div
=
fpga
::
get_filter_num_per_div
(
filter
,
group_num
);
int
element_num
=
fpga
::
get_aligned_filter_element_num
(
int
filter_num_per_div
=
get_filter_num_per_div
(
filter
,
group_num
);
int
element_num
=
get_aligned_filter_element_num
(
filter
->
dims
()[
1
]
*
filter
->
dims
()[
2
]
*
filter
->
dims
()[
3
]);
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
...
...
@@ -390,16 +394,17 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
&
((
int8_t
*
)
filter_ptr
)[
i
*
element_num
*
filter_num_per_div
];
arg
->
conv_args
[
i
].
sb_address
=
&
bs_ptr
[
i
*
filter_num_per_div
*
2
];
arg
->
conv_args
[
i
].
filter_num
=
(
uint32_t
)(
i
==
n
-
1
?
fpga
::
get_aligned_filter_num
(
channel
-
(
n
-
1
)
*
filter_num_per_div
)
(
uint32_t
)(
i
==
n
-
1
?
channel
-
(
n
-
1
)
*
filter_num_per_div
:
filter_num_per_div
);
if
(
n
>
1
)
{
arg
->
conv_args
[
i
].
output
.
scale_address
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
sizeof
(
float
));
arg
->
conv_args
[
i
].
output
.
address
=
fpga
::
fpga_malloc
(
input
->
dims
()[
2
]
*
input
->
dims
()[
3
]
*
arg
->
conv_args
[
i
].
filter_num
*
sizeof
(
half
));
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
arg
->
conv_args
[
i
].
output
.
address
=
fpga_malloc
(
input
->
dims
()[
2
]
*
align_to_x
(
input
->
dims
()[
3
]
*
arg
->
conv_args
[
i
].
filter_num
,
IMAGE_ALIGNMENT
)
*
sizeof
(
half
));
}
else
{
...
...
@@ -408,7 +413,7 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
}
arg
->
concat_arg
.
images_in
[
i
]
=
(
half
*
)
arg
->
conv_args
[
i
].
output
.
address
;
arg
->
concat_arg
.
scales_in
[
i
]
=
(
float
*
)
arg
->
conv_args
[
i
].
sb
_address
;
arg
->
concat_arg
.
scales_in
[
i
]
=
arg
->
conv_args
[
i
].
output
.
scale
_address
;
arg
->
concat_arg
.
channel_num
[
i
]
=
arg
->
conv_args
[
i
].
filter_num
;
}
}
...
...
src/io/executor.cpp
浏览文件 @
84ebc523
...
...
@@ -79,7 +79,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
std
::
vector
<
std
::
shared_ptr
<
framework
::
OpDesc
>>
ops
=
block_desc
->
Ops
();
for
(
int
j
=
0
;
j
<
ops
.
size
();
++
j
)
{
std
::
shared_ptr
<
framework
::
OpDesc
>
op
=
ops
[
j
];
DLOG
<<
"create op: "
<<
op
->
Type
();
DLOG
<<
"create op: "
<<
j
<<
" "
<<
op
->
Type
();
auto
op_base
=
framework
::
OpRegistry
<
Dtype
>::
CreateOp
(
op
->
Type
(),
op
->
GetInputs
(),
op
->
GetOutputs
(),
op
->
GetAttrMap
(),
program_
.
scope
);
...
...
@@ -103,7 +103,9 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
std
::
shared_ptr
<
framework
::
BlockDesc
>
to_predict_block
=
to_predict_program_
->
Block
(
0
);
auto
&
ops
=
ops_of_block_
[
*
to_predict_block
.
get
()];
int
i
=
0
;
for
(
const
auto
&
op
:
ops
)
{
DLOG
<<
"Init op: "
<<
i
++
<<
" "
<<
op
->
Type
();
op
->
Init
();
}
}
...
...
@@ -231,6 +233,13 @@ void Executor<Dtype, P>::InitMemory() {
Get_binary_data
(
program_
.
model_path
+
"/"
+
var_desc
->
Name
());
char
*
data
=
origin_data
;
LoadMemory
(
*
var_desc
,
tensor
,
&
data
);
// DLOG << "----- " << var_desc->Name();
// DLOG << "----- " << tensor->dims();
// float *pDouble = tensor->template data<float>();
// for (int i = 0; i < tensor->numel() && i < 30; ++i) {
// std::cout << pDouble[i] << std::endl;
// }
delete
origin_data
;
}
else
{
if
(
var_desc
->
Type
()
==
framework
::
VARTYPE_TYPE_LOD_TENSOR
)
{
...
...
@@ -695,6 +704,7 @@ void Executor<Dtype, P>::Predict_From_To(int start, int end) {
clock_gettime
(
CLOCK_MONOTONIC
,
&
ts
);
profile
[
i
].
runBegin
=
(
uint64_t
)
ts
.
tv_sec
*
1e9
+
ts
.
tv_nsec
;
#endif
DLOG
<<
"Running op: "
<<
i
<<
" "
<<
ops
[
i
]
->
Type
();
ops
[
i
]
->
Run
();
#ifdef PADDLE_MOBILE_PROFILE
...
...
src/operators/feed_op.h
浏览文件 @
84ebc523
...
...
@@ -53,7 +53,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
auto
input_ptr
=
input
->
data
<
float
>
();
fpga
::
format_image
(
input
);
Tensor
*
output
=
param_
.
Out
();
auto
output_ptr
=
output
->
data
<
half
>
();
auto
output_ptr
=
output
->
data
<
float
>
();
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP32
};
...
...
src/operators/kernel/central-arm-func/conv_add_arm_func.h
浏览文件 @
84ebc523
...
...
@@ -129,10 +129,13 @@ void ConvAddCompute(const FusionConvAddParam<CPU> ¶m) {
// param.Paddings(),
// param.Filter(), param.Bias(),
// param.Output(), false);
math
::
DepthwiseConv3x3s2p1v2
(
param
.
Input
(),
param
.
Filter
(),
param
.
Output
(),
if
(
param
.
Paddings
()[
0
]
==
0
)
{
math
::
DepthwiseConv3x3s2p0
(
param
.
Input
(),
param
.
Filter
(),
param
.
Output
(),
*
param
.
Bias
(),
true
);
}
else
{
math
::
DepthwiseConv3x3s2p1v2
(
param
.
Input
(),
param
.
Filter
(),
param
.
Output
(),
*
param
.
Bias
(),
true
);
}
}
else
{
ConvAddBasic
(
param
);
}
...
...
src/operators/kernel/fpga/softmax_kernel.cpp
浏览文件 @
84ebc523
...
...
@@ -26,7 +26,8 @@ template <>
bool
SoftmaxKernel
<
FPGA
,
float
>::
Init
(
SoftmaxParam
<
FPGA
>
*
param
)
{
auto
input
=
const_cast
<
Tensor
*>
(
param
->
InputX
());
auto
input_ptr
=
input
->
data
<
float
>
();
auto
float_input
=
new
Tensor
(
*
input
);
auto
float_input
=
new
Tensor
;
float_input
->
mutable_data
<
float
>
(
input
->
dims
());
fpga
::
format_fp32_ofm
(
float_input
);
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
...
...
src/operators/math/depthwise_conv_3x3.cpp
浏览文件 @
84ebc523
...
...
@@ -1881,6 +1881,103 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
#endif
}
void
DepthwiseConv3x3s2p0
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
Tensor
*
output
,
Tensor
bias
,
bool
if_bias
)
{
#if __ARM_NEON
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
const
int
input_channel
=
static_cast
<
int
>
(
input
->
dims
()[
1
]);
const
int
input_height
=
static_cast
<
int
>
(
input
->
dims
()[
2
]);
const
int
input_width
=
static_cast
<
int
>
(
input
->
dims
()[
3
]);
const
int
output_height
=
static_cast
<
int
>
(
output
->
dims
()[
2
]);
const
int
output_width
=
static_cast
<
int
>
(
output
->
dims
()[
3
]);
const
int
inhxw
=
input_height
*
input_width
;
const
int
outhxw
=
output_height
*
output_width
;
float32x4_t
zero
=
vdupq_n_f32
(
0.0
);
for
(
int
b
=
0
;
b
<
batch_size
;
b
++
)
{
#pragma omp parallel for
for
(
int
c
=
0
;
c
<
input_channel
;
c
++
)
{
const
float
*
filter_data
=
filter
->
data
<
float
>
()
+
c
*
9
;
const
float
*
input_data
=
input
->
data
<
float
>
()
+
c
*
inhxw
;
const
float
*
bias_data
=
bias
.
data
<
float
>
()
+
c
;
float
*
output_data
=
output
->
data
<
float
>
()
+
c
*
outhxw
;
float
w00
=
filter_data
[
0
];
float
w01
=
filter_data
[
1
];
float
w02
=
filter_data
[
2
];
float
w10
=
filter_data
[
3
];
float
w11
=
filter_data
[
4
];
float
w12
=
filter_data
[
5
];
float
w20
=
filter_data
[
6
];
float
w21
=
filter_data
[
7
];
float
w22
=
filter_data
[
8
];
float32x4_t
biasv
=
vld1q_dup_f32
(
bias_data
);
for
(
int
i
=
0
;
i
<
output_height
;
i
+=
1
)
{
for
(
int
m
=
0
;
m
<
output_width
-
2
;
m
+=
3
)
{
float
*
output_ptr
=
output_data
+
i
*
output_width
+
m
;
float32x4x2_t
input_buff_top
{},
input_buff_mid
{},
input_buff_bottom
{};
float32x4_t
in0
,
in1
,
in2
,
in3
,
in4
,
in5
,
tmp0
,
tmp1
,
tmp2
,
tmp3
,
tmp4
,
tmp5
,
out0
;
input_buff_top
=
vld2q_f32
(
input_data
+
(
2
*
i
)
*
input_width
+
(
2
*
m
));
input_buff_mid
=
vld2q_f32
(
input_data
+
(
2
*
i
+
1
)
*
input_width
+
(
2
*
m
));
input_buff_bottom
=
vld2q_f32
(
input_data
+
(
2
*
i
+
2
)
*
input_width
+
(
2
*
m
));
in0
=
input_buff_top
.
val
[
0
];
tmp0
=
input_buff_top
.
val
[
1
];
tmp1
=
vextq_f32
(
in0
,
zero
,
1
);
in2
=
input_buff_mid
.
val
[
0
];
tmp2
=
input_buff_mid
.
val
[
1
];
tmp3
=
vextq_f32
(
in2
,
zero
,
1
);
in4
=
input_buff_bottom
.
val
[
0
];
tmp4
=
input_buff_bottom
.
val
[
1
];
tmp5
=
vextq_f32
(
in4
,
zero
,
1
);
out0
=
vmulq_n_f32
(
in0
,
w00
);
out0
=
vmlaq_n_f32
(
out0
,
tmp0
,
w01
);
out0
=
vmlaq_n_f32
(
out0
,
tmp1
,
w02
);
out0
=
vmlaq_n_f32
(
out0
,
in2
,
w10
);
out0
=
vmlaq_n_f32
(
out0
,
tmp2
,
w11
);
out0
=
vmlaq_n_f32
(
out0
,
tmp3
,
w12
);
out0
=
vmlaq_n_f32
(
out0
,
in4
,
w20
);
out0
=
vmlaq_n_f32
(
out0
,
tmp4
,
w21
);
out0
=
vmlaq_n_f32
(
out0
,
tmp5
,
w22
);
out0
=
vaddq_f32
(
out0
,
biasv
);
vst1q_lane_f32
(
output_ptr
,
out0
,
0
);
vst1q_lane_f32
(
output_ptr
+
1
,
out0
,
1
);
vst1q_lane_f32
(
output_ptr
+
2
,
out0
,
2
);
}
int
m
;
for
(
m
=
0
;
m
<
output_width
-
2
;
m
+=
3
)
{
}
for
(
int
j
=
m
;
j
<
output_width
;
j
++
)
{
output_data
[
i
*
output_width
+
j
]
=
input_data
[(
2
*
i
-
1
)
*
input_width
+
2
*
j
-
1
]
*
w00
+
input_data
[(
2
*
i
-
1
)
*
input_width
+
2
*
j
]
*
w01
+
input_data
[(
2
*
i
-
1
)
*
input_width
+
2
*
j
+
1
]
*
w02
+
input_data
[(
2
*
i
)
*
input_width
+
2
*
j
-
1
]
*
w10
+
input_data
[(
2
*
i
)
*
input_width
+
2
*
j
]
*
w11
+
input_data
[(
2
*
i
)
*
input_width
+
2
*
j
+
1
]
*
w12
+
input_data
[(
2
*
i
+
1
)
*
input_width
+
2
*
j
-
1
]
*
w20
+
input_data
[(
2
*
i
+
1
)
*
input_width
+
2
*
j
]
*
w21
+
input_data
[(
2
*
i
+
1
)
*
input_width
+
2
*
j
+
1
]
*
w22
;
output_data
[
i
*
output_width
+
j
]
+=
*
bias_data
;
}
}
}
}
#endif
}
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
src/operators/math/depthwise_conv_3x3.h
浏览文件 @
84ebc523
...
...
@@ -43,6 +43,9 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
void
DepthwiseConvAddBNRelu3x3s2p1v2
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
Tensor
*
output
,
const
Tensor
*
new_scale
,
const
Tensor
*
new_bias
,
bool
if_relu
);
void
DepthwiseConv3x3s2p0
(
const
Tensor
*
input
,
const
Tensor
*
filter
,
Tensor
*
output
,
Tensor
bias
,
bool
if_bias
);
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
src/operators/op_param.h
浏览文件 @
84ebc523
此差异已折叠。
点击以展开。
test/CMakeLists.txt
浏览文件 @
84ebc523
...
...
@@ -18,6 +18,9 @@ elseif ("yolo" IN_LIST NET)
# gen test
ADD_EXECUTABLE
(
test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-yolo paddle-mobile
)
# gen test
ADD_EXECUTABLE
(
test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test_yolo_combined paddle-mobile
)
elseif
(
"squeezenet"
IN_LIST NET
)
# gen test
ADD_EXECUTABLE
(
test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h
)
...
...
@@ -30,6 +33,27 @@ elseif("FPGAnets" IN_LIST NET)
ADD_EXECUTABLE
(
test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-resnet paddle-mobile
)
ADD_EXECUTABLE
(
test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-resnet50 paddle-mobile
)
ADD_EXECUTABLE
(
test-fpga-EW fpga/test_fpga_EW.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-fpga-EW paddle-mobile
)
ADD_EXECUTABLE
(
test-fpga-conv fpga/test_fpga_conv.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-fpga-conv paddle-mobile
)
ADD_EXECUTABLE
(
test-fpga-pooling fpga/test_fpga_pooling.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-fpga-pooling paddle-mobile
)
ADD_EXECUTABLE
(
test-fpga-bypass fpga/test_fpga_bypass.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-fpga-bypass paddle-mobile
)
ADD_EXECUTABLE
(
test-fpga-softmax fpga/test_fpga_softmax.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-fpga-softmax paddle-mobile
)
ADD_EXECUTABLE
(
test-fpga-concat fpga/test_fpga_concat.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-fpga-concat paddle-mobile
)
ADD_EXECUTABLE
(
test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-tensor-quant paddle-mobile
)
...
...
@@ -74,6 +98,10 @@ else ()
ADD_EXECUTABLE
(
test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-yolo paddle-mobile
)
# gen test
ADD_EXECUTABLE
(
test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test_yolo_combined paddle-mobile
)
# gen test
ADD_EXECUTABLE
(
test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-googlenet paddle-mobile
)
...
...
@@ -235,13 +263,4 @@ else ()
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif
()
# if(FPGA)
# ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h)
# target_link_libraries(test-tensor-quant paddle-mobile)
# endif()
test/fpga/test_resnet50.cpp
0 → 100644
浏览文件 @
84ebc523
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "../test_include.h"
static
const
char
*
g_resnet_combine
=
"../models/resnet50"
;
int
main
()
{
DLOG
<<
paddle_mobile
::
fpga
::
open_device
();
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
FPGA
>
paddle_mobile
;
if
(
paddle_mobile
.
Load
(
std
::
string
(
g_resnet_combine
)
+
"/model"
,
std
::
string
(
g_resnet_combine
)
+
"/params"
,
true
))
{
std
::
vector
<
int64_t
>
dims
{
1
,
3
,
224
,
224
};
Tensor
input_tensor
;
SetupTensor
<
float
>
(
&
input_tensor
,
{
1
,
3
,
224
,
224
},
static_cast
<
float
>
(
0
),
static_cast
<
float
>
(
1
));
std
::
vector
<
float
>
input
(
input_tensor
.
data
<
float
>
(),
input_tensor
.
data
<
float
>
()
+
input_tensor
.
numel
());
paddle_mobile
.
FeedData
(
input_tensor
);
paddle_mobile
.
Predict_To
(
-
1
);
// paddle_mobile.Predict_From(73);
// paddle_mobile.Predict_From_To(72, 73);
DLOG
<<
"Computation done"
;
return
0
;
}
}
test/net/test_yolo_combined.cpp
0 → 100644
浏览文件 @
84ebc523
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
int
main
()
{
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
>
paddle_mobile
;
paddle_mobile
.
SetThreadNum
(
4
);
// ../../../test/models/googlenet
// ../../../test/models/mobilenet
auto
time1
=
time
();
if
(
paddle_mobile
.
Load
(
std
::
string
(
g_yolo_combined
)
+
"/model"
,
std
::
string
(
g_yolo_combined
)
+
"/params"
,
true
))
{
auto
time2
=
time
();
std
::
cout
<<
"load cost :"
<<
time_diff
(
time1
,
time1
)
<<
"ms"
<<
std
::
endl
;
std
::
vector
<
int64_t
>
dims
{
1
,
3
,
416
,
416
};
std
::
vector
<
float
>
input
;
GetInput
<
float
>
(
g_test_image_desktop_1_3_416_416_nchw_float
,
&
input
,
dims
);
std
::
cout
<<
"input.size(): "
<<
input
.
size
()
<<
std
::
endl
;
for
(
int
j
=
0
;
j
<
100
;
++
j
)
{
std
::
cout
<<
j
<<
" : "
<<
input
[
j
]
<<
std
::
endl
;
}
// // 预热十次
// for (int i = 0; i < 10; ++i) {
// paddle_mobile.Predict(input, dims);
// }
auto
time3
=
time
();
const
vector
<
float
>
vector_out
=
paddle_mobile
.
Predict
(
input
,
dims
);
std
::
cout
<<
"--------------------------------------------"
<<
std
::
endl
;
for
(
float
i
:
vector_out
)
{
std
::
cout
<<
i
<<
std
::
endl
;
}
std
::
cout
<<
"--------------------------------------------"
<<
std
::
endl
;
std
::
cout
<<
"load cost :"
<<
time_diff
(
time1
,
time1
)
<<
"ms"
<<
std
::
endl
;
auto
time4
=
time
();
std
::
cout
<<
"predict cost :"
<<
time_diff
(
time3
,
time4
)
/
10
<<
"ms"
<<
std
::
endl
;
}
return
0
;
}
test/test_helper.h
浏览文件 @
84ebc523
...
...
@@ -41,12 +41,15 @@ static const char *g_resnet_50 = "../models/resnet_50";
static
const
char
*
g_resnet
=
"../models/resnet"
;
static
const
char
*
g_googlenet_combine
=
"../models/googlenet_combine"
;
static
const
char
*
g_yolo
=
"../models/yolo"
;
static
const
char
*
g_yolo_combined
=
"../models/yolo_combined"
;
static
const
char
*
g_fluid_fssd_new
=
"../models/fluid_fssd_new"
;
static
const
char
*
g_test_image_1x3x224x224
=
"../images/test_image_1x3x224x224_float"
;
static
const
char
*
g_test_image_1x3x224x224_banana
=
"../images/input_3x224x224_banana"
;
static
const
char
*
g_test_image_desktop_1_3_416_416_nchw_float
=
"../images/in_put_1_3_416_416_2"
;
static
const
char
*
g_hand
=
"../images/hand_image"
;
static
const
char
*
g_imgfssd_ar
=
"../images/test_image_ssd_ar"
;
static
const
char
*
g_imgfssd_ar1
=
"../images/003_0001.txt"
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录