Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
2bca7447
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2bca7447
编写于
11月 21, 2018
作者:
qnqinan
提交者:
GitHub
11月 21, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1300 from zhangyang0701/develop
unify V1 & V2 style for FPGA track
上级
af65b34c
bff417fd
变更
23
隐藏空白更改
内联
并排
Showing
23 changed file
with
408 addition
and
616 deletion
+408
-616
src/common/types.cpp
src/common/types.cpp
+4
-4
src/fpga/V1/api.cpp
src/fpga/V1/api.cpp
+29
-267
src/fpga/V1/api.h
src/fpga/V1/api.h
+2
-170
src/fpga/V1/bias_scale.cpp
src/fpga/V1/bias_scale.cpp
+1
-1
src/fpga/V1/filter.cpp
src/fpga/V1/filter.cpp
+11
-9
src/fpga/V1/image.cpp
src/fpga/V1/image.cpp
+1
-1
src/fpga/V1/pe.cpp
src/fpga/V1/pe.cpp
+160
-0
src/fpga/V2/api.cpp
src/fpga/V2/api.cpp
+2
-73
src/fpga/V2/api.h
src/fpga/V2/api.h
+2
-10
src/fpga/V2/bias_scale.cpp
src/fpga/V2/bias_scale.cpp
+1
-1
src/fpga/V2/filter.cpp
src/fpga/V2/filter.cpp
+2
-2
src/fpga/V2/image.cpp
src/fpga/V2/image.cpp
+1
-1
src/fpga/V2/pe.cpp
src/fpga/V2/pe.cpp
+50
-50
src/fpga/common/bitmap.cpp
src/fpga/common/bitmap.cpp
+1
-1
src/fpga/common/bitmap.h
src/fpga/common/bitmap.h
+0
-0
src/fpga/common/config.h
src/fpga/common/config.h
+0
-0
src/fpga/common/driver.cpp
src/fpga/common/driver.cpp
+6
-6
src/fpga/common/driver.h
src/fpga/common/driver.h
+6
-8
src/fpga/common/fpga_common.cpp
src/fpga/common/fpga_common.cpp
+117
-0
src/fpga/common/fpga_common.h
src/fpga/common/fpga_common.h
+11
-0
src/fpga/common/pe.h
src/fpga/common/pe.h
+1
-1
test/CMakeLists.txt
test/CMakeLists.txt
+0
-6
tools/op.cmake
tools/op.cmake
+0
-5
未找到文件。
src/common/types.cpp
浏览文件 @
2bca7447
...
...
@@ -71,10 +71,10 @@ const char *G_OP_TYPE_SUM = "sum";
const
char
*
G_OP_TYPE_QUANTIZE
=
"quantize"
;
const
char
*
G_OP_TYPE_DEQUANTIZE
=
"dequantize"
;
extern
const
char
*
G_OP_TYPE_TANH
=
"tanh"
;
extern
const
char
*
G_OP_TYPE_FUSION_DECONV_RELU
=
"fusion_deconv_relu"
;
extern
const
char
*
G_OP_TYPE_FUSION_DECONV_ADD
=
"fusion_deconv_add"
;
extern
const
char
*
G_OP_TYPE_FUSION_DECONV_ADD_RELU
=
"fusion_deconv_add_relu"
;
const
char
*
G_OP_TYPE_TANH
=
"tanh"
;
const
char
*
G_OP_TYPE_FUSION_DECONV_RELU
=
"fusion_deconv_relu"
;
const
char
*
G_OP_TYPE_FUSION_DECONV_ADD
=
"fusion_deconv_add"
;
const
char
*
G_OP_TYPE_FUSION_DECONV_ADD_RELU
=
"fusion_deconv_add_relu"
;
std
::
unordered_map
<
std
::
string
,
std
::
pair
<
std
::
vector
<
std
::
string
>
,
std
::
vector
<
std
::
string
>>>
...
...
src/fpga/V1/api.cpp
浏览文件 @
2bca7447
...
...
@@ -13,251 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V1/api.h"
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <algorithm>
#include <map>
#include "fpga/V1/bias_scale.h"
#include "fpga/V1/filter.h"
#include "fpga/V1/image.h"
#define FPGA_TEST_MODE
#define PADDLE_MOBILE_OS_LINUX
namespace
paddle_mobile
{
namespace
fpga
{
static
int
fd
=
-
1
;
static
const
char
*
device_path
=
"/dev/fpgadrv0"
;
static
std
::
map
<
void
*
,
size_t
>
memory_map
;
static
inline
int
do_ioctl
(
int
req
,
const
void
*
arg
)
{
#ifdef PADDLE_MOBILE_OS_LINUX
int
result
=
ioctl
(
fd
,
req
,
(
uint64_t
)
arg
);
PADDLE_MOBILE_ENFORCE
(
result
==
0
,
"ioctl didn't return correctly"
);
return
result
;
#else
return
-
1
;
#endif
}
int
open_device
()
{
if
(
fd
==
-
1
)
{
fd
=
open
(
device_path
,
O_RDWR
);
}
return
fd
;
}
// memory management;
void
*
fpga_malloc
(
size_t
size
)
{
static
uint64_t
counter
=
0
;
#ifdef PADDLE_MOBILE_OS_LINUX
auto
ptr
=
mmap64
(
nullptr
,
size
,
PROT_READ
|
PROT_WRITE
,
MAP_SHARED
,
fd
,
0
);
#else
auto
ptr
=
malloc
(
size
);
#endif
counter
+=
size
;
memory_map
.
insert
(
std
::
make_pair
(
ptr
,
size
));
// DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
// << counter << " bytes";
return
ptr
;
}
void
fpga_free
(
void
*
ptr
)
{
static
uint64_t
counter
=
0
;
size_t
size
=
0
;
auto
iter
=
memory_map
.
find
(
ptr
);
// std::map<void *, size_t>::iterator
if
(
iter
!=
memory_map
.
end
())
{
size
=
iter
->
second
;
memory_map
.
erase
(
iter
);
#ifdef PADDLE_MOBILE_OS_LINUX
munmap
(
ptr
,
size
);
#else
free
(
ptr
);
#endif
counter
+=
size
;
// DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
// << counter << " bytes";
}
else
{
DLOG
<<
"Invalid pointer"
;
}
}
void
fpga_copy
(
void
*
dest
,
const
void
*
src
,
size_t
num
)
{
memcpy
(
dest
,
src
,
num
);
}
int
fpga_flush
(
void
*
address
,
size_t
size
)
{
struct
MemoryCacheArgs
args
=
{
nullptr
};
args
.
address
=
address
;
args
.
size
=
size
;
return
do_ioctl
(
IOCTL_MEMCACHE_FLUSH
,
&
args
);
}
int
fpga_invalidate
(
void
*
address
,
size_t
size
)
{
struct
MemoryCacheArgs
args
=
{
nullptr
};
args
.
address
=
address
;
args
.
size
=
size
;
return
do_ioctl
(
IOCTL_MEMCACHE_INVAL
,
&
args
);
}
half
fp32_2_fp16
(
float
fp32_num
)
{
unsigned
long
tmp
=
*
(
unsigned
long
*
)(
&
fp32_num
);
// NOLINT
half
t
=
((
tmp
&
0x007fffff
)
>>
13
)
|
((
tmp
&
0x80000000
)
>>
16
)
|
(((
tmp
&
0x7f800000
)
>>
13
)
-
(
112
<<
10
));
if
(
tmp
&
0x1000
)
{
t
++
;
// roundoff
}
return
t
;
}
float
fp16_2_fp32
(
half
fp16_num
)
{
int
frac
=
(
fp16_num
&
0x3ff
);
int
exp
=
((
fp16_num
&
0x7c00
)
>>
10
)
+
112
;
int
s
=
fp16_num
&
0x8000
;
int
tmp
=
0
;
float
fp32_num
;
tmp
=
s
<<
16
|
exp
<<
23
|
frac
<<
13
;
fp32_num
=
*
(
float
*
)
&
tmp
;
// NOLINT
return
fp32_num
;
}
int
ComputeBasicConv
(
const
struct
ConvArgs
&
args
)
{
#ifdef FPGA_TEST_MODE
DLOG
<<
"======Compute Basic Conv======"
;
DLOG
<<
" relu_enabled:"
<<
args
.
relu_enabled
<<
" sb_address:"
<<
args
.
sb_address
<<
" filter_address:"
<<
args
.
filter_address
<<
" filter_num:"
<<
args
.
filter_num
<<
" group_num:"
<<
args
.
group_num
;
DLOG
<<
" image_address:"
<<
args
.
image
.
address
<<
" image_scale_address:"
<<
args
.
image
.
scale_address
<<
" image_channels:"
<<
args
.
image
.
channels
<<
" image_height:"
<<
args
.
image
.
height
<<
" image_width:"
<<
args
.
image
.
width
<<
" pad_height:"
<<
args
.
image
.
pad_height
<<
" pad_width:"
<<
args
.
image
.
pad_width
;
DLOG
<<
" kernel_height:"
<<
args
.
kernel
.
height
<<
" kernel_width:"
<<
args
.
kernel
.
width
<<
" stride_h:"
<<
args
.
kernel
.
stride_h
<<
" stride_w:"
<<
args
.
kernel
.
stride_w
;
DLOG
<<
" out_address:"
<<
args
.
output
.
address
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
#endif
return
do_ioctl
(
IOCTL_CONFIG_CONV
,
&
args
);
}
int
ComputeFpgaConv
(
const
struct
SplitConvArgs
&
args
)
{
#ifdef FPGA_TEST_MODE
DLOG
<<
"=============ComputeFPGAConv==========="
;
DLOG
<<
" filter_num:"
<<
args
.
filter_num
<<
" group_num:"
<<
args
.
group_num
<<
" split_num:"
<<
args
.
split_num
;
#endif
int
split_num
=
args
.
split_num
;
for
(
int
i
=
0
;
i
<
split_num
;
i
++
)
{
ComputeBasicConv
(
args
.
conv_args
[
i
]);
}
if
(
split_num
>
1
)
{
ComputeFPGAConcat
(
args
.
concat_arg
);
}
}
int
ComputeFpgaPool
(
const
struct
PoolingArgs
&
args
)
{
#ifdef FPGA_TEST_MODE
DLOG
<<
"=============ComputeFpgaPool==========="
;
DLOG
<<
" mode:"
<<
args
.
mode
<<
" kernel_reciprocal:"
<<
fp16_2_fp32
(
args
.
kernel_reciprocal
);
DLOG
<<
" image_address:"
<<
args
.
image
.
address
<<
" image_scale_address:"
<<
args
.
image
.
scale_address
<<
" image_channels:"
<<
args
.
image
.
channels
<<
" image_height:"
<<
args
.
image
.
height
<<
" image_width:"
<<
args
.
image
.
width
<<
" pad_height:"
<<
args
.
image
.
pad_height
<<
" pad_width:"
<<
args
.
image
.
pad_width
;
DLOG
<<
" kernel_height:"
<<
args
.
kernel
.
height
<<
" kernel_width:"
<<
args
.
kernel
.
width
<<
" stride_h:"
<<
args
.
kernel
.
stride_h
<<
" stride_w:"
<<
args
.
kernel
.
stride_w
;
DLOG
<<
" out_address:"
<<
args
.
output
.
address
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
#endif
return
do_ioctl
(
IOCTL_CONFIG_POOLING
,
&
args
);
}
int
ComputeFpgaEWAdd
(
const
struct
EWAddArgs
&
args
)
{
#ifdef FPGA_TEST_MODE
DLOG
<<
"=============ComputeFpgaEWAdd==========="
;
DLOG
<<
" relu_enabled:"
<<
args
.
relu_enabled
<<
" const0:"
<<
fp16_2_fp32
(
int16_t
(
args
.
const0
))
<<
" const1:"
<<
fp16_2_fp32
(
int16_t
(
args
.
const1
));
DLOG
<<
" image0_address:"
<<
args
.
image0
.
address
<<
" image0_scale_address:"
<<
args
.
image0
.
scale_address
<<
" image0_channels:"
<<
args
.
image0
.
channels
<<
" image0_height:"
<<
args
.
image0
.
height
<<
" image0_width:"
<<
args
.
image0
.
width
<<
" pad0_height:"
<<
args
.
image0
.
pad_height
<<
" pad0_width:"
<<
args
.
image0
.
pad_width
;
DLOG
<<
" image1_address:"
<<
args
.
image1
.
address
<<
" image1_scale_address:"
<<
args
.
image1
.
scale_address
<<
" image1_channels:"
<<
args
.
image1
.
channels
<<
" image1_height:"
<<
args
.
image1
.
height
<<
" image1_width:"
<<
args
.
image1
.
width
<<
" pad1_height:"
<<
args
.
image1
.
pad_height
<<
" pad_width:"
<<
args
.
image1
.
pad_width
;
DLOG
<<
" out_address:"
<<
args
.
output
.
address
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
#endif
return
do_ioctl
(
IOCTL_CONFIG_EW
,
&
args
);
}
int
PerformBypass
(
const
struct
BypassArgs
&
args
)
{
#ifdef FPGA_TEST_MODE
DLOG
<<
"=============ComputeFpgaBypass==========="
;
DLOG
<<
" input_type:"
<<
args
.
input_data_type
<<
" output_type:"
<<
args
.
output_data_type
<<
" input_layout_type:"
<<
args
.
input_layout_type
<<
" output_layout_type:"
<<
args
.
output_layout_type
;
DLOG
<<
" image_address:"
<<
args
.
image
.
address
<<
" image_scale_address:"
<<
args
.
image
.
scale_address
<<
" image_channels:"
<<
args
.
image
.
channels
<<
" image_height:"
<<
args
.
image
.
height
<<
" image_width:"
<<
args
.
image
.
width
<<
" pad_height:"
<<
args
.
image
.
pad_height
<<
" pad_width:"
<<
args
.
image
.
pad_width
;
DLOG
<<
" out_address:"
<<
args
.
output
.
address
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
#endif
return
do_ioctl
(
IOCTL_CONFIG_BYPASS
,
&
args
);
}
int
ComputeFPGAConcat
(
const
struct
ConcatArgs
&
args
)
{
#ifdef FPGA_TEST_MODE
DLOG
<<
"=============ComputeFpgaConcat==========="
;
DLOG
<<
" Image_num: "
<<
args
.
image_num
<<
" out_address:"
<<
args
.
image_out
<<
" out_scale_address:"
<<
args
.
scale_out
;
DLOG
<<
" image_height:"
<<
args
.
height
<<
" image_width:"
<<
args
.
width
;
for
(
int
i
=
0
;
i
<
args
.
image_num
;
i
++
)
{
DLOG
<<
" "
<<
i
<<
"th: "
;
DLOG
<<
" channel_num:"
<<
args
.
channel_num
[
i
]
<<
" image_address:"
<<
args
.
images_in
[
i
]
<<
" image_scale_address:"
<<
args
.
scales_in
[
i
];
}
#endif
image
::
concat_images
(
args
.
images_in
,
args
.
scales_in
,
args
.
image_out
,
args
.
scale_out
,
args
.
image_num
,
args
.
channel_num
,
args
.
height
,
args
.
width
);
return
0
;
}
int
get_align_image_cw
(
int
cw
)
{
return
align_to_x
(
cw
,
IMAGE_ALIGNMENT
);
}
void
format_image
(
framework
::
Tensor
*
image_tensor
)
{
...
...
@@ -397,7 +159,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg
->
filter_num
=
(
uint32_t
)
filter
->
dims
()[
0
];
arg
->
output
.
address
=
out_ptr
;
arg
->
output
.
scale_address
=
out
->
scale
;
arg
->
conv_arg
s
=
arg
->
conv_arg
=
(
ConvArgs
*
)
fpga_malloc
(
arg
->
split_num
*
sizeof
(
ConvArgs
));
// NOLINT
arg
->
concat_arg
.
image_num
=
arg
->
split_num
;
...
...
@@ -420,44 +182,44 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
filter
->
dims
()[
1
]
*
filter
->
dims
()[
2
]
*
filter
->
dims
()[
3
]);
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
arg
->
conv_arg
s
[
i
].
relu_enabled
=
relu_enabled
;
arg
->
conv_arg
s
[
i
].
group_num
=
(
uint32_t
)
group_num
;
arg
->
conv_arg
s
[
i
].
kernel
.
stride_h
=
(
uint32_t
)
stride_h
;
arg
->
conv_arg
s
[
i
].
kernel
.
stride_w
=
(
uint32_t
)
stride_w
;
arg
->
conv_arg
s
[
i
].
kernel
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
arg
->
conv_arg
s
[
i
].
kernel
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
arg
->
conv_arg
s
[
i
].
image
.
address
=
input_ptr
;
arg
->
conv_arg
s
[
i
].
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
arg
->
conv_arg
s
[
i
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
arg
->
conv_arg
s
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
arg
->
conv_arg
s
[
i
].
image
.
scale_address
=
input
->
scale
;
arg
->
conv_arg
s
[
i
].
image
.
pad_height
=
(
uint32_t
)
padding_h
;
arg
->
conv_arg
s
[
i
].
image
.
pad_width
=
(
uint32_t
)
padding_w
;
arg
->
conv_arg
s
[
i
].
filter_scale_address
=
filter
->
scale
;
arg
->
conv_arg
s
[
i
].
filter_address
=
&
(
arg
->
conv_arg
[
i
].
relu_enabled
=
relu_enabled
;
arg
->
conv_arg
[
i
].
group_num
=
(
uint32_t
)
group_num
;
arg
->
conv_arg
[
i
].
kernel
.
stride_h
=
(
uint32_t
)
stride_h
;
arg
->
conv_arg
[
i
].
kernel
.
stride_w
=
(
uint32_t
)
stride_w
;
arg
->
conv_arg
[
i
].
kernel
.
height
=
(
uint32_t
)
filter
->
dims
()[
2
];
arg
->
conv_arg
[
i
].
kernel
.
width
=
(
uint32_t
)
filter
->
dims
()[
3
];
arg
->
conv_arg
[
i
].
image
.
address
=
input_ptr
;
arg
->
conv_arg
[
i
].
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
arg
->
conv_arg
[
i
].
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
arg
->
conv_arg
[
i
].
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
arg
->
conv_arg
[
i
].
image
.
scale_address
=
input
->
scale
;
arg
->
conv_arg
[
i
].
image
.
pad_height
=
(
uint32_t
)
padding_h
;
arg
->
conv_arg
[
i
].
image
.
pad_width
=
(
uint32_t
)
padding_w
;
arg
->
conv_arg
[
i
].
filter_scale_address
=
filter
->
scale
;
arg
->
conv_arg
[
i
].
filter_address
=
&
(
(
int8_t
*
)
filter_ptr
)[
i
*
element_num
*
filter_num_per_div
];
// NOLINT
arg
->
conv_arg
s
[
i
].
sb_address
=
&
bs_ptr
[
i
*
filter_num_per_div
*
2
];
arg
->
conv_arg
s
[
i
].
filter_num
=
(
uint32_t
)(
arg
->
conv_arg
[
i
].
sb_address
=
&
bs_ptr
[
i
*
filter_num_per_div
*
2
];
arg
->
conv_arg
[
i
].
filter_num
=
(
uint32_t
)(
i
==
n
-
1
?
channel
-
(
n
-
1
)
*
filter_num_per_div
// NOLINT
:
filter_num_per_div
);
if
(
n
>
1
)
{
arg
->
conv_arg
s
[
i
].
output
.
scale_address
=
arg
->
conv_arg
[
i
].
output
.
scale_address
=
(
float
*
)
fpga_malloc
(
2
*
sizeof
(
float
));
// NOLINT
arg
->
conv_arg
s
[
i
].
output
.
address
=
fpga_malloc
(
input
->
dims
()[
2
]
*
align_to_x
(
input
->
dims
()[
3
]
*
arg
->
conv_args
[
i
].
filter_num
,
IMAGE_ALIGNMENT
)
*
sizeof
(
half
));
arg
->
conv_arg
[
i
].
output
.
address
=
fpga_malloc
(
input
->
dims
()[
2
]
*
align_to_x
(
input
->
dims
()[
3
]
*
arg
->
conv_arg
[
i
].
filter_num
,
IMAGE_ALIGNMENT
)
*
sizeof
(
half
));
}
else
{
arg
->
conv_arg
s
[
i
].
output
.
scale_address
=
out
->
scale
;
arg
->
conv_arg
s
[
i
].
output
.
address
=
out_ptr
;
arg
->
conv_arg
[
i
].
output
.
scale_address
=
out
->
scale
;
arg
->
conv_arg
[
i
].
output
.
address
=
out_ptr
;
}
arg
->
concat_arg
.
images_in
[
i
]
=
(
half
*
)
arg
->
conv_arg
s
[
i
].
output
.
address
;
// NOLINT
arg
->
concat_arg
.
scales_in
[
i
]
=
arg
->
conv_arg
s
[
i
].
output
.
scale_address
;
arg
->
concat_arg
.
channel_num
[
i
]
=
arg
->
conv_arg
s
[
i
].
filter_num
;
(
half
*
)
arg
->
conv_arg
[
i
].
output
.
address
;
// NOLINT
arg
->
concat_arg
.
scales_in
[
i
]
=
arg
->
conv_arg
[
i
].
output
.
scale_address
;
arg
->
concat_arg
.
channel_num
[
i
]
=
arg
->
conv_arg
[
i
].
filter_num
;
}
}
...
...
src/fpga/V1/api.h
浏览文件 @
2bca7447
...
...
@@ -14,178 +14,13 @@ limitations under the License. */
#pragma once
#include <stdint.h>
#include <cstddef>
#include <iostream>
#include <limits>
#include "fpga/common/fpga_common.h"
#include "fpga/common/pe.h"
#include "framework/tensor.h"
namespace
paddle_mobile
{
namespace
fpga
{
enum
DataType
{
DATA_TYPE_FP32
=
1
,
DATA_TYPE_FP16
=
0
,
};
enum
LayoutType
{
LAYOUT_CHW
=
1
,
LAYOUT_HWC
=
0
,
};
struct
VersionArgs
{
void
*
buffer
;
};
struct
MemoryCopyArgs
{
void
*
src
;
void
*
dest
;
size_t
size
;
};
struct
KernelArgs
{
uint32_t
width
;
uint32_t
height
;
uint32_t
stride_w
;
uint32_t
stride_h
;
};
struct
ImageInputArgs
{
void
*
address
;
// input featuremap virtual address
float
*
scale_address
;
// input scale address;
uint32_t
channels
;
uint32_t
width
;
// featuremap width
uint32_t
height
;
uint32_t
pad_width
;
// padding width;
uint32_t
pad_height
;
};
struct
ImageOutputArgs
{
void
*
address
;
// output result address;
float
*
scale_address
;
// output scale address;
};
struct
ConvArgs
{
bool
relu_enabled
;
void
*
sb_address
;
// scale and bias are interlaced;
void
*
filter_address
;
float
*
filter_scale_address
;
uint32_t
filter_num
;
uint32_t
group_num
;
struct
KernelArgs
kernel
;
struct
ImageInputArgs
image
;
// input image;
struct
ImageOutputArgs
output
;
};
struct
ConcatArgs
{
uint32_t
image_num
;
half
**
images_in
;
float
**
scales_in
;
void
*
image_out
;
float
*
scale_out
;
uint32_t
*
channel_num
;
uint32_t
height
;
uint32_t
width
;
};
struct
SplitConvArgs
{
uint32_t
split_num
;
uint32_t
group_num
;
uint32_t
filter_num
;
struct
ImageOutputArgs
output
;
struct
ConvArgs
*
conv_args
;
struct
ConcatArgs
concat_arg
;
};
struct
GroupConvArgs
{
uint32_t
group_num
;
uint32_t
filter_num
;
struct
ImageOutputArgs
output
;
struct
SplitConvArgs
*
conv_args
;
struct
ConcatArgs
concat_arg
;
};
struct
PoolingArgs
{
int16_t
mode
;
// mode: 0:max, 1:avg
half
kernel_reciprocal
;
struct
KernelArgs
kernel
;
struct
ImageInputArgs
image
;
// input image;
struct
ImageOutputArgs
output
;
};
struct
EWAddArgs
{
bool
relu_enabled
;
uint32_t
const0
;
// output0 = const0 x input0 + const1 x input1;
uint32_t
const1
;
struct
ImageInputArgs
image0
;
struct
ImageInputArgs
image1
;
struct
ImageOutputArgs
output
;
};
struct
BypassArgs
{
enum
DataType
input_data_type
;
enum
DataType
output_data_type
;
enum
LayoutType
input_layout_type
;
enum
LayoutType
output_layout_type
;
struct
ImageInputArgs
image
;
struct
ImageOutputArgs
output
;
};
struct
FpgaRegWriteArgs
{
uint64_t
address
;
//
uint64_t
value
;
};
struct
FpgaRegReadArgs
{
uint64_t
address
;
uint64_t
value
;
};
struct
MemoryCacheArgs
{
void
*
address
;
size_t
size
;
};
#define IOCTL_FPGA_MAGIC 'FPGA'
#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
#define IOCTL_SEPARATOR_0 10
#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
#define IOCTL_SEPARATOR_1 20
#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
//============================== API =============================
int
open_device
();
int
close_device
();
void
*
fpga_malloc
(
size_t
size
);
void
fpga_free
(
void
*
ptr
);
void
fpga_copy
(
void
*
dst
,
const
void
*
src
,
size_t
num
);
int
fpga_flush
(
void
*
address
,
size_t
size
);
int
fpga_invalidate
(
void
*
address
,
size_t
size
);
int
PerformBypass
(
const
struct
BypassArgs
&
args
);
int
ComputeFpgaConv
(
const
struct
SplitConvArgs
&
args
);
int
ComputeFpgaPool
(
const
struct
PoolingArgs
&
args
);
int
ComputeFpgaEWAdd
(
const
struct
EWAddArgs
&
args
);
int
ComputeFPGAConcat
(
const
struct
ConcatArgs
&
args
);
static
inline
int
align_to_x
(
int
num
,
int
x
)
{
return
(
num
+
x
-
1
)
/
x
*
x
;
}
int
get_align_image_cw
(
int
cw
);
void
format_image
(
framework
::
Tensor
*
image_tensor
);
void
format_fp16_ofm
(
framework
::
Tensor
*
ofm_tensor
);
// only allocate memory
...
...
@@ -209,8 +44,5 @@ void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
bool
relu_enabled
,
int
group_num
,
int
stride_h
,
int
stride_w
,
int
padding_h
,
int
padding_w
,
float
*
bs_ptr
);
half
fp32_2_fp16
(
float
fp32_num
);
float
fp16_2_fp32
(
half
fp16_num
);
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/V1/bias_scale.cpp
浏览文件 @
2bca7447
...
...
@@ -14,7 +14,7 @@ limitations under the License. */
#include "fpga/V1/bias_scale.h"
#include <memory.h>
#include "fpga/
V1/api
.h"
#include "fpga/
common/fpga_common
.h"
namespace
paddle_mobile
{
namespace
fpga
{
...
...
src/fpga/V1/filter.cpp
浏览文件 @
2bca7447
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#include "fpga/V1/filter.h"
#include <memory.h>
#include <algorithm>
#include "fpga/
V1/api
.h"
#include "fpga/
common/fpga_common
.h"
namespace
paddle_mobile
{
namespace
fpga
{
...
...
@@ -31,20 +31,22 @@ int calc_split_num(int num, int division_capacity) {
}
int
calc_division_number
(
int
num
,
int
group_num
,
int
division_capacity
)
{
PADDLE_MOBILE_ENFORCE
(
num
%
group_num
==
0
,
"Filter number should be divisible by group number"
);
// PADDLE_MOBILE_ENFORCE(num % group_num == 0,
// "Filter number should be divisible by group
// number");
int
split_num
=
calc_split_num
(
num
,
division_capacity
);
PADDLE_MOBILE_ENFORCE
(
group_num
==
1
||
split_num
==
1
,
"Split number or group number should be 1"
);
//
PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
//
"Split number or group number should be 1");
return
group_num
*
split_num
;
}
int
calc_num_per_div
(
int
num
,
int
group_num
,
int
division_capacity
)
{
PADDLE_MOBILE_ENFORCE
(
num
%
group_num
==
0
,
"Filter number should be divisible by group number"
);
// PADDLE_MOBILE_ENFORCE(num % group_num == 0,
// "Filter number should be divisible by group
// number");
int
split_num
=
calc_split_num
(
num
,
division_capacity
);
PADDLE_MOBILE_ENFORCE
(
group_num
==
1
||
split_num
==
1
,
"Split number or group number should be 1"
);
//
PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
//
"Split number or group number should be 1");
if
(
group_num
==
1
)
{
if
(
num
>
division_capacity
)
{
return
division_capacity
;
...
...
src/fpga/V1/image.cpp
浏览文件 @
2bca7447
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#include "fpga/V1/image.h"
#include <memory.h>
#include <algorithm>
#include "fpga/
V1/api
.h"
#include "fpga/
common/fpga_common
.h"
namespace
paddle_mobile
{
namespace
fpga
{
...
...
src/fpga/V1/pe.cpp
0 → 100644
浏览文件 @
2bca7447
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/common/pe.h"
#include "fpga/V1/filter.h"
#include "fpga/V1/image.h"
#include "fpga/common/config.h"
#include "fpga/common/driver.h"
namespace
paddle_mobile
{
namespace
fpga
{
int
ComputeFpgaConv
(
const
struct
SplitConvArgs
&
args
)
{
ComputeBasicConv
(
args
.
conv_arg
[
0
]);
}
int
ComputeBasicConv
(
const
struct
ConvArgs
&
args
)
{
#ifdef FPGA_PRINT_MODE
DLOG
<<
"======Compute Basic Conv======"
;
DLOG
<<
" relu_enabled:"
<<
args
.
relu_enabled
<<
" sb_address:"
<<
args
.
sb_address
<<
" filter_address:"
<<
args
.
filter_address
<<
" filter_num:"
<<
args
.
filter_num
<<
" group_num:"
<<
args
.
group_num
;
DLOG
<<
" image_address:"
<<
args
.
image
.
address
<<
" image_scale_address:"
<<
args
.
image
.
scale_address
<<
" image_channels:"
<<
args
.
image
.
channels
<<
" image_height:"
<<
args
.
image
.
height
<<
" image_width:"
<<
args
.
image
.
width
<<
" pad_height:"
<<
args
.
image
.
pad_height
<<
" pad_width:"
<<
args
.
image
.
pad_width
;
DLOG
<<
" kernel_height:"
<<
args
.
kernel
.
height
<<
" kernel_width:"
<<
args
.
kernel
.
width
<<
" stride_h:"
<<
args
.
kernel
.
stride_h
<<
" stride_w:"
<<
args
.
kernel
.
stride_w
;
DLOG
<<
" out_address:"
<<
args
.
output
.
address
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
#endif
#ifndef PADDLE_MOBILE_ZU5
return
0
;
#endif
return
0
;
}
int
ComputeFpgaPool
(
const
struct
PoolingArgs
&
args
)
{
#ifdef FPGA_PRINT_MODE
DLOG
<<
"=============ComputeFpgaPool==========="
;
DLOG
<<
" mode:"
<<
args
.
mode
<<
" kernel_reciprocal:"
<<
fp16_2_fp32
(
args
.
kernel_reciprocal
);
DLOG
<<
" image_address:"
<<
args
.
image
.
address
<<
" image_scale_address:"
<<
args
.
image
.
scale_address
<<
" image_channels:"
<<
args
.
image
.
channels
<<
" image_height:"
<<
args
.
image
.
height
<<
" image_width:"
<<
args
.
image
.
width
<<
" pad_height:"
<<
args
.
image
.
pad_height
<<
" pad_width:"
<<
args
.
image
.
pad_width
;
DLOG
<<
" kernel_height:"
<<
args
.
kernel
.
height
<<
" kernel_width:"
<<
args
.
kernel
.
width
<<
" stride_h:"
<<
args
.
kernel
.
stride_h
<<
" stride_w:"
<<
args
.
kernel
.
stride_w
;
DLOG
<<
" out_address:"
<<
args
.
output
.
address
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
#endif
#ifndef PADDLE_MOBILE_ZU5
return
0
;
#endif
return
0
;
}
int
ComputeFpgaEWAdd
(
const
struct
EWAddArgs
&
args
)
{
#ifdef FPGA_PRINT_MODE
DLOG
<<
"=============ComputeFpgaEWAdd==========="
;
DLOG
<<
" relu_enabled:"
<<
args
.
relu_enabled
<<
" const0:"
<<
fp16_2_fp32
(
int16_t
(
args
.
const0
))
<<
" const1:"
<<
fp16_2_fp32
(
int16_t
(
args
.
const1
));
DLOG
<<
" image0_address:"
<<
args
.
image0
.
address
<<
" image0_scale_address:"
<<
args
.
image0
.
scale_address
<<
" image0_channels:"
<<
args
.
image0
.
channels
<<
" image0_height:"
<<
args
.
image0
.
height
<<
" image0_width:"
<<
args
.
image0
.
width
<<
" pad0_height:"
<<
args
.
image0
.
pad_height
<<
" pad0_width:"
<<
args
.
image0
.
pad_width
;
DLOG
<<
" image1_address:"
<<
args
.
image1
.
address
<<
" image1_scale_address:"
<<
args
.
image1
.
scale_address
<<
" image1_channels:"
<<
args
.
image1
.
channels
<<
" image1_height:"
<<
args
.
image1
.
height
<<
" image1_width:"
<<
args
.
image1
.
width
<<
" pad1_height:"
<<
args
.
image1
.
pad_height
<<
" pad_width:"
<<
args
.
image1
.
pad_width
;
DLOG
<<
" out_address:"
<<
args
.
output
.
address
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
#endif
#ifndef PADDLE_MOBILE_ZU5
return
0
;
#endif
return
0
;
}
int
PerformBypass
(
const
struct
BypassArgs
&
args
)
{
#ifdef FPGA_PRINT_MODE
DLOG
<<
"=============ComputeFpgaBypass==========="
;
DLOG
<<
" input_type:"
<<
args
.
input_data_type
<<
" output_type:"
<<
args
.
output_data_type
<<
" input_layout_type:"
<<
args
.
input_layout_type
<<
" output_layout_type:"
<<
args
.
output_layout_type
;
DLOG
<<
" image_address:"
<<
args
.
image
.
address
<<
" image_scale_address:"
<<
args
.
image
.
scale_address
<<
" image_channels:"
<<
args
.
image
.
channels
<<
" image_height:"
<<
args
.
image
.
height
<<
" image_width:"
<<
args
.
image
.
width
<<
" pad_height:"
<<
args
.
image
.
pad_height
<<
" pad_width:"
<<
args
.
image
.
pad_width
;
DLOG
<<
" out_address:"
<<
args
.
output
.
address
<<
" out_scale_address:"
<<
args
.
output
.
scale_address
;
#endif
#ifndef PADDLE_MOBILE_ZU5
return
0
;
#endif
return
0
;
}
int
ComputeFPGAConcat
(
const
struct
ConcatArgs
&
args
)
{
#ifdef FPGA_PRINT_MODE
DLOG
<<
"=============ComputeFpgaConcat==========="
;
DLOG
<<
" Image_num: "
<<
args
.
image_num
<<
" out_address:"
<<
args
.
image_out
<<
" out_scale_address:"
<<
args
.
scale_out
<<
" out_channel:"
<<
args
.
out_channel
;
DLOG
<<
" image_height:"
<<
args
.
height
<<
" image_width:"
<<
args
.
width
;
for
(
int
i
=
0
;
i
<
args
.
image_num
;
i
++
)
{
DLOG
<<
" "
<<
i
<<
"th: "
;
DLOG
<<
" channel_num:"
<<
args
.
channel_num
[
i
]
<<
" aligned_channel_num:"
<<
args
.
aligned_channel_num
[
i
]
<<
" image_address:"
<<
args
.
images_in
[
i
]
<<
" image_scale_address:"
<<
args
.
scales_in
[
i
];
}
#endif
image
::
concat_images
(
args
.
images_in
,
args
.
scales_in
,
args
.
image_out
,
args
.
scale_out
,
args
.
image_num
,
args
.
channel_num
,
args
.
height
,
args
.
width
);
return
0
;
}
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/V2/api.cpp
浏览文件 @
2bca7447
...
...
@@ -13,84 +13,13 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V2/api.h"
#include <algorithm>
#include "fpga/V2/bias_scale.h"
#include "fpga/V2/config.h"
#include "fpga/V2/driver/driver.h"
#include "fpga/V2/filter.h"
#include "fpga/V2/image.h"
namespace
paddle_mobile
{
namespace
fpga
{
static
std
::
map
<
void
*
,
size_t
>
memory_map
;
int
open_device
()
{
int
ret
=
driver
::
open_device_driver
();
return
ret
;
}
int
close_device
()
{
int
ret
=
driver
::
close_device_driver
();
return
ret
;
}
void
*
fpga_malloc
(
size_t
size
)
{
static
uint64_t
counter
=
0
;
#ifdef PADDLE_MOBILE_ZU5
auto
ptr
=
driver
::
fpga_malloc_driver
(
size
);
#else
auto
ptr
=
malloc
(
size
);
#endif
counter
+=
size
;
memory_map
.
insert
(
std
::
make_pair
(
ptr
,
size
));
// DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
// << counter << " bytes";
return
ptr
;
}
void
fpga_free
(
void
*
ptr
)
{
static
uint64_t
counter
=
0
;
size_t
size
=
0
;
auto
iter
=
memory_map
.
find
(
ptr
);
// std::map<void *, size_t>::iterator
if
(
iter
!=
memory_map
.
end
())
{
size
=
iter
->
second
;
memory_map
.
erase
(
iter
);
#ifdef PADDLE_MOBILE_ZU5
driver
::
fpga_free_driver
(
ptr
);
#else
free
(
ptr
);
#endif
counter
+=
size
;
// DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
// << counter << " bytes";
}
else
{
DLOG
<<
"Invalid pointer"
;
}
}
void
fpga_copy
(
void
*
dest
,
const
void
*
src
,
size_t
num
)
{
#ifdef PADDLE_MOBILE_ZU5
driver
::
fpga_copy_driver
(
dest
,
src
,
num
);
#else
memcpy
(
dest
,
src
,
num
);
#endif
}
int
fpga_flush
(
void
*
address
,
size_t
size
)
{
#ifdef PADDLE_MOBILE_ZU5
return
driver
::
fpga_flush_driver
(
address
,
size
);
#else
return
0
;
#endif
}
int
fpga_invalidate
(
void
*
address
,
size_t
size
)
{
#ifdef PADDLE_MOBILE_ZU5
return
driver
::
fpga_invalidate_driver
(
address
,
size
);
#else
return
0
;
#endif
}
void
format_image
(
framework
::
Tensor
*
image_tensor
)
{
auto
dims
=
image_tensor
->
dims
();
auto
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
...
...
@@ -284,8 +213,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg
->
conv_arg
[
i
].
output
.
address
=
out_ptr
;
arg
->
conv_arg
[
i
].
output
.
scale_address
=
out
->
scale
;
int
num_after_alignment
=
filter
::
calc_aligned_num
((
int
)
input
->
dims
()[
1
],
arg
->
filter_num
);
int
num_after_alignment
=
filter
::
calc_aligned_num
(
(
int
)
input
->
dims
()[
1
],
arg
->
filter_num
);
// NOLINT
arg
->
conv_arg
[
i
].
free_space
=
fpga_malloc
(
num_after_alignment
*
2
*
sizeof
(
half
));
}
...
...
src/fpga/V2/api.h
浏览文件 @
2bca7447
...
...
@@ -14,21 +14,13 @@ limitations under the License. */
#pragma once
#include "fpga/
V2/driver/pe
.h"
#include "fpga/
V2/fpga_common
.h"
#include "fpga/
common/fpga_common
.h"
#include "fpga/
common/pe
.h"
#include "framework/tensor.h"
namespace
paddle_mobile
{
namespace
fpga
{
int
open_device
();
int
close_device
();
void
*
fpga_malloc
(
size_t
size
);
void
fpga_free
(
void
*
ptr
);
void
fpga_copy
(
void
*
dest
,
const
void
*
src
,
size_t
num
);
int
fpga_flush
(
void
*
address
,
size_t
size
);
int
fpga_invalidate
(
void
*
address
,
size_t
size
);
float
filter_find_max
(
framework
::
Tensor
*
filter_tensor
);
int
get_aligned_channel_num
(
int
channel_num
);
int
get_aligned_filter_num
(
framework
::
Tensor
*
filter_tensor
);
...
...
src/fpga/V2/bias_scale.cpp
浏览文件 @
2bca7447
...
...
@@ -14,7 +14,7 @@ limitations under the License. */
#include "fpga/V2/bias_scale.h"
#include <memory.h>
#include "fpga/
V2/api
.h"
#include "fpga/
common/fpga_common
.h"
namespace
paddle_mobile
{
namespace
fpga
{
...
...
src/fpga/V2/filter.cpp
浏览文件 @
2bca7447
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#include "fpga/V2/filter.h"
#include <memory.h>
#include <algorithm>
#include "fpga/
V2/api
.h"
#include "fpga/
common/fpga_common
.h"
namespace
paddle_mobile
{
namespace
fpga
{
...
...
@@ -73,7 +73,7 @@ void convert_to_hwc(float **data_in, int num, int channel, int height,
void
align_filter
(
float
**
data_in
,
int
num
,
int
channel
,
int
height
,
int
width
)
{
int
aligned_channel
=
calc_
channel_parallelism
(
channel
);
int
aligned_channel
=
calc_
aligned_channel
(
channel
);
int
hw
=
height
*
width
;
int
pixel_num
=
calc_aligned_total_pixel_num
(
num
,
channel
,
height
,
width
);
float
*
new_data
=
(
float
*
)
fpga_malloc
(
pixel_num
*
sizeof
(
float
));
// NOLINT
...
...
src/fpga/V2/image.cpp
浏览文件 @
2bca7447
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#include "fpga/V2/image.h"
#include <memory.h>
#include <algorithm>
#include "fpga/
V2/api
.h"
#include "fpga/
common/fpga_common
.h"
namespace
paddle_mobile
{
namespace
fpga
{
...
...
src/fpga/V2/
driver/
pe.cpp
→
src/fpga/V2/pe.cpp
浏览文件 @
2bca7447
...
...
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V2/driver/pe.h"
#include "fpga/V2/config.h"
#include "fpga/V2/driver/driver.h"
#include "fpga/common/pe.h"
#include "fpga/V2/filter.h"
#include "fpga/V2/image.h"
#include "fpga/common/config.h"
#include "fpga/common/driver.h"
namespace
paddle_mobile
{
namespace
fpga
{
...
...
@@ -166,53 +166,53 @@ int PerformBypass(const struct BypassArgs &args) {
return
0
;
#endif
uint64_t
ifm_src_paddr
=
driver
::
vaddr_to_paddr
(
args
.
image
.
address
);
uint64_t
ifm_dst_paddr
=
driver
::
vaddr_to_paddr
(
args
.
output
.
address
);
uint64_t
bp_enable
;
int64_t
length
;
uint64_t
pixels
;
// fp32->fp16
if
((
args
.
input_data_type
)
&&
(
!
args
.
output_data_type
))
{
pixels
=
(
args
.
image
.
channels
)
*
(
args
.
image
.
width
)
*
(
args
.
image
.
height
);
length
=
pixels
*
sizeof
(
float
);
bp_enable
=
0x8800000000000000
+
length
;
}
// fp16->fp32
else
if
((
!
args
.
input_data_type
)
&&
(
args
.
output_data_type
))
{
pixels
=
filter
::
calc_aligned_channel
((
args
.
image
.
channels
))
*
(
args
.
image
.
width
)
*
(
args
.
image
.
height
);
length
=
pixels
*
sizeof
(
short
);
length
=
align_to_x
((
int
)
length
,
64
);
// NOLINT
bp_enable
=
0x8a00000000000000
+
length
;
}
// fp16->fp16 findmax
else
if
((
!
args
.
input_data_type
)
&&
(
!
args
.
output_data_type
))
{
pixels
=
(
args
.
image
.
channels
)
*
(
args
.
image
.
width
)
*
(
args
.
image
.
height
);
length
=
pixels
*
sizeof
(
short
);
bp_enable
=
0x8900000000000000
+
length
;
}
else
{
return
-
1
;
}
// start bypass
driver
::
reg_writeq
(
ifm_src_paddr
,
MUL8
(
27
));
driver
::
reg_writeq
(
ifm_dst_paddr
,
MUL8
(
28
));
driver
::
reg_writeq
(
0
,
MUL8
(
0
));
driver
::
reg_writeq
(
bp_enable
,
MUL8
(
0
));
// poll
int
ret
=
-
1
;
ret
=
driver
::
fpga_regpoll
(
MUL8
(
48
),
BYPASS_DONE
,
0xffffffff
);
if
(
ret
!=
-
1
)
{
// clear "irq"
driver
::
reg_readq
(
MUL8
(
63
));
}
// get max value
if
((
!
args
.
input_data_type
)
&&
(
!
args
.
output_data_type
))
{
float
scale
=
Findfp16Max
();
args
.
output
.
scale_address
[
0
]
=
(
float
)(
1.0
/
scale
);
// NOLINT
args
.
output
.
scale_address
[
1
]
=
scale
;
}
//
uint64_t ifm_src_paddr = driver::vaddr_to_paddr(args.image.address);
//
uint64_t ifm_dst_paddr = driver::vaddr_to_paddr(args.output.address);
//
uint64_t bp_enable;
//
int64_t length;
//
uint64_t pixels;
//
//
//
fp32->fp16
//
if ((args.input_data_type) && (!args.output_data_type)) {
// pixels = (args.image.channels) * (args.image.width) *
// (args.image.height); length = pixels * sizeof(float); bp_enable =
//
0x8800000000000000 + length;
//
}
//
//
fp16->fp32
//
else if ((!args.input_data_type) && (args.output_data_type)) {
//
pixels = filter::calc_aligned_channel((args.image.channels)) *
//
(args.image.width) * (args.image.height);
//
length = pixels * sizeof(short);
//
length = align_to_x((int)length, 64); // NOLINT
//
bp_enable = 0x8a00000000000000 + length;
//
}
//
//
fp16->fp16 findmax
//
else if ((!args.input_data_type) && (!args.output_data_type)) {
// pixels = (args.image.channels) * (args.image.width) *
// (args.image.height); length = pixels * sizeof(short); bp_enable =
//
0x8900000000000000 + length;
//
} else {
//
return -1;
//
}
//
//
//
start bypass
//
driver::reg_writeq(ifm_src_paddr, MUL8(27));
//
driver::reg_writeq(ifm_dst_paddr, MUL8(28));
//
driver::reg_writeq(0, MUL8(0));
//
driver::reg_writeq(bp_enable, MUL8(0));
//
//
poll
//
int ret = -1;
//
ret = driver::fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
//
if (ret != -1) {
//
// clear "irq"
//
driver::reg_readq(MUL8(63));
//
}
//
//
get max value
//
if ((!args.input_data_type) && (!args.output_data_type)) {
//
float scale = Findfp16Max();
//
args.output.scale_address[0] = (float)(1.0 / scale); // NOLINT
//
args.output.scale_address[1] = scale;
//
}
return
ret
;
}
...
...
src/fpga/
V2/driver
/bitmap.cpp
→
src/fpga/
common
/bitmap.cpp
浏览文件 @
2bca7447
...
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/
V2/driver
/bitmap.h"
#include "fpga/
common
/bitmap.h"
namespace
fpga_bitmap
{
void
bitmap_set
(
uint64_t
*
map
,
unsigned
int
start
,
int
len
)
{
...
...
src/fpga/
V2/driver
/bitmap.h
→
src/fpga/
common
/bitmap.h
浏览文件 @
2bca7447
文件已移动
src/fpga/
V2
/config.h
→
src/fpga/
common
/config.h
浏览文件 @
2bca7447
文件已移动
src/fpga/
V2/driver
/driver.cpp
→
src/fpga/
common
/driver.cpp
浏览文件 @
2bca7447
...
...
@@ -28,8 +28,8 @@ limitations under the License. */
#include <iostream>
#include "common/enforce.h"
#include "fpga/
V2/driver
/bitmap.h"
#include "fpga/
V2/driver
/driver.h"
#include "fpga/
common
/bitmap.h"
#include "fpga/
common
/driver.h"
namespace
paddle_mobile
{
namespace
fpga
{
...
...
@@ -353,7 +353,7 @@ void fpga_free_driver(void *ptr) {
}
}
static
inline
int
do_ioctl
(
unsigned
long
req
,
const
void
*
arg
)
{
static
inline
int
do_ioctl
(
int64_t
req
,
const
void
*
arg
)
{
return
ioctl
(
g_fpgainfo
.
fd_mem
,
req
,
arg
);
}
...
...
@@ -363,7 +363,7 @@ int fpga_flush_driver(void *address, size_t size) {
p_addr
=
vaddr_to_paddr
(
address
);
args
.
offset
=
(
void
*
)(
p_addr
-
FPGA_MEM_PHY_ADDR
);
args
.
offset
=
(
void
*
)(
p_addr
-
FPGA_MEM_PHY_ADDR
);
// NOLINT
args
.
size
=
size
;
return
do_ioctl
(
IOCTL_MEMCACHE_FLUSH
,
&
args
);
...
...
@@ -375,7 +375,7 @@ int fpga_invalidate_driver(void *address, size_t size) {
p_addr
=
vaddr_to_paddr
(
address
);
args
.
offset
=
(
void
*
)(
p_addr
-
FPGA_MEM_PHY_ADDR
);
args
.
offset
=
(
void
*
)(
p_addr
-
FPGA_MEM_PHY_ADDR
);
// NOLINT
args
.
size
=
size
;
return
do_ioctl
(
IOCTL_MEMCACHE_INVAL
,
&
args
);
...
...
@@ -389,7 +389,7 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) {
for
(
i
=
0
;
i
<
num
;
i
++
)
{
// DLOG << "i:" << i << " val:" << *((int8_t *)src + i);
// usleep(1);
*
((
int8_t
*
)
dest
+
i
)
=
*
((
int8_t
*
)
src
+
i
);
*
((
int8_t
*
)
dest
+
i
)
=
*
((
int8_t
*
)
src
+
i
);
// NOLINT
}
return
;
...
...
src/fpga/
V2/driver
/driver.h
→
src/fpga/
common
/driver.h
浏览文件 @
2bca7447
...
...
@@ -33,8 +33,6 @@ namespace driver {
#define FPGA_MEM_PHY_ADDR 0x20000000
#define FPGA_MEM_SIZE 0x20000000
#define CPU_FREQ 1000000000
#define FPGA_PAGE_SIZE (16UL * 1024UL)
// PE related macros
...
...
@@ -53,7 +51,7 @@ struct MemoryCacheArgs {
size_t
size
;
};
#define IOCTL_FPGA_MAGIC 'F
PGA
'
#define IOCTL_FPGA_MAGIC 'F'
#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
...
...
@@ -105,17 +103,17 @@ extern struct FPGA_INFO g_fpgainfo;
inline
uint64_t
reg_readq
(
uint32_t
offset
)
{
// DLOG << "offset : " << offset;
uint64_t
value
=
*
(
volatile
uint64_t
*
)((
uint8_t
*
)
g_fpgainfo
.
FpgaRegVirAddr
+
offset
);
// NOLINT
uint64_t
value
=
*
(
volatile
uint64_t
*
)((
uint8_t
*
)
g_fpgainfo
.
FpgaRegVirAddr
+
// NOLINT
offset
);
// NOLINT
return
value
;
}
inline
void
reg_writeq
(
uint64_t
value
,
uint32_t
offset
)
{
// DLOG << "offset : " << offset << ", value : " << value;
*
(
volatile
uint64_t
*
)((
uint8_t
*
)
g_fpgainfo
.
FpgaRegVirAddr
+
offset
)
=
// NOLINT
value
;
*
(
volatile
uint64_t
*
)((
uint8_t
*
)
g_fpgainfo
.
FpgaRegVirAddr
+
// NOLINT
offset
)
=
value
;
}
int
open_device_driver
();
...
...
src/fpga/
V2
/fpga_common.cpp
→
src/fpga/
common
/fpga_common.cpp
浏览文件 @
2bca7447
...
...
@@ -12,7 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fpga/V2/fpga_common.h>
#include "fpga/common/fpga_common.h"
#include <algorithm>
#include <map>
#include "fpga/common/config.h"
#include "fpga/common/driver.h"
namespace
paddle_mobile
{
namespace
fpga
{
...
...
@@ -40,5 +45,73 @@ float fp16_2_fp32(int16_t fp16_num) {
return
fp32_num
;
}
static
std
::
map
<
void
*
,
size_t
>
memory_map
;
int
open_device
()
{
int
ret
=
driver
::
open_device_driver
();
return
ret
;
}
int
close_device
()
{
int
ret
=
driver
::
close_device_driver
();
return
ret
;
}
void
*
fpga_malloc
(
size_t
size
)
{
static
uint64_t
counter
=
0
;
#ifdef PADDLE_MOBILE_ZU5
auto
ptr
=
driver
::
fpga_malloc_driver
(
size
);
#else
auto
ptr
=
malloc
(
size
);
#endif
counter
+=
size
;
memory_map
.
insert
(
std
::
make_pair
(
ptr
,
size
));
// DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
// << counter << " bytes";
return
ptr
;
}
void
fpga_free
(
void
*
ptr
)
{
static
uint64_t
counter
=
0
;
size_t
size
=
0
;
auto
iter
=
memory_map
.
find
(
ptr
);
// std::map<void *, size_t>::iterator
if
(
iter
!=
memory_map
.
end
())
{
size
=
iter
->
second
;
memory_map
.
erase
(
iter
);
#ifdef PADDLE_MOBILE_ZU5
driver
::
fpga_free_driver
(
ptr
);
#else
free
(
ptr
);
#endif
counter
+=
size
;
// DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
// << counter << " bytes";
}
else
{
DLOG
<<
"Invalid pointer"
;
}
}
void
fpga_copy
(
void
*
dest
,
const
void
*
src
,
size_t
num
)
{
#ifdef PADDLE_MOBILE_ZU5
driver
::
fpga_copy_driver
(
dest
,
src
,
num
);
#else
memcpy
(
dest
,
src
,
num
);
#endif
}
int
fpga_flush
(
void
*
address
,
size_t
size
)
{
#ifdef PADDLE_MOBILE_ZU5
return
driver
::
fpga_flush_driver
(
address
,
size
);
#else
return
0
;
#endif
}
int
fpga_invalidate
(
void
*
address
,
size_t
size
)
{
#ifdef PADDLE_MOBILE_ZU5
return
driver
::
fpga_invalidate_driver
(
address
,
size
);
#else
return
0
;
#endif
}
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/
V2
/fpga_common.h
→
src/fpga/
common
/fpga_common.h
浏览文件 @
2bca7447
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <cstddef>
#include <cstdint>
namespace
paddle_mobile
{
...
...
@@ -117,9 +118,19 @@ struct BypassArgs {
struct
DeconvArgs
{
struct
ConvArgs
conv_arg
;
};
static
inline
int
align_to_x
(
int
num
,
int
x
)
{
return
(
num
+
x
-
1
)
/
x
*
x
;
}
int16_t
fp32_2_fp16
(
float
fp32_num
);
float
fp16_2_fp32
(
int16_t
fp16_num
);
int
open_device
();
int
close_device
();
void
*
fpga_malloc
(
size_t
size
);
void
fpga_free
(
void
*
ptr
);
void
fpga_copy
(
void
*
dest
,
const
void
*
src
,
size_t
num
);
int
fpga_flush
(
void
*
address
,
size_t
size
);
int
fpga_invalidate
(
void
*
address
,
size_t
size
);
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/
V2/driver
/pe.h
→
src/fpga/
common
/pe.h
浏览文件 @
2bca7447
...
...
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "fpga/
V2
/fpga_common.h"
#include "fpga/
common
/fpga_common.h"
namespace
paddle_mobile
{
namespace
fpga
{
...
...
test/CMakeLists.txt
浏览文件 @
2bca7447
...
...
@@ -67,9 +67,6 @@ if (CON GREATER -1)
ADD_EXECUTABLE
(
test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-resnet50 paddle-mobile
)
ADD_EXECUTABLE
(
test-densebox net/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-densebox paddle-mobile
)
set
(
FOUND_MATCH ON
)
endif
()
...
...
@@ -81,9 +78,6 @@ if (CON GREATER -1)
ADD_EXECUTABLE
(
test-pe fpga/test_pe.cpp
)
target_link_libraries
(
test-pe paddle-mobile
)
ADD_EXECUTABLE
(
test-densebox net/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-densebox paddle-mobile
)
set
(
FOUND_MATCH ON
)
endif
()
...
...
tools/op.cmake
浏览文件 @
2bca7447
...
...
@@ -102,7 +102,6 @@ if (CON GREATER -1)
set
(
MUL_OP ON
)
set
(
RESHAPE_OP ON
)
set
(
SOFTMAX_OP ON
)
set
(
FOUND_MATCH ON
)
endif
()
...
...
@@ -120,14 +119,12 @@ if (CON GREATER -1)
set
(
SOFTMAX_OP ON
)
set
(
FUSION_CONVBNRELU_OP ON
)
set
(
FUSION_CONVBN_OP ON
)
set
(
FUSION_CONVADD_OP ON
)
set
(
FOUND_MATCH ON
)
endif
()
list
(
FIND NET
"FPGA_NET_V2"
CON
)
if
(
CON GREATER -1
)
message
(
"FPGA_NET_V2 enabled"
)
set
(
FEED_OP ON
)
set
(
FUSION_CONVADDRELU_OP ON
)
set
(
FUSION_ELEMENTWISEADDRELU_OP ON
)
set
(
FUSION_FC_OP ON
)
...
...
@@ -136,8 +133,6 @@ if (CON GREATER -1)
set
(
FUSION_CONVBNRELU_OP ON
)
set
(
FUSION_CONVBN_OP ON
)
set
(
CONV_TRANSPOSE_OP ON
)
set
(
FUSION_DECONVRELU_OP ON
)
#set(SLICE_OP ON)
set
(
TANH_OP ON
)
set
(
ELEMENTWISEADD_OP ON
)
set
(
TRANSPOSE2_OP ON
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录