Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
2f507f76
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
338
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2f507f76
编写于
3月 06, 2019
作者:
J
jameswu2014
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
75percentParallel+kerneldriver+ROIALIGN+psroi-bug
上级
74580de7
变更
19
隐藏空白更改
内联
并排
Showing
19 changed file
with
723 addition
and
520 deletion
+723
-520
src/common/types.cpp
src/common/types.cpp
+2
-0
src/common/types.h
src/common/types.h
+1
-0
src/fpga/V1/api.cpp
src/fpga/V1/api.cpp
+2
-2
src/fpga/common/bitmap.cpp
src/fpga/common/bitmap.cpp
+0
-131
src/fpga/common/bitmap.h
src/fpga/common/bitmap.h
+0
-37
src/fpga/common/driver.cpp
src/fpga/common/driver.cpp
+15
-130
src/fpga/common/driver.h
src/fpga/common/driver.h
+9
-2
src/fpga/common/fpga_common.h
src/fpga/common/fpga_common.h
+1
-0
src/operators/detection_ops.cpp
src/operators/detection_ops.cpp
+22
-0
src/operators/detection_ops.h
src/operators/detection_ops.h
+5
-0
src/operators/kernel/detection_kernel.h
src/operators/kernel/detection_kernel.h
+38
-0
src/operators/kernel/fpga/V1/fetch_kernel.cpp
src/operators/kernel/fpga/V1/fetch_kernel.cpp
+13
-5
src/operators/kernel/fpga/V1/pool_kernel.cpp
src/operators/kernel/fpga/V1/pool_kernel.cpp
+3
-1
src/operators/kernel/fpga/V1/proposal_kernel.cpp
src/operators/kernel/fpga/V1/proposal_kernel.cpp
+17
-4
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
+256
-208
src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
+330
-0
src/operators/kernel/fpga/V1/softmax_kernel.cpp
src/operators/kernel/fpga/V1/softmax_kernel.cpp
+1
-0
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
+4
-0
tools/op.cmake
tools/op.cmake
+4
-0
未找到文件。
src/common/types.cpp
浏览文件 @
2f507f76
...
@@ -109,6 +109,7 @@ const char *G_OP_TYPE_SLICE = "slice";
...
@@ -109,6 +109,7 @@ const char *G_OP_TYPE_SLICE = "slice";
const
char
*
G_OP_TYPE_ANCHOR_GENERATOR
=
"anchor_generator"
;
const
char
*
G_OP_TYPE_ANCHOR_GENERATOR
=
"anchor_generator"
;
const
char
*
G_OP_TYPE_GENERATE_PROPOSALS
=
"generate_proposals"
;
const
char
*
G_OP_TYPE_GENERATE_PROPOSALS
=
"generate_proposals"
;
const
char
*
G_OP_TYPE_PSROI_POOL
=
"psroi_pool"
;
const
char
*
G_OP_TYPE_PSROI_POOL
=
"psroi_pool"
;
const
char
*
G_OP_TYPE_ROIALIGN_POOL
=
"roialign_pool"
;
const
char
*
G_OP_TYPE_ROI_PERSPECTIVE
=
"roi_perspective_transform"
;
const
char
*
G_OP_TYPE_ROI_PERSPECTIVE
=
"roi_perspective_transform"
;
const
char
*
G_OP_TYPE_PAD2D
=
"pad2d"
;
const
char
*
G_OP_TYPE_PAD2D
=
"pad2d"
;
const
char
*
G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU
=
"fusion_deconv_add_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU
=
"fusion_deconv_add_bn_relu"
;
...
@@ -213,6 +214,7 @@ std::unordered_map<
...
@@ -213,6 +214,7 @@ std::unordered_map<
{{
"Scores"
,
"BboxDeltas"
,
"ImInfo"
,
"Anchors"
,
"Variances"
},
{{
"Scores"
,
"BboxDeltas"
,
"ImInfo"
,
"Anchors"
,
"Variances"
},
{
"RpnRois"
,
"RpnRoiProbs"
}}},
{
"RpnRois"
,
"RpnRoiProbs"
}}},
{
G_OP_TYPE_PSROI_POOL
,
{{
"X"
,
"ROIs"
},
{
"Out"
}}},
{
G_OP_TYPE_PSROI_POOL
,
{{
"X"
,
"ROIs"
},
{
"Out"
}}},
{
G_OP_TYPE_ROIALIGN_POOL
,
{{
"X"
,
"ROIs"
},
{
"Out"
}}},
{
G_OP_TYPE_ROI_PERSPECTIVE
,
{{
"X"
,
"ROIs"
},
{
"Out"
}}},
{
G_OP_TYPE_ROI_PERSPECTIVE
,
{{
"X"
,
"ROIs"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_DECONV_ADD_BN
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_DECONV_ADD_BN
,
{{
"Input"
},
{
"Out"
}}},
...
...
src/common/types.h
浏览文件 @
2f507f76
...
@@ -198,6 +198,7 @@ extern const char *G_OP_TYPE_SLICE;
...
@@ -198,6 +198,7 @@ extern const char *G_OP_TYPE_SLICE;
extern
const
char
*
G_OP_TYPE_ANCHOR_GENERATOR
;
extern
const
char
*
G_OP_TYPE_ANCHOR_GENERATOR
;
extern
const
char
*
G_OP_TYPE_GENERATE_PROPOSALS
;
extern
const
char
*
G_OP_TYPE_GENERATE_PROPOSALS
;
extern
const
char
*
G_OP_TYPE_PSROI_POOL
;
extern
const
char
*
G_OP_TYPE_PSROI_POOL
;
extern
const
char
*
G_OP_TYPE_ROIALIGN_POOL
;
extern
const
char
*
G_OP_TYPE_ROI_PERSPECTIVE
;
extern
const
char
*
G_OP_TYPE_ROI_PERSPECTIVE
;
extern
const
char
*
G_OP_TYPE_PAD2D
;
extern
const
char
*
G_OP_TYPE_PAD2D
;
extern
const
char
*
G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU
;
...
...
src/fpga/V1/api.cpp
浏览文件 @
2f507f76
...
@@ -368,9 +368,9 @@ void expand_conv_arg(ConvArgs *arg) {
...
@@ -368,9 +368,9 @@ void expand_conv_arg(ConvArgs *arg) {
auto
filter_pad_width_mul_channel
=
auto
filter_pad_width_mul_channel
=
args
.
image
.
pad_width
*
args
.
image
.
channels
;
args
.
image
.
pad_width
*
args
.
image
.
channels
;
auto
image_amount_per_row_multi_win_first
=
auto
image_amount_per_row_multi_win_first
=
image_amount_per_row
*
(
2
*
args
.
kernel
.
stride_h
-
args
.
image
.
pad_height
);
image_amount_per_row
*
(
ROW_PARALLEL_NUM
*
args
.
kernel
.
stride_h
-
args
.
image
.
pad_height
);
auto
image_amount_per_row_multi_win
=
auto
image_amount_per_row_multi_win
=
image_amount_per_row
*
(
2
*
args
.
kernel
.
stride_h
);
image_amount_per_row
*
(
ROW_PARALLEL_NUM
*
args
.
kernel
.
stride_h
);
auto
image_block_num
=
block_num
;
auto
image_block_num
=
block_num
;
auto
image_block_len
=
auto
image_block_len
=
...
...
src/fpga/common/bitmap.cpp
已删除
100644 → 0
浏览文件 @
74580de7
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/common/bitmap.h"
namespace
fpga_bitmap
{
void
bitmap_set
(
uint64_t
*
map
,
unsigned
int
start
,
int
len
)
{
uint64_t
*
p
=
map
+
BIT_WORD
(
start
);
const
unsigned
int
size
=
start
+
len
;
int
bits_to_set
=
BITS_PER_LONG
-
(
start
%
BITS_PER_LONG
);
uint64_t
mask_to_set
=
BITMAP_FIRST_WORD_MASK
(
start
);
while
(
len
-
bits_to_set
>=
0
)
{
*
p
|=
mask_to_set
;
len
-=
bits_to_set
;
bits_to_set
=
BITS_PER_LONG
;
mask_to_set
=
~
0UL
;
p
++
;
}
if
(
len
)
{
mask_to_set
&=
BITMAP_LAST_WORD_MASK
(
size
);
*
p
|=
mask_to_set
;
}
}
void
bitmap_clear
(
uint64_t
*
map
,
unsigned
int
start
,
int
len
)
{
uint64_t
*
p
=
map
+
BIT_WORD
(
start
);
const
unsigned
int
size
=
start
+
len
;
int
bits_to_clear
=
BITS_PER_LONG
-
(
start
%
BITS_PER_LONG
);
uint64_t
mask_to_clear
=
BITMAP_FIRST_WORD_MASK
(
start
);
while
(
len
-
bits_to_clear
>=
0
)
{
*
p
&=
~
mask_to_clear
;
len
-=
bits_to_clear
;
bits_to_clear
=
BITS_PER_LONG
;
mask_to_clear
=
~
0UL
;
p
++
;
}
if
(
len
)
{
mask_to_clear
&=
BITMAP_LAST_WORD_MASK
(
size
);
*
p
&=
~
mask_to_clear
;
}
}
static
uint64_t
ffs
(
uint64_t
data
)
{
uint64_t
bit
=
0
;
int
i
=
0
;
for
(
i
=
0
;
i
<
sizeof
(
data
)
*
8
;
i
++
)
{
if
(
data
&
(
1UL
<<
i
))
{
bit
=
i
;
break
;
}
}
return
bit
;
}
static
uint64_t
_find_next_bit
(
const
uint64_t
*
addr
,
uint64_t
nbits
,
uint64_t
start
,
uint64_t
invert
)
{
uint64_t
tmp
=
0
;
if
(
!
nbits
||
start
>=
nbits
)
return
nbits
;
tmp
=
addr
[
start
/
BITS_PER_LONG
]
^
invert
;
/* Handle 1st word. */
tmp
&=
BITMAP_FIRST_WORD_MASK
(
start
);
start
=
round_down
(
start
,
BITS_PER_LONG
);
while
(
!
tmp
)
{
start
+=
BITS_PER_LONG
;
if
(
start
>=
nbits
)
return
nbits
;
tmp
=
addr
[
start
/
BITS_PER_LONG
]
^
invert
;
}
return
(
start
+
ffs
(
tmp
))
<
nbits
?
(
start
+
ffs
(
tmp
))
:
nbits
;
}
uint64_t
find_next_zero_bit
(
const
uint64_t
*
addr
,
uint64_t
size
,
uint64_t
offset
)
{
return
_find_next_bit
(
addr
,
size
,
offset
,
~
0UL
);
}
uint64_t
find_next_bit
(
const
uint64_t
*
addr
,
uint64_t
size
,
uint64_t
offset
)
{
return
_find_next_bit
(
addr
,
size
,
offset
,
0UL
);
}
uint64_t
bitmap_find_next_zero_area_off
(
uint64_t
*
map
,
uint64_t
size
,
uint64_t
start
,
unsigned
int
nr
,
uint64_t
align_mask
,
uint64_t
align_offset
)
{
uint64_t
index
=
0
;
uint64_t
end
=
0
;
uint64_t
i
=
0
;
again:
index
=
find_next_zero_bit
(
map
,
size
,
start
);
/* Align allocation */
index
=
__ALIGN_MASK
(
index
+
align_offset
,
align_mask
)
-
align_offset
;
end
=
index
+
nr
;
if
(
end
>
size
)
return
end
;
i
=
find_next_bit
(
map
,
end
,
index
);
if
(
i
<
end
)
{
start
=
i
+
1
;
goto
again
;
}
return
index
;
}
uint64_t
bitmap_find_next_zero_area
(
uint64_t
*
map
,
uint64_t
size
,
uint64_t
start
,
unsigned
int
nr
,
uint64_t
align_mask
)
{
return
bitmap_find_next_zero_area_off
(
map
,
size
,
start
,
nr
,
align_mask
,
0
);
}
}
// namespace fpga_bitmap
src/fpga/common/bitmap.h
已删除
100644 → 0
浏览文件 @
74580de7
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <stdio.h>
#define BITS_PER_LONG 64
#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
#define round_down(x, y) ((x) & ~((y)-1))
namespace
fpga_bitmap
{
void
bitmap_set
(
uint64_t
*
map
,
unsigned
int
start
,
int
len
);
void
bitmap_clear
(
uint64_t
*
map
,
unsigned
int
start
,
int
len
);
uint64_t
bitmap_find_next_zero_area
(
uint64_t
*
map
,
uint64_t
size
,
uint64_t
start
,
unsigned
int
nr
,
uint64_t
align_mask
);
}
// namespace fpga_bitmap
src/fpga/common/driver.cpp
浏览文件 @
2f507f76
...
@@ -28,7 +28,6 @@ limitations under the License. */
...
@@ -28,7 +28,6 @@ limitations under the License. */
#include <iostream>
#include <iostream>
#include "common/enforce.h"
#include "common/enforce.h"
#include "fpga/common/bitmap.h"
#include "fpga/common/driver.h"
#include "fpga/common/driver.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
...
@@ -148,33 +147,7 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
...
@@ -148,33 +147,7 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
}
}
}
}
/*内存管理*/
int
memory_request
(
struct
fpga_memory
*
memory
,
size_t
size
,
uint64_t
*
addr
)
{
uint64_t
_nr
=
DIV_ROUND_UP
(
size
,
FPGA_PAGE_SIZE
);
unsigned
int
nr
=
(
unsigned
int
)
_nr
;
int
ret
=
0
;
uint64_t
a_size
=
FPGA_PAGE_SIZE
*
nr
;
pthread_mutex_lock
(
&
memory
->
mutex
);
unsigned
int
pos
=
(
unsigned
int
)
fpga_bitmap
::
bitmap_find_next_zero_area
(
memory
->
bitmap
,
memory
->
page_num
,
0
,
nr
,
0
);
if
(
pos
<=
memory
->
page_num
)
{
uint64_t
address_ofset
=
memory
->
mem_start
+
((
uint64_t
)
pos
)
*
FPGA_PAGE_SIZE
;
fpga_bitmap
::
bitmap_set
(
memory
->
bitmap
,
pos
,
nr
);
memory
->
nr
[
pos
]
=
nr
;
*
addr
=
address_ofset
;
}
else
{
DLOG
<<
"memory request failed!"
;
ret
=
-
ENOMEM
;
}
pthread_mutex_unlock
(
&
memory
->
mutex
);
return
ret
;
}
void
memory_release
(
struct
fpga_memory
*
memory
)
{
void
memory_release
(
struct
fpga_memory
*
memory
)
{
void
*
ptr
=
nullptr
;
void
*
ptr
=
nullptr
;
...
@@ -187,96 +160,7 @@ void memory_release(struct fpga_memory *memory) {
...
@@ -187,96 +160,7 @@ void memory_release(struct fpga_memory *memory) {
}
}
}
}
int
create_fpga_memory_inner
(
struct
fpga_memory
*
memory
,
size_t
memory_size
)
{
int
rc
=
0
;
uint64_t
*
bitmap
=
nullptr
;
unsigned
int
*
nr
=
nullptr
;
// 不允许多份memory创建,所以创建memory结构体不存在互斥
// pthread_mutex_lock(&memory->mutex);
memory
->
page_num
=
(
unsigned
int
)(
memory_size
/
FPGA_PAGE_SIZE
);
memory
->
page_num_long
=
DIV_ROUND_UP
(
memory
->
page_num
,
BITS_PER_LONG
);
bitmap
=
(
uint64_t
*
)
malloc
(
sizeof
(
int64_t
)
*
memory
->
page_num_long
);
// NOLINT
if
(
!
bitmap
)
{
rc
=
-
EFAULT
;
return
rc
;
}
memory
->
bitmap
=
bitmap
;
nr
=
(
unsigned
int
*
)
calloc
(
memory
->
page_num
,
sizeof
(
unsigned
int
));
if
(
!
nr
)
{
rc
=
-
EFAULT
;
free
(
bitmap
);
return
rc
;
}
memory
->
nr
=
nr
;
memory
->
mem_start
=
FPGA_MEM_PHY_ADDR
;
memory
->
mem_end
=
FPGA_MEM_SIZE
;
// pthread_mutex_unlock(memory->mutex);
return
rc
;
}
int
create_fpga_memory
(
struct
fpga_memory
**
memory_info
)
{
int
rc
=
0
;
*
memory_info
=
(
struct
fpga_memory
*
)
malloc
(
sizeof
(
struct
fpga_memory
));
if
(
*
memory_info
==
NULL
)
{
rc
=
-
EFAULT
;
return
rc
;
}
pthread_mutex_init
(
&
((
*
memory_info
)
->
mutex
),
nullptr
);
rc
=
create_fpga_memory_inner
(
*
memory_info
,
FPGA_MEM_SIZE
);
if
(
rc
)
{
free
(
*
memory_info
);
}
return
rc
;
}
int
init_fpga_memory
(
struct
fpga_memory
*
memory
)
{
int
rc
=
0
;
if
(
!
memory
)
{
rc
=
-
EFAULT
;
return
rc
;
}
fpga_bitmap
::
bitmap_clear
(
memory
->
bitmap
,
0
,
memory
->
page_num
);
fpga_bitmap
::
bitmap_set
(
memory
->
bitmap
,
0
,
1
);
// NOTE reserve fpga page 0.
return
0
;
}
void
destroy_fpga_memory
(
struct
fpga_memory
*
memory
)
{
if
(
memory
)
{
free
(
memory
->
nr
);
free
(
memory
->
bitmap
);
free
(
memory
);
}
}
int
fpga_memory_add
()
{
int
rc
=
0
;
rc
=
create_fpga_memory
(
&
g_fpgainfo
.
memory_info
);
if
(
rc
)
{
return
rc
;
}
rc
=
init_fpga_memory
(
g_fpgainfo
.
memory_info
);
if
(
rc
)
{
destroy_fpga_memory
(
g_fpgainfo
.
memory_info
);
return
rc
;
}
return
0
;
}
uint64_t
vaddr_to_paddr_driver
(
void
*
address
)
{
uint64_t
vaddr_to_paddr_driver
(
void
*
address
)
{
uint64_t
paddr
=
0
;
uint64_t
paddr
=
0
;
...
@@ -314,17 +198,28 @@ void *fpga_reg_free(void *ptr) {
...
@@ -314,17 +198,28 @@ void *fpga_reg_free(void *ptr) {
}
}
}
}
static
inline
int
do_ioctl
(
int64_t
req
,
const
void
*
arg
)
{
return
ioctl
(
g_fpgainfo
.
fd_mem
,
req
,
arg
);
}
void
*
fpga_malloc_driver
(
size_t
size
)
{
void
*
fpga_malloc_driver
(
size_t
size
)
{
void
*
ret
=
nullptr
;
void
*
ret
=
nullptr
;
uint64_t
phy_addr
=
0
;
uint64_t
phy_addr
=
0
;
int
i
=
0
;
int
i
=
0
;
struct
MemoryVM2PHYArgs
args
;
struct
MemoryCacheArgs
args_c
;
memory_request
(
g_fpgainfo
.
memory_info
,
size
,
&
phy_addr
);
//
memory_request(g_fpgainfo.memory_info, size, &phy_addr);
ret
=
mmap64
(
nullptr
,
size
,
PROT_READ
|
PROT_WRITE
,
MAP_SHARED
,
ret
=
mmap64
(
nullptr
,
size
,
PROT_READ
|
PROT_WRITE
,
MAP_SHARED
,
g_fpgainfo
.
fd_mem
,
phy_addr
);
g_fpgainfo
.
fd_mem
,
FPGA_MEM_PHY_ADDR
);
PADDLE_MOBILE_ENFORCE
(
ret
!=
(
void
*
)
-
1
,
"Should not be -1"
);
PADDLE_MOBILE_ENFORCE
(
ret
!=
(
void
*
)
-
1
,
"Should not be -1"
);
args
.
pVM
=
(
void
*
)
ret
;
args
.
pPHY
=
(
void
*
)
0
;
do_ioctl
(
IOCTL_MEMORY_VM2PHY
,
&
args
);
phy_addr
=
(
uint64_t
)
args
.
pPHY
;
g_fpgainfo
.
fpga_vaddr2paddr_map
.
insert
(
std
::
make_pair
(
ret
,
phy_addr
));
g_fpgainfo
.
fpga_vaddr2paddr_map
.
insert
(
std
::
make_pair
(
ret
,
phy_addr
));
g_fpgainfo
.
fpga_addr2size_map
.
insert
(
std
::
make_pair
(
ret
,
size
));
g_fpgainfo
.
fpga_addr2size_map
.
insert
(
std
::
make_pair
(
ret
,
size
));
...
@@ -345,11 +240,6 @@ void fpga_free_driver(void *ptr) {
...
@@ -345,11 +240,6 @@ void fpga_free_driver(void *ptr) {
p_addr
=
vaddr_to_paddr_driver
(
ptr
);
p_addr
=
vaddr_to_paddr_driver
(
ptr
);
pos
=
(
p_addr
-
g_fpgainfo
.
memory_info
->
mem_start
)
/
FPGA_PAGE_SIZE
;
pos
=
(
p_addr
-
g_fpgainfo
.
memory_info
->
mem_start
)
/
FPGA_PAGE_SIZE
;
/*clear bitmap*/
pthread_mutex_lock
(
&
g_fpgainfo
.
memory_info
->
mutex
);
fpga_bitmap
::
bitmap_clear
(
g_fpgainfo
.
memory_info
->
bitmap
,
pos
,
g_fpgainfo
.
memory_info
->
nr
[
pos
]);
pthread_mutex_unlock
(
&
g_fpgainfo
.
memory_info
->
mutex
);
auto
iter
=
g_fpgainfo
.
fpga_vaddr2paddr_map
.
find
(
ptr
);
auto
iter
=
g_fpgainfo
.
fpga_vaddr2paddr_map
.
find
(
ptr
);
if
(
iter
!=
g_fpgainfo
.
fpga_vaddr2paddr_map
.
end
())
{
if
(
iter
!=
g_fpgainfo
.
fpga_vaddr2paddr_map
.
end
())
{
...
@@ -360,10 +250,6 @@ void fpga_free_driver(void *ptr) {
...
@@ -360,10 +250,6 @@ void fpga_free_driver(void *ptr) {
}
}
}
}
static
inline
int
do_ioctl
(
int64_t
req
,
const
void
*
arg
)
{
return
ioctl
(
g_fpgainfo
.
fd_mem
,
req
,
arg
);
}
int
fpga_flush_driver
(
void
*
address
,
size_t
size
)
{
int
fpga_flush_driver
(
void
*
address
,
size_t
size
)
{
struct
MemoryCacheArgs
args
;
struct
MemoryCacheArgs
args
;
uint64_t
p_addr
;
uint64_t
p_addr
;
...
@@ -413,7 +299,7 @@ int open_device_driver() {
...
@@ -413,7 +299,7 @@ int open_device_driver() {
g_fpgainfo
.
FpgaRegVirAddr
=
g_fpgainfo
.
FpgaRegVirAddr
=
(
uint64_t
*
)
fpga_reg_malloc
(
FPGA_REG_SIZE
);
// NOLINT
(
uint64_t
*
)
fpga_reg_malloc
(
FPGA_REG_SIZE
);
// NOLINT
fpga_memory_add
();
//
fpga_memory_add();
pl_init
();
pl_init
();
...
@@ -424,8 +310,7 @@ int close_device_driver() {
...
@@ -424,8 +310,7 @@ int close_device_driver() {
pl_destroy
();
pl_destroy
();
fpga_reg_free
(
g_fpgainfo
.
FpgaRegVirAddr
);
fpga_reg_free
(
g_fpgainfo
.
FpgaRegVirAddr
);
memory_release
(
g_fpgainfo
.
memory_info
);
memory_release
(
g_fpgainfo
.
memory_info
);
destroy_fpga_memory
(
g_fpgainfo
.
memory_info
);
return
0
;
return
0
;
}
}
...
...
src/fpga/common/driver.h
浏览文件 @
2f507f76
...
@@ -31,8 +31,8 @@ namespace driver {
...
@@ -31,8 +31,8 @@ namespace driver {
#define FPGA_REG_PHY_ADDR 0x80000000
#define FPGA_REG_PHY_ADDR 0x80000000
#define FPGA_REG_SIZE 0x1000
#define FPGA_REG_SIZE 0x1000
#define FPGA_MEM_PHY_ADDR 0x
4
0000000
#define FPGA_MEM_PHY_ADDR 0x
2
0000000
#define FPGA_MEM_SIZE 0x
8
0000000
#define FPGA_MEM_SIZE 0x
2
0000000
#define FPGA_PAGE_SIZE (16UL * 1024UL)
#define FPGA_PAGE_SIZE (16UL * 1024UL)
...
@@ -52,9 +52,16 @@ struct MemoryCacheArgs {
...
@@ -52,9 +52,16 @@ struct MemoryCacheArgs {
size_t
size
;
size_t
size
;
};
};
struct
MemoryVM2PHYArgs
{
void
*
pVM
;
void
*
pPHY
;
};
#define IOCTL_FPGA_MAGIC 'F'
#define IOCTL_FPGA_MAGIC 'F'
#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
#define IOCTL_MEMORY_VM2PHY _IOWR(IOCTL_FPGA_MAGIC, 15, struct MemoryVM2PHYArgs)
struct
fpga_pe
{
struct
fpga_pe
{
char
type_name
[
MAX_TYPE_NAME_LENTH
+
1
];
char
type_name
[
MAX_TYPE_NAME_LENTH
+
1
];
...
...
src/fpga/common/fpga_common.h
浏览文件 @
2f507f76
...
@@ -25,6 +25,7 @@ limitations under the License. */
...
@@ -25,6 +25,7 @@ limitations under the License. */
#define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16
#define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT (8)
#define BS_NUM_ALIGNMENT (8)
#define BIAS_NUM_ALIGNMENT (16)
#define BIAS_NUM_ALIGNMENT (16)
#define ROW_PARALLEL_NUM (3)
#endif
#endif
namespace
paddle_mobile
{
namespace
paddle_mobile
{
...
...
src/operators/detection_ops.cpp
浏览文件 @
2f507f76
...
@@ -65,6 +65,24 @@ void PSRoiPoolOp<DeviceType, T>::InferShape() const {
...
@@ -65,6 +65,24 @@ void PSRoiPoolOp<DeviceType, T>::InferShape() const {
}
}
#endif
#endif
#ifdef ROIALIGN_POOL_OP
template
<
typename
DeviceType
,
typename
T
>
void
RoiAlignPoolOp
<
DeviceType
,
T
>::
InferShape
()
const
{
const
auto
&
rois_dims
=
this
->
param_
.
input_rois_
->
dims
();
const
int
pooled_height
=
this
->
param_
.
pooled_height_
;
const
int
pooled_width
=
this
->
param_
.
pooled_width_
;
auto
out_dims
=
this
->
param_
.
input_x_
->
dims
();
out_dims
[
0
]
=
rois_dims
[
0
];
// out_dims[1] =
// output_channels; // input_dims[1] / (pooled_height * pooled_width);
out_dims
[
2
]
=
pooled_height
;
out_dims
[
3
]
=
pooled_width
;
this
->
param_
.
output_
->
Resize
(
out_dims
);
}
#endif
#ifdef ROI_PERSPECTIVE_OP
#ifdef ROI_PERSPECTIVE_OP
template
<
typename
DeviceType
,
typename
T
>
template
<
typename
DeviceType
,
typename
T
>
void
RoiPerspectiveOp
<
DeviceType
,
T
>::
InferShape
()
const
{
void
RoiPerspectiveOp
<
DeviceType
,
T
>::
InferShape
()
const
{
...
@@ -110,4 +128,8 @@ REGISTER_OPERATOR_FPGA(generate_proposals, ops::ProposalOp);
...
@@ -110,4 +128,8 @@ REGISTER_OPERATOR_FPGA(generate_proposals, ops::ProposalOp);
#ifdef PSROI_POOL_OP
#ifdef PSROI_POOL_OP
REGISTER_OPERATOR_FPGA
(
psroi_pool
,
ops
::
PSRoiPoolOp
);
REGISTER_OPERATOR_FPGA
(
psroi_pool
,
ops
::
PSRoiPoolOp
);
#endif
#endif
#ifdef ROIALIGN_POOL_OP
REGISTER_OPERATOR_FPGA
(
roialign_pool
,
ops
::
RoiAlignPoolOp
);
#endif
#endif
#endif
src/operators/detection_ops.h
浏览文件 @
2f507f76
...
@@ -34,6 +34,11 @@ DECLARE_OPERATOR(Proposal, ProposalParam, ProposalKernel);
...
@@ -34,6 +34,11 @@ DECLARE_OPERATOR(Proposal, ProposalParam, ProposalKernel);
DECLARE_OPERATOR
(
PSRoiPool
,
PSRoiPoolParam
,
PSRoiPoolKernel
);
DECLARE_OPERATOR
(
PSRoiPool
,
PSRoiPoolParam
,
PSRoiPoolKernel
);
#endif
#endif
#ifdef ROIALIGN_POOL_OP
DECLARE_OPERATOR
(
RoiAlignPool
,
RoiAlignPoolParam
,
RoiAlignPoolKernel
);
#endif
#ifdef ROI_PERSPECTIVE_OP
#ifdef ROI_PERSPECTIVE_OP
DECLARE_OPERATOR
(
RoiPerspective
,
RoiPerspectiveParam
,
RoiPerspectiveKernel
);
DECLARE_OPERATOR
(
RoiPerspective
,
RoiPerspectiveParam
,
RoiPerspectiveKernel
);
#endif
#endif
...
...
src/operators/kernel/detection_kernel.h
浏览文件 @
2f507f76
...
@@ -98,6 +98,8 @@ class ProposalParam : public OpParam {
...
@@ -98,6 +98,8 @@ class ProposalParam : public OpParam {
framework
::
Tensor
*
anchors_
;
framework
::
Tensor
*
anchors_
;
framework
::
Tensor
*
variances_
;
framework
::
Tensor
*
variances_
;
std
::
shared_ptr
<
Tensor
>
score_index_
;
framework
::
LoDTensor
*
rpn_rois_
;
framework
::
LoDTensor
*
rpn_rois_
;
framework
::
LoDTensor
*
rpn_probs_
;
framework
::
LoDTensor
*
rpn_probs_
;
...
@@ -151,6 +153,42 @@ class PSRoiPoolParam : public OpParam {
...
@@ -151,6 +153,42 @@ class PSRoiPoolParam : public OpParam {
DECLARE_KERNEL
(
PSRoiPool
,
PSRoiPoolParam
);
DECLARE_KERNEL
(
PSRoiPool
,
PSRoiPoolParam
);
#endif
#endif
#ifdef ROIALIGN_POOL_OP
template
<
typename
Dtype
>
class
RoiAlignPoolParam
:
public
OpParam
{
public:
RoiAlignPoolParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
*
scope
)
:
OpParam
(
inputs
,
outputs
,
attrs
,
scope
)
{
input_x_
=
OpParam
::
GetVarValue
<
framework
::
LoDTensor
>
(
"X"
,
inputs
,
*
scope
);
input_rois_
=
OpParam
::
GetVarValue
<
framework
::
LoDTensor
>
(
"ROIs"
,
inputs
,
*
scope
);
output_
=
OpParam
::
GetVarValue
<
framework
::
LoDTensor
>
(
"Out"
,
outputs
,
*
scope
);
pooled_height_
=
OpParam
::
GetAttr
<
int
>
(
"pooled_height"
,
attrs
);
pooled_width_
=
OpParam
::
GetAttr
<
int
>
(
"pooled_width"
,
attrs
);
spatial_scale_
=
OpParam
::
GetAttr
<
float
>
(
"spatial_scale"
,
attrs
);
sampling_ratio_
=
OpParam
::
GetAttr
<
float
>
(
"sampling_ratio"
,
attrs
);
}
public:
framework
::
Tensor
*
input_x_
;
framework
::
LoDTensor
*
input_rois_
;
framework
::
Tensor
*
output_
;
int
pooled_height_
;
int
pooled_width_
;
float
spatial_scale_
;
int
sampling_ratio_
;
#ifdef PADDLE_MOBILE_FPGA
std
::
shared_ptr
<
Tensor
>
float_input
,
float_output
;
fpga
::
BypassArgs
input_arg
,
output_arg
;
#endif
};
DECLARE_KERNEL
(
RoiAlignPool
,
RoiAlignPoolParam
);
#endif
#ifdef ROI_PERSPECTIVE_OP
#ifdef ROI_PERSPECTIVE_OP
template
<
typename
Dtype
>
template
<
typename
Dtype
>
class
RoiPerspectiveParam
:
public
OpParam
{
class
RoiPerspectiveParam
:
public
OpParam
{
...
...
src/operators/kernel/fpga/V1/fetch_kernel.cpp
浏览文件 @
2f507f76
...
@@ -62,7 +62,10 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> ¶m) {
...
@@ -62,7 +62,10 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> ¶m) {
output
->
ShareDataWith
(
*
input
);
output
->
ShareDataWith
(
*
input
);
return
;
return
;
}
}
fpga
::
PerformBypass
(
param
.
fpga_bypass_args
);
fpga
::
BypassArgs
args
=
param
.
fpga_bypass_args
;
auto
input_address
=
(
input
->
data
<
half
>
());
args
.
image
.
address
=
static_cast
<
void
*>
(
input_address
);
fpga
::
PerformBypass
(
args
);
auto
outC
=
param
.
Out
()
->
dims
()[
1
];
auto
outC
=
param
.
Out
()
->
dims
()[
1
];
auto
outH
=
param
.
Out
()
->
dims
()[
2
];
auto
outH
=
param
.
Out
()
->
dims
()[
2
];
auto
outW
=
param
.
Out
()
->
dims
()[
3
];
auto
outW
=
param
.
Out
()
->
dims
()[
3
];
...
@@ -70,10 +73,15 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> ¶m) {
...
@@ -70,10 +73,15 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> ¶m) {
reinterpret_cast
<
float
*>
(
param
.
fpga_bypass_args
.
output
.
address
);
reinterpret_cast
<
float
*>
(
param
.
fpga_bypass_args
.
output
.
address
);
fpga
::
fpga_invalidate
(
param
.
fpga_bypass_args
.
output
.
address
,
fpga
::
fpga_invalidate
(
param
.
fpga_bypass_args
.
output
.
address
,
param
.
Out
()
->
fpga_data_num
*
sizeof
(
float
));
param
.
Out
()
->
fpga_data_num
*
sizeof
(
float
));
float
*
data_tmp
=
reinterpret_cast
<
float
*>
(
malloc
(
outC
*
outH
*
outW
*
sizeof
(
float
)));
if
(
param
.
Out
()
->
fpga_data_num
!=
product
(
input
->
dims
())){
dealign
(
outdata_ptr
,
data_tmp
,
outC
,
outH
,
outW
);
float
*
data_tmp
=
memcpy
(
outdata_ptr
,
data_tmp
,
outC
*
outH
*
outW
*
sizeof
(
float
));
reinterpret_cast
<
float
*>
(
malloc
(
outC
*
outH
*
outW
*
sizeof
(
float
)));
dealign
(
outdata_ptr
,
data_tmp
,
outC
,
outH
,
outW
);
memcpy
(
outdata_ptr
,
data_tmp
,
outC
*
outH
*
outW
*
sizeof
(
float
));
free
(
data_tmp
);
}
}
}
template
class
FetchKernel
<
FPGA
,
float
>;
template
class
FetchKernel
<
FPGA
,
float
>;
...
...
src/operators/kernel/fpga/V1/pool_kernel.cpp
浏览文件 @
2f507f76
...
@@ -73,9 +73,11 @@ void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> ¶m) {
...
@@ -73,9 +73,11 @@ void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> ¶m) {
if
(
input
->
type
()
==
typeid
(
float
))
{
if
(
input
->
type
()
==
typeid
(
float
))
{
auto
*
output
=
param
.
Output
();
auto
*
output
=
param
.
Output
();
auto
in
=
input
->
data
<
float
>
();
auto
in
=
input
->
data
<
float
>
();
auto
N
=
input
->
dims
()[
0
];
output
->
Resize
({
N
,
output
->
dims
()[
1
],
output
->
dims
()[
2
],
output
->
dims
()[
3
]});
auto
len
=
output
->
numel
();
auto
len
=
output
->
numel
();
auto
out
=
output
->
mutable_data
<
float
>
();
auto
out
=
output
->
mutable_data
<
float
>
();
int
N
=
input
->
dims
()[
0
],
C
=
input
->
dims
()[
1
],
H
=
input
->
dims
()[
2
],
int
C
=
input
->
dims
()[
1
],
H
=
input
->
dims
()[
2
],
//N = input->dims()[0
],
W
=
input
->
dims
()[
3
];
W
=
input
->
dims
()[
3
];
int
HW
=
H
*
W
,
CHW
=
C
*
H
*
W
,
WC
=
W
*
C
;
int
HW
=
H
*
W
,
CHW
=
C
*
H
*
W
,
WC
=
W
*
C
;
...
...
src/operators/kernel/fpga/V1/proposal_kernel.cpp
浏览文件 @
2f507f76
...
@@ -65,6 +65,14 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
...
@@ -65,6 +65,14 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
args
.
output
.
scale_address
=
param
->
float_score
->
scale
;
args
.
output
.
scale_address
=
param
->
float_score
->
scale
;
param
->
score_arg
=
args
;
param
->
score_arg
=
args
;
param
->
score_index_
=
std
::
make_shared
<
Tensor
>
();
param
->
score_index_
->
mutable_data
<
int32_t
>
({
input
->
numel
()});
auto
score_index
=
param
->
score_index_
->
data
<
int32_t
>
();
for
(
int
i
=
0
;
i
<
input
->
numel
();
++
i
){
score_index
[
i
]
=
i
;
}
return
true
;
return
true
;
}
}
template
<
typename
T
>
template
<
typename
T
>
...
@@ -334,6 +342,7 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
...
@@ -334,6 +342,7 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
const
Tensor
&
im_info_slice
,
const
Tensor
&
anchors
,
const
Tensor
&
variances
,
const
Tensor
&
im_info_slice
,
const
Tensor
&
anchors
,
const
Tensor
&
variances
,
const
Tensor
&
bbox_deltas_slice
,
// [M, 4]
const
Tensor
&
bbox_deltas_slice
,
// [M, 4]
const
Tensor
&
scores_slice
,
// [N, 1]
const
Tensor
&
scores_slice
,
// [N, 1]
const
Tensor
&
score_index
,
int
pre_nms_top_n
,
int
post_nms_top_n
,
float
nms_thresh
,
float
min_size
,
int
pre_nms_top_n
,
int
post_nms_top_n
,
float
nms_thresh
,
float
min_size
,
float
eta
)
{
float
eta
)
{
auto
*
scores_data
=
scores_slice
.
data
<
T
>
();
auto
*
scores_data
=
scores_slice
.
data
<
T
>
();
...
@@ -342,9 +351,11 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
...
@@ -342,9 +351,11 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
Tensor
index_t
;
Tensor
index_t
;
index_t
.
Resize
({
scores_slice
.
numel
()});
index_t
.
Resize
({
scores_slice
.
numel
()});
int
*
index
=
index_t
.
mutable_data
<
int
>
();
int
*
index
=
index_t
.
mutable_data
<
int
>
();
for
(
int
i
=
0
;
i
<
scores_slice
.
numel
();
++
i
)
{
/*
for (int i = 0; i < scores_slice.numel(); ++i) {
index[i] = i;
index[i] = i;
}
}*/
std
::
memcpy
(
index
,
score_index
.
data
<
int32_t
>
(),
scores_slice
.
numel
()
*
sizeof
(
int
)
);
auto
compare
=
[
scores_data
](
const
int64_t
&
i
,
const
int64_t
&
j
)
{
auto
compare
=
[
scores_data
](
const
int64_t
&
i
,
const
int64_t
&
j
)
{
return
scores_data
[
i
]
>
scores_data
[
j
];
return
scores_data
[
i
]
>
scores_data
[
j
];
};
};
...
@@ -490,8 +501,10 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> ¶m) {
...
@@ -490,8 +501,10 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> ¶m) {
auto
*
rpn_rois
=
param
.
rpn_rois_
;
auto
*
rpn_rois
=
param
.
rpn_rois_
;
auto
*
rpn_roi_probs
=
param
.
rpn_probs_
;
auto
*
rpn_roi_probs
=
param
.
rpn_probs_
;
auto
score_index
=
*
(
param
.
score_index_
.
get
());
int
pre_nms_top_n
=
param
.
pre_nms_topn_
;
int
pre_nms_top_n
=
param
.
pre_nms_topn_
;
int
post_nms_top_n
=
param
.
post_nms_topn_
;
int
post_nms_top_n
=
100
;
//
param.post_nms_topn_;
float
nms_thresh
=
param
.
nms_thresh_
;
float
nms_thresh
=
param
.
nms_thresh_
;
float
min_size
=
param
.
min_size_
;
float
min_size
=
param
.
min_size_
;
float
eta
=
param
.
eta_
;
float
eta
=
param
.
eta_
;
...
@@ -528,7 +541,7 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> ¶m) {
...
@@ -528,7 +541,7 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> ¶m) {
scores_slice
.
Resize
({
h_score
*
w_score
*
c_score
,
1
});
scores_slice
.
Resize
({
h_score
*
w_score
*
c_score
,
1
});
std
::
pair
<
Tensor
,
Tensor
>
tensor_pair
=
ProposalForOneImage
<
float
>
(
std
::
pair
<
Tensor
,
Tensor
>
tensor_pair
=
ProposalForOneImage
<
float
>
(
im_info_slice
,
anchors
,
variances
,
bbox_deltas_slice
,
scores_slice
,
im_info_slice
,
anchors
,
variances
,
bbox_deltas_slice
,
scores_slice
,
score_index
,
pre_nms_top_n
,
post_nms_top_n
,
nms_thresh
,
min_size
,
eta
);
pre_nms_top_n
,
post_nms_top_n
,
nms_thresh
,
min_size
,
eta
);
Tensor
&
proposals
=
tensor_pair
.
first
;
Tensor
&
proposals
=
tensor_pair
.
first
;
Tensor
&
scores
=
tensor_pair
.
second
;
Tensor
&
scores
=
tensor_pair
.
second
;
...
...
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
浏览文件 @
2f507f76
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#ifdef PSROI_POOL_OP
#ifdef PSROI_POOL_OP
#include <cmath>
#include <cmath>
#include <vector>
#include <vector>
#include "operators/kernel/detection_kernel.h"
#include "operators/kernel/detection_kernel.h"
#include "fpga/V1/api.h"
#include "fpga/V1/api.h"
#include "fpga/V1/image.h"
#include "fpga/V1/image.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
template
<
>
template
<
>
bool
PSRoiPoolKernel
<
FPGA
,
float
>::
Init
(
PSRoiPoolParam
<
FPGA
>*
param
)
{
bool
PSRoiPoolKernel
<
FPGA
,
float
>::
Init
(
PSRoiPoolParam
<
FPGA
>*
param
)
{
auto
dims
=
param
->
input_x_
->
dims
();
auto
dims
=
param
->
input_x_
->
dims
();
PADDLE_MOBILE_ENFORCE
(
dims
[
1
]
*
dims
[
3
]
%
IMAGE_ALIGNMENT
==
0
,
PADDLE_MOBILE_ENFORCE
(
dims
[
1
]
*
dims
[
3
]
%
IMAGE_ALIGNMENT
==
0
,
"data not aligned"
);
"data not aligned"
);
param
->
float_input
=
std
::
make_shared
<
Tensor
>
();
param
->
float_input
=
std
::
make_shared
<
Tensor
>
();
param
->
float_input
->
mutable_data
<
float
>
(
param
->
input_x_
->
dims
());
param
->
float_input
->
mutable_data
<
float
>
(
param
->
input_x_
->
dims
());
// param->float_output = std::make_shared<Tensor>();
// param->float_output = std::make_shared<Tensor>();
auto
input
=
param
->
input_x_
;
auto
input
=
param
->
input_x_
;
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
image
.
address
=
input
->
data
<
half
>
();
args
.
image
.
address
=
input
->
data
<
half
>
();
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
param
->
float_input
->
mutable_data
<
float
>
();
args
.
output
.
address
=
param
->
float_input
->
mutable_data
<
float
>
();
args
.
output
.
scale_address
=
param
->
float_input
->
scale
;
args
.
output
.
scale_address
=
param
->
float_input
->
scale
;
param
->
input_arg
=
args
;
param
->
input_arg
=
args
;
auto
*
rois
=
param
->
input_rois_
;
auto
*
rois
=
param
->
input_rois_
;
int
rois_num
=
rois
->
dims
()[
0
];
int
rois_num
=
rois
->
dims
()[
0
];
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
rois_num
,
param
->
output_
->
dims
()[
1
],
param
->
output_
->
dims
()[
2
],
{
rois_num
,
param
->
output_
->
dims
()[
1
],
param
->
output_
->
dims
()[
2
],
param
->
output_
->
dims
()[
3
]});
param
->
output_
->
dims
()[
3
]});
param
->
output_
->
Resize
(
dims_out_new
);
param
->
output_
->
Resize
(
dims_out_new
);
// fpga::format_fp16_ofm(param->output_);
// fpga::format_fp16_ofm(param->output_);
param
->
output_
->
mutable_data
<
float
>
(
dims_out_new
);
param
->
output_
->
mutable_data
<
float
>
(
dims_out_new
);
// auto output = param->float_output.get();
// auto output = param->float_output.get();
// param->output_ = output;
// param->output_ = output;
/* args.input_data_type = fpga::DATA_TYPE_FP32;
/* args.input_data_type = fpga::DATA_TYPE_FP32;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.image.address = output->data<float>();
args.image.address = output->data<float>();
args.image.height = (uint32_t)output->dims()[2];
args.image.height = (uint32_t)output->dims()[2];
args.image.width = (uint32_t)output->dims()[3];
args.image.width = (uint32_t)output->dims()[3];
args.image.channels = (uint32_t)output->dims()[1] ;
args.image.channels = (uint32_t)output->dims()[1] ;
args.output.address = param->output_->mutable_data<half>();
args.output.address = param->output_->mutable_data<half>();
args.output.scale_address = param->output_->scale;
args.output.scale_address = param->output_->scale;
param->output_arg = args;*/
param->output_arg = args;*/
return
true
;
return
true
;
}
}
template
<
typename
Dtype
>
template
<
typename
Dtype
>
void
PSROIPooling
(
const
Dtype
*
bottom_data
,
const
Dtype
spatial_scale
,
void
PSROIPooling
(
const
int
channels
,
const
int
height
,
const
int
width
,
const
Dtype
*
bottom_data
,
const
int
channels
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
height
,
const
int
width
,
const
Dtype
*
bottom_rois
,
const
int
output_dim
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
group_size
,
Dtype
*
top_data
,
const
Dtype
*
bottom_rois
,
const
int
output_dim
,
// int* mapping_channel,
const
int
group_size
,
Dtype
*
top_data
,
int
index
,
int
*
rois_batch_id
)
{
int
index
,
int
nid
,
// The output is in order (n, ctop, ph, pw)
const
Dtype
Bin_size_h
,
// static int cnt = 0;
const
Dtype
Bin_size_w
,
int
pw
=
index
%
pooled_width
;
const
Dtype
roi_start_h
,
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
const
Dtype
roi_start_w
,
int
ctop
=
(
index
/
pooled_width
/
pooled_height
)
%
output_dim
;
const
int
ctop
,
const
int
ph
,
const
int
roi_batch_ind
)
int
n
=
index
/
pooled_width
/
pooled_height
/
output_dim
;
{
int
pw
=
index
;
// [start, end) interval for spatial sampling
int
hstart
=
floor
(
static_cast
<
Dtype
>
(
ph
)
*
Bin_size_h
+
roi_start_h
);
bottom_rois
+=
n
*
4
;
int
wstart
=
floor
(
static_cast
<
Dtype
>
(
pw
)
*
Bin_size_w
+
roi_start_w
);
int
roi_batch_ind
=
rois_batch_id
[
n
];
// bottom_rois[0];
int
hend
=
ceil
(
static_cast
<
Dtype
>
(
ph
+
1
)
*
Bin_size_h
+
roi_start_h
);
Dtype
roi_start_w
=
static_cast
<
Dtype
>
(
round
(
bottom_rois
[
0
]))
*
spatial_scale
;
int
wend
=
ceil
(
static_cast
<
Dtype
>
(
pw
+
1
)
*
Bin_size_w
+
roi_start_w
);
Dtype
roi_start_h
=
static_cast
<
Dtype
>
(
round
(
bottom_rois
[
1
]))
*
spatial_scale
;
Dtype
roi_end_w
=
// Add roi offsets and clip to input boundaries
static_cast
<
Dtype
>
(
round
(
bottom_rois
[
2
])
+
1.
)
*
spatial_scale
;
hstart
=
std
::
min
(
std
::
max
(
hstart
,
0
),
height
);
Dtype
roi_end_h
=
hend
=
std
::
min
(
std
::
max
(
hend
,
0
),
height
);
static_cast
<
Dtype
>
(
round
(
bottom_rois
[
3
])
+
1.
)
*
spatial_scale
;
wstart
=
std
::
min
(
std
::
max
(
wstart
,
0
),
width
);
wend
=
std
::
min
(
std
::
max
(
wend
,
0
),
width
);
// Force too small ROIs to be 1x1
bool
is_empty
=
(
hend
<=
hstart
)
||
(
wend
<=
wstart
);
Dtype
roi_width
=
std
::
max
(
roi_end_w
-
roi_start_w
,
0.1
f
);
// avoid 0
Dtype
roi_height
=
std
::
max
(
roi_end_h
-
roi_start_h
,
0.1
f
);
int
c
=
(
ctop
*
group_size
+
ph
)
*
group_size
+
pw
;
// Compute w and h at bottom
Dtype
bin_area
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
Dtype
bin_size_h
=
roi_height
/
static_cast
<
Dtype
>
(
pooled_height
);
bottom_data
+=
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
Dtype
bin_size_w
=
roi_width
/
static_cast
<
Dtype
>
(
pooled_width
);
Dtype
out_sum
=
0
;
for
(
int
h
=
hstart
;
h
<
hend
;
++
h
)
{
int
hstart
=
floor
(
static_cast
<
Dtype
>
(
ph
)
*
bin_size_h
+
roi_start_h
);
for
(
int
w
=
wstart
;
w
<
wend
;
++
w
)
{
int
wstart
=
floor
(
static_cast
<
Dtype
>
(
pw
)
*
bin_size_w
+
roi_start_w
);
int
bottom_index
=
h
*
width
+
w
;
int
hend
=
ceil
(
static_cast
<
Dtype
>
(
ph
+
1
)
*
bin_size_h
+
roi_start_h
);
out_sum
+=
bottom_data
[
bottom_index
];
int
wend
=
ceil
(
static_cast
<
Dtype
>
(
pw
+
1
)
*
bin_size_w
+
roi_start_w
);
}
// Add roi offsets and clip to input boundaries
}
hstart
=
std
::
min
(
std
::
max
(
hstart
,
0
),
height
);
hend
=
std
::
min
(
std
::
max
(
hend
,
0
),
height
);
top_data
[
nid
+
index
]
=
is_empty
?
0.
:
out_sum
/
bin_area
;
wstart
=
std
::
min
(
std
::
max
(
wstart
,
0
),
width
);
wend
=
std
::
min
(
std
::
max
(
wend
,
0
),
width
);
}
bool
is_empty
=
(
hend
<=
hstart
)
||
(
wend
<=
wstart
);
void
convert_to_chw
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
,
int
gw
=
pw
;
int
num
)
{
int
gh
=
ph
;
float
*
data_in_tmp
=
*
data_in
;
int
c
=
(
ctop
*
group_size
+
gh
)
*
group_size
+
gw
;
float
*
data_tmp
=
(
float
*
)
fpga
::
fpga_malloc
(
channel
*
height
*
width
*
sizeof
(
float
));
// NOLINT
bottom_data
+=
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
int64_t
amount_per_side
=
width
*
height
;
Dtype
out_sum
=
0
;
for
(
int
n
=
0
;
n
<
num
;
n
++
)
{
for
(
int
h
=
hstart
;
h
<
hend
;
++
h
)
{
for
(
int
h
=
0
;
h
<
height
;
h
++
)
{
for
(
int
w
=
wstart
;
w
<
wend
;
++
w
)
{
for
(
int
w
=
0
;
w
<
width
;
w
++
)
{
int
bottom_index
=
h
*
width
+
w
;
for
(
int
c
=
0
;
c
<
channel
;
c
++
)
{
out_sum
+=
bottom_data
[
bottom_index
];
*
(
data_tmp
+
n
*
height
*
width
*
channel
+
c
*
amount_per_side
+
width
*
h
+
w
)
=
*
((
*
data_in
)
++
);
}
}
}
}
}
}
*
data_in
=
data_tmp
;
Dtype
bin_area
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
fpga
::
fpga_free
(
data_in_tmp
);
top_data
[
index
]
=
is_empty
?
0.
:
out_sum
/
bin_area
;
}
}
template
<
>
void
convert_to_hwc
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
,
void
PSRoiPoolKernel
<
FPGA
,
float
>::
Compute
(
const
PSRoiPoolParam
<
FPGA
>&
param
)
{
int
num
)
{
auto
input_tensor
=
param
.
float_input
.
get
();
float
*
data_in_tmp
=
*
data_in
;
fpga
::
PerformBypass
(
param
.
input_arg
);
float
*
data_tmp
=
reinterpret_cast
<
float
*>
(
fpga
::
fpga_invalidate
(
input_tensor
->
data
<
float
>
(),
fpga
::
fpga_malloc
(
num
*
channel
*
height
*
width
*
sizeof
(
float
)));
input_tensor
->
numel
()
*
sizeof
(
float
));
int64_t
amount_per_row
=
width
*
channel
;
for
(
int
n
=
0
;
n
<
num
;
n
++
)
{
auto
*
in
=
input_tensor
;
for
(
int
c
=
0
;
c
<
channel
;
c
++
)
{
auto
*
rois
=
param
.
input_rois_
;
for
(
int
h
=
0
;
h
<
height
;
h
++
)
{
auto
*
out
=
param
.
output_
;
// param.float_output.get();
int64_t
offset_height
=
h
*
amount_per_row
;
for
(
int
w
=
0
;
w
<
width
;
w
++
)
{
auto
pooled_height
=
param
.
pooled_height_
;
*
(
data_tmp
+
n
*
channel
*
height
*
width
+
offset_height
+
auto
pooled_width
=
param
.
pooled_width_
;
w
*
channel
+
c
)
=
*
((
*
data_in
)
++
);
auto
spatial_scale
=
param
.
spatial_scale_
;
}
auto
output_channels
=
param
.
output_channels_
;
}
auto
in_dims
=
in
->
dims
();
int
batch_size
=
in_dims
[
0
];
int
input_channels
=
in_dims
[
1
];
int
height
=
in_dims
[
2
];
int
width
=
in_dims
[
3
];
int
rois_num
=
rois
->
dims
()[
0
];
auto
data_nhwc
=
in
->
mutable_data
<
float
>
();
fpga
::
image
::
convert_to_chw
(
&
data_nhwc
,
input_channels
,
height
,
width
);
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
rois_num
,
(
param
.
output_
)
->
dims
()[
1
],
(((
param
.
output_
)
->
dims
()[
2
])),
(
param
.
output_
)
->
dims
()[
3
]});
(
param
.
output_
)
->
Resize
(
dims_out_new
);
const
float
*
input_data
=
data_nhwc
;
// in->data<float>();
framework
::
Tensor
rois_batch_id_list
;
rois_batch_id_list
.
Resize
({
rois_num
});
auto
rois_batch_id_data
=
rois_batch_id_list
.
mutable_data
<
int
>
();
PADDLE_MOBILE_ENFORCE
(
rois
->
NumLevels
()
>
0
,
"ROIS should not be empty"
);
auto
rois_lod
=
rois
->
lod
().
back
();
int
rois_batch_size
=
rois_lod
.
size
()
-
1
;
PADDLE_MOBILE_ENFORCE
(
rois_batch_size
==
batch_size
,
"the rois_batch_size and input(X) batch_size should be the same."
);
int
rois_num_with_lod
=
rois_lod
[
rois_batch_size
];
PADDLE_MOBILE_ENFORCE
(
rois_num_with_lod
==
rois_num
,
"the rois_num from input and lod must be the same"
);
PADDLE_MOBILE_ENFORCE
(
input_channels
==
output_channels
*
pooled_height
*
pooled_width
,
"the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width"
);
// calculate batch id index for each roi according to LoD
for
(
int
n
=
0
;
n
<
rois_batch_size
;
++
n
)
{
for
(
size_t
i
=
rois_lod
[
n
];
i
<
rois_lod
[
n
+
1
];
++
i
)
{
rois_batch_id_data
[
i
]
=
n
;
}
}
}
}
auto
output_data
=
out
->
mutable_data
<
float
>
();
*
data_in
=
data_tmp
;
auto
input_rois
=
rois
->
data
<
float
>
();
fpga
::
fpga_free
(
data_in_tmp
);
}
// calculate psroipooling, parallel processing can be implemented per ROI
int
index
=
pooled_height
*
pooled_width
*
output_channels
*
rois_num
;
template
<
>
for
(
int
idx
=
0
;
idx
<
index
;
idx
++
)
{
void
PSRoiPoolKernel
<
FPGA
,
float
>::
Compute
(
const
PSRoiPoolParam
<
FPGA
>&
param
)
{
PSROIPooling
<
float
>
(
input_data
,
spatial_scale
,
input_channels
,
height
,
auto
input_tensor
=
param
.
float_input
.
get
();
width
,
pooled_height
,
pooled_width
,
input_rois
,
fpga
::
PerformBypass
(
param
.
input_arg
);
output_channels
,
pooled_height
,
output_data
,
idx
,
fpga
::
fpga_invalidate
(
input_tensor
->
data
<
float
>
(),
rois_batch_id_data
);
input_tensor
->
numel
()
*
sizeof
(
float
));
}
//
auto
*
in
=
input_tensor
;
fpga
::
image
::
convert_to_hwc
(
&
output_data
,
output_channels
,
pooled_height
,
auto
*
rois
=
param
.
input_rois_
;
pooled_width
,
rois_num
);
auto
*
out
=
param
.
output_
;
// param.float_output.get();
out
->
reset_data_ptr
(
output_data
);
}
auto
pooled_height
=
param
.
pooled_height_
;
auto
pooled_width
=
param
.
pooled_width_
;
}
// namespace operators
auto
spatial_scale
=
param
.
spatial_scale_
;
}
// namespace paddle_mobile
auto
output_channels
=
param
.
output_channels_
;
#endif // PSROI_POOL_OP
auto
in_dims
=
in
->
dims
();
int
batch_size
=
in_dims
[
0
];
int
input_channels
=
in_dims
[
1
];
int
height
=
in_dims
[
2
];
int
width
=
in_dims
[
3
];
int
rois_num
=
rois
->
dims
()[
0
];
auto
data_nhwc
=
in
->
mutable_data
<
float
>
();
convert_to_chw
(
&
data_nhwc
,
input_channels
,
height
,
width
,
1
);
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
rois_num
,
(
param
.
output_
)
->
dims
()[
1
],
(((
param
.
output_
)
->
dims
()[
2
])),
(
param
.
output_
)
->
dims
()[
3
]});
(
param
.
output_
)
->
Resize
(
dims_out_new
);
const
float
*
input_data
=
data_nhwc
;
// in->data<float>();
framework
::
Tensor
rois_batch_id_list
;
rois_batch_id_list
.
Resize
({
rois_num
});
auto
rois_batch_id_data
=
rois_batch_id_list
.
mutable_data
<
int
>
();
PADDLE_MOBILE_ENFORCE
(
rois
->
NumLevels
()
>
0
,
"ROIS should not be empty"
);
auto
rois_lod
=
rois
->
lod
().
back
();
int
rois_batch_size
=
rois_lod
.
size
()
-
1
;
PADDLE_MOBILE_ENFORCE
(
rois_batch_size
==
batch_size
,
"the rois_batch_size and input(X) batch_size should be the same."
);
int
rois_num_with_lod
=
rois_lod
[
rois_batch_size
];
PADDLE_MOBILE_ENFORCE
(
rois_num_with_lod
==
rois_num
,
"the rois_num from input and lod must be the same"
);
PADDLE_MOBILE_ENFORCE
(
input_channels
==
output_channels
*
pooled_height
*
pooled_width
,
"the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width"
);
// calculate batch id index for each roi according to LoD
//for (int n = 0; n < rois_batch_size; ++n) {
//for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
//rois_batch_id_data[i] = n;
// }
//}
auto
output_data
=
out
->
mutable_data
<
float
>
();
auto
input_rois
=
rois
->
data
<
float
>
();
// calculate psroipooling, parallel processing can be implemented per ROI
for
(
int
n
=
0
;
n
<
rois_num
;
++
n
)
{
// [start, end) interval for spatial sampling
auto
offset_input_rois
=
input_rois
+
n
*
4
;
auto
roi_start_w
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
0
]))
*
spatial_scale
;
auto
roi_start_h
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
1
]))
*
spatial_scale
;
auto
roi_end_w
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
2
])
+
1.
)
*
spatial_scale
;
auto
roi_end_h
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
3
])
+
1.
)
*
spatial_scale
;
// Force too small rois to be 1 x 1
auto
roi_height
=
std
::
max
(
roi_end_h
-
roi_start_h
,
0.1
f
);
// avoid 0
auto
roi_width
=
std
::
max
(
roi_end_w
-
roi_start_w
,
0.1
f
);
// Compute bin size w and h at input feature map
auto
bin_size_h
=
roi_height
/
static_cast
<
float
>
(
pooled_height
);
auto
bin_size_w
=
roi_width
/
static_cast
<
float
>
(
pooled_width
);
int
roi_batch_ind
=
0
;
//rois_batch_id_data[n];
//std::cout << "roi_batch_ind: " << roi_batch_ind << std::endl;
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
){
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
){
int
index
=
pooled_width
;
int
nid
=
n
*
output_channels
*
pooled_height
*
pooled_width
+
c
*
pooled_width
*
pooled_height
+
ph
*
pooled_width
;
for
(
int
idx
=
0
;
idx
<
index
;
idx
++
){
PSROIPooling
<
float
>
(
input_data
,
input_channels
,
height
,
width
,
pooled_height
,
pooled_width
,
input_rois
,
output_channels
,
pooled_height
,
output_data
,
idx
,
nid
,
bin_size_h
,
bin_size_w
,
roi_start_h
,
roi_start_w
,
c
,
ph
,
roi_batch_ind
);
}
}
}
}
convert_to_hwc
(
&
output_data
,
output_channels
,
pooled_height
,
pooled_width
,
rois_num
);
out
->
reset_data_ptr
(
output_data
);
}
}
// namespace operators
}
// namespace paddle_mobile
#endif // PSROI_POOL_OP
src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
0 → 100644
浏览文件 @
2f507f76
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ROIALIGN_POOL_OP
#include <cmath>
#include <vector>
#include "operators/kernel/detection_kernel.h"
#include "fpga/V1/api.h"
#include "fpga/V1/image.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
RoiAlignPoolKernel
<
FPGA
,
float
>::
Init
(
RoiAlignPoolParam
<
FPGA
>*
param
)
{
auto
dims
=
param
->
input_x_
->
dims
();
PADDLE_MOBILE_ENFORCE
(
dims
[
1
]
*
dims
[
3
]
%
IMAGE_ALIGNMENT
==
0
,
"data not aligned"
);
param
->
float_input
=
std
::
make_shared
<
Tensor
>
();
param
->
float_input
->
mutable_data
<
float
>
(
param
->
input_x_
->
dims
());
auto
input
=
param
->
input_x_
;
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
image
.
address
=
input
->
data
<
half
>
();
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
param
->
float_input
->
mutable_data
<
float
>
();
args
.
output
.
scale_address
=
param
->
float_input
->
scale
;
param
->
input_arg
=
args
;
auto
*
rois
=
param
->
input_rois_
;
int
rois_num
=
rois
->
dims
()[
0
];
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
rois_num
,
param
->
output_
->
dims
()[
1
],
param
->
output_
->
dims
()[
2
],
param
->
output_
->
dims
()[
3
]});
param
->
output_
->
Resize
(
dims_out_new
);
param
->
output_
->
mutable_data
<
float
>
(
dims_out_new
);
return
true
;
}
template
<
typename
T
>
struct
PreCalc
{
int
pos1
;
int
pos2
;
int
pos3
;
int
pos4
;
T
w1
;
T
w2
;
T
w3
;
T
w4
;
};
template
<
typename
T
>
void
pre_calc_for_bilinear_interpolate
(
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
iy_upper
,
const
int
ix_upper
,
T
roi_start_h
,
T
roi_start_w
,
T
bin_size_h
,
T
bin_size_w
,
int
roi_bin_grid_h
,
int
roi_bin_grid_w
,
std
::
vector
<
PreCalc
<
T
>>&
pre_calc
)
{
int
pre_calc_index
=
0
;
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
for
(
int
pw
=
0
;
pw
<
pooled_width
;
pw
++
)
{
for
(
int
iy
=
0
;
iy
<
iy_upper
;
iy
++
)
{
const
T
yy
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
T
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
T
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
ix_upper
;
ix
++
)
{
const
T
xx
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
T
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
T
>
(
roi_bin_grid_w
);
T
x
=
xx
;
T
y
=
yy
;
// deal with: inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
// empty
PreCalc
<
T
>
pc
;
pc
.
pos1
=
0
;
pc
.
pos2
=
0
;
pc
.
pos3
=
0
;
pc
.
pos4
=
0
;
pc
.
w1
=
0
;
pc
.
w2
=
0
;
pc
.
w3
=
0
;
pc
.
w4
=
0
;
pre_calc
[
pre_calc_index
]
=
pc
;
pre_calc_index
+=
1
;
continue
;
}
if
(
y
<=
0
)
{
y
=
0
;
}
if
(
x
<=
0
)
{
x
=
0
;
}
int
y_low
=
(
int
)
y
;
int
x_low
=
(
int
)
x
;
int
y_high
;
int
x_high
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
T
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
T
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
T
ly
=
y
-
y_low
;
T
lx
=
x
-
x_low
;
T
hy
=
1.
-
ly
,
hx
=
1.
-
lx
;
T
w1
=
hy
*
hx
,
w2
=
hy
*
lx
,
w3
=
ly
*
hx
,
w4
=
ly
*
lx
;
// save weights and indeces
PreCalc
<
T
>
pc
;
pc
.
pos1
=
y_low
*
width
+
x_low
;
pc
.
pos2
=
y_low
*
width
+
x_high
;
pc
.
pos3
=
y_high
*
width
+
x_low
;
pc
.
pos4
=
y_high
*
width
+
x_high
;
pc
.
w1
=
w1
;
pc
.
w2
=
w2
;
pc
.
w3
=
w3
;
pc
.
w4
=
w4
;
pre_calc
[
pre_calc_index
]
=
pc
;
pre_calc_index
+=
1
;
}
}
}
}
}
template
<
typename
T
>
void
ROIAlignForward
(
const
int
nthreads
,
const
T
*
bottom_data
,
const
T
&
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
const
T
*
bottom_rois
,
T
*
top_data
)
{
int
n_rois
=
nthreads
/
channels
/
pooled_width
/
pooled_height
;
for
(
int
n
=
0
;
n
<
n_rois
;
n
++
)
{
int
index_n
=
n
*
channels
*
pooled_width
*
pooled_height
;
// roi could have 4 or 5 columns
const
T
*
offset_bottom_rois
=
bottom_rois
+
n
*
4
;
int
roi_batch_ind
=
0
;
// if (roi_cols == 5) {
// roi_batch_ind = offset_bottom_rois[0];
// offset_bottom_rois++;
// }
// Do not using rounding; this implementation detail is critical
T
roi_start_w
=
offset_bottom_rois
[
0
]
*
spatial_scale
;
T
roi_start_h
=
offset_bottom_rois
[
1
]
*
spatial_scale
;
T
roi_end_w
=
offset_bottom_rois
[
2
]
*
spatial_scale
;
T
roi_end_h
=
offset_bottom_rois
[
3
]
*
spatial_scale
;
// T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
// T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
// T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
// T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
// Force malformed ROIs to be 1x1
T
roi_width
=
std
::
max
(
roi_end_w
-
roi_start_w
,
(
T
)
1.
);
T
roi_height
=
std
::
max
(
roi_end_h
-
roi_start_h
,
(
T
)
1.
);
T
bin_size_h
=
static_cast
<
T
>
(
roi_height
)
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
static_cast
<
T
>
(
roi_width
)
/
static_cast
<
T
>
(
pooled_width
);
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_width
/
pooled_width
);
// We do average (integral) pooling inside a bin
const
T
count
=
roi_bin_grid_h
*
roi_bin_grid_w
;
// e.g. = 4
// we want to precalculate indeces and weights shared by all chanels,
// this is the key point of optimiation
std
::
vector
<
PreCalc
<
T
>>
pre_calc
(
roi_bin_grid_h
*
roi_bin_grid_w
*
pooled_width
*
pooled_height
);
pre_calc_for_bilinear_interpolate
(
height
,
width
,
pooled_height
,
pooled_width
,
roi_bin_grid_h
,
roi_bin_grid_w
,
roi_start_h
,
roi_start_w
,
bin_size_h
,
bin_size_w
,
roi_bin_grid_h
,
roi_bin_grid_w
,
pre_calc
);
for
(
int
c
=
0
;
c
<
channels
;
c
++
)
{
int
index_n_c
=
index_n
+
c
*
pooled_width
*
pooled_height
;
const
T
*
offset_bottom_data
=
bottom_data
+
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
int
pre_calc_index
=
0
;
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
for
(
int
pw
=
0
;
pw
<
pooled_width
;
pw
++
)
{
int
index
=
index_n_c
+
ph
*
pooled_width
+
pw
;
T
output_val
=
0.
;
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
{
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
PreCalc
<
T
>
pc
=
pre_calc
[
pre_calc_index
];
output_val
+=
pc
.
w1
*
offset_bottom_data
[
pc
.
pos1
]
+
pc
.
w2
*
offset_bottom_data
[
pc
.
pos2
]
+
pc
.
w3
*
offset_bottom_data
[
pc
.
pos3
]
+
pc
.
w4
*
offset_bottom_data
[
pc
.
pos4
];
pre_calc_index
+=
1
;
}
}
output_val
/=
count
;
top_data
[
index
]
=
output_val
;
}
// for pw
}
// for ph
}
// for c
}
// for n
}
template
<
>
void
RoiAlignPoolKernel
<
FPGA
,
float
>::
Compute
(
const
RoiAlignPoolParam
<
FPGA
>&
param
)
{
auto
input_tensor
=
param
.
float_input
.
get
();
fpga
::
PerformBypass
(
param
.
input_arg
);
fpga
::
fpga_invalidate
(
input_tensor
->
data
<
float
>
(),
input_tensor
->
numel
()
*
sizeof
(
float
));
auto
*
in
=
input_tensor
;
auto
*
rois
=
param
.
input_rois_
;
auto
*
out
=
param
.
output_
;
// param.float_output.get();
auto
pooled_height
=
param
.
pooled_height_
;
auto
pooled_width
=
param
.
pooled_width_
;
auto
spatial_scale
=
param
.
spatial_scale_
;
auto
sampe_ratio
=
param
.
sampling_ratio_
;
auto
in_dims
=
in
->
dims
();
int
batch_size
=
in_dims
[
0
];
int
input_channels
=
in_dims
[
1
];
int
height
=
in_dims
[
2
];
int
width
=
in_dims
[
3
];
int
rois_num
=
rois
->
dims
()[
0
];
auto
data_nhwc
=
in
->
mutable_data
<
float
>
();
fpga
::
image
::
convert_to_chw
(
&
data_nhwc
,
input_channels
,
height
,
width
);
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
rois_num
,
(
param
.
output_
)
->
dims
()[
1
],
(((
param
.
output_
)
->
dims
()[
2
])),
(
param
.
output_
)
->
dims
()[
3
]});
(
param
.
output_
)
->
Resize
(
dims_out_new
);
const
int
index
=
input_channels
*
pooled_height
*
pooled_width
*
rois_num
;
auto
rois_data
=
rois
->
data
<
float
>
();
auto
top_data
=
param
.
output_
->
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
index
;
++
i
){
ROIAlignForward
<
float
>
(
index
,
data_nhwc
,
spatial_scale
,
input_channels
,
height
,
width
,
pooled_height
,
pooled_width
,
sampe_ratio
,
rois_data
,
top_data
);
}
fpga
::
image
::
convert_to_hwc
(
&
top_data
,
input_channels
,
pooled_height
,
pooled_width
,
rois_num
);
out
->
reset_data_ptr
(
top_data
);
}
}
// namespace operators
}
// namespace paddle_mobile
#endif // ROIALIGN_POOL_OP
src/operators/kernel/fpga/V1/softmax_kernel.cpp
浏览文件 @
2f507f76
...
@@ -105,6 +105,7 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> ¶m) {
...
@@ -105,6 +105,7 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> ¶m) {
}
else
{
}
else
{
if
(
param
.
FpgaArgs
().
output
.
activation
.
activation_type
!=
fpga
::
SOFTMAX
)
{
if
(
param
.
FpgaArgs
().
output
.
activation
.
activation_type
!=
fpga
::
SOFTMAX
)
{
Tensor
*
out
=
param
.
Out
();
Tensor
*
out
=
param
.
Out
();
out
->
Resize
({
in_x
->
dims
()[
0
],
out
->
dims
()[
1
],
out
->
dims
()[
2
],
out
->
dims
()[
3
]});
math
::
SoftmaxFuntor
<
CPU
,
float
>
()(
in_x
,
out
);
math
::
SoftmaxFuntor
<
CPU
,
float
>
()(
in_x
,
out
);
}
}
}
}
...
...
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
浏览文件 @
2f507f76
...
@@ -42,6 +42,10 @@ template <>
...
@@ -42,6 +42,10 @@ template <>
void
Transpose2Kernel
<
FPGA
,
float
>::
Compute
(
void
Transpose2Kernel
<
FPGA
,
float
>::
Compute
(
const
Transpose2Param
<
FPGA
>
&
param
)
{
const
Transpose2Param
<
FPGA
>
&
param
)
{
// Transpose2Compute<float>(param);
// Transpose2Compute<float>(param);
auto
input
=
param
.
InputX
();
auto
output
=
param
.
Out
();
output
->
Resize
({
input
->
dims
()[
0
],
output
->
dims
()[
1
],
output
->
dims
()[
2
],
output
->
dims
()[
3
]});
}
}
}
// namespace operators
}
// namespace operators
...
...
tools/op.cmake
浏览文件 @
2f507f76
...
@@ -128,6 +128,7 @@ if (CON GREATER -1)
...
@@ -128,6 +128,7 @@ if (CON GREATER -1)
set
(
FUSION_CONVADDBN_OP ON
)
set
(
FUSION_CONVADDBN_OP ON
)
set
(
RESHAPE2_OP ON
)
set
(
RESHAPE2_OP ON
)
set
(
PSROI_POOL_OP ON
)
set
(
PSROI_POOL_OP ON
)
set
(
ROIALIGN_POOL_OP ON
)
set
(
PROPOSAL_OP ON
)
set
(
PROPOSAL_OP ON
)
set
(
ANCHOR_GENERATOR_OP ON
)
set
(
ANCHOR_GENERATOR_OP ON
)
set
(
SLICE_OP ON
)
set
(
SLICE_OP ON
)
...
@@ -603,6 +604,9 @@ endif()
...
@@ -603,6 +604,9 @@ endif()
if
(
PSROI_POOL_OP
)
if
(
PSROI_POOL_OP
)
add_definitions
(
-DPSROI_POOL_OP
)
add_definitions
(
-DPSROI_POOL_OP
)
endif
()
endif
()
if
(
ROIALIGN_POOL_OP
)
add_definitions
(
-DROIALIGN_POOL_OP
)
endif
()
if
(
ROI_PERSPECTIVE_OP
)
if
(
ROI_PERSPECTIVE_OP
)
add_definitions
(
-DROI_PERSPECTIVE_OP
)
add_definitions
(
-DROI_PERSPECTIVE_OP
)
endif
()
endif
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录