Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
80dd5953
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
80dd5953
编写于
3月 07, 2019
作者:
R
Ray Liu
提交者:
GitHub
3月 07, 2019
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into develop
上级
31bbcef0
55960cd7
变更
21
显示空白变更内容
内联
并排
Showing
21 changed file
with
564 addition
and
401 deletion
+564
-401
metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
...obile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
+12
-12
metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
...le-unit-test/paddle-mobile-unit-test/ViewController.swift
+4
-4
src/common/types.cpp
src/common/types.cpp
+2
-0
src/common/types.h
src/common/types.h
+1
-0
src/fpga/V1/api.cpp
src/fpga/V1/api.cpp
+4
-2
src/fpga/common/bitmap.cpp
src/fpga/common/bitmap.cpp
+0
-131
src/fpga/common/bitmap.h
src/fpga/common/bitmap.h
+0
-37
src/fpga/common/driver.cpp
src/fpga/common/driver.cpp
+17
-136
src/fpga/common/driver.h
src/fpga/common/driver.h
+8
-2
src/fpga/common/fpga_common.h
src/fpga/common/fpga_common.h
+1
-0
src/operators/detection_ops.cpp
src/operators/detection_ops.cpp
+21
-0
src/operators/detection_ops.h
src/operators/detection_ops.h
+4
-0
src/operators/kernel/detection_kernel.h
src/operators/kernel/detection_kernel.h
+40
-0
src/operators/kernel/fpga/V1/fetch_kernel.cpp
src/operators/kernel/fpga/V1/fetch_kernel.cpp
+16
-14
src/operators/kernel/fpga/V1/pool_kernel.cpp
src/operators/kernel/fpga/V1/pool_kernel.cpp
+4
-1
src/operators/kernel/fpga/V1/proposal_kernel.cpp
src/operators/kernel/fpga/V1/proposal_kernel.cpp
+18
-6
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
+105
-56
src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
+296
-0
src/operators/kernel/fpga/V1/softmax_kernel.cpp
src/operators/kernel/fpga/V1/softmax_kernel.cpp
+2
-0
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
+5
-0
tools/op.cmake
tools/op.cmake
+4
-0
未找到文件。
metal/paddle-mobile-unit-test/paddle-mobile-unit-test/AppDelegate.swift
浏览文件 @
80dd5953
metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift
浏览文件 @
80dd5953
src/common/types.cpp
浏览文件 @
80dd5953
...
...
@@ -109,6 +109,7 @@ const char *G_OP_TYPE_SLICE = "slice";
const
char
*
G_OP_TYPE_ANCHOR_GENERATOR
=
"anchor_generator"
;
const
char
*
G_OP_TYPE_GENERATE_PROPOSALS
=
"generate_proposals"
;
const
char
*
G_OP_TYPE_PSROI_POOL
=
"psroi_pool"
;
const
char
*
G_OP_TYPE_ROIALIGN_POOL
=
"roialign_pool"
;
const
char
*
G_OP_TYPE_ROI_PERSPECTIVE
=
"roi_perspective_transform"
;
const
char
*
G_OP_TYPE_PAD2D
=
"pad2d"
;
const
char
*
G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU
=
"fusion_deconv_add_bn_relu"
;
...
...
@@ -213,6 +214,7 @@ std::unordered_map<
{{
"Scores"
,
"BboxDeltas"
,
"ImInfo"
,
"Anchors"
,
"Variances"
},
{
"RpnRois"
,
"RpnRoiProbs"
}}},
{
G_OP_TYPE_PSROI_POOL
,
{{
"X"
,
"ROIs"
},
{
"Out"
}}},
{
G_OP_TYPE_ROIALIGN_POOL
,
{{
"X"
,
"ROIs"
},
{
"Out"
}}},
{
G_OP_TYPE_ROI_PERSPECTIVE
,
{{
"X"
,
"ROIs"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_DECONV_ADD_BN
,
{{
"Input"
},
{
"Out"
}}},
...
...
src/common/types.h
浏览文件 @
80dd5953
...
...
@@ -198,6 +198,7 @@ extern const char *G_OP_TYPE_SLICE;
extern
const
char
*
G_OP_TYPE_ANCHOR_GENERATOR
;
extern
const
char
*
G_OP_TYPE_GENERATE_PROPOSALS
;
extern
const
char
*
G_OP_TYPE_PSROI_POOL
;
extern
const
char
*
G_OP_TYPE_ROIALIGN_POOL
;
extern
const
char
*
G_OP_TYPE_ROI_PERSPECTIVE
;
extern
const
char
*
G_OP_TYPE_PAD2D
;
extern
const
char
*
G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU
;
...
...
src/fpga/V1/api.cpp
浏览文件 @
80dd5953
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/V1/api.h"
#include <memory>
#include "fpga/V1/bias_scale.h"
#include "fpga/V1/deconv_filter.h"
#include "fpga/V1/filter.h"
...
...
@@ -368,9 +369,10 @@ void expand_conv_arg(ConvArgs *arg) {
auto
filter_pad_width_mul_channel
=
args
.
image
.
pad_width
*
args
.
image
.
channels
;
auto
image_amount_per_row_multi_win_first
=
image_amount_per_row
*
(
2
*
args
.
kernel
.
stride_h
-
args
.
image
.
pad_height
);
image_amount_per_row
*
(
ROW_PARALLEL_NUM
*
args
.
kernel
.
stride_h
-
args
.
image
.
pad_height
);
auto
image_amount_per_row_multi_win
=
image_amount_per_row
*
(
2
*
args
.
kernel
.
stride_h
);
image_amount_per_row
*
(
ROW_PARALLEL_NUM
*
args
.
kernel
.
stride_h
);
auto
image_block_num
=
block_num
;
auto
image_block_len
=
...
...
src/fpga/common/bitmap.cpp
已删除
100644 → 0
浏览文件 @
31bbcef0
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/common/bitmap.h"
namespace
fpga_bitmap
{
void
bitmap_set
(
uint64_t
*
map
,
unsigned
int
start
,
int
len
)
{
uint64_t
*
p
=
map
+
BIT_WORD
(
start
);
const
unsigned
int
size
=
start
+
len
;
int
bits_to_set
=
BITS_PER_LONG
-
(
start
%
BITS_PER_LONG
);
uint64_t
mask_to_set
=
BITMAP_FIRST_WORD_MASK
(
start
);
while
(
len
-
bits_to_set
>=
0
)
{
*
p
|=
mask_to_set
;
len
-=
bits_to_set
;
bits_to_set
=
BITS_PER_LONG
;
mask_to_set
=
~
0UL
;
p
++
;
}
if
(
len
)
{
mask_to_set
&=
BITMAP_LAST_WORD_MASK
(
size
);
*
p
|=
mask_to_set
;
}
}
void
bitmap_clear
(
uint64_t
*
map
,
unsigned
int
start
,
int
len
)
{
uint64_t
*
p
=
map
+
BIT_WORD
(
start
);
const
unsigned
int
size
=
start
+
len
;
int
bits_to_clear
=
BITS_PER_LONG
-
(
start
%
BITS_PER_LONG
);
uint64_t
mask_to_clear
=
BITMAP_FIRST_WORD_MASK
(
start
);
while
(
len
-
bits_to_clear
>=
0
)
{
*
p
&=
~
mask_to_clear
;
len
-=
bits_to_clear
;
bits_to_clear
=
BITS_PER_LONG
;
mask_to_clear
=
~
0UL
;
p
++
;
}
if
(
len
)
{
mask_to_clear
&=
BITMAP_LAST_WORD_MASK
(
size
);
*
p
&=
~
mask_to_clear
;
}
}
static
uint64_t
ffs
(
uint64_t
data
)
{
uint64_t
bit
=
0
;
int
i
=
0
;
for
(
i
=
0
;
i
<
sizeof
(
data
)
*
8
;
i
++
)
{
if
(
data
&
(
1UL
<<
i
))
{
bit
=
i
;
break
;
}
}
return
bit
;
}
static
uint64_t
_find_next_bit
(
const
uint64_t
*
addr
,
uint64_t
nbits
,
uint64_t
start
,
uint64_t
invert
)
{
uint64_t
tmp
=
0
;
if
(
!
nbits
||
start
>=
nbits
)
return
nbits
;
tmp
=
addr
[
start
/
BITS_PER_LONG
]
^
invert
;
/* Handle 1st word. */
tmp
&=
BITMAP_FIRST_WORD_MASK
(
start
);
start
=
round_down
(
start
,
BITS_PER_LONG
);
while
(
!
tmp
)
{
start
+=
BITS_PER_LONG
;
if
(
start
>=
nbits
)
return
nbits
;
tmp
=
addr
[
start
/
BITS_PER_LONG
]
^
invert
;
}
return
(
start
+
ffs
(
tmp
))
<
nbits
?
(
start
+
ffs
(
tmp
))
:
nbits
;
}
uint64_t
find_next_zero_bit
(
const
uint64_t
*
addr
,
uint64_t
size
,
uint64_t
offset
)
{
return
_find_next_bit
(
addr
,
size
,
offset
,
~
0UL
);
}
uint64_t
find_next_bit
(
const
uint64_t
*
addr
,
uint64_t
size
,
uint64_t
offset
)
{
return
_find_next_bit
(
addr
,
size
,
offset
,
0UL
);
}
uint64_t
bitmap_find_next_zero_area_off
(
uint64_t
*
map
,
uint64_t
size
,
uint64_t
start
,
unsigned
int
nr
,
uint64_t
align_mask
,
uint64_t
align_offset
)
{
uint64_t
index
=
0
;
uint64_t
end
=
0
;
uint64_t
i
=
0
;
again:
index
=
find_next_zero_bit
(
map
,
size
,
start
);
/* Align allocation */
index
=
__ALIGN_MASK
(
index
+
align_offset
,
align_mask
)
-
align_offset
;
end
=
index
+
nr
;
if
(
end
>
size
)
return
end
;
i
=
find_next_bit
(
map
,
end
,
index
);
if
(
i
<
end
)
{
start
=
i
+
1
;
goto
again
;
}
return
index
;
}
uint64_t
bitmap_find_next_zero_area
(
uint64_t
*
map
,
uint64_t
size
,
uint64_t
start
,
unsigned
int
nr
,
uint64_t
align_mask
)
{
return
bitmap_find_next_zero_area_off
(
map
,
size
,
start
,
nr
,
align_mask
,
0
);
}
}
// namespace fpga_bitmap
src/fpga/common/bitmap.h
已删除
100644 → 0
浏览文件 @
31bbcef0
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <stdio.h>
#define BITS_PER_LONG 64
#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
#define round_down(x, y) ((x) & ~((y)-1))
namespace
fpga_bitmap
{
void
bitmap_set
(
uint64_t
*
map
,
unsigned
int
start
,
int
len
);
void
bitmap_clear
(
uint64_t
*
map
,
unsigned
int
start
,
int
len
);
uint64_t
bitmap_find_next_zero_area
(
uint64_t
*
map
,
uint64_t
size
,
uint64_t
start
,
unsigned
int
nr
,
uint64_t
align_mask
);
}
// namespace fpga_bitmap
src/fpga/common/driver.cpp
浏览文件 @
80dd5953
...
...
@@ -26,9 +26,9 @@ limitations under the License. */
#include <fstream>
#include <iomanip>
#include <iostream>
#include <utility>
#include "common/enforce.h"
#include "fpga/common/bitmap.h"
#include "fpga/common/driver.h"
namespace
paddle_mobile
{
...
...
@@ -148,34 +148,6 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
}
}
/*内存管理*/
int
memory_request
(
struct
fpga_memory
*
memory
,
size_t
size
,
uint64_t
*
addr
)
{
uint64_t
_nr
=
DIV_ROUND_UP
(
size
,
FPGA_PAGE_SIZE
);
unsigned
int
nr
=
(
unsigned
int
)
_nr
;
int
ret
=
0
;
uint64_t
a_size
=
FPGA_PAGE_SIZE
*
nr
;
pthread_mutex_lock
(
&
memory
->
mutex
);
unsigned
int
pos
=
(
unsigned
int
)
fpga_bitmap
::
bitmap_find_next_zero_area
(
memory
->
bitmap
,
memory
->
page_num
,
0
,
nr
,
0
);
if
(
pos
<=
memory
->
page_num
)
{
uint64_t
address_ofset
=
memory
->
mem_start
+
((
uint64_t
)
pos
)
*
FPGA_PAGE_SIZE
;
fpga_bitmap
::
bitmap_set
(
memory
->
bitmap
,
pos
,
nr
);
memory
->
nr
[
pos
]
=
nr
;
*
addr
=
address_ofset
;
}
else
{
DLOG
<<
"memory request failed!"
;
ret
=
-
ENOMEM
;
}
pthread_mutex_unlock
(
&
memory
->
mutex
);
return
ret
;
}
void
memory_release
(
struct
fpga_memory
*
memory
)
{
void
*
ptr
=
nullptr
;
...
...
@@ -187,97 +159,6 @@ void memory_release(struct fpga_memory *memory) {
}
}
int
create_fpga_memory_inner
(
struct
fpga_memory
*
memory
,
size_t
memory_size
)
{
int
rc
=
0
;
uint64_t
*
bitmap
=
nullptr
;
unsigned
int
*
nr
=
nullptr
;
// 不允许多份memory创建,所以创建memory结构体不存在互斥
// pthread_mutex_lock(&memory->mutex);
memory
->
page_num
=
(
unsigned
int
)(
memory_size
/
FPGA_PAGE_SIZE
);
memory
->
page_num_long
=
DIV_ROUND_UP
(
memory
->
page_num
,
BITS_PER_LONG
);
bitmap
=
(
uint64_t
*
)
malloc
(
sizeof
(
int64_t
)
*
memory
->
page_num_long
);
// NOLINT
if
(
!
bitmap
)
{
rc
=
-
EFAULT
;
return
rc
;
}
memory
->
bitmap
=
bitmap
;
nr
=
(
unsigned
int
*
)
calloc
(
memory
->
page_num
,
sizeof
(
unsigned
int
));
if
(
!
nr
)
{
rc
=
-
EFAULT
;
free
(
bitmap
);
return
rc
;
}
memory
->
nr
=
nr
;
memory
->
mem_start
=
FPGA_MEM_PHY_ADDR
;
memory
->
mem_end
=
FPGA_MEM_SIZE
;
// pthread_mutex_unlock(memory->mutex);
return
rc
;
}
int
create_fpga_memory
(
struct
fpga_memory
**
memory_info
)
{
int
rc
=
0
;
*
memory_info
=
(
struct
fpga_memory
*
)
malloc
(
sizeof
(
struct
fpga_memory
));
if
(
*
memory_info
==
NULL
)
{
rc
=
-
EFAULT
;
return
rc
;
}
pthread_mutex_init
(
&
((
*
memory_info
)
->
mutex
),
nullptr
);
rc
=
create_fpga_memory_inner
(
*
memory_info
,
FPGA_MEM_SIZE
);
if
(
rc
)
{
free
(
*
memory_info
);
}
return
rc
;
}
int
init_fpga_memory
(
struct
fpga_memory
*
memory
)
{
int
rc
=
0
;
if
(
!
memory
)
{
rc
=
-
EFAULT
;
return
rc
;
}
fpga_bitmap
::
bitmap_clear
(
memory
->
bitmap
,
0
,
memory
->
page_num
);
fpga_bitmap
::
bitmap_set
(
memory
->
bitmap
,
0
,
1
);
// NOTE reserve fpga page 0.
return
0
;
}
void
destroy_fpga_memory
(
struct
fpga_memory
*
memory
)
{
if
(
memory
)
{
free
(
memory
->
nr
);
free
(
memory
->
bitmap
);
free
(
memory
);
}
}
int
fpga_memory_add
()
{
int
rc
=
0
;
rc
=
create_fpga_memory
(
&
g_fpgainfo
.
memory_info
);
if
(
rc
)
{
return
rc
;
}
rc
=
init_fpga_memory
(
g_fpgainfo
.
memory_info
);
if
(
rc
)
{
destroy_fpga_memory
(
g_fpgainfo
.
memory_info
);
return
rc
;
}
return
0
;
}
uint64_t
vaddr_to_paddr_driver
(
void
*
address
)
{
uint64_t
paddr
=
0
;
auto
iter
=
g_fpgainfo
.
fpga_vaddr2paddr_map
.
find
(
address
);
...
...
@@ -314,17 +195,28 @@ void *fpga_reg_free(void *ptr) {
}
}
static
inline
int
do_ioctl
(
int64_t
req
,
const
void
*
arg
)
{
return
ioctl
(
g_fpgainfo
.
fd_mem
,
req
,
arg
);
}
void
*
fpga_malloc_driver
(
size_t
size
)
{
void
*
ret
=
nullptr
;
uint64_t
phy_addr
=
0
;
int
i
=
0
;
struct
MemoryVM2PHYArgs
args
;
struct
MemoryCacheArgs
args_c
;
memory_request
(
g_fpgainfo
.
memory_info
,
size
,
&
phy_addr
);
//
memory_request(g_fpgainfo.memory_info, size, &phy_addr);
ret
=
mmap64
(
nullptr
,
size
,
PROT_READ
|
PROT_WRITE
,
MAP_SHARED
,
g_fpgainfo
.
fd_mem
,
phy_addr
);
g_fpgainfo
.
fd_mem
,
FPGA_MEM_PHY_ADDR
);
PADDLE_MOBILE_ENFORCE
(
ret
!=
(
void
*
)
-
1
,
"Should not be -1"
);
args
.
pVM
=
reinterpret_cast
<
void
*>
(
ret
);
args
.
pPHY
=
reinterpret_cast
<
void
*>
(
0
);
do_ioctl
(
IOCTL_MEMORY_VM2PHY
,
&
args
);
phy_addr
=
(
uint64_t
)
args
.
pPHY
;
g_fpgainfo
.
fpga_vaddr2paddr_map
.
insert
(
std
::
make_pair
(
ret
,
phy_addr
));
g_fpgainfo
.
fpga_addr2size_map
.
insert
(
std
::
make_pair
(
ret
,
size
));
...
...
@@ -342,14 +234,8 @@ void fpga_free_driver(void *ptr) {
g_fpgainfo
.
fpga_addr2size_map
.
erase
(
iter
);
munmap
(
ptr
,
size
);
p_addr
=
vaddr_to_paddr_driver
(
ptr
);
pos
=
(
p_addr
-
g_fpgainfo
.
memory_info
->
mem_start
)
/
FPGA_PAGE_SIZE
;
/*clear bitmap*/
pthread_mutex_lock
(
&
g_fpgainfo
.
memory_info
->
mutex
);
fpga_bitmap
::
bitmap_clear
(
g_fpgainfo
.
memory_info
->
bitmap
,
pos
,
g_fpgainfo
.
memory_info
->
nr
[
pos
]);
pthread_mutex_unlock
(
&
g_fpgainfo
.
memory_info
->
mutex
);
// p_addr = vaddr_to_paddr_driver(ptr);
// pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE;
auto
iter
=
g_fpgainfo
.
fpga_vaddr2paddr_map
.
find
(
ptr
);
if
(
iter
!=
g_fpgainfo
.
fpga_vaddr2paddr_map
.
end
())
{
...
...
@@ -360,10 +246,6 @@ void fpga_free_driver(void *ptr) {
}
}
static
inline
int
do_ioctl
(
int64_t
req
,
const
void
*
arg
)
{
return
ioctl
(
g_fpgainfo
.
fd_mem
,
req
,
arg
);
}
int
fpga_flush_driver
(
void
*
address
,
size_t
size
)
{
struct
MemoryCacheArgs
args
;
uint64_t
p_addr
;
...
...
@@ -413,7 +295,7 @@ int open_device_driver() {
g_fpgainfo
.
FpgaRegVirAddr
=
(
uint64_t
*
)
fpga_reg_malloc
(
FPGA_REG_SIZE
);
// NOLINT
fpga_memory_add
();
//
fpga_memory_add();
pl_init
();
...
...
@@ -424,7 +306,6 @@ int close_device_driver() {
pl_destroy
();
fpga_reg_free
(
g_fpgainfo
.
FpgaRegVirAddr
);
memory_release
(
g_fpgainfo
.
memory_info
);
destroy_fpga_memory
(
g_fpgainfo
.
memory_info
);
return
0
;
}
...
...
src/fpga/common/driver.h
浏览文件 @
80dd5953
...
...
@@ -31,8 +31,8 @@ namespace driver {
#define FPGA_REG_PHY_ADDR 0x80000000
#define FPGA_REG_SIZE 0x1000
#define FPGA_MEM_PHY_ADDR 0x
4
0000000
#define FPGA_MEM_SIZE 0x
8
0000000
#define FPGA_MEM_PHY_ADDR 0x
2
0000000
#define FPGA_MEM_SIZE 0x
2
0000000
#define FPGA_PAGE_SIZE (16UL * 1024UL)
...
...
@@ -52,9 +52,15 @@ struct MemoryCacheArgs {
size_t
size
;
};
struct
MemoryVM2PHYArgs
{
void
*
pVM
;
void
*
pPHY
;
};
#define IOCTL_FPGA_MAGIC 'F'
#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
#define IOCTL_MEMORY_VM2PHY _IOWR(IOCTL_FPGA_MAGIC, 15, struct MemoryVM2PHYArgs)
struct
fpga_pe
{
char
type_name
[
MAX_TYPE_NAME_LENTH
+
1
];
...
...
src/fpga/common/fpga_common.h
浏览文件 @
80dd5953
...
...
@@ -25,6 +25,7 @@ limitations under the License. */
#define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT (8)
#define BIAS_NUM_ALIGNMENT (16)
#define ROW_PARALLEL_NUM (3)
#endif
namespace
paddle_mobile
{
...
...
src/operators/detection_ops.cpp
浏览文件 @
80dd5953
...
...
@@ -65,6 +65,23 @@ void PSRoiPoolOp<DeviceType, T>::InferShape() const {
}
#endif
#ifdef ROIALIGN_POOL_OP
template
<
typename
DeviceType
,
typename
T
>
void
RoiAlignPoolOp
<
DeviceType
,
T
>::
InferShape
()
const
{
const
auto
&
rois_dims
=
this
->
param_
.
input_rois_
->
dims
();
const
int
pooled_height
=
this
->
param_
.
pooled_height_
;
const
int
pooled_width
=
this
->
param_
.
pooled_width_
;
auto
out_dims
=
this
->
param_
.
input_x_
->
dims
();
out_dims
[
0
]
=
rois_dims
[
0
];
// out_dims[1] =
// output_channels; // input_dims[1] / (pooled_height * pooled_width);
out_dims
[
2
]
=
pooled_height
;
out_dims
[
3
]
=
pooled_width
;
this
->
param_
.
output_
->
Resize
(
out_dims
);
}
#endif
#ifdef ROI_PERSPECTIVE_OP
template
<
typename
DeviceType
,
typename
T
>
void
RoiPerspectiveOp
<
DeviceType
,
T
>::
InferShape
()
const
{
...
...
@@ -110,4 +127,8 @@ REGISTER_OPERATOR_FPGA(generate_proposals, ops::ProposalOp);
#ifdef PSROI_POOL_OP
REGISTER_OPERATOR_FPGA
(
psroi_pool
,
ops
::
PSRoiPoolOp
);
#endif
#ifdef ROIALIGN_POOL_OP
REGISTER_OPERATOR_FPGA
(
roialign_pool
,
ops
::
RoiAlignPoolOp
);
#endif
#endif
src/operators/detection_ops.h
浏览文件 @
80dd5953
...
...
@@ -34,6 +34,10 @@ DECLARE_OPERATOR(Proposal, ProposalParam, ProposalKernel);
DECLARE_OPERATOR
(
PSRoiPool
,
PSRoiPoolParam
,
PSRoiPoolKernel
);
#endif
#ifdef ROIALIGN_POOL_OP
DECLARE_OPERATOR
(
RoiAlignPool
,
RoiAlignPoolParam
,
RoiAlignPoolKernel
);
#endif
#ifdef ROI_PERSPECTIVE_OP
DECLARE_OPERATOR
(
RoiPerspective
,
RoiPerspectiveParam
,
RoiPerspectiveKernel
);
#endif
...
...
src/operators/kernel/detection_kernel.h
浏览文件 @
80dd5953
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <memory>
#include <vector>
#include "framework/operator.h"
#include "operators/op_param.h"
...
...
@@ -98,6 +99,8 @@ class ProposalParam : public OpParam {
framework
::
Tensor
*
anchors_
;
framework
::
Tensor
*
variances_
;
std
::
shared_ptr
<
Tensor
>
score_index_
;
framework
::
LoDTensor
*
rpn_rois_
;
framework
::
LoDTensor
*
rpn_probs_
;
...
...
@@ -151,6 +154,43 @@ class PSRoiPoolParam : public OpParam {
DECLARE_KERNEL
(
PSRoiPool
,
PSRoiPoolParam
);
#endif
#ifdef ROIALIGN_POOL_OP
template
<
typename
Dtype
>
class
RoiAlignPoolParam
:
public
OpParam
{
public:
RoiAlignPoolParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
Scope
*
scope
)
:
OpParam
(
inputs
,
outputs
,
attrs
,
scope
)
{
input_x_
=
OpParam
::
GetVarValue
<
framework
::
LoDTensor
>
(
"X"
,
inputs
,
*
scope
);
input_rois_
=
OpParam
::
GetVarValue
<
framework
::
LoDTensor
>
(
"ROIs"
,
inputs
,
*
scope
);
output_
=
OpParam
::
GetVarValue
<
framework
::
LoDTensor
>
(
"Out"
,
outputs
,
*
scope
);
pooled_height_
=
OpParam
::
GetAttr
<
int
>
(
"pooled_height"
,
attrs
);
pooled_width_
=
OpParam
::
GetAttr
<
int
>
(
"pooled_width"
,
attrs
);
spatial_scale_
=
OpParam
::
GetAttr
<
float
>
(
"spatial_scale"
,
attrs
);
sampling_ratio_
=
OpParam
::
GetAttr
<
float
>
(
"sampling_ratio"
,
attrs
);
}
public:
framework
::
Tensor
*
input_x_
;
framework
::
LoDTensor
*
input_rois_
;
framework
::
Tensor
*
output_
;
int
pooled_height_
;
int
pooled_width_
;
float
spatial_scale_
;
int
sampling_ratio_
;
#ifdef PADDLE_MOBILE_FPGA
std
::
shared_ptr
<
Tensor
>
float_input
,
float_output
;
fpga
::
BypassArgs
input_arg
,
output_arg
;
#endif
};
DECLARE_KERNEL
(
RoiAlignPool
,
RoiAlignPoolParam
);
#endif
#ifdef ROI_PERSPECTIVE_OP
template
<
typename
Dtype
>
class
RoiPerspectiveParam
:
public
OpParam
{
...
...
src/operators/kernel/fpga/V1/fetch_kernel.cpp
浏览文件 @
80dd5953
...
...
@@ -11,9 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/kernel/fetch_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -35,7 +33,7 @@ bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
args
.
input_layout_type
=
fpga
::
LAYOUT_CHW
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
image
.
address
=
input
->
data
<
half
>
();
args
.
image
.
channels
=
(
uint32_t
)
product
(
input
->
dims
()
);
args
.
image
.
channels
=
(
uint32_t
)
(
input
->
fpga_data_num
);
args
.
image
.
height
=
1
;
args
.
image
.
width
=
1
;
args
.
image
.
pad_height
=
0
;
...
...
@@ -58,27 +56,31 @@ void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
}
template
<
>
void
FetchKernel
<
FPGA
,
float
>::
Compute
(
const
FetchParam
<
FPGA
>
&
param
)
{
auto
input
=
param
.
InputX
(
);
auto
input
=
const_cast
<
Tensor
*>
(
param
.
InputX
()
);
if
(
input
->
type
()
==
typeid
(
float
))
{
auto
output
=
param
.
Out
();
output
->
ShareDataWith
(
*
input
);
return
;
}
fpga
::
PerformBypass
(
param
.
fpga_bypass_args
);
fpga
::
BypassArgs
args
=
param
.
fpga_bypass_args
;
auto
input_address
=
(
input
->
data
<
half
>
());
args
.
image
.
address
=
static_cast
<
void
*>
(
input_address
);
fpga
::
PerformBypass
(
args
);
auto
outC
=
param
.
Out
()
->
dims
()[
1
];
auto
outH
=
param
.
Out
()
->
dims
()[
2
];
auto
outW
=
param
.
Out
()
->
dims
()[
3
];
fpga
::
fpga_invalidate
(
param
.
fpga_bypass_args
.
output
.
address
,
outH
*
(
paddle_mobile
::
fpga
::
align_to_x
(
outC
*
outW
,
16
))
*
sizeof
(
float
));
float
*
outdata_ptr
=
reinterpret_cast
<
float
*>
(
param
.
fpga_bypass_args
.
output
.
address
);
fpga
::
fpga_invalidate
(
param
.
fpga_bypass_args
.
output
.
address
,
param
.
Out
()
->
fpga_data_num
*
sizeof
(
float
));
if
(
param
.
Out
()
->
fpga_data_num
!=
product
(
input
->
dims
()))
{
float
*
data_tmp
=
reinterpret_cast
<
float
*>
(
malloc
(
outC
*
outH
*
outW
*
sizeof
(
float
)));
dealign
(
outdata_ptr
,
data_tmp
,
outC
,
outH
,
outW
);
memcpy
(
outdata_ptr
,
data_tmp
,
outC
*
outH
*
outW
*
sizeof
(
float
));
free
(
data_tmp
);
}
}
template
class
FetchKernel
<
FPGA
,
float
>;
...
...
src/operators/kernel/fpga/V1/pool_kernel.cpp
浏览文件 @
80dd5953
...
...
@@ -73,9 +73,12 @@ void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> ¶m) {
if
(
input
->
type
()
==
typeid
(
float
))
{
auto
*
output
=
param
.
Output
();
auto
in
=
input
->
data
<
float
>
();
auto
N
=
input
->
dims
()[
0
];
output
->
Resize
(
{
N
,
output
->
dims
()[
1
],
output
->
dims
()[
2
],
output
->
dims
()[
3
]});
auto
len
=
output
->
numel
();
auto
out
=
output
->
mutable_data
<
float
>
();
int
N
=
input
->
dims
()[
0
],
C
=
input
->
dims
()[
1
],
H
=
input
->
dims
()[
2
],
int
C
=
input
->
dims
()[
1
],
H
=
input
->
dims
()[
2
],
// N = input->dims()[0
],
W
=
input
->
dims
()[
3
];
int
HW
=
H
*
W
,
CHW
=
C
*
H
*
W
,
WC
=
W
*
C
;
...
...
src/operators/kernel/fpga/V1/proposal_kernel.cpp
浏览文件 @
80dd5953
...
...
@@ -65,6 +65,13 @@ bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
args
.
output
.
scale_address
=
param
->
float_score
->
scale
;
param
->
score_arg
=
args
;
param
->
score_index_
=
std
::
make_shared
<
Tensor
>
();
param
->
score_index_
->
mutable_data
<
int32_t
>
({
input
->
numel
()});
auto
score_index
=
param
->
score_index_
->
data
<
int32_t
>
();
for
(
int
i
=
0
;
i
<
input
->
numel
();
++
i
)
{
score_index
[
i
]
=
i
;
}
return
true
;
}
template
<
typename
T
>
...
...
@@ -334,17 +341,20 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
const
Tensor
&
im_info_slice
,
const
Tensor
&
anchors
,
const
Tensor
&
variances
,
const
Tensor
&
bbox_deltas_slice
,
// [M, 4]
const
Tensor
&
scores_slice
,
// [N, 1]
int
pre_nms_top_n
,
int
post_nms_top_n
,
float
nms_thresh
,
float
min_size
,
float
eta
)
{
const
Tensor
&
score_index
,
int
pre_nms_top_n
,
int
post_nms_top_n
,
float
nms_thresh
,
float
min_size
,
float
eta
)
{
auto
*
scores_data
=
scores_slice
.
data
<
T
>
();
// Sort index
Tensor
index_t
;
index_t
.
Resize
({
scores_slice
.
numel
()});
int
*
index
=
index_t
.
mutable_data
<
int
>
();
for
(
int
i
=
0
;
i
<
scores_slice
.
numel
();
++
i
)
{
/*
for (int i = 0; i < scores_slice.numel(); ++i) {
index[i] = i;
}
}*/
std
::
memcpy
(
index
,
score_index
.
data
<
int32_t
>
(),
scores_slice
.
numel
()
*
sizeof
(
int
));
auto
compare
=
[
scores_data
](
const
int64_t
&
i
,
const
int64_t
&
j
)
{
return
scores_data
[
i
]
>
scores_data
[
j
];
};
...
...
@@ -490,8 +500,10 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> ¶m) {
auto
*
rpn_rois
=
param
.
rpn_rois_
;
auto
*
rpn_roi_probs
=
param
.
rpn_probs_
;
auto
score_index
=
*
(
param
.
score_index_
.
get
());
int
pre_nms_top_n
=
param
.
pre_nms_topn_
;
int
post_nms_top_n
=
param
.
post_nms_topn_
;
int
post_nms_top_n
=
100
;
//
param.post_nms_topn_;
float
nms_thresh
=
param
.
nms_thresh_
;
float
min_size
=
param
.
min_size_
;
float
eta
=
param
.
eta_
;
...
...
@@ -529,7 +541,7 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> ¶m) {
std
::
pair
<
Tensor
,
Tensor
>
tensor_pair
=
ProposalForOneImage
<
float
>
(
im_info_slice
,
anchors
,
variances
,
bbox_deltas_slice
,
scores_slice
,
pre_nms_top_n
,
post_nms_top_n
,
nms_thresh
,
min_size
,
eta
);
score_index
,
pre_nms_top_n
,
post_nms_top_n
,
nms_thresh
,
min_size
,
eta
);
Tensor
&
proposals
=
tensor_pair
.
first
;
Tensor
&
scores
=
tensor_pair
.
second
;
...
...
src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
浏览文件 @
80dd5953
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef PSROI_POOL_OP
#include <cmath>
#include <memory>
#include <vector>
#include "operators/kernel/detection_kernel.h"
...
...
@@ -72,42 +73,20 @@ bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
}
template
<
typename
Dtype
>
void
PSROIPooling
(
const
Dtype
*
bottom_data
,
const
Dtype
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_
height
,
const
int
pooled_width
,
const
Dtype
*
bottom_rois
,
const
int
output_dim
,
const
int
group_size
,
Dtype
*
top_data
,
// int* mapping_channel
,
int
index
,
int
*
rois_batch_id
)
{
// The output is in order (n, ctop, ph, pw)
// static int cnt = 0
;
int
pw
=
index
%
pooled_width
;
int
ph
=
(
index
/
pooled_width
)
%
pooled_height
;
int
ctop
=
(
index
/
pooled_width
/
pooled_height
)
%
output_dim
;
int
n
=
index
/
pooled_width
/
pooled_height
/
output_dim
;
void
PSROIPooling
(
const
Dtype
*
bottom_data
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_
width
,
const
Dtype
*
bottom_rois
,
const
int
output_dim
,
const
int
group_size
,
Dtype
*
top_data
,
int
index
,
int
nid
,
const
Dtype
Bin_size_h
,
const
Dtype
Bin_size_w
,
const
Dtype
roi_start_h
,
const
Dtype
roi_start_w
,
const
int
ctop
,
const
int
ph
,
const
int
roi_batch_ind
)
{
int
pw
=
index
;
int
hstart
=
floor
(
static_cast
<
Dtype
>
(
ph
)
*
Bin_size_h
+
roi_start_h
)
;
int
wstart
=
floor
(
static_cast
<
Dtype
>
(
pw
)
*
Bin_size_w
+
roi_start_w
)
;
int
hend
=
ceil
(
static_cast
<
Dtype
>
(
ph
+
1
)
*
Bin_size_h
+
roi_start_h
)
;
int
wend
=
ceil
(
static_cast
<
Dtype
>
(
pw
+
1
)
*
Bin_size_w
+
roi_start_w
)
;
// [start, end) interval for spatial sampling
bottom_rois
+=
n
*
4
;
int
roi_batch_ind
=
rois_batch_id
[
n
];
// bottom_rois[0];
Dtype
roi_start_w
=
static_cast
<
Dtype
>
(
round
(
bottom_rois
[
0
]))
*
spatial_scale
;
Dtype
roi_start_h
=
static_cast
<
Dtype
>
(
round
(
bottom_rois
[
1
]))
*
spatial_scale
;
Dtype
roi_end_w
=
static_cast
<
Dtype
>
(
round
(
bottom_rois
[
2
])
+
1.
)
*
spatial_scale
;
Dtype
roi_end_h
=
static_cast
<
Dtype
>
(
round
(
bottom_rois
[
3
])
+
1.
)
*
spatial_scale
;
// Force too small ROIs to be 1x1
Dtype
roi_width
=
std
::
max
(
roi_end_w
-
roi_start_w
,
0.1
f
);
// avoid 0
Dtype
roi_height
=
std
::
max
(
roi_end_h
-
roi_start_h
,
0.1
f
);
// Compute w and h at bottom
Dtype
bin_size_h
=
roi_height
/
static_cast
<
Dtype
>
(
pooled_height
);
Dtype
bin_size_w
=
roi_width
/
static_cast
<
Dtype
>
(
pooled_width
);
int
hstart
=
floor
(
static_cast
<
Dtype
>
(
ph
)
*
bin_size_h
+
roi_start_h
);
int
wstart
=
floor
(
static_cast
<
Dtype
>
(
pw
)
*
bin_size_w
+
roi_start_w
);
int
hend
=
ceil
(
static_cast
<
Dtype
>
(
ph
+
1
)
*
bin_size_h
+
roi_start_h
);
int
wend
=
ceil
(
static_cast
<
Dtype
>
(
pw
+
1
)
*
bin_size_w
+
roi_start_w
);
// Add roi offsets and clip to input boundaries
hstart
=
std
::
min
(
std
::
max
(
hstart
,
0
),
height
);
hend
=
std
::
min
(
std
::
max
(
hend
,
0
),
height
);
...
...
@@ -115,10 +94,9 @@ void PSROIPooling(const Dtype* bottom_data, const Dtype spatial_scale,
wend
=
std
::
min
(
std
::
max
(
wend
,
0
),
width
);
bool
is_empty
=
(
hend
<=
hstart
)
||
(
wend
<=
wstart
);
int
gw
=
pw
;
int
gh
=
ph
;
int
c
=
(
ctop
*
group_size
+
gh
)
*
group_size
+
gw
;
int
c
=
(
ctop
*
group_size
+
ph
)
*
group_size
+
pw
;
Dtype
bin_area
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
bottom_data
+=
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
Dtype
out_sum
=
0
;
for
(
int
h
=
hstart
;
h
<
hend
;
++
h
)
{
...
...
@@ -128,9 +106,50 @@ void PSROIPooling(const Dtype* bottom_data, const Dtype spatial_scale,
}
}
Dtype
bin_area
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
top_data
[
index
]
=
is_empty
?
0.
:
out_sum
/
bin_area
;
top_data
[
nid
+
index
]
=
is_empty
?
0.
:
out_sum
/
bin_area
;
}
void
convert_to_chw
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
,
int
num
)
{
float
*
data_in_tmp
=
*
data_in
;
float
*
data_tmp
=
reinterpret_cast
<
float
*>
(
fpga
::
fpga_malloc
(
channel
*
height
*
width
*
sizeof
(
float
)));
// NOLINT
int64_t
amount_per_side
=
width
*
height
;
for
(
int
n
=
0
;
n
<
num
;
n
++
)
{
for
(
int
h
=
0
;
h
<
height
;
h
++
)
{
for
(
int
w
=
0
;
w
<
width
;
w
++
)
{
for
(
int
c
=
0
;
c
<
channel
;
c
++
)
{
*
(
data_tmp
+
n
*
height
*
width
*
channel
+
c
*
amount_per_side
+
width
*
h
+
w
)
=
*
((
*
data_in
)
++
);
}
}
}
}
*
data_in
=
data_tmp
;
fpga
::
fpga_free
(
data_in_tmp
);
}
void
convert_to_hwc
(
float
**
data_in
,
int
channel
,
int
height
,
int
width
,
int
num
)
{
float
*
data_in_tmp
=
*
data_in
;
float
*
data_tmp
=
reinterpret_cast
<
float
*>
(
fpga
::
fpga_malloc
(
num
*
channel
*
height
*
width
*
sizeof
(
float
)));
int64_t
amount_per_row
=
width
*
channel
;
for
(
int
n
=
0
;
n
<
num
;
n
++
)
{
for
(
int
c
=
0
;
c
<
channel
;
c
++
)
{
for
(
int
h
=
0
;
h
<
height
;
h
++
)
{
int64_t
offset_height
=
h
*
amount_per_row
;
for
(
int
w
=
0
;
w
<
width
;
w
++
)
{
*
(
data_tmp
+
n
*
channel
*
height
*
width
+
offset_height
+
w
*
channel
+
c
)
=
*
((
*
data_in
)
++
);
}
}
}
}
*
data_in
=
data_tmp
;
fpga
::
fpga_free
(
data_in_tmp
);
}
template
<
>
void
PSRoiPoolKernel
<
FPGA
,
float
>::
Compute
(
const
PSRoiPoolParam
<
FPGA
>&
param
)
{
auto
input_tensor
=
param
.
float_input
.
get
();
...
...
@@ -155,13 +174,14 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
int
rois_num
=
rois
->
dims
()[
0
];
auto
data_nhwc
=
in
->
mutable_data
<
float
>
();
fpga
::
image
::
convert_to_chw
(
&
data_nhwc
,
input_channels
,
height
,
width
);
fpga
::
image
::
convert_to_chw
(
&
data_nhwc
,
input_channels
,
height
,
width
,
1
);
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
rois_num
,
(
param
.
output_
)
->
dims
()[
1
],
(((
param
.
output_
)
->
dims
()[
2
])),
(
param
.
output_
)
->
dims
()[
3
]});
(
param
.
output_
)
->
Resize
(
dims_out_new
);
const
float
*
input_data
=
data_nhwc
;
// in->data<float>();
float
*
input_data
=
data_nhwc
;
// in->data<float>();
// shared_ptr<float> input_data(data_nhwc);
framework
::
Tensor
rois_batch_id_list
;
rois_batch_id_list
.
Resize
({
rois_num
});
auto
rois_batch_id_data
=
rois_batch_id_list
.
mutable_data
<
int
>
();
...
...
@@ -183,24 +203,53 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
"output_channels x pooled_height x pooled_width"
);
// calculate batch id index for each roi according to LoD
for
(
int
n
=
0
;
n
<
rois_batch_size
;
++
n
)
{
for
(
size_t
i
=
rois_lod
[
n
];
i
<
rois_lod
[
n
+
1
];
++
i
)
{
rois_batch_id_data
[
i
]
=
n
;
}
}
//
for (int n = 0; n < rois_batch_size; ++n) {
//
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
//
rois_batch_id_data[i] = n;
//
}
//
}
auto
output_data
=
out
->
mutable_data
<
float
>
();
auto
input_rois
=
rois
->
data
<
float
>
();
// calculate psroipooling, parallel processing can be implemented per ROI
int
index
=
pooled_height
*
pooled_width
*
output_channels
*
rois_num
;
for
(
int
n
=
0
;
n
<
rois_num
;
++
n
)
{
// [start, end) interval for spatial sampling
auto
offset_input_rois
=
input_rois
+
n
*
4
;
auto
roi_start_w
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
0
]))
*
spatial_scale
;
auto
roi_start_h
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
1
]))
*
spatial_scale
;
auto
roi_end_w
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
2
])
+
1.
)
*
spatial_scale
;
auto
roi_end_h
=
static_cast
<
float
>
(
round
(
offset_input_rois
[
3
])
+
1.
)
*
spatial_scale
;
// Force too small rois to be 1 x 1
auto
roi_height
=
std
::
max
(
roi_end_h
-
roi_start_h
,
0.1
f
);
// avoid 0
auto
roi_width
=
std
::
max
(
roi_end_w
-
roi_start_w
,
0.1
f
);
// Compute bin size w and h at input feature map
auto
bin_size_h
=
roi_height
/
static_cast
<
float
>
(
pooled_height
);
auto
bin_size_w
=
roi_width
/
static_cast
<
float
>
(
pooled_width
);
int
roi_batch_ind
=
0
;
// rois_batch_id_data[n];
// std::cout << "roi_batch_ind: " << roi_batch_ind << std::endl;
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
int
index
=
pooled_width
;
int
nid
=
n
*
output_channels
*
pooled_height
*
pooled_width
+
c
*
pooled_width
*
pooled_height
+
ph
*
pooled_width
;
for
(
int
idx
=
0
;
idx
<
index
;
idx
++
)
{
PSROIPooling
<
float
>
(
input_data
,
spatial_scale
,
input_channels
,
height
,
width
,
pooled_height
,
pooled_width
,
input_rois
,
PSROIPooling
<
float
>
(
input_data
,
input_channels
,
height
,
width
,
pooled_height
,
pooled_width
,
input_rois
,
output_channels
,
pooled_height
,
output_data
,
idx
,
rois_batch_id_data
);
nid
,
bin_size_h
,
bin_size_w
,
roi_start_h
,
roi_start_w
,
c
,
ph
,
roi_batch_ind
);
}
}
}
}
//
fpga
::
fpga_free
(
input_data
);
fpga
::
image
::
convert_to_hwc
(
&
output_data
,
output_channels
,
pooled_height
,
pooled_width
,
rois_num
);
out
->
reset_data_ptr
(
output_data
);
...
...
src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
0 → 100644
浏览文件 @
80dd5953
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ROIALIGN_POOL_OP
#include <cmath>
#include <vector>
#include "operators/kernel/detection_kernel.h"
#include "fpga/V1/api.h"
#include "fpga/V1/image.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
RoiAlignPoolKernel
<
FPGA
,
float
>::
Init
(
RoiAlignPoolParam
<
FPGA
>*
param
)
{
auto
dims
=
param
->
input_x_
->
dims
();
PADDLE_MOBILE_ENFORCE
(
dims
[
1
]
*
dims
[
3
]
%
IMAGE_ALIGNMENT
==
0
,
"data not aligned"
);
param
->
float_input
=
std
::
make_shared
<
Tensor
>
();
param
->
float_input
->
mutable_data
<
float
>
(
param
->
input_x_
->
dims
());
auto
input
=
param
->
input_x_
;
fpga
::
BypassArgs
args
=
{
fpga
::
DATA_TYPE_FP16
};
args
.
input_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
output_layout_type
=
fpga
::
LAYOUT_HWC
;
args
.
input_data_type
=
fpga
::
DATA_TYPE_FP16
;
args
.
output_data_type
=
fpga
::
DATA_TYPE_FP32
;
args
.
image
.
address
=
input
->
data
<
half
>
();
args
.
image
.
height
=
(
uint32_t
)
input
->
dims
()[
2
];
args
.
image
.
width
=
(
uint32_t
)
input
->
dims
()[
3
];
args
.
image
.
channels
=
(
uint32_t
)
input
->
dims
()[
1
];
args
.
output
.
address
=
param
->
float_input
->
mutable_data
<
float
>
();
args
.
output
.
scale_address
=
param
->
float_input
->
scale
;
param
->
input_arg
=
args
;
auto
*
rois
=
param
->
input_rois_
;
int
rois_num
=
rois
->
dims
()[
0
];
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
rois_num
,
param
->
output_
->
dims
()[
1
],
param
->
output_
->
dims
()[
2
],
param
->
output_
->
dims
()[
3
]});
param
->
output_
->
Resize
(
dims_out_new
);
param
->
output_
->
mutable_data
<
float
>
(
dims_out_new
);
return
true
;
}
template
<
typename
T
>
struct
PreCalc
{
int
pos1
;
int
pos2
;
int
pos3
;
int
pos4
;
T
w1
;
T
w2
;
T
w3
;
T
w4
;
};
template
<
typename
T
>
void
pre_calc_for_bilinear_interpolate
(
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
iy_upper
,
const
int
ix_upper
,
T
roi_start_h
,
T
roi_start_w
,
T
bin_size_h
,
T
bin_size_w
,
int
roi_bin_grid_h
,
int
roi_bin_grid_w
,
std
::
vector
<
PreCalc
<
T
>>&
pre_calc
)
{
// NOLINT
int
pre_calc_index
=
0
;
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
for
(
int
pw
=
0
;
pw
<
pooled_width
;
pw
++
)
{
for
(
int
iy
=
0
;
iy
<
iy_upper
;
iy
++
)
{
const
T
yy
=
roi_start_h
+
ph
*
bin_size_h
+
static_cast
<
T
>
(
iy
+
.5
f
)
*
bin_size_h
/
static_cast
<
T
>
(
roi_bin_grid_h
);
// e.g., 0.5, 1.5
for
(
int
ix
=
0
;
ix
<
ix_upper
;
ix
++
)
{
const
T
xx
=
roi_start_w
+
pw
*
bin_size_w
+
static_cast
<
T
>
(
ix
+
.5
f
)
*
bin_size_w
/
static_cast
<
T
>
(
roi_bin_grid_w
);
T
x
=
xx
;
T
y
=
yy
;
// deal with: inverse elements are out of feature map boundary
if
(
y
<
-
1.0
||
y
>
height
||
x
<
-
1.0
||
x
>
width
)
{
// empty
PreCalc
<
T
>
pc
;
pc
.
pos1
=
0
;
pc
.
pos2
=
0
;
pc
.
pos3
=
0
;
pc
.
pos4
=
0
;
pc
.
w1
=
0
;
pc
.
w2
=
0
;
pc
.
w3
=
0
;
pc
.
w4
=
0
;
pre_calc
[
pre_calc_index
]
=
pc
;
pre_calc_index
+=
1
;
continue
;
}
if
(
y
<=
0
)
{
y
=
0
;
}
if
(
x
<=
0
)
{
x
=
0
;
}
int
y_low
=
static_cast
<
int
>
(
y
);
int
x_low
=
static_cast
<
int
>
(
x
);
int
y_high
;
int
x_high
;
if
(
y_low
>=
height
-
1
)
{
y_high
=
y_low
=
height
-
1
;
y
=
(
T
)
y_low
;
}
else
{
y_high
=
y_low
+
1
;
}
if
(
x_low
>=
width
-
1
)
{
x_high
=
x_low
=
width
-
1
;
x
=
(
T
)
x_low
;
}
else
{
x_high
=
x_low
+
1
;
}
T
ly
=
y
-
y_low
;
T
lx
=
x
-
x_low
;
T
hy
=
1.
-
ly
,
hx
=
1.
-
lx
;
T
w1
=
hy
*
hx
,
w2
=
hy
*
lx
,
w3
=
ly
*
hx
,
w4
=
ly
*
lx
;
// save weights and indeces
PreCalc
<
T
>
pc
;
pc
.
pos1
=
y_low
*
width
+
x_low
;
pc
.
pos2
=
y_low
*
width
+
x_high
;
pc
.
pos3
=
y_high
*
width
+
x_low
;
pc
.
pos4
=
y_high
*
width
+
x_high
;
pc
.
w1
=
w1
;
pc
.
w2
=
w2
;
pc
.
w3
=
w3
;
pc
.
w4
=
w4
;
pre_calc
[
pre_calc_index
]
=
pc
;
pre_calc_index
+=
1
;
}
}
}
}
}
template
<
typename
T
>
void
ROIAlignForward
(
const
int
nthreads
,
const
T
*
bottom_data
,
const
T
&
spatial_scale
,
const
int
channels
,
const
int
height
,
const
int
width
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
sampling_ratio
,
const
T
*
bottom_rois
,
T
*
top_data
)
{
int
n_rois
=
nthreads
/
channels
/
pooled_width
/
pooled_height
;
for
(
int
n
=
0
;
n
<
n_rois
;
n
++
)
{
int
index_n
=
n
*
channels
*
pooled_width
*
pooled_height
;
// roi could have 4 or 5 columns
const
T
*
offset_bottom_rois
=
bottom_rois
+
n
*
4
;
int
roi_batch_ind
=
0
;
// if (roi_cols == 5) {
// roi_batch_ind = offset_bottom_rois[0];
// offset_bottom_rois++;
// }
// Do not using rounding; this implementation detail is critical
T
roi_start_w
=
offset_bottom_rois
[
0
]
*
spatial_scale
;
T
roi_start_h
=
offset_bottom_rois
[
1
]
*
spatial_scale
;
T
roi_end_w
=
offset_bottom_rois
[
2
]
*
spatial_scale
;
T
roi_end_h
=
offset_bottom_rois
[
3
]
*
spatial_scale
;
// T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
// T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
// T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
// T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
// Force malformed ROIs to be 1x1
T
roi_width
=
std
::
max
(
roi_end_w
-
roi_start_w
,
(
T
)
1.
);
T
roi_height
=
std
::
max
(
roi_end_h
-
roi_start_h
,
(
T
)
1.
);
T
bin_size_h
=
static_cast
<
T
>
(
roi_height
)
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
static_cast
<
T
>
(
roi_width
)
/
static_cast
<
T
>
(
pooled_width
);
// We use roi_bin_grid to sample the grid and mimic integral
int
roi_bin_grid_h
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_height
/
pooled_height
);
// e.g., = 2
int
roi_bin_grid_w
=
(
sampling_ratio
>
0
)
?
sampling_ratio
:
ceil
(
roi_width
/
pooled_width
);
// We do average (integral) pooling inside a bin
const
T
count
=
roi_bin_grid_h
*
roi_bin_grid_w
;
// e.g. = 4
// we want to precalculate indeces and weights shared by all chanels,
// this is the key point of optimiation
std
::
vector
<
PreCalc
<
T
>>
pre_calc
(
roi_bin_grid_h
*
roi_bin_grid_w
*
pooled_width
*
pooled_height
);
pre_calc_for_bilinear_interpolate
(
height
,
width
,
pooled_height
,
pooled_width
,
roi_bin_grid_h
,
roi_bin_grid_w
,
roi_start_h
,
roi_start_w
,
bin_size_h
,
bin_size_w
,
roi_bin_grid_h
,
roi_bin_grid_w
,
pre_calc
);
for
(
int
c
=
0
;
c
<
channels
;
c
++
)
{
int
index_n_c
=
index_n
+
c
*
pooled_width
*
pooled_height
;
const
T
*
offset_bottom_data
=
bottom_data
+
(
roi_batch_ind
*
channels
+
c
)
*
height
*
width
;
int
pre_calc_index
=
0
;
for
(
int
ph
=
0
;
ph
<
pooled_height
;
ph
++
)
{
for
(
int
pw
=
0
;
pw
<
pooled_width
;
pw
++
)
{
int
index
=
index_n_c
+
ph
*
pooled_width
+
pw
;
T
output_val
=
0.
;
for
(
int
iy
=
0
;
iy
<
roi_bin_grid_h
;
iy
++
)
{
for
(
int
ix
=
0
;
ix
<
roi_bin_grid_w
;
ix
++
)
{
PreCalc
<
T
>
pc
=
pre_calc
[
pre_calc_index
];
output_val
+=
pc
.
w1
*
offset_bottom_data
[
pc
.
pos1
]
+
pc
.
w2
*
offset_bottom_data
[
pc
.
pos2
]
+
pc
.
w3
*
offset_bottom_data
[
pc
.
pos3
]
+
pc
.
w4
*
offset_bottom_data
[
pc
.
pos4
];
pre_calc_index
+=
1
;
}
}
output_val
/=
count
;
top_data
[
index
]
=
output_val
;
}
// for pw
}
// for ph
}
// for c
}
// for n
}
template
<
>
void
RoiAlignPoolKernel
<
FPGA
,
float
>::
Compute
(
const
RoiAlignPoolParam
<
FPGA
>&
param
)
{
auto
input_tensor
=
param
.
float_input
.
get
();
fpga
::
PerformBypass
(
param
.
input_arg
);
fpga
::
fpga_invalidate
(
input_tensor
->
data
<
float
>
(),
input_tensor
->
numel
()
*
sizeof
(
float
));
auto
*
in
=
input_tensor
;
auto
*
rois
=
param
.
input_rois_
;
auto
*
out
=
param
.
output_
;
// param.float_output.get();
auto
pooled_height
=
param
.
pooled_height_
;
auto
pooled_width
=
param
.
pooled_width_
;
auto
spatial_scale
=
param
.
spatial_scale_
;
auto
sampe_ratio
=
param
.
sampling_ratio_
;
auto
in_dims
=
in
->
dims
();
int
batch_size
=
in_dims
[
0
];
int
input_channels
=
in_dims
[
1
];
int
height
=
in_dims
[
2
];
int
width
=
in_dims
[
3
];
int
rois_num
=
rois
->
dims
()[
0
];
auto
data_nhwc
=
in
->
mutable_data
<
float
>
();
fpga
::
image
::
convert_to_chw
(
&
data_nhwc
,
input_channels
,
height
,
width
);
framework
::
DDim
dims_out_new
=
framework
::
make_ddim
(
{
rois_num
,
(
param
.
output_
)
->
dims
()[
1
],
(((
param
.
output_
)
->
dims
()[
2
])),
(
param
.
output_
)
->
dims
()[
3
]});
(
param
.
output_
)
->
Resize
(
dims_out_new
);
const
int
index
=
input_channels
*
pooled_height
*
pooled_width
*
rois_num
;
auto
rois_data
=
rois
->
data
<
float
>
();
auto
top_data
=
param
.
output_
->
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
index
;
++
i
)
{
ROIAlignForward
<
float
>
(
index
,
data_nhwc
,
spatial_scale
,
input_channels
,
height
,
width
,
pooled_height
,
pooled_width
,
sampe_ratio
,
rois_data
,
top_data
);
}
fpga
::
image
::
convert_to_hwc
(
&
top_data
,
input_channels
,
pooled_height
,
pooled_width
,
rois_num
);
out
->
reset_data_ptr
(
top_data
);
}
}
// namespace operators
}
// namespace paddle_mobile
#endif // ROIALIGN_POOL_OP
src/operators/kernel/fpga/V1/softmax_kernel.cpp
浏览文件 @
80dd5953
...
...
@@ -105,6 +105,8 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> ¶m) {
}
else
{
if
(
param
.
FpgaArgs
().
output
.
activation
.
activation_type
!=
fpga
::
SOFTMAX
)
{
Tensor
*
out
=
param
.
Out
();
out
->
Resize
(
{
in_x
->
dims
()[
0
],
out
->
dims
()[
1
],
out
->
dims
()[
2
],
out
->
dims
()[
3
]});
math
::
SoftmaxFuntor
<
CPU
,
float
>
()(
in_x
,
out
);
}
}
...
...
src/operators/kernel/fpga/V1/transpose2_kernel.cpp
浏览文件 @
80dd5953
...
...
@@ -42,6 +42,11 @@ template <>
void
Transpose2Kernel
<
FPGA
,
float
>::
Compute
(
const
Transpose2Param
<
FPGA
>
&
param
)
{
// Transpose2Compute<float>(param);
auto
input
=
param
.
InputX
();
auto
output
=
param
.
Out
();
output
->
Resize
({
input
->
dims
()[
0
],
output
->
dims
()[
1
],
output
->
dims
()[
2
],
output
->
dims
()[
3
]});
}
}
// namespace operators
...
...
tools/op.cmake
浏览文件 @
80dd5953
...
...
@@ -128,6 +128,7 @@ if (CON GREATER -1)
set
(
FUSION_CONVADDBN_OP ON
)
set
(
RESHAPE2_OP ON
)
set
(
PSROI_POOL_OP ON
)
set
(
ROIALIGN_POOL_OP ON
)
set
(
PROPOSAL_OP ON
)
set
(
ANCHOR_GENERATOR_OP ON
)
set
(
SLICE_OP ON
)
...
...
@@ -603,6 +604,9 @@ endif()
if
(
PSROI_POOL_OP
)
add_definitions
(
-DPSROI_POOL_OP
)
endif
()
if
(
ROIALIGN_POOL_OP
)
add_definitions
(
-DROIALIGN_POOL_OP
)
endif
()
if
(
ROI_PERSPECTIVE_OP
)
add_definitions
(
-DROI_PERSPECTIVE_OP
)
endif
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录