Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
c38dbd3c
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
338
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
“813e27c95a983754f0db622ecb305875a13c302c”上不存在“...prim/api/composite_backward/composite_backward_api.h”
未验证
提交
c38dbd3c
编写于
8月 01, 2018
作者:
R
Ruilong Liu
提交者:
GitHub
8月 01, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into develop
上级
34196979
6bfdeedc
变更
35
隐藏空白更改
内联
并排
Showing
35 changed file
with
817 addition
and
307 deletion
+817
-307
README.md
README.md
+1
-1
src/common/types.cpp
src/common/types.cpp
+32
-32
src/common/types.h
src/common/types.h
+41
-35
src/fpga/api/fpga_api.cpp
src/fpga/api/fpga_api.cpp
+11
-9
src/fpga/api/fpga_api.h
src/fpga/api/fpga_api.h
+82
-19
src/framework/program/program-optimize/fusion_op_register.h
src/framework/program/program-optimize/fusion_op_register.h
+3
-1
src/framework/tensor.h
src/framework/tensor.h
+5
-3
src/io/executor.cpp
src/io/executor.cpp
+2
-2
src/memory/t_malloc.cpp
src/memory/t_malloc.cpp
+3
-3
src/operators/concat_op.cpp
src/operators/concat_op.cpp
+4
-1
src/operators/concat_op.h
src/operators/concat_op.h
+1
-0
src/operators/feed_op.h
src/operators/feed_op.h
+1
-1
src/operators/kernel/arm/dropout_kernel.cpp
src/operators/kernel/arm/dropout_kernel.cpp
+0
-2
src/operators/kernel/dropout_kernel.h
src/operators/kernel/dropout_kernel.h
+1
-1
src/operators/kernel/fpga/concat_kernel.cpp
src/operators/kernel/fpga/concat_kernel.cpp
+55
-0
src/operators/kernel/im2sequence_kernel.h
src/operators/kernel/im2sequence_kernel.h
+1
-3
src/operators/kernel/mali/fushion_fc_kernel.cpp
src/operators/kernel/mali/fushion_fc_kernel.cpp
+0
-2
src/operators/kernel/prelu_kernel.h
src/operators/kernel/prelu_kernel.h
+1
-1
src/operators/kernel/scale_kernel.h
src/operators/kernel/scale_kernel.h
+1
-1
src/operators/kernel/slice_kernel.h
src/operators/kernel/slice_kernel.h
+1
-1
src/operators/math/gemm.cpp
src/operators/math/gemm.cpp
+294
-24
src/operators/math/gemm.h
src/operators/math/gemm.h
+8
-5
src/operators/op_param.h
src/operators/op_param.h
+59
-80
src/operators/resize_op.h
src/operators/resize_op.h
+1
-1
test/CMakeLists.txt
test/CMakeLists.txt
+4
-0
test/common/test_gemm.cpp
test/common/test_gemm.cpp
+84
-30
test/executor_for_test.h
test/executor_for_test.h
+14
-8
test/framework/test_load.cpp
test/framework/test_load.cpp
+5
-5
test/net/test_mobilenet+ssd.cpp
test/net/test_mobilenet+ssd.cpp
+6
-8
test/net/test_mobilenet.cpp
test/net/test_mobilenet.cpp
+14
-12
test/operators/test_fusion_conv_add_bn_relu_op.cpp
test/operators/test_fusion_conv_add_bn_relu_op.cpp
+62
-0
test/test_helper.h
test/test_helper.h
+15
-11
tools/android-cmake/android.toolchain.cmake
tools/android-cmake/android.toolchain.cmake
+3
-1
tools/build.sh
tools/build.sh
+2
-2
tools/op.cmake
tools/op.cmake
+0
-2
未找到文件。
README.md
浏览文件 @
c38dbd3c
...
@@ -27,7 +27,7 @@ Paddle-Moible是PaddlePaddle组织下的项目,是一个致力于嵌入式平
...
@@ -27,7 +27,7 @@ Paddle-Moible是PaddlePaddle组织下的项目,是一个致力于嵌入式平
-
**ARM CPU**
-
**ARM CPU**


arm cpu是paddle-mobile的主要支持方向,cpu的通用性一直是其优势。嵌入式深度学习,需要大量的cpu汇编实现。我们正在紧锣密鼓的编码,为的是能充分硬件的每一点加速能力。
arm cpu是paddle-mobile的主要支持方向,cpu的通用性一直是其优势。嵌入式深度学习,需要大量的cpu汇编实现。我们正在紧锣密鼓的编码,为的是能充分硬件的每一点加速能力。
arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
...
...
src/common/types.cpp
浏览文件 @
c38dbd3c
...
@@ -17,39 +17,39 @@ limitations under the License. */
...
@@ -17,39 +17,39 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
paddle_mobile
{
const
std
::
string
G_OP_TYPE_CONV
=
"conv2d"
;
const
char
*
G_OP_TYPE_CONV
=
"conv2d"
;
const
std
::
string
G_OP_TYPE_BATCHNORM
=
"batch_norm"
;
const
char
*
G_OP_TYPE_BATCHNORM
=
"batch_norm"
;
const
std
::
string
G_OP_TYPE_BOX_CODER
=
"box_coder"
;
const
char
*
G_OP_TYPE_BOX_CODER
=
"box_coder"
;
const
std
::
string
G_OP_TYPE_CONCAT
=
"concat"
;
const
char
*
G_OP_TYPE_CONCAT
=
"concat"
;
const
std
::
string
G_OP_TYPE_ELEMENTWISE_ADD
=
"elementwise_add"
;
const
char
*
G_OP_TYPE_ELEMENTWISE_ADD
=
"elementwise_add"
;
const
std
::
string
G_OP_TYPE_FUSION_CONV_ADD_RELU
=
"fusion_conv_add_relu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_RELU
=
"fusion_conv_add_relu"
;
const
std
::
string
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
=
"fusion_conv_add_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
=
"fusion_conv_add_bn_relu"
;
const
std
::
string
G_OP_TYPE_FUSION_DWCONV_BN_RELU
=
"fusion_dwconv_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_DWCONV_BN_RELU
=
"fusion_dwconv_bn_relu"
;
const
std
::
string
G_OP_TYPE_FUSION_CONV_BN_RELU
=
"fusion_conv_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_BN_RELU
=
"fusion_conv_bn_relu"
;
const
std
::
string
G_OP_TYPE_FC
=
"fusion_fc"
;
const
char
*
G_OP_TYPE_FC
=
"fusion_fc"
;
const
std
::
string
G_OP_TYPE_FUSION_CONV_ADD
=
"fusion_conv_add"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD
=
"fusion_conv_add"
;
const
std
::
string
G_OP_TYPE_LRN
=
"lrn"
;
const
char
*
G_OP_TYPE_LRN
=
"lrn"
;
const
std
::
string
G_OP_TYPE_MUL
=
"mul"
;
const
char
*
G_OP_TYPE_MUL
=
"mul"
;
const
std
::
string
G_OP_TYPE_MULTICLASS_NMS
=
"multiclass_nms"
;
const
char
*
G_OP_TYPE_MULTICLASS_NMS
=
"multiclass_nms"
;
const
std
::
string
G_OP_TYPE_POOL2D
=
"pool2d"
;
const
char
*
G_OP_TYPE_POOL2D
=
"pool2d"
;
const
std
::
string
G_OP_TYPE_PRIOR_BOX
=
"prior_box"
;
const
char
*
G_OP_TYPE_PRIOR_BOX
=
"prior_box"
;
const
std
::
string
G_OP_TYPE_RELU
=
"relu"
;
const
char
*
G_OP_TYPE_RELU
=
"relu"
;
const
std
::
string
G_OP_TYPE_RESHAPE
=
"reshape"
;
const
char
*
G_OP_TYPE_RESHAPE
=
"reshape"
;
const
std
::
string
G_OP_TYPE_SIGMOID
=
"sigmoid"
;
const
char
*
G_OP_TYPE_SIGMOID
=
"sigmoid"
;
const
std
::
string
G_OP_TYPE_SOFTMAX
=
"softmax"
;
const
char
*
G_OP_TYPE_SOFTMAX
=
"softmax"
;
const
std
::
string
G_OP_TYPE_TRANSPOSE
=
"transpose"
;
const
char
*
G_OP_TYPE_TRANSPOSE
=
"transpose"
;
const
std
::
string
G_OP_TYPE_SPLIT
=
"split"
;
const
char
*
G_OP_TYPE_SPLIT
=
"split"
;
const
std
::
string
G_OP_TYPE_FEED
=
"feed"
;
const
char
*
G_OP_TYPE_FEED
=
"feed"
;
const
std
::
string
G_OP_TYPE_FETCH
=
"fetch"
;
const
char
*
G_OP_TYPE_FETCH
=
"fetch"
;
const
std
::
string
G_OP_TYPE_DEPTHWISE_CONV
=
"depthwise_conv2d"
;
const
char
*
G_OP_TYPE_DEPTHWISE_CONV
=
"depthwise_conv2d"
;
const
std
::
string
G_OP_TYPE_IM2SEQUENCE
=
"im2sequence"
;
const
char
*
G_OP_TYPE_IM2SEQUENCE
=
"im2sequence"
;
const
std
::
string
G_OP_TYPE_DROPOUT
=
"dropout"
;
const
char
*
G_OP_TYPE_DROPOUT
=
"dropout"
;
const
std
::
string
G_OP_TYPE_FUSION_CONV_ADD_BN
=
"fusion_conv_add_bn"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN
=
"fusion_conv_add_bn"
;
const
std
::
string
G_OP_TYPE_FUSION_POOL_BN
=
"fusion_pool_bn"
;
const
char
*
G_OP_TYPE_FUSION_POOL_BN
=
"fusion_pool_bn"
;
const
std
::
string
G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU
=
const
char
*
G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU
=
"fusion_elementwise_add_relu"
;
"fusion_elementwise_add_relu"
;
const
std
::
string
G_OP_TYPE_FUSION_FC_RELU
=
"fusion_fc_relu"
;
const
char
*
G_OP_TYPE_FUSION_FC_RELU
=
"fusion_fc_relu"
;
const
std
::
string
G_OP_TYPE_REGION
=
"region"
;
const
char
*
G_OP_TYPE_REGION
=
"region"
;
std
::
unordered_map
<
std
::
unordered_map
<
std
::
string
,
std
::
pair
<
std
::
vector
<
std
::
string
>
,
std
::
vector
<
std
::
string
>>>
std
::
string
,
std
::
pair
<
std
::
vector
<
std
::
string
>
,
std
::
vector
<
std
::
string
>>>
...
...
src/common/types.h
浏览文件 @
c38dbd3c
...
@@ -20,7 +20,9 @@ limitations under the License. */
...
@@ -20,7 +20,9 @@ limitations under the License. */
#include <vector>
#include <vector>
namespace
paddle_mobile
{
namespace
paddle_mobile
{
enum
class
Precision
:
int
{
FP32
=
0
};
enum
class
Precision
:
int
{
FP32
=
0
,
FP16
=
1
};
typedef
int16_t
half
;
template
<
Precision
p
>
template
<
Precision
p
>
struct
PrecisionTrait
{
struct
PrecisionTrait
{
...
@@ -31,6 +33,10 @@ template <>
...
@@ -31,6 +33,10 @@ template <>
struct
PrecisionTrait
<
Precision
::
FP32
>
{
struct
PrecisionTrait
<
Precision
::
FP32
>
{
typedef
float
ptype
;
typedef
float
ptype
;
};
};
template
<
>
struct
PrecisionTrait
<
Precision
::
FP16
>
{
typedef
half
ptype
;
};
//! device type
//! device type
enum
DeviceTypeEnum
{
kINVALID
=
-
1
,
kCPU
=
0
,
kFPGA
=
1
,
kGPU_MALI
=
2
};
enum
DeviceTypeEnum
{
kINVALID
=
-
1
,
kCPU
=
0
,
kFPGA
=
1
,
kGPU_MALI
=
2
};
...
@@ -73,40 +79,40 @@ enum PMStatus {
...
@@ -73,40 +79,40 @@ enum PMStatus {
PMWrongDevice
=
0x08
/*!< un-correct device. */
PMWrongDevice
=
0x08
/*!< un-correct device. */
};
};
extern
const
std
::
string
G_OP_TYPE_CONV
;
extern
const
char
*
G_OP_TYPE_CONV
;
extern
const
std
::
string
G_OP_TYPE_BATCHNORM
;
extern
const
char
*
G_OP_TYPE_BATCHNORM
;
extern
const
std
::
string
G_OP_TYPE_BOX_CODER
;
extern
const
char
*
G_OP_TYPE_BOX_CODER
;
extern
const
std
::
string
G_OP_TYPE_CONCAT
;
extern
const
char
*
G_OP_TYPE_CONCAT
;
extern
const
std
::
string
G_OP_TYPE_ELEMENTWISE_ADD
;
extern
const
char
*
G_OP_TYPE_ELEMENTWISE_ADD
;
extern
const
std
::
string
G_OP_TYPE_FUSION_CONV_ADD_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_RELU
;
extern
const
std
::
string
G_OP_TYPE_FC
;
extern
const
char
*
G_OP_TYPE_FC
;
extern
const
std
::
string
G_OP_TYPE_FUSION_CONV_ADD
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD
;
extern
const
std
::
string
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
;
extern
const
std
::
string
G_OP_TYPE_FUSION_DWCONV_BN_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_DWCONV_BN_RELU
;
extern
const
std
::
string
G_OP_TYPE_FUSION_CONV_BN_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_BN_RELU
;
extern
const
std
::
string
G_OP_TYPE_LRN
;
extern
const
char
*
G_OP_TYPE_LRN
;
extern
const
std
::
string
G_OP_TYPE_MUL
;
extern
const
char
*
G_OP_TYPE_MUL
;
extern
const
std
::
string
G_OP_TYPE_MULTICLASS_NMS
;
extern
const
char
*
G_OP_TYPE_MULTICLASS_NMS
;
extern
const
std
::
string
G_OP_TYPE_POOL2D
;
extern
const
char
*
G_OP_TYPE_POOL2D
;
extern
const
std
::
string
G_OP_TYPE_PRIOR_BOX
;
extern
const
char
*
G_OP_TYPE_PRIOR_BOX
;
extern
const
std
::
string
G_OP_TYPE_RELU
;
extern
const
char
*
G_OP_TYPE_RELU
;
extern
const
std
::
string
G_OP_TYPE_RESHAPE
;
extern
const
char
*
G_OP_TYPE_RESHAPE
;
extern
const
std
::
string
G_OP_TYPE_SIGMOID
;
extern
const
char
*
G_OP_TYPE_SIGMOID
;
extern
const
std
::
string
G_OP_TYPE_SOFTMAX
;
extern
const
char
*
G_OP_TYPE_SOFTMAX
;
extern
const
std
::
string
G_OP_TYPE_TRANSPOSE
;
extern
const
char
*
G_OP_TYPE_TRANSPOSE
;
extern
const
std
::
string
G_OP_TYPE_SPLIT
;
extern
const
char
*
G_OP_TYPE_SPLIT
;
extern
const
std
::
string
G_OP_TYPE_FEED
;
extern
const
char
*
G_OP_TYPE_FEED
;
extern
const
std
::
string
G_OP_TYPE_FETCH
;
extern
const
char
*
G_OP_TYPE_FETCH
;
extern
const
std
::
string
G_OP_TYPE_DEPTHWISE_CONV
;
extern
const
char
*
G_OP_TYPE_DEPTHWISE_CONV
;
extern
const
std
::
string
G_OP_TYPE_IM2SEQUENCE
;
extern
const
char
*
G_OP_TYPE_IM2SEQUENCE
;
extern
const
std
::
string
G_OP_TYPE_DROPOUT
;
extern
const
char
*
G_OP_TYPE_DROPOUT
;
extern
const
std
::
string
G_OP_TYPE_FUSION_CONV_ADD_BN
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN
;
extern
const
std
::
string
G_OP_TYPE_FUSION_POOL_BN
;
extern
const
char
*
G_OP_TYPE_FUSION_POOL_BN
;
extern
const
std
::
string
G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU
;
extern
const
std
::
string
G_OP_TYPE_FUSION_FC_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_FC_RELU
;
extern
const
std
::
string
G_OP_TYPE_REGION
;
extern
const
char
*
G_OP_TYPE_REGION
;
extern
std
::
unordered_map
<
extern
std
::
unordered_map
<
std
::
string
,
std
::
pair
<
std
::
vector
<
std
::
string
>
,
std
::
vector
<
std
::
string
>>>
std
::
string
,
std
::
pair
<
std
::
vector
<
std
::
string
>
,
std
::
vector
<
std
::
string
>>>
...
...
src/fpga/api/fpga_api.cpp
浏览文件 @
c38dbd3c
...
@@ -29,15 +29,15 @@ limitations under the License. */
...
@@ -29,15 +29,15 @@ limitations under the License. */
#include "fpga/api/fpga_api.h"
#include "fpga/api/fpga_api.h"
namespace
paddle
{
namespace
paddle_mobile
{
namespace
mobile
{
namespace
fpga
{
namespace
fpga
{
namespace
api
{
static
int
fd
=
-
1
;
static
int
fd
=
-
1
;
static
const
char
*
device_path
=
"/dev/fpgadrv0"
;
static
const
char
*
device_path
=
"/dev/fpgadrv0"
;
static
inline
int
do_ioctl
(
int
req
,
void
*
arg
)
{
return
ioctl
(
req
,
arg
);
}
static
inline
int
do_ioctl
(
int
req
,
void
*
arg
)
{
return
ioctl
(
req
,
(
long
unsigned
int
)
arg
);
}
int
open_device
()
{
int
open_device
()
{
if
(
fd
==
-
1
)
{
if
(
fd
==
-
1
)
{
...
@@ -48,8 +48,8 @@ int open_device() {
...
@@ -48,8 +48,8 @@ int open_device() {
// memory management;
// memory management;
void
*
fpga_malloc
(
size_t
size
)
{
void
*
fpga_malloc
(
size_t
size
)
{
return
reinterpret_cast
<
(
void
*
)
>
mmap64
(
NULL
,
size
,
PROT_READ
|
PROT_WRITE
,
return
reinterpret_cast
<
void
*>
(
MAP_SHARED
,
fd
,
0
);
mmap64
(
NULL
,
size
,
PROT_READ
|
PROT_WRITE
,
MAP_SHARED
,
fd
,
0
)
);
}
}
void
fpga_free
(
void
*
ptr
)
{
munmap
(
ptr
,
0
);
}
void
fpga_free
(
void
*
ptr
)
{
munmap
(
ptr
,
0
);
}
...
@@ -58,7 +58,9 @@ void fpga_copy(void *dest, const void *src, size_t num) {
...
@@ -58,7 +58,9 @@ void fpga_copy(void *dest, const void *src, size_t num) {
memcpy
(
dest
,
src
,
num
);
memcpy
(
dest
,
src
,
num
);
}
}
}
// namespace api
int
ComputeFpgaConv
(
struct
FpgaConvArgs
args
)
{}
int
ComputeFpgaPool
(
struct
FpgaPoolArgs
args
)
{}
int
ComputeFpgaEWAdd
(
struct
FpgaEWAddArgs
args
)
{}
}
// namespace fpga
}
// namespace fpga
}
// namespace mobile
}
// namespace paddle_mobile
}
// namespace paddle
src/fpga/api/fpga_api.h
浏览文件 @
c38dbd3c
...
@@ -14,44 +14,107 @@ limitations under the License. */
...
@@ -14,44 +14,107 @@ limitations under the License. */
#pragma once
#pragma once
#include <stdint.h>
#include <cstddef>
#include <cstddef>
#include <iostream>
#include <iostream>
#include <limits>
#include <limits>
// memory management;
// memory management;
namespace
paddle
{
namespace
paddle_mobile
{
namespace
mobile
{
namespace
fpga
{
namespace
fpga
{
namespace
api
{
int
open_device
();
int
open_device
();
int
close_device
();
int
close_device
();
void
*
fpga_malloc
(
size_t
size
);
void
*
fpga_malloc
(
size_t
size
);
void
fpga_free
(
void
*
ptr
);
void
fpga_free
(
void
*
ptr
);
void
fpga_copy
(
void
*
dst
,
const
void
*
src
,
size_t
num
);
void
fpga_copy
(
void
*
dst
,
const
void
*
src
,
size_t
num
);
struct
Cnn
VersionArgs
{
struct
Fpga
VersionArgs
{
void
*
buf
;
void
*
buf
;
};
};
struct
QuantArgs
{
struct
MemoryToPhysicalArgs
{
const
void
*
src
;
uint64_t
physical
;
};
struct
MemoryCopyArgs
{
void
*
src
;
void
*
dst
;
size_t
size
;
};
struct
FpgaQuantArgs
{
float
scale
;
float
scale
;
};
};
struct
BatchNormalizationArgs
{
struct
FpgaBNArgs
{
bool
enable
;
bool
enabled
=
false
;
void
*
bias_addr
;
void
*
scale_addr
;
};
struct
FpgaKernelArgs
{
uint32_t
width
;
uint32_t
height
;
uint32_t
stride_h
;
uint32_t
stride_w
;
};
struct
FpgaImageArgs
{
uint32_t
width
;
uint32_t
height
;
uint32_t
channels
;
uint32_t
pad_h
;
uint32_t
pad_w
;
};
struct
FpgaConvArgs
{
bool
relu_enabled
;
struct
FpgaBNArgs
BNargs
;
void
*
image_addr
;
void
*
filter_addr
;
void
*
bias_addr
;
void
*
output_addr
;
float
quant_scale
;
struct
FpgaImageArgs
image
;
uint32_t
filter_num
;
uint32_t
group_num
;
struct
FpgaKernelArgs
kernel
;
};
struct
FpgaPoolArgs
{
void
*
image_addr
;
void
*
output_addr
;
struct
FpgaImageArgs
image
;
struct
FpgaKernelArgs
kernel
;
};
struct
FpgaEWAddArgs
{
bool
relu_enabled
;
void
*
image0_addr
;
void
*
image1_addr
;
void
*
result_addr
;
uint32_t
const0
;
uint32_t
const1
;
uint32_t
data_len
;
// aligned element count
};
};
struct
ScaleArgs
{};
int
ComputeFpgaConv
(
struct
FpgaConvArgs
args
);
int
ComputeFpgaPool
(
struct
FpgaPoolArgs
args
);
int
ComputeFpgaEWAdd
(
struct
FpgaEWAddArgs
args
);
#define IOCTL_CNN_MAGIC 'CNN'
#define IOCTL_FPGA_MAGIC 'CNN'
#define IOCTL_VERSION _IOW(IOCTL_CNN_MAGIC, 1, struct CnnVersionArgs)
#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 1, struct FpgaVersionArgs)
#define IOCTL_GET_QUANT _IOW(IOCTL_CNN_MAGIC, 2, struct QuantArgs)
#define IOCTL_GET_QUANT _IOW(IOCTL_FPGA_MAGIC, 2, struct FpgaQuantArgs)
#define IOCTL_SET_QUANT _IOW(IOCTL_CNN_MAGIC, 3, struct QuantArgs)
#define IOCTL_SET_QUANT _IOW(IOCTL_FPGA_MAGIC, 3, struct FpgaQuantArgs)
#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct FpgaConvArgs)
#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct FpgaPoolArgs)
#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct FpgaEWAddArgs)
}
// namespace api
}
// namespace fpga
}
// namespace fpga
}
// namespace mobile
}
// namespace paddle_mobile
}
// namespace paddle
src/framework/program/program-optimize/fusion_op_register.h
浏览文件 @
c38dbd3c
...
@@ -14,11 +14,13 @@ limitations under the License. */
...
@@ -14,11 +14,13 @@ limitations under the License. */
#pragma once
#pragma once
#include <algorithm>
#include <map>
#include <map>
#include <string>
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/operator.h"
#include "node.h"
#include "
framework/program/program-optimize/
node.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
framework
{
namespace
framework
{
...
...
src/framework/tensor.h
浏览文件 @
c38dbd3c
...
@@ -16,14 +16,15 @@ limitations under the License. */
...
@@ -16,14 +16,15 @@ limitations under the License. */
#include <cstdint>
#include <cstdint>
#include <cstring>
#include <cstring>
#include <fstream>
#include <memory>
#include <memory>
#include <string>
#include <type_traits>
#include <type_traits>
#include <typeindex>
#include <typeindex>
#include <vector>
#include <vector>
#include "common/enforce.h"
#include <fstream>
#include "common/enforce.h"
#include "common/enforce.h"
#include "common/types.h"
#include "framework/data_layout.h"
#include "framework/data_layout.h"
#include "framework/ddim.h"
#include "framework/ddim.h"
#include "memory/t_malloc.h"
#include "memory/t_malloc.h"
...
@@ -63,7 +64,8 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
...
@@ -63,7 +64,8 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
};
};
static
inline
size_t
SizeOfType
(
std
::
type_index
type
)
{
static
inline
size_t
SizeOfType
(
std
::
type_index
type
)
{
SizeOfTypeFunctor
<
int
,
float
,
double
,
int16_t
,
int64_t
,
bool
,
size_t
>
functor
;
SizeOfTypeFunctor
<
int
,
half
,
float
,
double
,
int16_t
,
int64_t
,
bool
,
size_t
>
functor
;
size_t
size
=
functor
(
type
);
size_t
size
=
functor
(
type
);
PADDLE_MOBILE_ENFORCE
(
size
!=
0UL
,
"Cannot get size of type %s"
,
type
.
name
());
PADDLE_MOBILE_ENFORCE
(
size
!=
0UL
,
"Cannot get size of type %s"
,
type
.
name
());
...
...
src/io/executor.cpp
浏览文件 @
c38dbd3c
...
@@ -187,7 +187,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
...
@@ -187,7 +187,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
memcpy
(
&
max_value
,
*
data
+
sizeof
(
float
),
sizeof
(
float
));
memcpy
(
&
max_value
,
*
data
+
sizeof
(
float
),
sizeof
(
float
));
*
data
+=
2
*
sizeof
(
float
);
*
data
+=
2
*
sizeof
(
float
);
const
float
factor
=
(
max_value
-
min_value
)
/
255.0
;
const
float
factor
=
(
max_value
-
min_value
)
/
255.0
;
uint8_t
*
uint8_data
=
(
uint8_t
*
)
(
*
data
);
uint8_t
*
uint8_data
=
reinterpret_cast
<
uint8_t
*>
(
*
data
);
for
(
int
k
=
0
;
k
<
memory_size
;
++
k
)
{
for
(
int
k
=
0
;
k
<
memory_size
;
++
k
)
{
static_cast
<
float
*>
(
memory
)[
k
]
=
uint8_data
[
k
]
*
factor
+
min_value
;
static_cast
<
float
*>
(
memory
)[
k
]
=
uint8_data
[
k
]
*
factor
+
min_value
;
}
}
...
@@ -419,7 +419,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
...
@@ -419,7 +419,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
}
}
template
class
Executor
<
CPU
,
Precision
::
FP32
>;
template
class
Executor
<
CPU
,
Precision
::
FP32
>;
template
class
Executor
<
FPGA
,
Precision
::
FP32
>;
template
class
Executor
<
GPU_MALI
,
Precision
::
FP32
>;
template
class
Executor
<
GPU_MALI
,
Precision
::
FP32
>;
template
class
Executor
<
FPGA
,
Precision
::
FP32
>;
}
// namespace paddle_mobile
}
// namespace paddle_mobile
src/memory/t_malloc.cpp
浏览文件 @
c38dbd3c
...
@@ -27,17 +27,17 @@ namespace memory {
...
@@ -27,17 +27,17 @@ namespace memory {
const
int
MALLOC_ALIGN
=
64
;
const
int
MALLOC_ALIGN
=
64
;
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
namespace
api
=
paddle
::
mobile
::
fpga
::
api
;
namespace
fpga
=
paddle_mobile
::
fpga
;
void
Copy
(
void
*
dst
,
const
void
*
src
,
size_t
num
)
{
void
Copy
(
void
*
dst
,
const
void
*
src
,
size_t
num
)
{
std
::
memcpy
(
dst
,
src
,
num
);
std
::
memcpy
(
dst
,
src
,
num
);
}
}
void
*
Alloc
(
size_t
size
)
{
return
api
::
malloc
(
size
);
}
void
*
Alloc
(
size_t
size
)
{
return
fpga
::
fpga_
malloc
(
size
);
}
void
Free
(
void
*
ptr
)
{
void
Free
(
void
*
ptr
)
{
if
(
ptr
)
{
if
(
ptr
)
{
api
::
fpga_free
(
ptr
);
fpga
::
fpga_free
(
ptr
);
}
}
}
}
...
...
src/operators/concat_op.cpp
浏览文件 @
c38dbd3c
...
@@ -14,7 +14,9 @@ limitations under the License. */
...
@@ -14,7 +14,9 @@ limitations under the License. */
#ifdef CONCAT_OP
#ifdef CONCAT_OP
#include "concat_op.h"
#include <vector>
#include "operators/concat_op.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -68,6 +70,7 @@ REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
...
@@ -68,6 +70,7 @@ REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
REGISTER_OPERATOR_MALI_GPU
(
concat
,
ops
::
ConcatOp
);
REGISTER_OPERATOR_MALI_GPU
(
concat
,
ops
::
ConcatOp
);
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA
(
concat
,
ops
::
ConcatOp
);
#endif
#endif
#endif
#endif
src/operators/concat_op.h
浏览文件 @
c38dbd3c
...
@@ -53,6 +53,7 @@ USE_OP_CPU(concat);
...
@@ -53,6 +53,7 @@ USE_OP_CPU(concat);
USE_OP_MALI_GPU
(
concat
);
USE_OP_MALI_GPU
(
concat
);
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA
(
concat
);
#endif
#endif
#endif
#endif
src/operators/feed_op.h
浏览文件 @
c38dbd3c
...
@@ -29,7 +29,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
...
@@ -29,7 +29,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
:
framework
::
OperatorBase
<
DeviceType
>
(
type
,
inputs
,
outputs
,
attrs
,
:
framework
::
OperatorBase
<
DeviceType
>
(
type
,
inputs
,
outputs
,
attrs
,
scope
),
scope
),
param_
(
inputs
,
outputs
,
attrs
,
*
scope
)
{}
param_
(
inputs
,
outputs
,
attrs
,
scope
.
get
()
)
{}
void
RunImpl
()
const
{
param_
.
Out
()
->
ShareDataWith
(
*
param_
.
InputX
());
}
void
RunImpl
()
const
{
param_
.
Out
()
->
ShareDataWith
(
*
param_
.
InputX
());
}
void
Init
()
{}
void
Init
()
{}
...
...
src/operators/kernel/arm/dropout_kernel.cpp
浏览文件 @
c38dbd3c
...
@@ -14,8 +14,6 @@ limitations under the License. */
...
@@ -14,8 +14,6 @@ limitations under the License. */
#ifdef DROPOUT_OP
#ifdef DROPOUT_OP
#pragma once
#include "operators/kernel/dropout_kernel.h"
#include "operators/kernel/dropout_kernel.h"
#include <operators/math/transform.h>
#include <operators/math/transform.h>
...
...
src/operators/kernel/dropout_kernel.h
浏览文件 @
c38dbd3c
...
@@ -17,7 +17,7 @@ limitations under the License. */
...
@@ -17,7 +17,7 @@ limitations under the License. */
#include "framework/operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
#include "operators/op_param.h"
#pragma once
;
#pragma once
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
...
src/operators/kernel/fpga/concat_kernel.cpp
0 → 100644
浏览文件 @
c38dbd3c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONCAT_OP
#include "operators/kernel/concat_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ConcatKernel
<
FPGA
,
half
>::
Init
(
ConcatParam
*
param
)
{
return
true
;
}
template
<
>
void
ConcatKernel
<
FPGA
,
half
>::
Compute
(
const
ConcatParam
&
param
)
const
{
auto
inputs
=
param
.
Inputs
();
auto
*
out
=
param
.
Out
();
int64_t
axis
=
param
.
Axis
();
out
->
mutable_data
<
half
>
();
DDim
out_dim
=
out
->
dims
();
int
pixels
=
out_dim
[
1
]
*
out_dim
[
2
];
auto
out_channel
=
out_dim
[
3
];
auto
out_offset
=
0
;
for
(
int
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
auto
input
=
inputs
[
i
];
auto
channels
=
input
[
3
];
out_offset
+=
channels
;
auto
src
=
input
->
data
<
half
>
();
for
(
int
j
=
0
;
j
<
pixels
;
++
j
)
{
auto
dst
=
out
->
data
<
half
>
()
+
out_offset
;
memory
::
Copy
(
dst
,
src
,
sizeof
(
half
));
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/im2sequence_kernel.h
浏览文件 @
c38dbd3c
...
@@ -20,13 +20,11 @@ limitations under the License. */
...
@@ -20,13 +20,11 @@ limitations under the License. */
#include "operators/math/vol2col.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
#include "operators/op_param.h"
#pragma once
;
#pragma once
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
using
namespace
framework
;
template
<
typename
DeviceType
,
typename
T
>
template
<
typename
DeviceType
,
typename
T
>
class
Im2SequenceKernel
class
Im2SequenceKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
Im2SequenceParam
>
{
:
public
framework
::
OpKernelBase
<
DeviceType
,
Im2SequenceParam
>
{
...
...
src/operators/kernel/mali/fushion_fc_kernel.cpp
浏览文件 @
c38dbd3c
...
@@ -14,8 +14,6 @@ limitations under the License. */
...
@@ -14,8 +14,6 @@ limitations under the License. */
#ifdef FUSION_FC_OP
#ifdef FUSION_FC_OP
#pragma once
#include "operators/kernel/fusion_fc_kernel.h"
#include "operators/kernel/fusion_fc_kernel.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
...
...
src/operators/kernel/prelu_kernel.h
浏览文件 @
c38dbd3c
...
@@ -15,7 +15,7 @@ limitations under the License. */
...
@@ -15,7 +15,7 @@ limitations under the License. */
#include "framework/operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
#include "operators/op_param.h"
#pragma once
;
#pragma once
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
...
src/operators/kernel/scale_kernel.h
浏览文件 @
c38dbd3c
...
@@ -15,7 +15,7 @@ limitations under the License. */
...
@@ -15,7 +15,7 @@ limitations under the License. */
#include "framework/operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
#include "operators/op_param.h"
#pragma once
;
#pragma once
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
...
src/operators/kernel/slice_kernel.h
浏览文件 @
c38dbd3c
...
@@ -15,7 +15,7 @@ limitations under the License. */
...
@@ -15,7 +15,7 @@ limitations under the License. */
#include "framework/operator.h"
#include "framework/operator.h"
#include "operators/op_param.h"
#include "operators/op_param.h"
#pragma once
;
#pragma once
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
...
src/operators/math/gemm.cpp
浏览文件 @
c38dbd3c
...
@@ -92,8 +92,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
...
@@ -92,8 +92,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
*/
*/
// 将A矩阵分块复制到连续内存(RowMajor)
// 将A矩阵分块复制到连续内存(RowMajor)
void
PackMatrixA_
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
void
PackMatrixA_
4r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
)
{
float
*
buffer
)
{
const
float
*
a0
,
*
a1
,
*
a2
,
*
a3
;
const
float
*
a0
,
*
a1
,
*
a2
,
*
a3
;
for
(
int
i
=
0
;
i
<
m
-
m_tail
;
i
+=
MR
)
{
for
(
int
i
=
0
;
i
<
m
-
m_tail
;
i
+=
MR
)
{
a0
=
A
+
i
*
lda
;
a0
=
A
+
i
*
lda
;
...
@@ -131,9 +131,62 @@ void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
...
@@ -131,9 +131,62 @@ void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
}
}
}
}
void
PackMatrixA_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
)
{
const
float
*
a0
,
*
a1
,
*
a2
,
*
a3
,
*
a4
,
*
a5
;
for
(
int
i
=
0
;
i
<
m
-
m_tail
;
i
+=
MR
)
{
a0
=
A
+
i
*
lda
;
a1
=
A
+
(
i
+
1
)
*
lda
;
a2
=
A
+
(
i
+
2
)
*
lda
;
a3
=
A
+
(
i
+
3
)
*
lda
;
a4
=
A
+
(
i
+
4
)
*
lda
;
a5
=
A
+
(
i
+
5
)
*
lda
;
for
(
int
j
=
0
;
j
<
k
;
++
j
)
{
*
buffer
++
=
*
a0
++
;
*
buffer
++
=
*
a1
++
;
*
buffer
++
=
*
a2
++
;
*
buffer
++
=
*
a3
++
;
*
buffer
++
=
*
a4
++
;
*
buffer
++
=
*
a5
++
;
}
}
int
i
=
m
-
m_tail
;
a0
=
&
A
(
i
,
0
);
a1
=
a0
+
lda
;
a2
=
a0
+
2
*
lda
;
a3
=
a0
+
3
*
lda
;
a4
=
a0
+
4
*
lda
;
a5
=
a0
+
5
*
lda
;
if
(
m_tail
!=
0
)
{
if
(
m_tail
<=
5
)
{
a5
=
zero
;
}
if
(
m_tail
<=
4
)
{
a4
=
zero
;
}
if
(
m_tail
<=
3
)
{
a3
=
zero
;
}
if
(
m_tail
<=
2
)
{
a2
=
zero
;
}
if
(
m_tail
<=
1
)
{
a1
=
zero
;
}
for
(
int
j
=
0
;
j
<
k
;
++
j
)
{
*
buffer
++
=
*
a0
++
;
*
buffer
++
=
*
a1
++
;
*
buffer
++
=
*
a2
++
;
*
buffer
++
=
*
a3
++
;
*
buffer
++
=
*
a4
++
;
*
buffer
++
=
*
a5
++
;
}
}
}
// 将B矩阵分块复制到连续内存(RowMajor)
// 将B矩阵分块复制到连续内存(RowMajor)
void
PackMatrixB_
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
void
PackMatrixB_
8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
)
{
float
*
buffer
)
{
const
float
*
b0
;
const
float
*
b0
;
for
(
int
j
=
0
;
j
<
n
-
n_tail
;
j
+=
NR
)
{
for
(
int
j
=
0
;
j
<
n
-
n_tail
;
j
+=
NR
)
{
for
(
int
i
=
0
;
i
<
k
;
++
i
)
{
for
(
int
i
=
0
;
i
<
k
;
++
i
)
{
...
@@ -188,7 +241,8 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
...
@@ -188,7 +241,8 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
for
(
int
j
=
0
;
j
<
nc
;
j
+=
NR
)
{
for
(
int
j
=
0
;
j
<
nc
;
j
+=
NR
)
{
for
(
int
i
=
0
;
i
<
mc
;
i
+=
MR
)
{
for
(
int
i
=
0
;
i
<
mc
;
i
+=
MR
)
{
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot4x8
(
KC
,
a
+
i
*
KC
,
b
+
j
*
KC
,
c
+
i
*
NC
+
j
,
NC
);
// AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot6x8
(
KC
,
a
+
i
*
KC
,
b
+
j
*
KC
,
c
+
i
*
NC
+
j
,
NC
);
}
}
}
}
...
@@ -218,7 +272,8 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
...
@@ -218,7 +272,8 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
for
(
int
j
=
0
;
j
<
nc
;
j
+=
NR
)
{
for
(
int
j
=
0
;
j
<
nc
;
j
+=
NR
)
{
for
(
int
i
=
0
;
i
<
mc
;
i
+=
MR
)
{
for
(
int
i
=
0
;
i
<
mc
;
i
+=
MR
)
{
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot4x8
(
KC
,
a
+
i
*
KC
,
b
+
j
*
KC
,
c
+
i
*
NC
+
j
,
NC
);
// AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot6x8
(
KC
,
a
+
i
*
KC
,
b
+
j
*
KC
,
c
+
i
*
NC
+
j
,
NC
);
}
}
}
}
...
@@ -1868,22 +1923,22 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
...
@@ -1868,22 +1923,22 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
)
{
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int
L1
=
3
0
*
1024
;
int
L1
=
3
2
*
1024
;
int
L2
=
1
*
1024
*
1024
;
int
L2
=
0.5
*
1024
*
1024
;
KC
=
k
;
KC
=
k
;
MC
=
L
2
/
(
2
*
KC
*
sizeof
(
float
));
MC
=
L
1
/
(
KC
*
sizeof
(
float
));
NC
=
MC
;
NC
=
L2
/
(
KC
*
sizeof
(
float
))
;
// make sure MC is multiple of
4, and NC is multiple of 8
// make sure MC is multiple of
MR, and NC is multiple of NR
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
4
-
1
)
/
4
*
4
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
8
-
1
)
/
8
*
8
;
NC
=
(
NC
+
NR
-
1
)
/
NR
*
NR
;
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA
=
static_cast
<
float
*>
(
packedA
=
static_cast
<
float
*>
(
...
@@ -1901,10 +1956,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
...
@@ -1901,10 +1956,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
int
mc
,
nc
;
int
mc
,
nc
;
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
nc
=
s_min
(
n
-
j
,
NC
);
nc
=
s_min
(
n
-
j
,
NC
);
PackMatrixB_
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
PackMatrixB_
8c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
mc
=
s_min
(
m
-
i
,
MC
);
mc
=
s_min
(
m
-
i
,
MC
);
PackMatrixA_
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
PackMatrixA_
6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
InnerKernel
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
&
C
(
i
,
j
),
ldc
,
InnerKernel
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
&
C
(
i
,
j
),
ldc
,
relu
);
relu
);
}
}
...
@@ -1921,22 +1976,22 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
...
@@ -1921,22 +1976,22 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
)
{
bool
relu
,
float
*
new_scale
,
float
*
new_bias
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int
L1
=
3
0
*
1024
;
int
L1
=
3
2
*
1024
;
int
L2
=
1
*
1024
*
1024
;
int
L2
=
0.5
*
1024
*
1024
;
KC
=
k
;
KC
=
k
;
MC
=
L
2
/
(
2
*
KC
*
sizeof
(
float
));
MC
=
L
1
/
(
KC
*
sizeof
(
float
));
NC
=
MC
;
NC
=
L2
/
(
KC
*
sizeof
(
float
))
;
// make sure MC is multiple of
4, and NC is multiple of 8
// make sure MC is multiple of
MR, and NC is multiple of NR
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
4
-
1
)
/
4
*
4
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
8
-
1
)
/
8
*
8
;
NC
=
(
NC
+
NR
-
1
)
/
NR
*
NR
;
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA
=
static_cast
<
float
*>
(
packedA
=
static_cast
<
float
*>
(
...
@@ -1954,10 +2009,10 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
...
@@ -1954,10 +2009,10 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
int
mc
,
nc
;
int
mc
,
nc
;
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
nc
=
s_min
(
n
-
j
,
NC
);
nc
=
s_min
(
n
-
j
,
NC
);
PackMatrixB_
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
PackMatrixB_
8c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
mc
=
s_min
(
m
-
i
,
MC
);
mc
=
s_min
(
m
-
i
,
MC
);
PackMatrixA_
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
PackMatrixA_
6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
InnerKernelWithBn
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
InnerKernelWithBn
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
&
C
(
i
,
j
),
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
);
&
C
(
i
,
j
),
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
);
}
}
...
@@ -1969,6 +2024,221 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
...
@@ -1969,6 +2024,221 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile
::
memory
::
Free
(
zero
);
paddle_mobile
::
memory
::
Free
(
zero
);
}
}
void
AddDot6x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
#if __ARM_NEON
#if __aarch64__
// init C
float32x4_t
cv0
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv1
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv2
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv3
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv4
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv5
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv6
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv7
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv8
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv9
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv10
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv11
=
vdupq_n_f32
(
0.0
);
float32x4_t
av
;
float32x4_t
bv0
;
float32x4_t
bv1
;
float32x2_t
av01
;
float32x2_t
av23
;
float32x2_t
av45
;
for
(
int
p
=
0
;
p
<
k
;
p
+=
1
)
{
av
=
vld1q_f32
(
a
);
av01
=
vget_low_f32
(
av
);
av23
=
vget_high_f32
(
av
);
av45
=
vld1_f32
(
a
+
4
);
bv0
=
vld1q_f32
(
b
);
bv1
=
vld1q_f32
(
b
+
4
);
cv0
=
vmlaq_lane_f32
(
cv0
,
bv0
,
av01
,
0
);
cv1
=
vmlaq_lane_f32
(
cv1
,
bv1
,
av01
,
0
);
cv2
=
vmlaq_lane_f32
(
cv2
,
bv0
,
av01
,
1
);
cv3
=
vmlaq_lane_f32
(
cv3
,
bv1
,
av01
,
1
);
cv4
=
vmlaq_lane_f32
(
cv4
,
bv0
,
av23
,
0
);
cv5
=
vmlaq_lane_f32
(
cv5
,
bv1
,
av23
,
0
);
cv6
=
vmlaq_lane_f32
(
cv6
,
bv0
,
av23
,
1
);
cv7
=
vmlaq_lane_f32
(
cv7
,
bv1
,
av23
,
1
);
cv8
=
vmlaq_lane_f32
(
cv8
,
bv0
,
av45
,
0
);
cv9
=
vmlaq_lane_f32
(
cv9
,
bv1
,
av45
,
0
);
cv10
=
vmlaq_lane_f32
(
cv10
,
bv0
,
av45
,
1
);
cv11
=
vmlaq_lane_f32
(
cv11
,
bv1
,
av45
,
1
);
a
+=
MR
;
b
+=
NR
;
}
vst1q_f32
(
c
,
cv0
);
vst1q_f32
(
c
+
4
,
cv1
);
vst1q_f32
(
c
+
ldc
,
cv2
);
vst1q_f32
(
c
+
ldc
+
4
,
cv3
);
vst1q_f32
(
c
+
2
*
ldc
,
cv4
);
vst1q_f32
(
c
+
2
*
ldc
+
4
,
cv5
);
vst1q_f32
(
c
+
3
*
ldc
,
cv6
);
vst1q_f32
(
c
+
3
*
ldc
+
4
,
cv7
);
vst1q_f32
(
c
+
4
*
ldc
,
cv8
);
vst1q_f32
(
c
+
4
*
ldc
+
4
,
cv9
);
vst1q_f32
(
c
+
5
*
ldc
,
cv10
);
vst1q_f32
(
c
+
5
*
ldc
+
4
,
cv11
);
#else
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
int
kc1
=
k
/
4
;
int
kc2
=
k
%
4
;
int
step
=
4
*
ldc
;
asm
volatile
(
"pld [%[a_ptr]]
\n\t
"
"pld [%[b_ptr]]
\n\t
"
"pld [%[a_ptr], #64]
\n\t
"
"pld [%[b_ptr], #64]
\n\t
"
"vmov.f32 q4, #0.0
\n\t
"
"vmov.f32 q5, #0.0
\n\t
"
"vmov.f32 q6, #0.0
\n\t
"
"vmov.f32 q7, #0.0
\n\t
"
"vmov.f32 q8, #0.0
\n\t
"
"vmov.f32 q9, #0.0
\n\t
"
"vmov.f32 q10, #0.0
\n\t
"
"vmov.f32 q11, #0.0
\n\t
"
"vmov.f32 q12, #0.0
\n\t
"
"vmov.f32 q13, #0.0
\n\t
"
"vmov.f32 q14, #0.0
\n\t
"
"vmov.f32 q15, #0.0
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"blt end_kc1_%=
\n\t
"
"loop_kc1_%=:
\n\t
"
// "pld [%[a_ptr], #128] \n\t"
// "pld [%[b_ptr], #128] \n\t"
// "pld [%[a_ptr], #192] \n\t"
// "pld [%[b_ptr], #192] \n\t"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"bge loop_kc1_%=
\n\t
"
"end_kc1_%=:
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"blt end_kc2_%=
\n\t
"
"loop_kc2_%=:
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"bge loop_kc2_%=
\n\t
"
"end_kc2_%=:
\n\t
"
"mov r5, %[c]
\n\t
"
"mov r6, %[step]
\n\t
"
"vst1.32 {q4, q5}, [r5], r6
\n\t
"
"vst1.32 {q6, q7}, [r5], r6
\n\t
"
"vst1.32 {q8, q9}, [r5], r6
\n\t
"
"vst1.32 {q10, q11}, [r5], r6
\n\t
"
"vst1.32 {q12, q13}, [r5], r6
\n\t
"
"vst1.32 {q14, q15}, [r5]
\n\t
"
:
:
[
a_ptr
]
"r"
(
a_ptr
),
[
b_ptr
]
"r"
(
b_ptr
),
[
c
]
"r"
(
c
),
[
kc1
]
"r"
(
kc1
),
[
kc2
]
"r"
(
kc2
),
[
step
]
"r"
(
step
)
:
"memory"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
);
#endif // __aarch64__
#else
#endif // __ARM_NEON
}
}
// namespace math
}
// namespace math
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
src/operators/math/gemm.h
浏览文件 @
c38dbd3c
...
@@ -19,7 +19,7 @@ limitations under the License. */
...
@@ -19,7 +19,7 @@ limitations under the License. */
#define B(i, j) B[(i)*ldb + (j)]
#define B(i, j) B[(i)*ldb + (j)]
#define C(i, j) C[(i)*ldc + (j)]
#define C(i, j) C[(i)*ldc + (j)]
#define MR
4
#define MR
6
#define NR 8
#define NR 8
#define s_min(i, j) ((i) < (j) ? (i) : (j))
#define s_min(i, j) ((i) < (j) ? (i) : (j))
...
@@ -39,12 +39,14 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
...
@@ -39,12 +39,14 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
*/
*/
// 将 A 矩阵分块复制到连续内存(RowMajor)
// 将 A 矩阵分块复制到连续内存(RowMajor)
void
PackMatrixA_
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
void
PackMatrixA_4r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
float
*
buffer
);
void
PackMatrixA_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
// 将 B 矩阵分块复制到连续内存(RowMajor)
// 将 B 矩阵分块复制到连续内存(RowMajor)
void
PackMatrixB_
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
void
PackMatrixB_
8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
float
*
buffer
);
// 分块矩阵乘法
// 分块矩阵乘法
void
InnerKernel
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
void
InnerKernel
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
...
@@ -67,6 +69,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
...
@@ -67,6 +69,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
// 计算一个更小的 C 矩阵分块
// 计算一个更小的 C 矩阵分块
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot6x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
// 分块矩阵乘法结果回写
// 分块矩阵乘法结果回写
// C = A * B
// C = A * B
...
...
src/operators/op_param.h
浏览文件 @
c38dbd3c
...
@@ -22,6 +22,9 @@ limitations under the License. */
...
@@ -22,6 +22,9 @@ limitations under the License. */
#include "framework/scope.h"
#include "framework/scope.h"
#include "framework/tensor.h"
#include "framework/tensor.h"
#include "framework/variable.h"
#include "framework/variable.h"
#ifdef PADDLE_MOBILE_FPGA
#include "fpga/api/fpga_api.h"
#endif
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -256,6 +259,15 @@ class ElementwiseAddParam : OpParam {
...
@@ -256,6 +259,15 @@ class ElementwiseAddParam : OpParam {
Tensor
*
input_y_
;
Tensor
*
input_y_
;
Tensor
*
out_
;
Tensor
*
out_
;
int
axis_
;
int
axis_
;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga
::
FpgaEWAddArgs
fpga_EW_add_args
;
public:
const
fpga
::
FpgaEWAddArgs
&
FpgaArgs
()
const
{
return
fpga_EW_add_args
;
}
void
SetFpgaArgs
(
const
fpga
::
FpgaEWAddArgs
&
args
)
{
fpga_EW_add_args
=
args
;
}
#endif
};
};
#ifdef FUSION_ELEMENTWISEADDRELU_OP
#ifdef FUSION_ELEMENTWISEADDRELU_OP
...
@@ -450,80 +462,15 @@ class PoolParam : public OpParam {
...
@@ -450,80 +462,15 @@ class PoolParam : public OpParam {
vector
<
int
>
paddings_
;
vector
<
int
>
paddings_
;
bool
ceil_mode_
;
bool
ceil_mode_
;
bool
global_pooling_
=
false
;
bool
global_pooling_
=
false
;
};
#ifdef PADDLE_MOBILE_FPGA
#endif
#ifdef FUSION_POOLBN_OP
class
FusionPoolBNParam
:
OpParam
{
public:
FusionPoolBNParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
{
input_
=
InputXFrom
<
LoDTensor
>
(
inputs
,
scope
);
pooling_type_
=
GetAttr
<
string
>
(
"pooling_type"
,
attrs
);
ksize_
=
GetAttr
<
vector
<
int
>>
(
"ksize"
,
attrs
);
strides_
=
GetAttr
<
vector
<
int
>>
(
"strides"
,
attrs
);
paddings_
=
GetAttr
<
vector
<
int
>>
(
"paddings"
,
attrs
);
ceil_mode_
=
GetAttr
<
bool
>
(
"ceil_mode"
,
attrs
);
global_pooling_
=
GetAttr
<
bool
>
(
"global_pooling"
,
attrs
);
output_y_
=
OutputYFrom
<
LoDTensor
>
(
outputs
,
scope
);
input_bias_
=
InputBiasFrom
<
LoDTensor
>
(
inputs
,
scope
);
input_mean_
=
InputMeanFrom
<
LoDTensor
>
(
inputs
,
scope
);
input_scale_
=
InputScaleFrom
<
LoDTensor
>
(
inputs
,
scope
);
input_variance_
=
InputVarianceFrom
<
LoDTensor
>
(
inputs
,
scope
);
epsilon_
=
GetAttr
<
float
>
(
"epsilon"
,
attrs
);
momentum_
=
GetAttr
<
float
>
(
"momentum"
,
attrs
);
// is_test_ = GetAttr<bool>("is_test", attrs);
}
const
Tensor
*
Input
()
const
{
return
input_
;
}
const
string
&
PoolingType
()
const
{
return
pooling_type_
;
}
const
vector
<
int
>
&
Ksize
()
const
{
return
ksize_
;
}
const
vector
<
int
>
&
Strides
()
const
{
return
strides_
;
}
const
vector
<
int
>
&
Paddings
()
const
{
return
paddings_
;
}
bool
isCeilMode
()
const
{
return
ceil_mode_
;
}
bool
isGlobalPooling
()
const
{
return
global_pooling_
;
}
Tensor
*
OutputY
()
const
{
return
output_y_
;
}
const
Tensor
*
InputBias
()
const
{
return
input_bias_
;
}
const
Tensor
*
InputMean
()
const
{
return
input_mean_
;
}
const
Tensor
*
InputScale
()
const
{
return
input_scale_
;
}
const
Tensor
*
InputVariance
()
const
{
return
input_variance_
;
}
const
float
&
Epsilon
()
const
{
return
epsilon_
;
}
const
float
&
Momentum
()
const
{
return
momentum_
;
}
const
bool
&
IsTest
()
const
{
return
is_test_
;
}
const
string
&
DataFormat
()
const
{
return
data_format_
;
}
private:
private:
Tensor
*
input_
;
fpga
::
FpgaPoolArgs
fpga_pool_args
;
string
pooling_type_
;
vector
<
int
>
ksize_
;
public:
vector
<
int
>
strides_
;
const
fpga
::
FpgaPoolArgs
&
FpgaArgs
()
const
{
return
fpga_pool_args
;
}
vector
<
int
>
paddings_
;
void
SetFpgaArgs
(
const
fpga
::
FpgaPoolArgs
&
args
)
{
fpga_pool_args
=
args
;
}
bool
ceil_mode_
;
#endif
bool
global_pooling_
=
false
;
Tensor
*
output_y_
;
Tensor
*
input_bias_
;
Tensor
*
input_mean_
;
Tensor
*
input_scale_
;
Tensor
*
input_variance_
;
float
epsilon_
;
float
momentum_
;
bool
is_test_
;
string
data_format_
;
};
};
#endif
#endif
...
@@ -704,10 +651,10 @@ class MultiClassNMSParam : public OpParam {
...
@@ -704,10 +651,10 @@ class MultiClassNMSParam : public OpParam {
class
FeedParam
:
public
OpParam
{
class
FeedParam
:
public
OpParam
{
public:
public:
FeedParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
FeedParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
Scope
&
scope
)
{
const
AttributeMap
&
attrs
,
Scope
*
scope
)
{
input_x_
=
InputXFrom
<
LoDTensor
>
(
inputs
,
scope
);
input_x_
=
InputXFrom
<
LoDTensor
>
(
inputs
,
*
scope
);
out_
=
OutFrom
<
LoDTensor
>
(
outputs
,
scope
);
out_
=
OutFrom
<
LoDTensor
>
(
outputs
,
*
scope
);
auto
var
=
scope
.
Var
(
"batch_size"
);
auto
var
=
scope
->
Var
(
"batch_size"
);
batch_size
=
var
->
GetValue
<
int
>
();
batch_size
=
var
->
GetValue
<
int
>
();
}
}
const
Tensor
*
InputX
()
const
{
return
input_x_
;
}
const
Tensor
*
InputX
()
const
{
return
input_x_
;
}
...
@@ -983,6 +930,15 @@ class FusionFcParam : public OpParam {
...
@@ -983,6 +930,15 @@ class FusionFcParam : public OpParam {
int
x_num_col_dims_
;
int
x_num_col_dims_
;
int
y_num_col_dims_
;
int
y_num_col_dims_
;
int
axis_
;
int
axis_
;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga
::
FpgaConvArgs
fpga_conv_args
;
public:
const
fpga
::
FpgaConvArgs
&
FpgaArgs
()
const
{
return
fpga_conv_args
;
}
void
SetFpgaArgs
(
const
fpga
::
FpgaConvArgs
&
args
)
{
fpga_conv_args
=
args
;
}
#endif
};
};
#ifdef FUSION_FCRELU_OP
#ifdef FUSION_FCRELU_OP
...
@@ -1032,6 +988,15 @@ class FusionConvAddParam : public OpParam {
...
@@ -1032,6 +988,15 @@ class FusionConvAddParam : public OpParam {
vector
<
int
>
paddings_
;
vector
<
int
>
paddings_
;
vector
<
int
>
dilations_
;
vector
<
int
>
dilations_
;
int
groups
;
int
groups
;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga
::
FpgaConvArgs
fpga_conv_args
;
public:
const
fpga
::
FpgaConvArgs
&
FpgaArgs
()
const
{
return
fpga_conv_args
;
}
void
SetFpgaArgs
(
const
fpga
::
FpgaConvArgs
&
args
)
{
fpga_conv_args
=
args
;
}
#endif
};
};
Print
&
operator
<<
(
Print
&
printer
,
const
FusionConvAddParam
&
conv_param
);
Print
&
operator
<<
(
Print
&
printer
,
const
FusionConvAddParam
&
conv_param
);
...
@@ -1128,6 +1093,15 @@ class FusionConvAddBNReluParam : public OpParam {
...
@@ -1128,6 +1093,15 @@ class FusionConvAddBNReluParam : public OpParam {
bool
is_test_
;
bool
is_test_
;
Tensor
*
new_bias_
;
Tensor
*
new_bias_
;
Tensor
*
new_scale_
;
Tensor
*
new_scale_
;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga
::
FpgaConvArgs
fpga_conv_args
;
public:
const
fpga
::
FpgaConvArgs
&
FpgaArgs
()
const
{
return
fpga_conv_args
;
}
void
SetFpgaArgs
(
const
fpga
::
FpgaConvArgs
&
args
)
{
fpga_conv_args
=
args
;
}
#endif
};
};
#endif
#endif
...
@@ -1213,6 +1187,15 @@ class FusionConvAddBNParam : public OpParam {
...
@@ -1213,6 +1187,15 @@ class FusionConvAddBNParam : public OpParam {
bool
is_test_
;
bool
is_test_
;
Tensor
*
new_bias_
;
Tensor
*
new_bias_
;
Tensor
*
new_scale_
;
Tensor
*
new_scale_
;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga
::
FpgaConvArgs
fpga_conv_args
;
public:
const
fpga
::
FpgaConvArgs
&
FpgaArgs
()
const
{
return
fpga_conv_args
;
}
void
SetFpgaArgs
(
const
fpga
::
FpgaConvArgs
&
args
)
{
fpga_conv_args
=
args
;
}
#endif
};
};
#endif
#endif
...
@@ -1426,9 +1409,5 @@ class DropoutParam : public OpParam {
...
@@ -1426,9 +1409,5 @@ class DropoutParam : public OpParam {
};
};
#endif
#endif
#ifdef REGION_OP
class
RegionParam
:
public
OpParam
{};
#endif
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
src/operators/resize_op.h
浏览文件 @
c38dbd3c
...
@@ -33,7 +33,7 @@ class ResizeOp
...
@@ -33,7 +33,7 @@ class ResizeOp
DeviceType
,
ResizeParam
,
operators
::
ResizeKernel
<
DeviceType
,
T
>>
{
DeviceType
,
ResizeParam
,
operators
::
ResizeKernel
<
DeviceType
,
T
>>
{
public:
public:
ResizeOp
(
const
std
::
string
&
type
,
const
VariableNameMap
&
inputs
,
ResizeOp
(
const
std
::
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
attrs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
,
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
:
framework
::
OperatorWithKernel
<
DeviceType
,
ResizeParam
,
:
framework
::
OperatorWithKernel
<
DeviceType
,
ResizeParam
,
operators
::
ResizeKernel
<
DeviceType
,
T
>>
(
operators
::
ResizeKernel
<
DeviceType
,
T
>>
(
...
...
test/CMakeLists.txt
浏览文件 @
c38dbd3c
...
@@ -145,6 +145,10 @@ else ()
...
@@ -145,6 +145,10 @@ else ()
ADD_EXECUTABLE
(
test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h
)
ADD_EXECUTABLE
(
test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-conv-add-relu-op paddle-mobile
)
target_link_libraries
(
test-conv-add-relu-op paddle-mobile
)
# gen test
ADD_EXECUTABLE
(
test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-conv-add-bn-relu-op paddle-mobile
)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif
()
endif
()
test/common/test_gemm.cpp
浏览文件 @
c38dbd3c
...
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <iostream>
#include "../test_helper.h"
#include "../test_helper.h"
#include "common/log.h"
#include "common/log.h"
...
@@ -20,13 +22,21 @@ limitations under the License. */
...
@@ -20,13 +22,21 @@ limitations under the License. */
#define a(i, j) a[(i)*lda + (j)]
#define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(i)*ldb + (j)]
#define b(i, j) b[(i)*ldb + (j)]
#define c(i, j) c[(i)*ldc + (j)]
#define c1(i, j) c1[(i)*ldc + (j)]
#define c1(i, j) c1[(i)*ldc + (j)]
#define m 62
void
print_matirx
(
int
m
,
int
n
,
int
ldc
,
float
*
c
)
{
#define n 63
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
#define k 74
std
::
cout
<<
c
(
i
,
0
);
for
(
int
j
=
1
;
j
<
n
;
++
j
)
{
std
::
cout
<<
" | "
<<
c
(
i
,
j
);
}
std
::
cout
<<
std
::
endl
;
}
std
::
cout
<<
std
::
endl
;
}
int
main
(
)
{
int
do_sgemm
(
int
m
,
int
n
,
int
k
,
bool
relu
,
int
t1
,
int
t2
,
int
pr
)
{
int
lda
=
k
;
int
lda
=
k
;
int
ldb
=
n
;
int
ldb
=
n
;
int
ldc
=
n
;
int
ldc
=
n
;
...
@@ -39,44 +49,88 @@ int main() {
...
@@ -39,44 +49,88 @@ int main() {
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
m
*
n
));
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
m
*
n
));
float
*
c1
=
float
*
c1
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
m
*
n
));
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
m
*
n
));
float
*
scale
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
m
));
float
*
bias
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
m
));
srand
(
unsigned
(
time
(
0
)));
for
(
int
i
=
0
;
i
<
m
*
k
;
++
i
)
{
for
(
int
i
=
0
;
i
<
m
*
k
;
++
i
)
{
a
[
i
]
=
2
;
a
[
i
]
=
t1
+
rand
()
%
t
2
;
}
}
for
(
int
i
=
0
;
i
<
k
*
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
k
*
n
;
++
i
)
{
b
[
i
]
=
2
;
b
[
i
]
=
t1
+
rand
()
%
t
2
;
}
}
for
(
int
i
=
0
;
i
<
m
*
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
c
[
i
]
=
2
;
scale
[
i
]
=
t1
+
rand
()
%
t2
;
c1
[
i
]
=
2
;
}
}
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
auto
time1
=
time
();
bias
[
i
]
=
t1
+
rand
()
%
t2
;
// paddle_mobile::operators::math::Sgemm(m, n, k, 0.9, a, lda, b, ldb, 0.3,
// c,
// ldc);
auto
time2
=
time
();
DLOG
<<
"gemm cost :"
<<
time_diff
(
time1
,
time2
)
<<
"ms
\n
"
;
for
(
int
i
=
0
;
i
<
m
*
n
;
++
i
)
{
std
::
cout
<<
c
[
i
]
<<
" | "
;
if
(
i
%
n
==
(
n
-
1
))
{
std
::
cout
<<
std
::
endl
;
}
}
}
for
(
int
j
=
0
;
j
<
n
;
++
j
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
c1
(
i
,
j
)
*=
0.3
;
for
(
int
j
=
0
;
j
<
n
;
++
j
)
{
for
(
int
p
=
0
;
p
<
k
;
++
p
)
{
float
r
=
0
;
c1
(
i
,
j
)
+=
0.9
*
a
(
i
,
p
)
*
b
(
p
,
j
);
for
(
int
p
=
0
;
p
<
k
;
p
++
)
{
r
+=
a
(
i
,
p
)
*
b
(
p
,
j
);
}
}
r
*=
scale
[
i
];
r
+=
bias
[
i
];
if
(
relu
&&
(
r
<
0
))
{
r
=
0
;
}
c1
(
i
,
j
)
=
r
;
}
}
}
}
std
::
cout
<<
"正确结果对比:"
<<
std
::
endl
;
paddle_mobile
::
operators
::
math
::
SgemmWithBn
(
m
,
n
,
k
,
0.9
,
a
,
lda
,
b
,
ldb
,
0.3
,
c
,
ldc
,
relu
,
scale
,
bias
);
int
eq
=
0
;
int
neq
=
0
;
for
(
int
i
=
0
;
i
<
m
*
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
m
*
n
;
++
i
)
{
std
::
cout
<<
c1
[
i
]
<<
" | "
;
if
(
static_cast
<
int
>
(
c
[
i
])
==
static_cast
<
int
>
(
c1
[
i
]))
{
if
(
i
%
n
==
(
n
-
1
))
{
++
eq
;
std
::
cout
<<
std
::
endl
;
}
else
{
++
neq
;
}
}
}
}
if
(
pr
>
0
)
{
std
::
cout
<<
"A:"
<<
std
::
endl
;
print_matirx
(
m
,
k
,
lda
,
a
);
std
::
cout
<<
"B:"
<<
std
::
endl
;
print_matirx
(
k
,
n
,
ldb
,
b
);
std
::
cout
<<
"C:"
<<
std
::
endl
;
print_matirx
(
m
,
n
,
ldc
,
c
);
std
::
cout
<<
"C1:"
<<
std
::
endl
;
print_matirx
(
m
,
n
,
ldc
,
c1
);
}
std
::
cout
<<
"mnk="
<<
m
<<
" "
<<
n
<<
" "
<<
k
<<
" relu="
<<
relu
<<
" eq="
<<
eq
<<
" neq="
<<
neq
<<
std
::
endl
;
paddle_mobile
::
memory
::
Free
(
a
);
paddle_mobile
::
memory
::
Free
(
b
);
paddle_mobile
::
memory
::
Free
(
c
);
paddle_mobile
::
memory
::
Free
(
c1
);
paddle_mobile
::
memory
::
Free
(
scale
);
paddle_mobile
::
memory
::
Free
(
bias
);
return
0
;
}
int
main
()
{
do_sgemm
(
9
,
9
,
9
,
true
,
10
,
10
,
10
);
do_sgemm
(
10
,
6
,
12
,
false
,
10
,
10
,
0
);
do_sgemm
(
512
,
256
,
384
,
false
,
10
,
10
,
0
);
do_sgemm
(
1366
,
768
,
256
,
false
,
10
,
10
,
0
);
do_sgemm
(
1255
,
755
,
333
,
false
,
10
,
10
,
0
);
do_sgemm
(
555
,
777
,
999
,
false
,
10
,
10
,
0
);
do_sgemm
(
10
,
6
,
12
,
true
,
-
4
,
10
,
0
);
do_sgemm
(
512
,
256
,
384
,
true
,
-
4
,
10
,
0
);
do_sgemm
(
1366
,
768
,
256
,
true
,
-
4
,
10
,
0
);
do_sgemm
(
1255
,
755
,
333
,
true
,
-
4
,
10
,
0
);
do_sgemm
(
555
,
777
,
999
,
true
,
-
4
,
10
,
0
);
return
0
;
return
0
;
}
}
test/executor_for_test.h
浏览文件 @
c38dbd3c
...
@@ -43,7 +43,7 @@ template <typename DeviceType, typename OpType>
...
@@ -43,7 +43,7 @@ template <typename DeviceType, typename OpType>
class
Executor4Test
:
public
Executor
<
DeviceType
>
{
class
Executor4Test
:
public
Executor
<
DeviceType
>
{
public:
public:
Executor4Test
(
Program
<
DeviceType
>
p
,
string
op_type
,
Executor4Test
(
Program
<
DeviceType
>
p
,
string
op_type
,
bool
use_optimize
=
false
)
bool
use_optimize
=
false
,
int
predict_op_count
=
1
)
:
Executor
<
DeviceType
>
()
{
:
Executor
<
DeviceType
>
()
{
this
->
use_optimize_
=
use_optimize
;
this
->
use_optimize_
=
use_optimize
;
this
->
program_
=
p
;
this
->
program_
=
p
;
...
@@ -57,12 +57,14 @@ class Executor4Test : public Executor<DeviceType> {
...
@@ -57,12 +57,14 @@ class Executor4Test : public Executor<DeviceType> {
LOG
(
paddle_mobile
::
LogLevel
::
kLOG_ERROR
)
LOG
(
paddle_mobile
::
LogLevel
::
kLOG_ERROR
)
<<
"to_predict_program_ == nullptr"
;
<<
"to_predict_program_ == nullptr"
;
}
}
const
std
::
vector
<
std
::
shared_ptr
<
BlockDesc
>>
blocks
=
const
std
::
vector
<
std
::
shared_ptr
<
BlockDesc
>>
blocks
=
this
->
to_predict_program_
->
Blocks
();
this
->
to_predict_program_
->
Blocks
();
for
(
std
::
shared_ptr
<
BlockDesc
>
block_desc
:
blocks
)
{
for
(
std
::
shared_ptr
<
BlockDesc
>
block_desc
:
blocks
)
{
std
::
vector
<
std
::
shared_ptr
<
OpDesc
>>
ops
=
block_desc
->
Ops
();
std
::
vector
<
std
::
shared_ptr
<
OpDesc
>>
ops
=
block_desc
->
Ops
();
for
(
std
::
shared_ptr
<
OpDesc
>
op
:
ops
)
{
for
(
int
i
=
0
;
i
<
ops
.
size
();
++
i
)
{
if
(
op
->
Type
()
==
op_type
)
{
auto
op
=
ops
[
i
];
if
(
op
->
Type
()
==
op_type
&&
i
<
predict_op_count
)
{
DLOG
<<
"匹配到: "
<<
op
->
Type
();
DLOG
<<
"匹配到: "
<<
op
->
Type
();
/// test first meeting op in program
/// test first meeting op in program
...
@@ -72,11 +74,17 @@ class Executor4Test : public Executor<DeviceType> {
...
@@ -72,11 +74,17 @@ class Executor4Test : public Executor<DeviceType> {
op
->
Type
(),
op
->
GetInputs
(),
op
->
GetOutputs
(),
op
->
Type
(),
op
->
GetInputs
(),
op
->
GetOutputs
(),
op
->
GetAttrMap
(),
this
->
program_
.
scope
);
op
->
GetAttrMap
(),
this
->
program_
.
scope
);
this
->
ops_of_block_
[
*
block_desc
.
get
()].
push_back
(
op_ptr
);
this
->
ops_of_block_
[
*
block_desc
.
get
()].
push_back
(
op_ptr
);
break
;
}
}
}
}
}
}
this
->
InitMemory
();
this
->
InitMemory
();
std
::
shared_ptr
<
paddle_mobile
::
framework
::
BlockDesc
>
to_predict_block
=
this
->
to_predict_program_
->
Block
(
0
);
auto
&
ops
=
this
->
ops_of_block_
[
*
to_predict_block
.
get
()];
for
(
const
auto
&
op
:
ops
)
{
op
->
Init
();
}
}
}
template
<
typename
T
=
LoDTensor
>
template
<
typename
T
=
LoDTensor
>
...
@@ -130,9 +138,6 @@ class Executor4Test : public Executor<DeviceType> {
...
@@ -130,9 +138,6 @@ class Executor4Test : public Executor<DeviceType> {
auto
*
output_tensor
=
con_output
->
GetMutable
<
LoDTensor
>
();
auto
*
output_tensor
=
con_output
->
GetMutable
<
LoDTensor
>
();
output_tensor
->
mutable_data
<
float
>
(
dDim
);
output_tensor
->
mutable_data
<
float
>
(
dDim
);
std
::
shared_ptr
<
Tensor
>
out_tensor
=
std
::
make_shared
<
LoDTensor
>
();
out_tensor
.
reset
(
output_tensor
);
std
::
shared_ptr
<
paddle_mobile
::
framework
::
BlockDesc
>
to_predict_block
=
std
::
shared_ptr
<
paddle_mobile
::
framework
::
BlockDesc
>
to_predict_block
=
this
->
to_predict_program_
->
Block
(
0
);
this
->
to_predict_program_
->
Block
(
0
);
for
(
int
j
=
0
;
j
<
this
->
ops_of_block_
[
*
to_predict_block
.
get
()].
size
();
for
(
int
j
=
0
;
j
<
this
->
ops_of_block_
[
*
to_predict_block
.
get
()].
size
();
...
@@ -141,6 +146,7 @@ class Executor4Test : public Executor<DeviceType> {
...
@@ -141,6 +146,7 @@ class Executor4Test : public Executor<DeviceType> {
op
->
Run
();
op
->
Run
();
}
}
return
out_tensor
;
return
std
::
make_shared
<
paddle_mobile
::
framework
::
Tensor
>
(
paddle_mobile
::
framework
::
Tensor
(
*
output_tensor
));
}
}
};
};
test/framework/test_load.cpp
浏览文件 @
c38dbd3c
...
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <string>
#include "../test_helper.h"
#include "../test_helper.h"
#include "io/loader.h"
#include "io/loader.h"
...
@@ -20,12 +22,10 @@ int main() {
...
@@ -20,12 +22,10 @@ int main() {
// ../../../test/models/googlenet
// ../../../test/models/googlenet
// ../../../test/models/mobilenet
// ../../../test/models/mobilenet
// auto program = loader.Load(g_googlenet, true);
// auto program = loader.Load(g_googlenet, true);
// auto program = loader.Load(g_mobilenet_ssd, true);
auto
program
=
loader
.
Load
(
g_mobilenet_ssd
,
true
);
auto
program
=
loader
.
Load
(
std
::
string
(
g_ocr
)
+
"/model"
,
// auto program = loader.Load(g_googlenet_combine + "/model",
std
::
string
(
g_ocr
)
+
"/params"
,
false
);
// g_googlenet_combine +
// "/params", true);
// program.originProgram->Description("program desc: ");
// program.originProgram->Description("program desc: ");
return
0
;
return
0
;
}
}
test/net/test_mobilenet+ssd.cpp
浏览文件 @
c38dbd3c
...
@@ -20,22 +20,20 @@ int main() {
...
@@ -20,22 +20,20 @@ int main() {
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
>
paddle_mobile
;
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
>
paddle_mobile
;
paddle_mobile
.
SetThreadNum
(
4
);
paddle_mobile
.
SetThreadNum
(
4
);
auto
time1
=
time
();
auto
time1
=
time
();
auto
isok
=
paddle_mobile
.
Load
(
g_mobilenet_ssd_gesture
+
"/model"
,
auto
isok
=
paddle_mobile
.
Load
(
g_mobilenet_ssd_gesture
+
"/params"
,
true
);
std
::
string
(
g_mobilenet_ssd_gesture
)
+
"/model"
,
std
::
string
(
g_mobilenet_ssd_gesture
)
+
"/params"
,
true
);
// auto isok = paddle_mobile.Load(g_mobilenet_ssd, false);
// auto isok = paddle_mobile.Load(g_mobilenet_ssd, false);
if
(
isok
)
{
if
(
isok
)
{
auto
time2
=
time
();
auto
time2
=
time
();
std
::
cout
<<
"load cost :"
<<
time_diff
(
time1
,
time2
)
<<
"ms"
<<
std
::
endl
;
std
::
cout
<<
"load cost :"
<<
time_diff
(
time1
,
time2
)
<<
"ms"
<<
std
::
endl
;
std
::
vector
<
float
>
input
;
std
::
vector
<
int64_t
>
dims
{
1
,
3
,
300
,
300
};
std
::
vector
<
int64_t
>
dims
{
1
,
3
,
300
,
300
};
Tensor
input_tensor
;
GetInput
<
float
>
(
g_hand
,
&
input
,
dims
);
SetupTensor
<
float
>
(
&
input_tensor
,
{
1
,
3
,
300
,
300
},
static_cast
<
float
>
(
0
),
static_cast
<
float
>
(
1
));
std
::
vector
<
float
>
input
(
input_tensor
.
data
<
float
>
(),
input_tensor
.
data
<
float
>
()
+
input_tensor
.
numel
());
auto
time3
=
time
();
auto
time3
=
time
();
paddle_mobile
.
Predict
(
input
,
dims
);
auto
output
=
paddle_mobile
.
Predict
(
input
,
dims
);
auto
time4
=
time
();
auto
time4
=
time
();
std
::
cout
<<
"predict cost :"
<<
time_diff
(
time3
,
time4
)
<<
"ms"
std
::
cout
<<
"predict cost :"
<<
time_diff
(
time3
,
time4
)
<<
"ms"
<<
std
::
endl
;
<<
std
::
endl
;
...
...
test/net/test_mobilenet.cpp
浏览文件 @
c38dbd3c
...
@@ -24,19 +24,21 @@ int main() {
...
@@ -24,19 +24,21 @@ int main() {
auto
time2
=
time
();
auto
time2
=
time
();
std
::
cout
<<
"load cost :"
<<
time_diff
(
time1
,
time1
)
<<
"ms"
<<
std
::
endl
;
std
::
cout
<<
"load cost :"
<<
time_diff
(
time1
,
time1
)
<<
"ms"
<<
std
::
endl
;
std
::
vector
<
float
>
input
;
std
::
vector
<
int64_t
>
dims
{
1
,
3
,
224
,
224
};
std
::
vector
<
int64_t
>
dims
{
1
,
3
,
224
,
224
};
Tensor
input_tensor
;
GetInput
<
float
>
(
g_test_image_1x3x224x224
,
&
input
,
dims
);
SetupTensor
<
float
>
(
&
input_tensor
,
{
1
,
3
,
224
,
224
},
static_cast
<
float
>
(
0
),
static_cast
<
float
>
(
1
));
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
auto
time3
=
time
();
std
::
vector
<
float
>
input
(
input_tensor
.
data
<
float
>
(),
auto
vec_result
=
paddle_mobile
.
Predict
(
input
,
dims
);
input_tensor
.
data
<
float
>
()
+
input_tensor
.
numel
());
auto
time4
=
time
();
auto
time3
=
time
();
std
::
vector
<
float
>::
iterator
biggest
=
auto
vec_result
=
paddle_mobile
.
Predict
(
input
,
dims
);
std
::
max_element
(
std
::
begin
(
vec_result
),
std
::
end
(
vec_result
));
auto
time4
=
time
();
std
::
cout
<<
" Max element is "
<<
*
biggest
<<
" at position "
<<
std
::
distance
(
std
::
begin
(
vec_result
),
biggest
)
<<
std
::
endl
;
std
::
cout
<<
"predict cost :"
<<
time_diff
(
time3
,
time4
)
<<
"ms"
std
::
cout
<<
"predict cost :"
<<
time_diff
(
time3
,
time4
)
<<
"ms"
<<
std
::
endl
;
<<
std
::
endl
;
}
}
}
return
0
;
return
0
;
...
...
test/operators/test_fusion_conv_add_bn_relu_op.cpp
0 → 100644
浏览文件 @
c38dbd3c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "../test_include.h"
#include "operators/fusion_conv_add_bn_relu_op.h"
int
main
()
{
paddle_mobile
::
Loader
<
paddle_mobile
::
CPU
>
loader
;
// ../models/image_classification_resnet.inference.model
auto
program
=
loader
.
Load
(
g_mobilenet
,
true
);
PADDLE_MOBILE_ENFORCE
(
program
.
originProgram
!=
nullptr
,
"program file read fail"
);
Executor4Test
<
paddle_mobile
::
CPU
,
paddle_mobile
::
operators
::
FusionConvAddBNReluOp
<
paddle_mobile
::
CPU
,
float
>>
executor
(
program
,
"fusion_conv_add_bn_relu"
,
true
);
std
::
cout
<<
"executor 4 test: "
<<
std
::
endl
;
paddle_mobile
::
framework
::
Tensor
input
;
GetInput
<
float
>
(
g_test_image_1x3x224x224_banana
,
&
input
,
{
1
,
3
,
224
,
224
});
// // use SetupTensor if not has local input image .
// SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
// static_cast<float>(1));
DLOG
<<
" fuck: "
<<
input
;
auto
out_ddim
=
paddle_mobile
::
framework
::
make_ddim
({
1
,
32
,
112
,
112
});
std
::
cout
<<
"before predict: "
<<
std
::
endl
;
auto
output
=
executor
.
Predict
(
input
,
"data"
,
"conv2_1_dw_bn.tmp_2"
,
out_ddim
);
std
::
cout
<<
"after predict "
<<
std
::
endl
;
auto
output_ptr
=
output
->
data
<
float
>
();
int
stride
=
output
->
numel
()
/
100
;
for
(
int
i
=
0
;
i
<
100
;
i
++
)
{
DLOG
<<
" index:"
<<
i
*
stride
<<
" value: "
<<
output_ptr
[
i
*
stride
];
}
// for (int i = 0; i < 100; i++) {
// DLOG << " index:" << i << " value: "<< output_ptr[i];
// }
// for (int j = 0; j < output->numel(); ++j) {
// std::cout << " (index: " << j << " value: " << output_ptr[j] << ") ";
// }
std
::
cout
<<
std
::
endl
;
return
0
;
}
test/test_helper.h
浏览文件 @
c38dbd3c
...
@@ -24,18 +24,22 @@ limitations under the License. */
...
@@ -24,18 +24,22 @@ limitations under the License. */
#include "framework/ddim.h"
#include "framework/ddim.h"
#include "framework/tensor.h"
#include "framework/tensor.h"
static
const
std
::
string
g_mobilenet_ssd
=
"../models/mobilenet+ssd
"
;
static
const
char
*
g_ocr
=
"../models/ocr
"
;
static
const
std
::
string
g_mobilenet_ssd_gesture
=
static
const
char
*
g_mobilenet_ssd
=
"../models/mobilenet+ssd"
;
"../models/mobilenet+ssd_gesture"
;
static
const
char
*
g_mobilenet_ssd_gesture
=
"../models/mobilenet+ssd_gesture"
;
static
const
std
::
string
g_squeezenet
=
"../models/squeezenet"
;
static
const
char
*
g_squeezenet
=
"../models/squeezenet"
;
static
const
std
::
string
g_googlenet
=
"../models/googlenet"
;
static
const
char
*
g_googlenet
=
"../models/googlenet"
;
static
const
std
::
string
g_mobilenet
=
"../models/mobilenet"
;
static
const
char
*
g_mobilenet
=
"../models/mobilenet"
;
static
const
std
::
string
g_resnet_50
=
"../models/resnet_50"
;
static
const
char
*
g_resnet_50
=
"../models/resnet_50"
;
static
const
std
::
string
g_resnet
=
"../models/resnet"
;
static
const
char
*
g_resnet
=
"../models/resnet"
;
static
const
std
::
string
g_googlenet_combine
=
"../models/googlenet_combine"
;
static
const
char
*
g_googlenet_combine
=
"../models/googlenet_combine"
;
static
const
std
::
string
g_yolo
=
"../models/yolo"
;
static
const
char
*
g_yolo
=
"../models/yolo"
;
static
const
std
::
string
g_test_image_1x3x224x224
=
static
const
char
*
g_test_image_1x3x224x224
=
"../images/test_image_1x3x224x224_float"
;
"../images/test_image_1x3x224x224_float"
;
static
const
char
*
g_test_image_1x3x224x224_banana
=
"../images/input_3x224x224_banana"
;
static
const
char
*
g_hand
=
"../images/hand_image"
;
using
paddle_mobile
::
framework
::
DDim
;
using
paddle_mobile
::
framework
::
DDim
;
using
paddle_mobile
::
framework
::
Tensor
;
using
paddle_mobile
::
framework
::
Tensor
;
...
...
tools/android-cmake/android.toolchain.cmake
浏览文件 @
c38dbd3c
...
@@ -65,6 +65,8 @@ endif()
...
@@ -65,6 +65,8 @@ endif()
file
(
TO_CMAKE_PATH
"
${
ANDROID_NDK
}
"
ANDROID_NDK
)
file
(
TO_CMAKE_PATH
"
${
ANDROID_NDK
}
"
ANDROID_NDK
)
# Android NDK revision
# Android NDK revision
message
(
"
${
ANDROID_NDK
}
"
)
file
(
READ
"
${
ANDROID_NDK
}
/source.properties"
ANDROID_NDK_SOURCE_PROPERTIES
)
file
(
READ
"
${
ANDROID_NDK
}
/source.properties"
ANDROID_NDK_SOURCE_PROPERTIES
)
set
(
ANDROID_NDK_SOURCE_PROPERTIES_REGEX
set
(
ANDROID_NDK_SOURCE_PROPERTIES_REGEX
"^Pkg
\\
.Desc = Android NDK
\n
Pkg
\\
.Revision = ([0-9]+)
\\
."
)
"^Pkg
\\
.Desc = Android NDK
\n
Pkg
\\
.Revision = ([0-9]+)
\\
."
)
...
@@ -159,7 +161,7 @@ endif()
...
@@ -159,7 +161,7 @@ endif()
# Default values for configurable variables.
# Default values for configurable variables.
if
(
NOT ANDROID_TOOLCHAIN
)
if
(
NOT ANDROID_TOOLCHAIN
)
set
(
ANDROID_TOOLCHAIN
clang
)
set
(
ANDROID_TOOLCHAIN
gcc
)
endif
()
endif
()
if
(
NOT ANDROID_ABI
)
if
(
NOT ANDROID_ABI
)
set
(
ANDROID_ABI armeabi-v7a
)
set
(
ANDROID_ABI armeabi-v7a
)
...
...
tools/build.sh
浏览文件 @
c38dbd3c
...
@@ -63,7 +63,7 @@ build_for_android() {
...
@@ -63,7 +63,7 @@ build_for_android() {
TOOLCHAIN_FILE
=
"./tools/android-cmake/android.toolchain.cmake"
TOOLCHAIN_FILE
=
"./tools/android-cmake/android.toolchain.cmake"
ANDROID_ARM_MODE
=
"arm"
ANDROID_ARM_MODE
=
"arm"
if
[
"
${#
NETS
}
"
>
1
]
;
then
if
[
"
${#
NETS
}
"
-gt
1
]
;
then
cmake ..
\
cmake ..
\
-B
"../build/release/
${
PLATFORM
}
"
\
-B
"../build/release/
${
PLATFORM
}
"
\
-DANDROID_ABI
=
"
${
ABI
}
"
\
-DANDROID_ABI
=
"
${
ABI
}
"
\
...
@@ -99,7 +99,7 @@ build_for_ios() {
...
@@ -99,7 +99,7 @@ build_for_ios() {
BUILD_DIR
=
../build/release/
"
${
PLATFORM
}
"
/
BUILD_DIR
=
../build/release/
"
${
PLATFORM
}
"
/
TOOLCHAIN_FILE
=
"./tools/ios-cmake/ios.toolchain.cmake"
TOOLCHAIN_FILE
=
"./tools/ios-cmake/ios.toolchain.cmake"
mkdir
-p
"
${
BUILD_DIR
}
"
mkdir
-p
"
${
BUILD_DIR
}
"
if
[
"
${#
NETS
}
"
>
1
]
;
then
if
[
"
${#
NETS
}
"
-gt
1
]
;
then
cmake ..
\
cmake ..
\
-B
"
${
BUILD_DIR
}
"
\
-B
"
${
BUILD_DIR
}
"
\
-DCMAKE_BUILD_TYPE
=
"
${
MODE
}
"
\
-DCMAKE_BUILD_TYPE
=
"
${
MODE
}
"
\
...
...
tools/op.cmake
浏览文件 @
c38dbd3c
...
@@ -75,11 +75,9 @@ if ("FPGAnets" IN_LIST NET)
...
@@ -75,11 +75,9 @@ if ("FPGAnets" IN_LIST NET)
set
(
FUSION_CONVADDRELU_OP ON
)
set
(
FUSION_CONVADDRELU_OP ON
)
set
(
FUSION_CONVADDBNRELU_OP ON
)
set
(
FUSION_CONVADDBNRELU_OP ON
)
set
(
FUSION_CONVADDBN_OP ON
)
set
(
FUSION_CONVADDBN_OP ON
)
set
(
FUSION_POOLBN_OP ON
)
set
(
FUSION_ELEMENTWISEADDRELU_OP ON
)
set
(
FUSION_ELEMENTWISEADDRELU_OP ON
)
set
(
FUSION_FC_OP ON
)
set
(
FUSION_FC_OP ON
)
set
(
FUSION_FCRELU_OP ON
)
set
(
FUSION_FCRELU_OP ON
)
set
(
REGION_OP ON
)
set
(
POOL_OP ON
)
set
(
POOL_OP ON
)
set
(
CONCAT_OP ON
)
set
(
CONCAT_OP ON
)
set
(
SOFTMAX_OP ON
)
set
(
SOFTMAX_OP ON
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录