Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
8d1fbf99
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8d1fbf99
编写于
8月 01, 2018
作者:
R
Ruilong Liu
提交者:
GitHub
8月 01, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into develop
上级
baf110bf
ef62c1cc
变更
35
隐藏空白更改
内联
并排
Showing
35 changed file
with
817 addition
and
307 deletion
+817
-307
README.md
README.md
+1
-1
src/common/types.cpp
src/common/types.cpp
+32
-32
src/common/types.h
src/common/types.h
+41
-35
src/fpga/api/fpga_api.cpp
src/fpga/api/fpga_api.cpp
+11
-9
src/fpga/api/fpga_api.h
src/fpga/api/fpga_api.h
+82
-19
src/framework/program/program-optimize/fusion_op_register.h
src/framework/program/program-optimize/fusion_op_register.h
+3
-1
src/framework/tensor.h
src/framework/tensor.h
+5
-3
src/io/executor.cpp
src/io/executor.cpp
+2
-2
src/memory/t_malloc.cpp
src/memory/t_malloc.cpp
+3
-3
src/operators/concat_op.cpp
src/operators/concat_op.cpp
+4
-1
src/operators/concat_op.h
src/operators/concat_op.h
+1
-0
src/operators/feed_op.h
src/operators/feed_op.h
+1
-1
src/operators/kernel/arm/dropout_kernel.cpp
src/operators/kernel/arm/dropout_kernel.cpp
+0
-2
src/operators/kernel/dropout_kernel.h
src/operators/kernel/dropout_kernel.h
+1
-1
src/operators/kernel/fpga/concat_kernel.cpp
src/operators/kernel/fpga/concat_kernel.cpp
+55
-0
src/operators/kernel/im2sequence_kernel.h
src/operators/kernel/im2sequence_kernel.h
+1
-3
src/operators/kernel/mali/fushion_fc_kernel.cpp
src/operators/kernel/mali/fushion_fc_kernel.cpp
+0
-2
src/operators/kernel/prelu_kernel.h
src/operators/kernel/prelu_kernel.h
+1
-1
src/operators/kernel/scale_kernel.h
src/operators/kernel/scale_kernel.h
+1
-1
src/operators/kernel/slice_kernel.h
src/operators/kernel/slice_kernel.h
+1
-1
src/operators/math/gemm.cpp
src/operators/math/gemm.cpp
+294
-24
src/operators/math/gemm.h
src/operators/math/gemm.h
+8
-5
src/operators/op_param.h
src/operators/op_param.h
+59
-80
src/operators/resize_op.h
src/operators/resize_op.h
+1
-1
test/CMakeLists.txt
test/CMakeLists.txt
+4
-0
test/common/test_gemm.cpp
test/common/test_gemm.cpp
+84
-30
test/executor_for_test.h
test/executor_for_test.h
+14
-8
test/framework/test_load.cpp
test/framework/test_load.cpp
+5
-5
test/net/test_mobilenet+ssd.cpp
test/net/test_mobilenet+ssd.cpp
+6
-8
test/net/test_mobilenet.cpp
test/net/test_mobilenet.cpp
+14
-12
test/operators/test_fusion_conv_add_bn_relu_op.cpp
test/operators/test_fusion_conv_add_bn_relu_op.cpp
+62
-0
test/test_helper.h
test/test_helper.h
+15
-11
tools/android-cmake/android.toolchain.cmake
tools/android-cmake/android.toolchain.cmake
+3
-1
tools/build.sh
tools/build.sh
+2
-2
tools/op.cmake
tools/op.cmake
+0
-2
未找到文件。
README.md
浏览文件 @
8d1fbf99
...
...
@@ -27,7 +27,7 @@ Paddle-Moible是PaddlePaddle组织下的项目,是一个致力于嵌入式平
-
**ARM CPU**


arm cpu是paddle-mobile的主要支持方向,cpu的通用性一直是其优势。嵌入式深度学习,需要大量的cpu汇编实现。我们正在紧锣密鼓的编码,为的是能充分硬件的每一点加速能力。
arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
...
...
src/common/types.cpp
浏览文件 @
8d1fbf99
...
...
@@ -17,39 +17,39 @@ limitations under the License. */
namespace
paddle_mobile
{
const
std
::
string
G_OP_TYPE_CONV
=
"conv2d"
;
const
std
::
string
G_OP_TYPE_BATCHNORM
=
"batch_norm"
;
const
std
::
string
G_OP_TYPE_BOX_CODER
=
"box_coder"
;
const
std
::
string
G_OP_TYPE_CONCAT
=
"concat"
;
const
std
::
string
G_OP_TYPE_ELEMENTWISE_ADD
=
"elementwise_add"
;
const
std
::
string
G_OP_TYPE_FUSION_CONV_ADD_RELU
=
"fusion_conv_add_relu"
;
const
std
::
string
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
=
"fusion_conv_add_bn_relu"
;
const
std
::
string
G_OP_TYPE_FUSION_DWCONV_BN_RELU
=
"fusion_dwconv_bn_relu"
;
const
std
::
string
G_OP_TYPE_FUSION_CONV_BN_RELU
=
"fusion_conv_bn_relu"
;
const
std
::
string
G_OP_TYPE_FC
=
"fusion_fc"
;
const
std
::
string
G_OP_TYPE_FUSION_CONV_ADD
=
"fusion_conv_add"
;
const
std
::
string
G_OP_TYPE_LRN
=
"lrn"
;
const
std
::
string
G_OP_TYPE_MUL
=
"mul"
;
const
std
::
string
G_OP_TYPE_MULTICLASS_NMS
=
"multiclass_nms"
;
const
std
::
string
G_OP_TYPE_POOL2D
=
"pool2d"
;
const
std
::
string
G_OP_TYPE_PRIOR_BOX
=
"prior_box"
;
const
std
::
string
G_OP_TYPE_RELU
=
"relu"
;
const
std
::
string
G_OP_TYPE_RESHAPE
=
"reshape"
;
const
std
::
string
G_OP_TYPE_SIGMOID
=
"sigmoid"
;
const
std
::
string
G_OP_TYPE_SOFTMAX
=
"softmax"
;
const
std
::
string
G_OP_TYPE_TRANSPOSE
=
"transpose"
;
const
std
::
string
G_OP_TYPE_SPLIT
=
"split"
;
const
std
::
string
G_OP_TYPE_FEED
=
"feed"
;
const
std
::
string
G_OP_TYPE_FETCH
=
"fetch"
;
const
std
::
string
G_OP_TYPE_DEPTHWISE_CONV
=
"depthwise_conv2d"
;
const
std
::
string
G_OP_TYPE_IM2SEQUENCE
=
"im2sequence"
;
const
std
::
string
G_OP_TYPE_DROPOUT
=
"dropout"
;
const
std
::
string
G_OP_TYPE_FUSION_CONV_ADD_BN
=
"fusion_conv_add_bn"
;
const
std
::
string
G_OP_TYPE_FUSION_POOL_BN
=
"fusion_pool_bn"
;
const
std
::
string
G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU
=
const
char
*
G_OP_TYPE_CONV
=
"conv2d"
;
const
char
*
G_OP_TYPE_BATCHNORM
=
"batch_norm"
;
const
char
*
G_OP_TYPE_BOX_CODER
=
"box_coder"
;
const
char
*
G_OP_TYPE_CONCAT
=
"concat"
;
const
char
*
G_OP_TYPE_ELEMENTWISE_ADD
=
"elementwise_add"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_RELU
=
"fusion_conv_add_relu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
=
"fusion_conv_add_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_DWCONV_BN_RELU
=
"fusion_dwconv_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_BN_RELU
=
"fusion_conv_bn_relu"
;
const
char
*
G_OP_TYPE_FC
=
"fusion_fc"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD
=
"fusion_conv_add"
;
const
char
*
G_OP_TYPE_LRN
=
"lrn"
;
const
char
*
G_OP_TYPE_MUL
=
"mul"
;
const
char
*
G_OP_TYPE_MULTICLASS_NMS
=
"multiclass_nms"
;
const
char
*
G_OP_TYPE_POOL2D
=
"pool2d"
;
const
char
*
G_OP_TYPE_PRIOR_BOX
=
"prior_box"
;
const
char
*
G_OP_TYPE_RELU
=
"relu"
;
const
char
*
G_OP_TYPE_RESHAPE
=
"reshape"
;
const
char
*
G_OP_TYPE_SIGMOID
=
"sigmoid"
;
const
char
*
G_OP_TYPE_SOFTMAX
=
"softmax"
;
const
char
*
G_OP_TYPE_TRANSPOSE
=
"transpose"
;
const
char
*
G_OP_TYPE_SPLIT
=
"split"
;
const
char
*
G_OP_TYPE_FEED
=
"feed"
;
const
char
*
G_OP_TYPE_FETCH
=
"fetch"
;
const
char
*
G_OP_TYPE_DEPTHWISE_CONV
=
"depthwise_conv2d"
;
const
char
*
G_OP_TYPE_IM2SEQUENCE
=
"im2sequence"
;
const
char
*
G_OP_TYPE_DROPOUT
=
"dropout"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN
=
"fusion_conv_add_bn"
;
const
char
*
G_OP_TYPE_FUSION_POOL_BN
=
"fusion_pool_bn"
;
const
char
*
G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU
=
"fusion_elementwise_add_relu"
;
const
std
::
string
G_OP_TYPE_FUSION_FC_RELU
=
"fusion_fc_relu"
;
const
std
::
string
G_OP_TYPE_REGION
=
"region"
;
const
char
*
G_OP_TYPE_FUSION_FC_RELU
=
"fusion_fc_relu"
;
const
char
*
G_OP_TYPE_REGION
=
"region"
;
std
::
unordered_map
<
std
::
string
,
std
::
pair
<
std
::
vector
<
std
::
string
>
,
std
::
vector
<
std
::
string
>>>
...
...
src/common/types.h
浏览文件 @
8d1fbf99
...
...
@@ -20,7 +20,9 @@ limitations under the License. */
#include <vector>
namespace
paddle_mobile
{
enum
class
Precision
:
int
{
FP32
=
0
};
enum
class
Precision
:
int
{
FP32
=
0
,
FP16
=
1
};
typedef
int16_t
half
;
template
<
Precision
p
>
struct
PrecisionTrait
{
...
...
@@ -31,6 +33,10 @@ template <>
struct
PrecisionTrait
<
Precision
::
FP32
>
{
typedef
float
ptype
;
};
template
<
>
struct
PrecisionTrait
<
Precision
::
FP16
>
{
typedef
half
ptype
;
};
//! device type
enum
DeviceTypeEnum
{
kINVALID
=
-
1
,
kCPU
=
0
,
kFPGA
=
1
,
kGPU_MALI
=
2
};
...
...
@@ -73,40 +79,40 @@ enum PMStatus {
PMWrongDevice
=
0x08
/*!< un-correct device. */
};
extern
const
std
::
string
G_OP_TYPE_CONV
;
extern
const
std
::
string
G_OP_TYPE_BATCHNORM
;
extern
const
std
::
string
G_OP_TYPE_BOX_CODER
;
extern
const
std
::
string
G_OP_TYPE_CONCAT
;
extern
const
std
::
string
G_OP_TYPE_ELEMENTWISE_ADD
;
extern
const
std
::
string
G_OP_TYPE_FUSION_CONV_ADD_RELU
;
extern
const
std
::
string
G_OP_TYPE_FC
;
extern
const
std
::
string
G_OP_TYPE_FUSION_CONV_ADD
;
extern
const
std
::
string
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
;
extern
const
std
::
string
G_OP_TYPE_FUSION_DWCONV_BN_RELU
;
extern
const
std
::
string
G_OP_TYPE_FUSION_CONV_BN_RELU
;
extern
const
std
::
string
G_OP_TYPE_LRN
;
extern
const
std
::
string
G_OP_TYPE_MUL
;
extern
const
std
::
string
G_OP_TYPE_MULTICLASS_NMS
;
extern
const
std
::
string
G_OP_TYPE_POOL2D
;
extern
const
std
::
string
G_OP_TYPE_PRIOR_BOX
;
extern
const
std
::
string
G_OP_TYPE_RELU
;
extern
const
std
::
string
G_OP_TYPE_RESHAPE
;
extern
const
std
::
string
G_OP_TYPE_SIGMOID
;
extern
const
std
::
string
G_OP_TYPE_SOFTMAX
;
extern
const
std
::
string
G_OP_TYPE_TRANSPOSE
;
extern
const
std
::
string
G_OP_TYPE_SPLIT
;
extern
const
std
::
string
G_OP_TYPE_FEED
;
extern
const
std
::
string
G_OP_TYPE_FETCH
;
extern
const
std
::
string
G_OP_TYPE_DEPTHWISE_CONV
;
extern
const
std
::
string
G_OP_TYPE_IM2SEQUENCE
;
extern
const
std
::
string
G_OP_TYPE_DROPOUT
;
extern
const
std
::
string
G_OP_TYPE_FUSION_CONV_ADD_BN
;
extern
const
std
::
string
G_OP_TYPE_FUSION_POOL_BN
;
extern
const
std
::
string
G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU
;
extern
const
std
::
string
G_OP_TYPE_FUSION_FC_RELU
;
extern
const
std
::
string
G_OP_TYPE_REGION
;
extern
const
char
*
G_OP_TYPE_CONV
;
extern
const
char
*
G_OP_TYPE_BATCHNORM
;
extern
const
char
*
G_OP_TYPE_BOX_CODER
;
extern
const
char
*
G_OP_TYPE_CONCAT
;
extern
const
char
*
G_OP_TYPE_ELEMENTWISE_ADD
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_RELU
;
extern
const
char
*
G_OP_TYPE_FC
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_DWCONV_BN_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_BN_RELU
;
extern
const
char
*
G_OP_TYPE_LRN
;
extern
const
char
*
G_OP_TYPE_MUL
;
extern
const
char
*
G_OP_TYPE_MULTICLASS_NMS
;
extern
const
char
*
G_OP_TYPE_POOL2D
;
extern
const
char
*
G_OP_TYPE_PRIOR_BOX
;
extern
const
char
*
G_OP_TYPE_RELU
;
extern
const
char
*
G_OP_TYPE_RESHAPE
;
extern
const
char
*
G_OP_TYPE_SIGMOID
;
extern
const
char
*
G_OP_TYPE_SOFTMAX
;
extern
const
char
*
G_OP_TYPE_TRANSPOSE
;
extern
const
char
*
G_OP_TYPE_SPLIT
;
extern
const
char
*
G_OP_TYPE_FEED
;
extern
const
char
*
G_OP_TYPE_FETCH
;
extern
const
char
*
G_OP_TYPE_DEPTHWISE_CONV
;
extern
const
char
*
G_OP_TYPE_IM2SEQUENCE
;
extern
const
char
*
G_OP_TYPE_DROPOUT
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN
;
extern
const
char
*
G_OP_TYPE_FUSION_POOL_BN
;
extern
const
char
*
G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_FC_RELU
;
extern
const
char
*
G_OP_TYPE_REGION
;
extern
std
::
unordered_map
<
std
::
string
,
std
::
pair
<
std
::
vector
<
std
::
string
>
,
std
::
vector
<
std
::
string
>>>
...
...
src/fpga/api/fpga_api.cpp
浏览文件 @
8d1fbf99
...
...
@@ -29,15 +29,15 @@ limitations under the License. */
#include "fpga/api/fpga_api.h"
namespace
paddle
{
namespace
mobile
{
namespace
paddle_mobile
{
namespace
fpga
{
namespace
api
{
static
int
fd
=
-
1
;
static
const
char
*
device_path
=
"/dev/fpgadrv0"
;
static
inline
int
do_ioctl
(
int
req
,
void
*
arg
)
{
return
ioctl
(
req
,
arg
);
}
static
inline
int
do_ioctl
(
int
req
,
void
*
arg
)
{
return
ioctl
(
req
,
(
long
unsigned
int
)
arg
);
}
int
open_device
()
{
if
(
fd
==
-
1
)
{
...
...
@@ -48,8 +48,8 @@ int open_device() {
// memory management;
void
*
fpga_malloc
(
size_t
size
)
{
return
reinterpret_cast
<
(
void
*
)
>
mmap64
(
NULL
,
size
,
PROT_READ
|
PROT_WRITE
,
MAP_SHARED
,
fd
,
0
);
return
reinterpret_cast
<
void
*>
(
mmap64
(
NULL
,
size
,
PROT_READ
|
PROT_WRITE
,
MAP_SHARED
,
fd
,
0
)
);
}
void
fpga_free
(
void
*
ptr
)
{
munmap
(
ptr
,
0
);
}
...
...
@@ -58,7 +58,9 @@ void fpga_copy(void *dest, const void *src, size_t num) {
memcpy
(
dest
,
src
,
num
);
}
}
// namespace api
int
ComputeFpgaConv
(
struct
FpgaConvArgs
args
)
{}
int
ComputeFpgaPool
(
struct
FpgaPoolArgs
args
)
{}
int
ComputeFpgaEWAdd
(
struct
FpgaEWAddArgs
args
)
{}
}
// namespace fpga
}
// namespace mobile
}
// namespace paddle
}
// namespace paddle_mobile
src/fpga/api/fpga_api.h
浏览文件 @
8d1fbf99
...
...
@@ -14,44 +14,107 @@ limitations under the License. */
#pragma once
#include <stdint.h>
#include <cstddef>
#include <iostream>
#include <limits>
// memory management;
namespace
paddle
{
namespace
mobile
{
namespace
paddle_mobile
{
namespace
fpga
{
namespace
api
{
int
open_device
();
int
close_device
();
void
*
fpga_malloc
(
size_t
size
);
void
fpga_free
(
void
*
ptr
);
void
fpga_copy
(
void
*
dst
,
const
void
*
src
,
size_t
num
);
void
*
fpga_malloc
(
size_t
size
);
void
fpga_free
(
void
*
ptr
);
void
fpga_copy
(
void
*
dst
,
const
void
*
src
,
size_t
num
);
struct
Cnn
VersionArgs
{
void
*
buf
;
struct
Fpga
VersionArgs
{
void
*
buf
;
};
struct
QuantArgs
{
struct
MemoryToPhysicalArgs
{
const
void
*
src
;
uint64_t
physical
;
};
struct
MemoryCopyArgs
{
void
*
src
;
void
*
dst
;
size_t
size
;
};
struct
FpgaQuantArgs
{
float
scale
;
};
struct
BatchNormalizationArgs
{
bool
enable
;
struct
FpgaBNArgs
{
bool
enabled
=
false
;
void
*
bias_addr
;
void
*
scale_addr
;
};
struct
FpgaKernelArgs
{
uint32_t
width
;
uint32_t
height
;
uint32_t
stride_h
;
uint32_t
stride_w
;
};
struct
FpgaImageArgs
{
uint32_t
width
;
uint32_t
height
;
uint32_t
channels
;
uint32_t
pad_h
;
uint32_t
pad_w
;
};
struct
FpgaConvArgs
{
bool
relu_enabled
;
struct
FpgaBNArgs
BNargs
;
void
*
image_addr
;
void
*
filter_addr
;
void
*
bias_addr
;
void
*
output_addr
;
float
quant_scale
;
struct
FpgaImageArgs
image
;
uint32_t
filter_num
;
uint32_t
group_num
;
struct
FpgaKernelArgs
kernel
;
};
struct
FpgaPoolArgs
{
void
*
image_addr
;
void
*
output_addr
;
struct
FpgaImageArgs
image
;
struct
FpgaKernelArgs
kernel
;
};
struct
FpgaEWAddArgs
{
bool
relu_enabled
;
void
*
image0_addr
;
void
*
image1_addr
;
void
*
result_addr
;
uint32_t
const0
;
uint32_t
const1
;
uint32_t
data_len
;
// aligned element count
};
struct
ScaleArgs
{};
int
ComputeFpgaConv
(
struct
FpgaConvArgs
args
);
int
ComputeFpgaPool
(
struct
FpgaPoolArgs
args
);
int
ComputeFpgaEWAdd
(
struct
FpgaEWAddArgs
args
);
#define IOCTL_CNN_MAGIC 'CNN'
#define IOCTL_VERSION _IOW(IOCTL_CNN_MAGIC, 1, struct CnnVersionArgs)
#define IOCTL_GET_QUANT _IOW(IOCTL_CNN_MAGIC, 2, struct QuantArgs)
#define IOCTL_SET_QUANT _IOW(IOCTL_CNN_MAGIC, 3, struct QuantArgs)
#define IOCTL_FPGA_MAGIC 'CNN'
#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 1, struct FpgaVersionArgs)
#define IOCTL_GET_QUANT _IOW(IOCTL_FPGA_MAGIC, 2, struct FpgaQuantArgs)
#define IOCTL_SET_QUANT _IOW(IOCTL_FPGA_MAGIC, 3, struct FpgaQuantArgs)
#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct FpgaConvArgs)
#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct FpgaPoolArgs)
#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct FpgaEWAddArgs)
}
// namespace api
}
// namespace fpga
}
// namespace mobile
}
// namespace paddle
}
// namespace paddle_mobile
src/framework/program/program-optimize/fusion_op_register.h
浏览文件 @
8d1fbf99
...
...
@@ -14,11 +14,13 @@ limitations under the License. */
#pragma once
#include <algorithm>
#include <map>
#include <string>
#include <vector>
#include "framework/operator.h"
#include "node.h"
#include "
framework/program/program-optimize/
node.h"
namespace
paddle_mobile
{
namespace
framework
{
...
...
src/framework/tensor.h
浏览文件 @
8d1fbf99
...
...
@@ -16,14 +16,15 @@ limitations under the License. */
#include <cstdint>
#include <cstring>
#include <fstream>
#include <memory>
#include <string>
#include <type_traits>
#include <typeindex>
#include <vector>
#include "common/enforce.h"
#include <fstream>
#include "common/enforce.h"
#include "common/types.h"
#include "framework/data_layout.h"
#include "framework/ddim.h"
#include "memory/t_malloc.h"
...
...
@@ -63,7 +64,8 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
};
static
inline
size_t
SizeOfType
(
std
::
type_index
type
)
{
SizeOfTypeFunctor
<
int
,
float
,
double
,
int16_t
,
int64_t
,
bool
,
size_t
>
functor
;
SizeOfTypeFunctor
<
int
,
half
,
float
,
double
,
int16_t
,
int64_t
,
bool
,
size_t
>
functor
;
size_t
size
=
functor
(
type
);
PADDLE_MOBILE_ENFORCE
(
size
!=
0UL
,
"Cannot get size of type %s"
,
type
.
name
());
...
...
src/io/executor.cpp
浏览文件 @
8d1fbf99
...
...
@@ -187,7 +187,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
memcpy
(
&
max_value
,
*
data
+
sizeof
(
float
),
sizeof
(
float
));
*
data
+=
2
*
sizeof
(
float
);
const
float
factor
=
(
max_value
-
min_value
)
/
255.0
;
uint8_t
*
uint8_data
=
(
uint8_t
*
)
(
*
data
);
uint8_t
*
uint8_data
=
reinterpret_cast
<
uint8_t
*>
(
*
data
);
for
(
int
k
=
0
;
k
<
memory_size
;
++
k
)
{
static_cast
<
float
*>
(
memory
)[
k
]
=
uint8_data
[
k
]
*
factor
+
min_value
;
}
...
...
@@ -419,7 +419,7 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
}
template
class
Executor
<
CPU
,
Precision
::
FP32
>;
template
class
Executor
<
FPGA
,
Precision
::
FP32
>;
template
class
Executor
<
GPU_MALI
,
Precision
::
FP32
>;
template
class
Executor
<
FPGA
,
Precision
::
FP32
>;
}
// namespace paddle_mobile
src/memory/t_malloc.cpp
浏览文件 @
8d1fbf99
...
...
@@ -27,17 +27,17 @@ namespace memory {
const
int
MALLOC_ALIGN
=
64
;
#ifdef PADDLE_MOBILE_FPGA
namespace
api
=
paddle
::
mobile
::
fpga
::
api
;
namespace
fpga
=
paddle_mobile
::
fpga
;
void
Copy
(
void
*
dst
,
const
void
*
src
,
size_t
num
)
{
std
::
memcpy
(
dst
,
src
,
num
);
}
void
*
Alloc
(
size_t
size
)
{
return
api
::
malloc
(
size
);
}
void
*
Alloc
(
size_t
size
)
{
return
fpga
::
fpga_
malloc
(
size
);
}
void
Free
(
void
*
ptr
)
{
if
(
ptr
)
{
api
::
fpga_free
(
ptr
);
fpga
::
fpga_free
(
ptr
);
}
}
...
...
src/operators/concat_op.cpp
浏览文件 @
8d1fbf99
...
...
@@ -14,7 +14,9 @@ limitations under the License. */
#ifdef CONCAT_OP
#include "concat_op.h"
#include <vector>
#include "operators/concat_op.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -68,6 +70,7 @@ REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
REGISTER_OPERATOR_MALI_GPU
(
concat
,
ops
::
ConcatOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA
(
concat
,
ops
::
ConcatOp
);
#endif
#endif
src/operators/concat_op.h
浏览文件 @
8d1fbf99
...
...
@@ -53,6 +53,7 @@ USE_OP_CPU(concat);
USE_OP_MALI_GPU
(
concat
);
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA
(
concat
);
#endif
#endif
src/operators/feed_op.h
浏览文件 @
8d1fbf99
...
...
@@ -29,7 +29,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
:
framework
::
OperatorBase
<
DeviceType
>
(
type
,
inputs
,
outputs
,
attrs
,
scope
),
param_
(
inputs
,
outputs
,
attrs
,
*
scope
)
{}
param_
(
inputs
,
outputs
,
attrs
,
scope
.
get
()
)
{}
void
RunImpl
()
const
{
param_
.
Out
()
->
ShareDataWith
(
*
param_
.
InputX
());
}
void
Init
()
{}
...
...
src/operators/kernel/arm/dropout_kernel.cpp
浏览文件 @
8d1fbf99
...
...
@@ -14,8 +14,6 @@ limitations under the License. */
#ifdef DROPOUT_OP
#pragma once
#include "operators/kernel/dropout_kernel.h"
#include <operators/math/transform.h>
...
...
src/operators/kernel/dropout_kernel.h
浏览文件 @
8d1fbf99
...
...
@@ -17,7 +17,7 @@ limitations under the License. */
#include "framework/operator.h"
#include "operators/op_param.h"
#pragma once
;
#pragma once
namespace
paddle_mobile
{
namespace
operators
{
...
...
src/operators/kernel/fpga/concat_kernel.cpp
0 → 100644
浏览文件 @
8d1fbf99
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONCAT_OP
#include "operators/kernel/concat_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ConcatKernel
<
FPGA
,
half
>::
Init
(
ConcatParam
*
param
)
{
return
true
;
}
template
<
>
void
ConcatKernel
<
FPGA
,
half
>::
Compute
(
const
ConcatParam
&
param
)
const
{
auto
inputs
=
param
.
Inputs
();
auto
*
out
=
param
.
Out
();
int64_t
axis
=
param
.
Axis
();
out
->
mutable_data
<
half
>
();
DDim
out_dim
=
out
->
dims
();
int
pixels
=
out_dim
[
1
]
*
out_dim
[
2
];
auto
out_channel
=
out_dim
[
3
];
auto
out_offset
=
0
;
for
(
int
i
=
0
;
i
<
inputs
.
size
();
++
i
)
{
auto
input
=
inputs
[
i
];
auto
channels
=
input
[
3
];
out_offset
+=
channels
;
auto
src
=
input
->
data
<
half
>
();
for
(
int
j
=
0
;
j
<
pixels
;
++
j
)
{
auto
dst
=
out
->
data
<
half
>
()
+
out_offset
;
memory
::
Copy
(
dst
,
src
,
sizeof
(
half
));
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/im2sequence_kernel.h
浏览文件 @
8d1fbf99
...
...
@@ -20,13 +20,11 @@ limitations under the License. */
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
#pragma once
;
#pragma once
namespace
paddle_mobile
{
namespace
operators
{
using
namespace
framework
;
template
<
typename
DeviceType
,
typename
T
>
class
Im2SequenceKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
Im2SequenceParam
>
{
...
...
src/operators/kernel/mali/fushion_fc_kernel.cpp
浏览文件 @
8d1fbf99
...
...
@@ -14,8 +14,6 @@ limitations under the License. */
#ifdef FUSION_FC_OP
#pragma once
#include "operators/kernel/fusion_fc_kernel.h"
namespace
paddle_mobile
{
...
...
src/operators/kernel/prelu_kernel.h
浏览文件 @
8d1fbf99
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#include "framework/operator.h"
#include "operators/op_param.h"
#pragma once
;
#pragma once
namespace
paddle_mobile
{
namespace
operators
{
...
...
src/operators/kernel/scale_kernel.h
浏览文件 @
8d1fbf99
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#include "framework/operator.h"
#include "operators/op_param.h"
#pragma once
;
#pragma once
namespace
paddle_mobile
{
namespace
operators
{
...
...
src/operators/kernel/slice_kernel.h
浏览文件 @
8d1fbf99
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#include "framework/operator.h"
#include "operators/op_param.h"
#pragma once
;
#pragma once
namespace
paddle_mobile
{
namespace
operators
{
...
...
src/operators/math/gemm.cpp
浏览文件 @
8d1fbf99
...
...
@@ -92,8 +92,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
*/
// 将A矩阵分块复制到连续内存(RowMajor)
void
PackMatrixA_
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
)
{
void
PackMatrixA_
4r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
)
{
const
float
*
a0
,
*
a1
,
*
a2
,
*
a3
;
for
(
int
i
=
0
;
i
<
m
-
m_tail
;
i
+=
MR
)
{
a0
=
A
+
i
*
lda
;
...
...
@@ -131,9 +131,62 @@ void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
}
}
void
PackMatrixA_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
)
{
const
float
*
a0
,
*
a1
,
*
a2
,
*
a3
,
*
a4
,
*
a5
;
for
(
int
i
=
0
;
i
<
m
-
m_tail
;
i
+=
MR
)
{
a0
=
A
+
i
*
lda
;
a1
=
A
+
(
i
+
1
)
*
lda
;
a2
=
A
+
(
i
+
2
)
*
lda
;
a3
=
A
+
(
i
+
3
)
*
lda
;
a4
=
A
+
(
i
+
4
)
*
lda
;
a5
=
A
+
(
i
+
5
)
*
lda
;
for
(
int
j
=
0
;
j
<
k
;
++
j
)
{
*
buffer
++
=
*
a0
++
;
*
buffer
++
=
*
a1
++
;
*
buffer
++
=
*
a2
++
;
*
buffer
++
=
*
a3
++
;
*
buffer
++
=
*
a4
++
;
*
buffer
++
=
*
a5
++
;
}
}
int
i
=
m
-
m_tail
;
a0
=
&
A
(
i
,
0
);
a1
=
a0
+
lda
;
a2
=
a0
+
2
*
lda
;
a3
=
a0
+
3
*
lda
;
a4
=
a0
+
4
*
lda
;
a5
=
a0
+
5
*
lda
;
if
(
m_tail
!=
0
)
{
if
(
m_tail
<=
5
)
{
a5
=
zero
;
}
if
(
m_tail
<=
4
)
{
a4
=
zero
;
}
if
(
m_tail
<=
3
)
{
a3
=
zero
;
}
if
(
m_tail
<=
2
)
{
a2
=
zero
;
}
if
(
m_tail
<=
1
)
{
a1
=
zero
;
}
for
(
int
j
=
0
;
j
<
k
;
++
j
)
{
*
buffer
++
=
*
a0
++
;
*
buffer
++
=
*
a1
++
;
*
buffer
++
=
*
a2
++
;
*
buffer
++
=
*
a3
++
;
*
buffer
++
=
*
a4
++
;
*
buffer
++
=
*
a5
++
;
}
}
}
// 将B矩阵分块复制到连续内存(RowMajor)
void
PackMatrixB_
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
)
{
void
PackMatrixB_
8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
)
{
const
float
*
b0
;
for
(
int
j
=
0
;
j
<
n
-
n_tail
;
j
+=
NR
)
{
for
(
int
i
=
0
;
i
<
k
;
++
i
)
{
...
...
@@ -188,7 +241,8 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
for
(
int
j
=
0
;
j
<
nc
;
j
+=
NR
)
{
for
(
int
i
=
0
;
i
<
mc
;
i
+=
MR
)
{
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot4x8
(
KC
,
a
+
i
*
KC
,
b
+
j
*
KC
,
c
+
i
*
NC
+
j
,
NC
);
// AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot6x8
(
KC
,
a
+
i
*
KC
,
b
+
j
*
KC
,
c
+
i
*
NC
+
j
,
NC
);
}
}
...
...
@@ -218,7 +272,8 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
for
(
int
j
=
0
;
j
<
nc
;
j
+=
NR
)
{
for
(
int
i
=
0
;
i
<
mc
;
i
+=
MR
)
{
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot4x8
(
KC
,
a
+
i
*
KC
,
b
+
j
*
KC
,
c
+
i
*
NC
+
j
,
NC
);
// AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot6x8
(
KC
,
a
+
i
*
KC
,
b
+
j
*
KC
,
c
+
i
*
NC
+
j
,
NC
);
}
}
...
...
@@ -1868,22 +1923,22 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int
L1
=
3
0
*
1024
;
int
L2
=
1
*
1024
*
1024
;
int
L1
=
3
2
*
1024
;
int
L2
=
0.5
*
1024
*
1024
;
KC
=
k
;
MC
=
L
2
/
(
2
*
KC
*
sizeof
(
float
));
NC
=
MC
;
MC
=
L
1
/
(
KC
*
sizeof
(
float
));
NC
=
L2
/
(
KC
*
sizeof
(
float
))
;
// make sure MC is multiple of
4, and NC is multiple of 8
// make sure MC is multiple of
MR, and NC is multiple of NR
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
4
-
1
)
/
4
*
4
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
8
-
1
)
/
8
*
8
;
NC
=
(
NC
+
NR
-
1
)
/
NR
*
NR
;
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA
=
static_cast
<
float
*>
(
...
...
@@ -1901,10 +1956,10 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
int
mc
,
nc
;
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
nc
=
s_min
(
n
-
j
,
NC
);
PackMatrixB_
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
PackMatrixB_
8c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
mc
=
s_min
(
m
-
i
,
MC
);
PackMatrixA_
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
PackMatrixA_
6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
InnerKernel
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
&
C
(
i
,
j
),
ldc
,
relu
);
}
...
...
@@ -1921,22 +1976,22 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int
L1
=
3
0
*
1024
;
int
L2
=
1
*
1024
*
1024
;
int
L1
=
3
2
*
1024
;
int
L2
=
0.5
*
1024
*
1024
;
KC
=
k
;
MC
=
L
2
/
(
2
*
KC
*
sizeof
(
float
));
NC
=
MC
;
MC
=
L
1
/
(
KC
*
sizeof
(
float
));
NC
=
L2
/
(
KC
*
sizeof
(
float
))
;
// make sure MC is multiple of
4, and NC is multiple of 8
// make sure MC is multiple of
MR, and NC is multiple of NR
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
4
-
1
)
/
4
*
4
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
8
-
1
)
/
8
*
8
;
NC
=
(
NC
+
NR
-
1
)
/
NR
*
NR
;
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA
=
static_cast
<
float
*>
(
...
...
@@ -1954,10 +2009,10 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
int
mc
,
nc
;
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
nc
=
s_min
(
n
-
j
,
NC
);
PackMatrixB_
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
PackMatrixB_
8c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
mc
=
s_min
(
m
-
i
,
MC
);
PackMatrixA_
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
PackMatrixA_
6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
InnerKernelWithBn
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
&
C
(
i
,
j
),
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
);
}
...
...
@@ -1969,6 +2024,221 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile
::
memory
::
Free
(
zero
);
}
void
AddDot6x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
#if __ARM_NEON
#if __aarch64__
// init C
float32x4_t
cv0
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv1
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv2
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv3
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv4
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv5
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv6
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv7
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv8
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv9
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv10
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv11
=
vdupq_n_f32
(
0.0
);
float32x4_t
av
;
float32x4_t
bv0
;
float32x4_t
bv1
;
float32x2_t
av01
;
float32x2_t
av23
;
float32x2_t
av45
;
for
(
int
p
=
0
;
p
<
k
;
p
+=
1
)
{
av
=
vld1q_f32
(
a
);
av01
=
vget_low_f32
(
av
);
av23
=
vget_high_f32
(
av
);
av45
=
vld1_f32
(
a
+
4
);
bv0
=
vld1q_f32
(
b
);
bv1
=
vld1q_f32
(
b
+
4
);
cv0
=
vmlaq_lane_f32
(
cv0
,
bv0
,
av01
,
0
);
cv1
=
vmlaq_lane_f32
(
cv1
,
bv1
,
av01
,
0
);
cv2
=
vmlaq_lane_f32
(
cv2
,
bv0
,
av01
,
1
);
cv3
=
vmlaq_lane_f32
(
cv3
,
bv1
,
av01
,
1
);
cv4
=
vmlaq_lane_f32
(
cv4
,
bv0
,
av23
,
0
);
cv5
=
vmlaq_lane_f32
(
cv5
,
bv1
,
av23
,
0
);
cv6
=
vmlaq_lane_f32
(
cv6
,
bv0
,
av23
,
1
);
cv7
=
vmlaq_lane_f32
(
cv7
,
bv1
,
av23
,
1
);
cv8
=
vmlaq_lane_f32
(
cv8
,
bv0
,
av45
,
0
);
cv9
=
vmlaq_lane_f32
(
cv9
,
bv1
,
av45
,
0
);
cv10
=
vmlaq_lane_f32
(
cv10
,
bv0
,
av45
,
1
);
cv11
=
vmlaq_lane_f32
(
cv11
,
bv1
,
av45
,
1
);
a
+=
MR
;
b
+=
NR
;
}
vst1q_f32
(
c
,
cv0
);
vst1q_f32
(
c
+
4
,
cv1
);
vst1q_f32
(
c
+
ldc
,
cv2
);
vst1q_f32
(
c
+
ldc
+
4
,
cv3
);
vst1q_f32
(
c
+
2
*
ldc
,
cv4
);
vst1q_f32
(
c
+
2
*
ldc
+
4
,
cv5
);
vst1q_f32
(
c
+
3
*
ldc
,
cv6
);
vst1q_f32
(
c
+
3
*
ldc
+
4
,
cv7
);
vst1q_f32
(
c
+
4
*
ldc
,
cv8
);
vst1q_f32
(
c
+
4
*
ldc
+
4
,
cv9
);
vst1q_f32
(
c
+
5
*
ldc
,
cv10
);
vst1q_f32
(
c
+
5
*
ldc
+
4
,
cv11
);
#else
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
int
kc1
=
k
/
4
;
int
kc2
=
k
%
4
;
int
step
=
4
*
ldc
;
asm
volatile
(
"pld [%[a_ptr]]
\n\t
"
"pld [%[b_ptr]]
\n\t
"
"pld [%[a_ptr], #64]
\n\t
"
"pld [%[b_ptr], #64]
\n\t
"
"vmov.f32 q4, #0.0
\n\t
"
"vmov.f32 q5, #0.0
\n\t
"
"vmov.f32 q6, #0.0
\n\t
"
"vmov.f32 q7, #0.0
\n\t
"
"vmov.f32 q8, #0.0
\n\t
"
"vmov.f32 q9, #0.0
\n\t
"
"vmov.f32 q10, #0.0
\n\t
"
"vmov.f32 q11, #0.0
\n\t
"
"vmov.f32 q12, #0.0
\n\t
"
"vmov.f32 q13, #0.0
\n\t
"
"vmov.f32 q14, #0.0
\n\t
"
"vmov.f32 q15, #0.0
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"blt end_kc1_%=
\n\t
"
"loop_kc1_%=:
\n\t
"
// "pld [%[a_ptr], #128] \n\t"
// "pld [%[b_ptr], #128] \n\t"
// "pld [%[a_ptr], #192] \n\t"
// "pld [%[b_ptr], #192] \n\t"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"bge loop_kc1_%=
\n\t
"
"end_kc1_%=:
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"blt end_kc2_%=
\n\t
"
"loop_kc2_%=:
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"bge loop_kc2_%=
\n\t
"
"end_kc2_%=:
\n\t
"
"mov r5, %[c]
\n\t
"
"mov r6, %[step]
\n\t
"
"vst1.32 {q4, q5}, [r5], r6
\n\t
"
"vst1.32 {q6, q7}, [r5], r6
\n\t
"
"vst1.32 {q8, q9}, [r5], r6
\n\t
"
"vst1.32 {q10, q11}, [r5], r6
\n\t
"
"vst1.32 {q12, q13}, [r5], r6
\n\t
"
"vst1.32 {q14, q15}, [r5]
\n\t
"
:
:
[
a_ptr
]
"r"
(
a_ptr
),
[
b_ptr
]
"r"
(
b_ptr
),
[
c
]
"r"
(
c
),
[
kc1
]
"r"
(
kc1
),
[
kc2
]
"r"
(
kc2
),
[
step
]
"r"
(
step
)
:
"memory"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
);
#endif // __aarch64__
#else
#endif // __ARM_NEON
}
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
src/operators/math/gemm.h
浏览文件 @
8d1fbf99
...
...
@@ -19,7 +19,7 @@ limitations under the License. */
#define B(i, j) B[(i)*ldb + (j)]
#define C(i, j) C[(i)*ldc + (j)]
#define MR
4
#define MR
6
#define NR 8
#define s_min(i, j) ((i) < (j) ? (i) : (j))
...
...
@@ -39,12 +39,14 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
*/
// 将 A 矩阵分块复制到连续内存(RowMajor)
void
PackMatrixA_
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_4r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
// 将 B 矩阵分块复制到连续内存(RowMajor)
void
PackMatrixB_
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_
8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
// 分块矩阵乘法
void
InnerKernel
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
...
...
@@ -67,6 +69,7 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
// 计算一个更小的 C 矩阵分块
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot6x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
// 分块矩阵乘法结果回写
// C = A * B
...
...
src/operators/op_param.h
浏览文件 @
8d1fbf99
...
...
@@ -22,6 +22,9 @@ limitations under the License. */
#include "framework/scope.h"
#include "framework/tensor.h"
#include "framework/variable.h"
#ifdef PADDLE_MOBILE_FPGA
#include "fpga/api/fpga_api.h"
#endif
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -256,6 +259,15 @@ class ElementwiseAddParam : OpParam {
Tensor
*
input_y_
;
Tensor
*
out_
;
int
axis_
;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga
::
FpgaEWAddArgs
fpga_EW_add_args
;
public:
const
fpga
::
FpgaEWAddArgs
&
FpgaArgs
()
const
{
return
fpga_EW_add_args
;
}
void
SetFpgaArgs
(
const
fpga
::
FpgaEWAddArgs
&
args
)
{
fpga_EW_add_args
=
args
;
}
#endif
};
#ifdef FUSION_ELEMENTWISEADDRELU_OP
...
...
@@ -450,80 +462,15 @@ class PoolParam : public OpParam {
vector
<
int
>
paddings_
;
bool
ceil_mode_
;
bool
global_pooling_
=
false
;
};
#endif
#ifdef FUSION_POOLBN_OP
class
FusionPoolBNParam
:
OpParam
{
public:
FusionPoolBNParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
{
input_
=
InputXFrom
<
LoDTensor
>
(
inputs
,
scope
);
pooling_type_
=
GetAttr
<
string
>
(
"pooling_type"
,
attrs
);
ksize_
=
GetAttr
<
vector
<
int
>>
(
"ksize"
,
attrs
);
strides_
=
GetAttr
<
vector
<
int
>>
(
"strides"
,
attrs
);
paddings_
=
GetAttr
<
vector
<
int
>>
(
"paddings"
,
attrs
);
ceil_mode_
=
GetAttr
<
bool
>
(
"ceil_mode"
,
attrs
);
global_pooling_
=
GetAttr
<
bool
>
(
"global_pooling"
,
attrs
);
output_y_
=
OutputYFrom
<
LoDTensor
>
(
outputs
,
scope
);
input_bias_
=
InputBiasFrom
<
LoDTensor
>
(
inputs
,
scope
);
input_mean_
=
InputMeanFrom
<
LoDTensor
>
(
inputs
,
scope
);
input_scale_
=
InputScaleFrom
<
LoDTensor
>
(
inputs
,
scope
);
input_variance_
=
InputVarianceFrom
<
LoDTensor
>
(
inputs
,
scope
);
epsilon_
=
GetAttr
<
float
>
(
"epsilon"
,
attrs
);
momentum_
=
GetAttr
<
float
>
(
"momentum"
,
attrs
);
// is_test_ = GetAttr<bool>("is_test", attrs);
}
const
Tensor
*
Input
()
const
{
return
input_
;
}
const
string
&
PoolingType
()
const
{
return
pooling_type_
;
}
const
vector
<
int
>
&
Ksize
()
const
{
return
ksize_
;
}
const
vector
<
int
>
&
Strides
()
const
{
return
strides_
;
}
const
vector
<
int
>
&
Paddings
()
const
{
return
paddings_
;
}
bool
isCeilMode
()
const
{
return
ceil_mode_
;
}
bool
isGlobalPooling
()
const
{
return
global_pooling_
;
}
Tensor
*
OutputY
()
const
{
return
output_y_
;
}
const
Tensor
*
InputBias
()
const
{
return
input_bias_
;
}
const
Tensor
*
InputMean
()
const
{
return
input_mean_
;
}
const
Tensor
*
InputScale
()
const
{
return
input_scale_
;
}
const
Tensor
*
InputVariance
()
const
{
return
input_variance_
;
}
const
float
&
Epsilon
()
const
{
return
epsilon_
;
}
const
float
&
Momentum
()
const
{
return
momentum_
;
}
const
bool
&
IsTest
()
const
{
return
is_test_
;
}
const
string
&
DataFormat
()
const
{
return
data_format_
;
}
#ifdef PADDLE_MOBILE_FPGA
private:
Tensor
*
input_
;
string
pooling_type_
;
vector
<
int
>
ksize_
;
vector
<
int
>
strides_
;
vector
<
int
>
paddings_
;
bool
ceil_mode_
;
bool
global_pooling_
=
false
;
Tensor
*
output_y_
;
Tensor
*
input_bias_
;
Tensor
*
input_mean_
;
Tensor
*
input_scale_
;
Tensor
*
input_variance_
;
float
epsilon_
;
float
momentum_
;
bool
is_test_
;
string
data_format_
;
fpga
::
FpgaPoolArgs
fpga_pool_args
;
public:
const
fpga
::
FpgaPoolArgs
&
FpgaArgs
()
const
{
return
fpga_pool_args
;
}
void
SetFpgaArgs
(
const
fpga
::
FpgaPoolArgs
&
args
)
{
fpga_pool_args
=
args
;
}
#endif
};
#endif
...
...
@@ -704,10 +651,10 @@ class MultiClassNMSParam : public OpParam {
class
FeedParam
:
public
OpParam
{
public:
FeedParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
Scope
&
scope
)
{
input_x_
=
InputXFrom
<
LoDTensor
>
(
inputs
,
scope
);
out_
=
OutFrom
<
LoDTensor
>
(
outputs
,
scope
);
auto
var
=
scope
.
Var
(
"batch_size"
);
const
AttributeMap
&
attrs
,
Scope
*
scope
)
{
input_x_
=
InputXFrom
<
LoDTensor
>
(
inputs
,
*
scope
);
out_
=
OutFrom
<
LoDTensor
>
(
outputs
,
*
scope
);
auto
var
=
scope
->
Var
(
"batch_size"
);
batch_size
=
var
->
GetValue
<
int
>
();
}
const
Tensor
*
InputX
()
const
{
return
input_x_
;
}
...
...
@@ -983,6 +930,15 @@ class FusionFcParam : public OpParam {
int
x_num_col_dims_
;
int
y_num_col_dims_
;
int
axis_
;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga
::
FpgaConvArgs
fpga_conv_args
;
public:
const
fpga
::
FpgaConvArgs
&
FpgaArgs
()
const
{
return
fpga_conv_args
;
}
void
SetFpgaArgs
(
const
fpga
::
FpgaConvArgs
&
args
)
{
fpga_conv_args
=
args
;
}
#endif
};
#ifdef FUSION_FCRELU_OP
...
...
@@ -1032,6 +988,15 @@ class FusionConvAddParam : public OpParam {
vector
<
int
>
paddings_
;
vector
<
int
>
dilations_
;
int
groups
;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga
::
FpgaConvArgs
fpga_conv_args
;
public:
const
fpga
::
FpgaConvArgs
&
FpgaArgs
()
const
{
return
fpga_conv_args
;
}
void
SetFpgaArgs
(
const
fpga
::
FpgaConvArgs
&
args
)
{
fpga_conv_args
=
args
;
}
#endif
};
Print
&
operator
<<
(
Print
&
printer
,
const
FusionConvAddParam
&
conv_param
);
...
...
@@ -1128,6 +1093,15 @@ class FusionConvAddBNReluParam : public OpParam {
bool
is_test_
;
Tensor
*
new_bias_
;
Tensor
*
new_scale_
;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga
::
FpgaConvArgs
fpga_conv_args
;
public:
const
fpga
::
FpgaConvArgs
&
FpgaArgs
()
const
{
return
fpga_conv_args
;
}
void
SetFpgaArgs
(
const
fpga
::
FpgaConvArgs
&
args
)
{
fpga_conv_args
=
args
;
}
#endif
};
#endif
...
...
@@ -1213,6 +1187,15 @@ class FusionConvAddBNParam : public OpParam {
bool
is_test_
;
Tensor
*
new_bias_
;
Tensor
*
new_scale_
;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga
::
FpgaConvArgs
fpga_conv_args
;
public:
const
fpga
::
FpgaConvArgs
&
FpgaArgs
()
const
{
return
fpga_conv_args
;
}
void
SetFpgaArgs
(
const
fpga
::
FpgaConvArgs
&
args
)
{
fpga_conv_args
=
args
;
}
#endif
};
#endif
...
...
@@ -1426,9 +1409,5 @@ class DropoutParam : public OpParam {
};
#endif
#ifdef REGION_OP
class
RegionParam
:
public
OpParam
{};
#endif
}
// namespace operators
}
// namespace paddle_mobile
src/operators/resize_op.h
浏览文件 @
8d1fbf99
...
...
@@ -33,7 +33,7 @@ class ResizeOp
DeviceType
,
ResizeParam
,
operators
::
ResizeKernel
<
DeviceType
,
T
>>
{
public:
ResizeOp
(
const
std
::
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
attrs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
,
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
:
framework
::
OperatorWithKernel
<
DeviceType
,
ResizeParam
,
operators
::
ResizeKernel
<
DeviceType
,
T
>>
(
...
...
test/CMakeLists.txt
浏览文件 @
8d1fbf99
...
...
@@ -145,6 +145,10 @@ else ()
ADD_EXECUTABLE
(
test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-conv-add-relu-op paddle-mobile
)
# gen test
ADD_EXECUTABLE
(
test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-conv-add-bn-relu-op paddle-mobile
)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif
()
test/common/test_gemm.cpp
浏览文件 @
8d1fbf99
...
...
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstdlib>
#include <ctime>
#include <iostream>
#include "../test_helper.h"
#include "common/log.h"
...
...
@@ -20,13 +22,21 @@ limitations under the License. */
#define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(i)*ldb + (j)]
#define c(i, j) c[(i)*ldc + (j)]
#define c1(i, j) c1[(i)*ldc + (j)]
#define m 62
#define n 63
#define k 74
void
print_matirx
(
int
m
,
int
n
,
int
ldc
,
float
*
c
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
std
::
cout
<<
c
(
i
,
0
);
for
(
int
j
=
1
;
j
<
n
;
++
j
)
{
std
::
cout
<<
" | "
<<
c
(
i
,
j
);
}
std
::
cout
<<
std
::
endl
;
}
std
::
cout
<<
std
::
endl
;
}
int
main
(
)
{
int
do_sgemm
(
int
m
,
int
n
,
int
k
,
bool
relu
,
int
t1
,
int
t2
,
int
pr
)
{
int
lda
=
k
;
int
ldb
=
n
;
int
ldc
=
n
;
...
...
@@ -39,44 +49,88 @@ int main() {
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
m
*
n
));
float
*
c1
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
m
*
n
));
float
*
scale
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
m
));
float
*
bias
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
m
));
srand
(
unsigned
(
time
(
0
)));
for
(
int
i
=
0
;
i
<
m
*
k
;
++
i
)
{
a
[
i
]
=
2
;
a
[
i
]
=
t1
+
rand
()
%
t
2
;
}
for
(
int
i
=
0
;
i
<
k
*
n
;
++
i
)
{
b
[
i
]
=
2
;
b
[
i
]
=
t1
+
rand
()
%
t
2
;
}
for
(
int
i
=
0
;
i
<
m
*
n
;
++
i
)
{
c
[
i
]
=
2
;
c1
[
i
]
=
2
;
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
scale
[
i
]
=
t1
+
rand
()
%
t2
;
}
auto
time1
=
time
();
// paddle_mobile::operators::math::Sgemm(m, n, k, 0.9, a, lda, b, ldb, 0.3,
// c,
// ldc);
auto
time2
=
time
();
DLOG
<<
"gemm cost :"
<<
time_diff
(
time1
,
time2
)
<<
"ms
\n
"
;
for
(
int
i
=
0
;
i
<
m
*
n
;
++
i
)
{
std
::
cout
<<
c
[
i
]
<<
" | "
;
if
(
i
%
n
==
(
n
-
1
))
{
std
::
cout
<<
std
::
endl
;
}
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
bias
[
i
]
=
t1
+
rand
()
%
t2
;
}
for
(
int
j
=
0
;
j
<
n
;
++
j
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
c1
(
i
,
j
)
*=
0.3
;
for
(
int
p
=
0
;
p
<
k
;
++
p
)
{
c1
(
i
,
j
)
+=
0.9
*
a
(
i
,
p
)
*
b
(
p
,
j
);
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
for
(
int
j
=
0
;
j
<
n
;
++
j
)
{
float
r
=
0
;
for
(
int
p
=
0
;
p
<
k
;
p
++
)
{
r
+=
a
(
i
,
p
)
*
b
(
p
,
j
);
}
r
*=
scale
[
i
];
r
+=
bias
[
i
];
if
(
relu
&&
(
r
<
0
))
{
r
=
0
;
}
c1
(
i
,
j
)
=
r
;
}
}
std
::
cout
<<
"正确结果对比:"
<<
std
::
endl
;
paddle_mobile
::
operators
::
math
::
SgemmWithBn
(
m
,
n
,
k
,
0.9
,
a
,
lda
,
b
,
ldb
,
0.3
,
c
,
ldc
,
relu
,
scale
,
bias
);
int
eq
=
0
;
int
neq
=
0
;
for
(
int
i
=
0
;
i
<
m
*
n
;
++
i
)
{
std
::
cout
<<
c1
[
i
]
<<
" | "
;
if
(
i
%
n
==
(
n
-
1
))
{
std
::
cout
<<
std
::
endl
;
if
(
static_cast
<
int
>
(
c
[
i
])
==
static_cast
<
int
>
(
c1
[
i
]))
{
++
eq
;
}
else
{
++
neq
;
}
}
if
(
pr
>
0
)
{
std
::
cout
<<
"A:"
<<
std
::
endl
;
print_matirx
(
m
,
k
,
lda
,
a
);
std
::
cout
<<
"B:"
<<
std
::
endl
;
print_matirx
(
k
,
n
,
ldb
,
b
);
std
::
cout
<<
"C:"
<<
std
::
endl
;
print_matirx
(
m
,
n
,
ldc
,
c
);
std
::
cout
<<
"C1:"
<<
std
::
endl
;
print_matirx
(
m
,
n
,
ldc
,
c1
);
}
std
::
cout
<<
"mnk="
<<
m
<<
" "
<<
n
<<
" "
<<
k
<<
" relu="
<<
relu
<<
" eq="
<<
eq
<<
" neq="
<<
neq
<<
std
::
endl
;
paddle_mobile
::
memory
::
Free
(
a
);
paddle_mobile
::
memory
::
Free
(
b
);
paddle_mobile
::
memory
::
Free
(
c
);
paddle_mobile
::
memory
::
Free
(
c1
);
paddle_mobile
::
memory
::
Free
(
scale
);
paddle_mobile
::
memory
::
Free
(
bias
);
return
0
;
}
int
main
()
{
do_sgemm
(
9
,
9
,
9
,
true
,
10
,
10
,
10
);
do_sgemm
(
10
,
6
,
12
,
false
,
10
,
10
,
0
);
do_sgemm
(
512
,
256
,
384
,
false
,
10
,
10
,
0
);
do_sgemm
(
1366
,
768
,
256
,
false
,
10
,
10
,
0
);
do_sgemm
(
1255
,
755
,
333
,
false
,
10
,
10
,
0
);
do_sgemm
(
555
,
777
,
999
,
false
,
10
,
10
,
0
);
do_sgemm
(
10
,
6
,
12
,
true
,
-
4
,
10
,
0
);
do_sgemm
(
512
,
256
,
384
,
true
,
-
4
,
10
,
0
);
do_sgemm
(
1366
,
768
,
256
,
true
,
-
4
,
10
,
0
);
do_sgemm
(
1255
,
755
,
333
,
true
,
-
4
,
10
,
0
);
do_sgemm
(
555
,
777
,
999
,
true
,
-
4
,
10
,
0
);
return
0
;
}
test/executor_for_test.h
浏览文件 @
8d1fbf99
...
...
@@ -43,7 +43,7 @@ template <typename DeviceType, typename OpType>
class
Executor4Test
:
public
Executor
<
DeviceType
>
{
public:
Executor4Test
(
Program
<
DeviceType
>
p
,
string
op_type
,
bool
use_optimize
=
false
)
bool
use_optimize
=
false
,
int
predict_op_count
=
1
)
:
Executor
<
DeviceType
>
()
{
this
->
use_optimize_
=
use_optimize
;
this
->
program_
=
p
;
...
...
@@ -57,12 +57,14 @@ class Executor4Test : public Executor<DeviceType> {
LOG
(
paddle_mobile
::
LogLevel
::
kLOG_ERROR
)
<<
"to_predict_program_ == nullptr"
;
}
const
std
::
vector
<
std
::
shared_ptr
<
BlockDesc
>>
blocks
=
this
->
to_predict_program_
->
Blocks
();
for
(
std
::
shared_ptr
<
BlockDesc
>
block_desc
:
blocks
)
{
std
::
vector
<
std
::
shared_ptr
<
OpDesc
>>
ops
=
block_desc
->
Ops
();
for
(
std
::
shared_ptr
<
OpDesc
>
op
:
ops
)
{
if
(
op
->
Type
()
==
op_type
)
{
for
(
int
i
=
0
;
i
<
ops
.
size
();
++
i
)
{
auto
op
=
ops
[
i
];
if
(
op
->
Type
()
==
op_type
&&
i
<
predict_op_count
)
{
DLOG
<<
"匹配到: "
<<
op
->
Type
();
/// test first meeting op in program
...
...
@@ -72,11 +74,17 @@ class Executor4Test : public Executor<DeviceType> {
op
->
Type
(),
op
->
GetInputs
(),
op
->
GetOutputs
(),
op
->
GetAttrMap
(),
this
->
program_
.
scope
);
this
->
ops_of_block_
[
*
block_desc
.
get
()].
push_back
(
op_ptr
);
break
;
}
}
}
this
->
InitMemory
();
std
::
shared_ptr
<
paddle_mobile
::
framework
::
BlockDesc
>
to_predict_block
=
this
->
to_predict_program_
->
Block
(
0
);
auto
&
ops
=
this
->
ops_of_block_
[
*
to_predict_block
.
get
()];
for
(
const
auto
&
op
:
ops
)
{
op
->
Init
();
}
}
template
<
typename
T
=
LoDTensor
>
...
...
@@ -130,9 +138,6 @@ class Executor4Test : public Executor<DeviceType> {
auto
*
output_tensor
=
con_output
->
GetMutable
<
LoDTensor
>
();
output_tensor
->
mutable_data
<
float
>
(
dDim
);
std
::
shared_ptr
<
Tensor
>
out_tensor
=
std
::
make_shared
<
LoDTensor
>
();
out_tensor
.
reset
(
output_tensor
);
std
::
shared_ptr
<
paddle_mobile
::
framework
::
BlockDesc
>
to_predict_block
=
this
->
to_predict_program_
->
Block
(
0
);
for
(
int
j
=
0
;
j
<
this
->
ops_of_block_
[
*
to_predict_block
.
get
()].
size
();
...
...
@@ -141,6 +146,7 @@ class Executor4Test : public Executor<DeviceType> {
op
->
Run
();
}
return
out_tensor
;
return
std
::
make_shared
<
paddle_mobile
::
framework
::
Tensor
>
(
paddle_mobile
::
framework
::
Tensor
(
*
output_tensor
));
}
};
test/framework/test_load.cpp
浏览文件 @
8d1fbf99
...
...
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include "../test_helper.h"
#include "io/loader.h"
...
...
@@ -20,12 +22,10 @@ int main() {
// ../../../test/models/googlenet
// ../../../test/models/mobilenet
// auto program = loader.Load(g_googlenet, true);
// auto program = loader.Load(g_mobilenet_ssd, true);
auto
program
=
loader
.
Load
(
g_mobilenet_ssd
,
true
);
// auto program = loader.Load(g_googlenet_combine + "/model",
// g_googlenet_combine +
// "/params", true);
auto
program
=
loader
.
Load
(
std
::
string
(
g_ocr
)
+
"/model"
,
std
::
string
(
g_ocr
)
+
"/params"
,
false
);
// program.originProgram->Description("program desc: ");
return
0
;
}
test/net/test_mobilenet+ssd.cpp
浏览文件 @
8d1fbf99
...
...
@@ -20,22 +20,20 @@ int main() {
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
>
paddle_mobile
;
paddle_mobile
.
SetThreadNum
(
4
);
auto
time1
=
time
();
auto
isok
=
paddle_mobile
.
Load
(
g_mobilenet_ssd_gesture
+
"/model"
,
g_mobilenet_ssd_gesture
+
"/params"
,
true
);
auto
isok
=
paddle_mobile
.
Load
(
std
::
string
(
g_mobilenet_ssd_gesture
)
+
"/model"
,
std
::
string
(
g_mobilenet_ssd_gesture
)
+
"/params"
,
true
);
// auto isok = paddle_mobile.Load(g_mobilenet_ssd, false);
if
(
isok
)
{
auto
time2
=
time
();
std
::
cout
<<
"load cost :"
<<
time_diff
(
time1
,
time2
)
<<
"ms"
<<
std
::
endl
;
std
::
vector
<
float
>
input
;
std
::
vector
<
int64_t
>
dims
{
1
,
3
,
300
,
300
};
Tensor
input_tensor
;
SetupTensor
<
float
>
(
&
input_tensor
,
{
1
,
3
,
300
,
300
},
static_cast
<
float
>
(
0
),
static_cast
<
float
>
(
1
));
GetInput
<
float
>
(
g_hand
,
&
input
,
dims
);
std
::
vector
<
float
>
input
(
input_tensor
.
data
<
float
>
(),
input_tensor
.
data
<
float
>
()
+
input_tensor
.
numel
());
auto
time3
=
time
();
paddle_mobile
.
Predict
(
input
,
dims
);
auto
output
=
paddle_mobile
.
Predict
(
input
,
dims
);
auto
time4
=
time
();
std
::
cout
<<
"predict cost :"
<<
time_diff
(
time3
,
time4
)
<<
"ms"
<<
std
::
endl
;
...
...
test/net/test_mobilenet.cpp
浏览文件 @
8d1fbf99
...
...
@@ -24,19 +24,21 @@ int main() {
auto
time2
=
time
();
std
::
cout
<<
"load cost :"
<<
time_diff
(
time1
,
time1
)
<<
"ms"
<<
std
::
endl
;
std
::
vector
<
float
>
input
;
std
::
vector
<
int64_t
>
dims
{
1
,
3
,
224
,
224
};
Tensor
input_tensor
;
SetupTensor
<
float
>
(
&
input_tensor
,
{
1
,
3
,
224
,
224
},
static_cast
<
float
>
(
0
),
static_cast
<
float
>
(
1
));
std
::
vector
<
float
>
input
(
input_tensor
.
data
<
float
>
(),
input_tensor
.
data
<
float
>
()
+
input_tensor
.
numel
());
auto
time3
=
time
();
auto
vec_result
=
paddle_mobile
.
Predict
(
input
,
dims
);
auto
time4
=
time
();
std
::
cout
<<
"predict cost :"
<<
time_diff
(
time3
,
time4
)
<<
"ms"
<<
std
::
endl
;
GetInput
<
float
>
(
g_test_image_1x3x224x224
,
&
input
,
dims
);
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
auto
time3
=
time
();
auto
vec_result
=
paddle_mobile
.
Predict
(
input
,
dims
);
auto
time4
=
time
();
std
::
vector
<
float
>::
iterator
biggest
=
std
::
max_element
(
std
::
begin
(
vec_result
),
std
::
end
(
vec_result
));
std
::
cout
<<
" Max element is "
<<
*
biggest
<<
" at position "
<<
std
::
distance
(
std
::
begin
(
vec_result
),
biggest
)
<<
std
::
endl
;
std
::
cout
<<
"predict cost :"
<<
time_diff
(
time3
,
time4
)
<<
"ms"
<<
std
::
endl
;
}
}
return
0
;
...
...
test/operators/test_fusion_conv_add_bn_relu_op.cpp
0 → 100644
浏览文件 @
8d1fbf99
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "../test_include.h"
#include "operators/fusion_conv_add_bn_relu_op.h"
int
main
()
{
paddle_mobile
::
Loader
<
paddle_mobile
::
CPU
>
loader
;
// ../models/image_classification_resnet.inference.model
auto
program
=
loader
.
Load
(
g_mobilenet
,
true
);
PADDLE_MOBILE_ENFORCE
(
program
.
originProgram
!=
nullptr
,
"program file read fail"
);
Executor4Test
<
paddle_mobile
::
CPU
,
paddle_mobile
::
operators
::
FusionConvAddBNReluOp
<
paddle_mobile
::
CPU
,
float
>>
executor
(
program
,
"fusion_conv_add_bn_relu"
,
true
);
std
::
cout
<<
"executor 4 test: "
<<
std
::
endl
;
paddle_mobile
::
framework
::
Tensor
input
;
GetInput
<
float
>
(
g_test_image_1x3x224x224_banana
,
&
input
,
{
1
,
3
,
224
,
224
});
// // use SetupTensor if not has local input image .
// SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
// static_cast<float>(1));
DLOG
<<
" fuck: "
<<
input
;
auto
out_ddim
=
paddle_mobile
::
framework
::
make_ddim
({
1
,
32
,
112
,
112
});
std
::
cout
<<
"before predict: "
<<
std
::
endl
;
auto
output
=
executor
.
Predict
(
input
,
"data"
,
"conv2_1_dw_bn.tmp_2"
,
out_ddim
);
std
::
cout
<<
"after predict "
<<
std
::
endl
;
auto
output_ptr
=
output
->
data
<
float
>
();
int
stride
=
output
->
numel
()
/
100
;
for
(
int
i
=
0
;
i
<
100
;
i
++
)
{
DLOG
<<
" index:"
<<
i
*
stride
<<
" value: "
<<
output_ptr
[
i
*
stride
];
}
// for (int i = 0; i < 100; i++) {
// DLOG << " index:" << i << " value: "<< output_ptr[i];
// }
// for (int j = 0; j < output->numel(); ++j) {
// std::cout << " (index: " << j << " value: " << output_ptr[j] << ") ";
// }
std
::
cout
<<
std
::
endl
;
return
0
;
}
test/test_helper.h
浏览文件 @
8d1fbf99
...
...
@@ -24,18 +24,22 @@ limitations under the License. */
#include "framework/ddim.h"
#include "framework/tensor.h"
static
const
std
::
string
g_mobilenet_ssd
=
"../models/mobilenet+ssd
"
;
static
const
std
::
string
g_mobilenet_ssd_gesture
=
"../models/mobilenet+ssd_gesture"
;
static
const
std
::
string
g_squeezenet
=
"../models/squeezenet"
;
static
const
std
::
string
g_googlenet
=
"../models/googlenet"
;
static
const
std
::
string
g_mobilenet
=
"../models/mobilenet"
;
static
const
std
::
string
g_resnet_50
=
"../models/resnet_50"
;
static
const
std
::
string
g_resnet
=
"../models/resnet"
;
static
const
std
::
string
g_googlenet_combine
=
"../models/googlenet_combine"
;
static
const
std
::
string
g_yolo
=
"../models/yolo"
;
static
const
std
::
string
g_test_image_1x3x224x224
=
static
const
char
*
g_ocr
=
"../models/ocr
"
;
static
const
char
*
g_mobilenet_ssd
=
"../models/mobilenet+ssd"
;
static
const
char
*
g_mobilenet_ssd_gesture
=
"../models/mobilenet+ssd_gesture"
;
static
const
char
*
g_squeezenet
=
"../models/squeezenet"
;
static
const
char
*
g_googlenet
=
"../models/googlenet"
;
static
const
char
*
g_mobilenet
=
"../models/mobilenet"
;
static
const
char
*
g_resnet_50
=
"../models/resnet_50"
;
static
const
char
*
g_resnet
=
"../models/resnet"
;
static
const
char
*
g_googlenet_combine
=
"../models/googlenet_combine"
;
static
const
char
*
g_yolo
=
"../models/yolo"
;
static
const
char
*
g_test_image_1x3x224x224
=
"../images/test_image_1x3x224x224_float"
;
static
const
char
*
g_test_image_1x3x224x224_banana
=
"../images/input_3x224x224_banana"
;
static
const
char
*
g_hand
=
"../images/hand_image"
;
using
paddle_mobile
::
framework
::
DDim
;
using
paddle_mobile
::
framework
::
Tensor
;
...
...
tools/android-cmake/android.toolchain.cmake
浏览文件 @
8d1fbf99
...
...
@@ -65,6 +65,8 @@ endif()
file
(
TO_CMAKE_PATH
"
${
ANDROID_NDK
}
"
ANDROID_NDK
)
# Android NDK revision
message
(
"
${
ANDROID_NDK
}
"
)
file
(
READ
"
${
ANDROID_NDK
}
/source.properties"
ANDROID_NDK_SOURCE_PROPERTIES
)
set
(
ANDROID_NDK_SOURCE_PROPERTIES_REGEX
"^Pkg
\\
.Desc = Android NDK
\n
Pkg
\\
.Revision = ([0-9]+)
\\
."
)
...
...
@@ -159,7 +161,7 @@ endif()
# Default values for configurable variables.
if
(
NOT ANDROID_TOOLCHAIN
)
set
(
ANDROID_TOOLCHAIN
clang
)
set
(
ANDROID_TOOLCHAIN
gcc
)
endif
()
if
(
NOT ANDROID_ABI
)
set
(
ANDROID_ABI armeabi-v7a
)
...
...
tools/build.sh
浏览文件 @
8d1fbf99
...
...
@@ -63,7 +63,7 @@ build_for_android() {
TOOLCHAIN_FILE
=
"./tools/android-cmake/android.toolchain.cmake"
ANDROID_ARM_MODE
=
"arm"
if
[
"
${#
NETS
}
"
>
1
]
;
then
if
[
"
${#
NETS
}
"
-gt
1
]
;
then
cmake ..
\
-B
"../build/release/
${
PLATFORM
}
"
\
-DANDROID_ABI
=
"
${
ABI
}
"
\
...
...
@@ -99,7 +99,7 @@ build_for_ios() {
BUILD_DIR
=
../build/release/
"
${
PLATFORM
}
"
/
TOOLCHAIN_FILE
=
"./tools/ios-cmake/ios.toolchain.cmake"
mkdir
-p
"
${
BUILD_DIR
}
"
if
[
"
${#
NETS
}
"
>
1
]
;
then
if
[
"
${#
NETS
}
"
-gt
1
]
;
then
cmake ..
\
-B
"
${
BUILD_DIR
}
"
\
-DCMAKE_BUILD_TYPE
=
"
${
MODE
}
"
\
...
...
tools/op.cmake
浏览文件 @
8d1fbf99
...
...
@@ -75,11 +75,9 @@ if ("FPGAnets" IN_LIST NET)
set
(
FUSION_CONVADDRELU_OP ON
)
set
(
FUSION_CONVADDBNRELU_OP ON
)
set
(
FUSION_CONVADDBN_OP ON
)
set
(
FUSION_POOLBN_OP ON
)
set
(
FUSION_ELEMENTWISEADDRELU_OP ON
)
set
(
FUSION_FC_OP ON
)
set
(
FUSION_FCRELU_OP ON
)
set
(
REGION_OP ON
)
set
(
POOL_OP ON
)
set
(
CONCAT_OP ON
)
set
(
SOFTMAX_OP ON
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录