Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
8aac7d9d
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
338
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
8aac7d9d
编写于
10月 15, 2018
作者:
X
xiebaiyuan
提交者:
GitHub
10月 15, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into dev-latest
上级
23858ac2
a2ab5734
变更
29
展开全部
隐藏空白更改
内联
并排
Showing
29 changed file
with
729 addition
and
368 deletion
+729
-368
src/common/variant.h
src/common/variant.h
+31
-4
src/fpga/api.cpp
src/fpga/api.cpp
+16
-2
src/fpga/api.h
src/fpga/api.h
+3
-2
src/fpga/filter.cpp
src/fpga/filter.cpp
+39
-0
src/fpga/filter.h
src/fpga/filter.h
+6
-1
src/framework/attribute.h
src/framework/attribute.h
+9
-2
src/framework/ddim.h
src/framework/ddim.h
+1
-0
src/framework/variable.h
src/framework/variable.h
+7
-0
src/io/api_paddle_mobile.cc
src/io/api_paddle_mobile.cc
+5
-0
src/io/api_paddle_mobile.h
src/io/api_paddle_mobile.h
+2
-2
src/io/paddle_inference_api.h
src/io/paddle_inference_api.h
+3
-1
src/operators/kernel/fpga/fc_relu_kernel.cpp
src/operators/kernel/fpga/fc_relu_kernel.cpp
+1
-1
src/operators/kernel/fpga/fusion_fc_kernel.cpp
src/operators/kernel/fpga/fusion_fc_kernel.cpp
+1
-1
src/operators/kernel/fpga/mul_kernel.cpp
src/operators/kernel/fpga/mul_kernel.cpp
+70
-0
src/operators/math/gemm.cpp
src/operators/math/gemm.cpp
+187
-176
src/operators/math/gemm.h
src/operators/math/gemm.h
+149
-129
src/operators/math/gru_compute.cpp
src/operators/math/gru_compute.cpp
+9
-6
src/operators/math/math_function.cpp
src/operators/math/math_function.cpp
+29
-20
src/operators/mul_op.cpp
src/operators/mul_op.cpp
+3
-1
src/operators/op_param.h
src/operators/op_param.h
+21
-8
test/CMakeLists.txt
test/CMakeLists.txt
+7
-2
test/common/test_gemm_accuracy.cpp
test/common/test_gemm_accuracy.cpp
+3
-2
test/fpga/test_resnet50.cpp
test/fpga/test_resnet50.cpp
+3
-2
test/framework/test_inference_api.cpp
test/framework/test_inference_api.cpp
+6
-1
test/net/test_multi_inference_predict.cpp
test/net/test_multi_inference_predict.cpp
+104
-0
test/net/test_nlp.cpp
test/net/test_nlp.cpp
+9
-1
test/net/test_resnet.cpp
test/net/test_resnet.cpp
+3
-3
test/operators/test_box_coder_op.cpp
test/operators/test_box_coder_op.cpp
+1
-1
tools/op.cmake
tools/op.cmake
+1
-0
未找到文件。
src/common/variant.h
浏览文件 @
8aac7d9d
...
...
@@ -57,7 +57,12 @@ class RawData {
public:
char
data
[
size
];
RawData
()
{}
RawData
(
const
RawData
&
raw_data
)
{
strcpy
(
data
,
raw_data
.
data
);
}
RawData
(
const
RawData
&
raw_data
)
{
memcpy
(
data
,
raw_data
.
data
,
size
);
}
RawData
&
operator
=
(
const
RawData
&
raw_data
)
{
memcpy
(
data
,
raw_data
.
data
,
size
);
return
*
this
;
}
};
template
<
typename
...
Ts
>
...
...
@@ -74,14 +79,36 @@ struct Variant {
template
<
typename
T
,
typename
...
Args
>
void
Set
(
Args
&&
...
args
)
{
helper
::
Destroy
(
type_id
,
&
data
);
new
(
&
data
)
T
(
std
::
forward
<
Args
>
(
args
)...);
helper
::
Destroy
(
type_id
,
&
data
.
data
);
new
(
&
data
.
data
)
T
(
std
::
forward
<
Args
>
(
args
)...);
type_id
=
typeid
(
T
).
hash_code
();
}
void
SetString
(
std
::
string
&
string
)
{
// helper::Destroy(type_id, &data);
type_id
=
typeid
(
std
::
string
).
hash_code
();
strcpy
(
data
.
data
,
string
.
c_str
());
}
std
::
string
GetString
()
const
{
if
(
type_id
==
typeid
(
std
::
string
).
hash_code
())
{
return
std
::
string
(
data
.
data
);
}
else
{
PADDLE_MOBILE_THROW_EXCEPTION
(
" bad cast in variant data type not a string "
);
exit
(
0
);
}
}
template
<
typename
T
>
T
&
Get
()
const
{
if
(
type_id
==
typeid
(
T
).
hash_code
())
{
if
(
type_id
==
typeid
(
std
::
string
).
hash_code
())
{
PADDLE_MOBILE_THROW_EXCEPTION
(
"Please use getString to get an string (to avoid of an issue with "
"gcc "
"stl lib with string copy)"
);
exit
(
0
);
}
else
if
(
type_id
==
typeid
(
T
).
hash_code
())
{
return
*
const_cast
<
T
*>
(
reinterpret_cast
<
const
T
*>
(
&
data
));
}
else
{
PADDLE_MOBILE_THROW_EXCEPTION
(
" bad cast in variant"
);
...
...
src/fpga/api.cpp
浏览文件 @
8aac7d9d
...
...
@@ -104,7 +104,7 @@ int fpga_invalidate(void *address, size_t size) {
}
half
fp32_2_fp16
(
float
fp32_num
)
{
unsigned
long
tmp
=
*
(
unsigned
long
*
)(
&
fp32_num
);
unsigned
long
tmp
=
*
(
unsigned
long
*
)(
&
fp32_num
);
// NOLINT
half
t
=
((
tmp
&
0x007fffff
)
>>
13
)
|
((
tmp
&
0x80000000
)
>>
16
)
|
(((
tmp
&
0x7f800000
)
>>
13
)
-
(
112
<<
10
));
if
(
tmp
&
0x1000
)
{
...
...
@@ -120,7 +120,7 @@ float fp16_2_fp32(half fp16_num) {
int
tmp
=
0
;
float
fp32_num
;
tmp
=
s
<<
16
|
exp
<<
23
|
frac
<<
13
;
fp32_num
=
*
(
float
*
)
&
tmp
;
fp32_num
=
*
(
float
*
)
&
tmp
;
// NOLINT
return
fp32_num
;
}
...
...
@@ -347,6 +347,20 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
filter_tensor
->
reset_data_ptr
(
new_data
);
}
void
format_fc_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
)
{
filter_tensor
->
scale
[
0
]
=
float
(
max_value
/
127.0
);
// NOLINT
filter_tensor
->
scale
[
1
]
=
float
(
127.0
/
max_value
);
// NOLINT
auto
dims
=
filter_tensor
->
dims
();
auto
num
=
dims
[
0
],
channel
=
dims
[
1
],
height
=
dims
[
2
],
width
=
dims
[
3
];
auto
data_ptr
=
filter_tensor
->
data
<
float
>
();
size_t
memory_size
=
num
*
channel
*
height
*
width
*
sizeof
(
float
);
auto
new_data
=
(
float
*
)
fpga_malloc
(
memory_size
);
// NOLINT
fpga_copy
(
new_data
,
data_ptr
,
memory_size
);
filter
::
format_fc_filter
(
&
new_data
,
num
,
channel
,
height
,
width
,
1
,
max_value
);
filter_tensor
->
reset_data_ptr
(
new_data
);
}
void
format_bias_scale_array
(
float
**
bias_scale_array
,
int
element_num_per_division
,
int
num
)
{
bias_scale
::
format_bias_scale_array
(
bias_scale_array
,
...
...
src/fpga/api.h
浏览文件 @
8aac7d9d
...
...
@@ -109,8 +109,8 @@ struct PoolingArgs {
struct
EWAddArgs
{
bool
relu_enabled
;
half
const0
;
// output0 = const0 x input0 + const1 x input1;
half
const1
;
uint32_t
const0
;
// output0 = const0 x input0 + const1 x input1;
uint32_t
const1
;
struct
ImageInputArgs
image0
;
struct
ImageInputArgs
image1
;
struct
ImageOutputArgs
output
;
...
...
@@ -214,6 +214,7 @@ int get_aligned_filter_element_num(int chw);
int
get_aligned_filter_num
(
int
num
);
void
format_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
,
int
group_num
);
void
format_fc_filter
(
framework
::
Tensor
*
filter_tensor
,
float
max_value
);
void
format_bias_scale_array
(
float
**
bias_scale_array
,
int
element_num_per_division
,
int
num
);
void
format_concat_output
(
framework
::
Tensor
*
out
,
int
height
,
int
width
,
...
...
src/fpga/filter.cpp
浏览文件 @
8aac7d9d
...
...
@@ -225,6 +225,45 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
num_after_alignment
*
sizeof
(
char
));
}
void
convert_fc_filter
(
char
**
data_in
,
int
num
,
int
chw
)
{
char
*
tmp
=
*
data_in
;
char
*
data_tmp
=
(
char
*
)
fpga_malloc
(
chw
*
num
*
sizeof
(
char
));
// NOLINT
for
(
int
n
=
0
;
n
<
num
;
n
++
)
{
for
(
int
c
=
0
;
c
<
chw
;
c
++
)
{
data_tmp
[
n
*
chw
+
c
]
=
(
*
data_in
)[
num
*
c
+
n
];
}
}
*
data_in
=
data_tmp
;
fpga_free
(
tmp
);
}
void
format_fc_filter
(
float
**
data_in
,
int
num
,
int
channel
,
int
height
,
int
width
,
int
group_num
,
float
max
)
{
int
data_size
=
channel
*
height
*
width
*
num
;
int
chw
=
channel
*
height
*
width
;
int
division_capacity
=
calc_division_capacity
(
chw
);
int
num_per_div_before_alignment
=
calc_num_per_div
(
num
,
group_num
,
division_capacity
);
int
num_per_div_after_alignment
=
align_to_x
(
num_per_div_before_alignment
,
FILTER_NUM_ALIGNMENT
);
int
div_num
=
(
num
+
num_per_div_before_alignment
-
1
)
/
num_per_div_before_alignment
;
int
num_after_alignment
=
num_per_div_after_alignment
*
div_num
;
quantize
(
data_in
,
data_size
,
max
);
char
**
quantize_data
=
(
char
**
)
data_in
;
// NOLINT
convert_fc_filter
(
quantize_data
,
num
,
chw
);
align_element
(
quantize_data
,
num
,
chw
);
align_num
(
quantize_data
,
num_per_div_before_alignment
,
num
,
chw
);
reorder
(
quantize_data
,
num_after_alignment
,
chw
);
interleave
(
quantize_data
,
num_after_alignment
,
chw
);
fpga_flush
(
*
quantize_data
,
align_to_x
(
chw
,
FILTER_ELEMENT_ALIGNMENT
)
*
num_after_alignment
*
sizeof
(
char
));
}
}
// namespace filter
}
// namespace fpga
}
// namespace paddle_mobile
src/fpga/filter.h
浏览文件 @
8aac7d9d
...
...
@@ -25,7 +25,7 @@ int calc_division_capacity(int chw);
int
calc_split_num
(
int
num
,
int
division_capacity
);
int
calc_division_number
(
int
num
,
int
group_num
,
int
division_capacity
);
int
calc_num_per_div
(
int
num
,
int
group_num
,
int
division_capacity
);
void
convert_to_hwc
(
float
**
data_in
,
int
num
,
int
channel
,
int
height
,
void
convert_to_hwc
(
char
**
data_in
,
int
num
,
int
channel
,
int
height
,
int
width
);
float
find_max
(
float
*
data_in
,
int
data_size
);
void
quantize
(
float
**
data_in
,
int
data_size
,
float
max
);
...
...
@@ -36,6 +36,11 @@ void reorder(float** data_in, int num_after_alignment, int chw);
void
interleave
(
float
**
data_in
,
int
num_after_alignment
,
int
chw
);
void
format_filter
(
float
**
data_in
,
int
num
,
int
channel
,
int
height
,
int
width
,
int
group_num
,
float
max
);
void
convert_fc_filter
(
char
**
data_in
,
int
num
,
int
chw
);
void
format_fc_filter
(
float
**
data_in
,
int
num
,
int
channel
,
int
height
,
int
width
,
int
group_num
,
float
max
);
}
// namespace filter
}
// namespace fpga
}
// namespace paddle_mobile
src/framework/attribute.h
浏览文件 @
8aac7d9d
...
...
@@ -51,7 +51,7 @@ class Attribute {
break
;
}
case
PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING
:
{
attr
.
Set
<
std
::
string
>
(
std
::
string
(
attr_desc
->
s
));
attr
.
Set
String
(
std
::
string
(
attr_desc
->
s
));
break
;
}
case
PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS
:
{
...
...
@@ -108,6 +108,13 @@ class Attribute {
return
variant_
.
Get
<
T
>
();
}
Attribute
&
SetString
(
std
::
string
string
)
{
variant_
.
SetString
(
string
);
return
*
this
;
}
std
::
string
GetString
()
const
{
return
variant_
.
GetString
();
}
template
<
typename
Vistor
>
static
typename
Vistor
::
type_t
ApplyVistor
(
Vistor
vistor
,
Attribute
attr
)
{
if
(
attr
.
variant_
.
TypeId
()
==
typeid
(
int
).
hash_code
())
{
...
...
@@ -115,7 +122,7 @@ class Attribute {
}
else
if
(
attr
.
variant_
.
TypeId
()
==
typeid
(
float
).
hash_code
())
{
return
vistor
(
attr
.
variant_
.
Get
<
float
>
());
}
else
if
(
attr
.
variant_
.
TypeId
()
==
typeid
(
string
).
hash_code
())
{
return
vistor
(
attr
.
variant_
.
Get
<
string
>
());
return
vistor
(
attr
.
variant_
.
Get
String
());
}
else
if
(
attr
.
variant_
.
TypeId
()
==
typeid
(
vector
<
int
>
).
hash_code
())
{
return
vistor
(
attr
.
variant_
.
Get
<
vector
<
int
>>
());
}
else
if
(
attr
.
variant_
.
TypeId
()
==
typeid
(
vector
<
float
>
).
hash_code
())
{
...
...
src/framework/ddim.h
浏览文件 @
8aac7d9d
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <cstdlib>
#include <initializer_list>
#include <string>
#include <typeinfo>
#include <vector>
...
...
src/framework/variable.h
浏览文件 @
8aac7d9d
...
...
@@ -33,6 +33,13 @@ class Variable {
template
<
typename
T
>
const
T
GetValue
()
const
{
if
(
typeid
(
T
)
==
typeid
(
std
::
string
))
{
PADDLE_MOBILE_THROW_EXCEPTION
(
"Please use getString to get an string (to avoid of an issue with "
"gcc "
"stl lib with string copy)"
);
exit
(
0
);
}
return
variant
.
Get
<
T
>
();
}
...
...
src/io/api_paddle_mobile.cc
浏览文件 @
8aac7d9d
...
...
@@ -101,6 +101,11 @@ bool PaddleMobilePredictor<Dtype, P>::Run(
return
true
;
}
template
<
typename
Dtype
,
Precision
P
>
PaddleMobilePredictor
<
Dtype
,
P
>::~
PaddleMobilePredictor
()
{
paddle_mobile_
->
Clear
();
}
// A factory to help create difference predictor.
template
<
>
std
::
unique_ptr
<
PaddlePredictor
>
...
...
src/io/api_paddle_mobile.h
浏览文件 @
8aac7d9d
...
...
@@ -32,7 +32,7 @@ namespace paddle_mobile {
template
<
typename
Dtype
=
CPU
,
Precision
P
=
Precision
::
FP32
>
class
PaddleMobilePredictor
:
public
PaddlePredictor
{
public:
PaddleMobilePredictor
()
{}
PaddleMobilePredictor
()
=
delete
;
explicit
PaddleMobilePredictor
(
const
PaddleMobileConfig
&
config
);
...
...
@@ -40,7 +40,7 @@ class PaddleMobilePredictor : public PaddlePredictor {
std
::
vector
<
PaddleTensor
>*
output_data
,
int
batch_size
=
-
1
)
override
;
~
PaddleMobilePredictor
()
override
{}
;
~
PaddleMobilePredictor
()
override
;
private:
std
::
unique_ptr
<
PaddleMobile
<
Dtype
,
P
>>
paddle_mobile_
;
...
...
src/io/paddle_inference_api.h
浏览文件 @
8aac7d9d
...
...
@@ -87,7 +87,6 @@ enum class PaddleEngineKind {
class
PaddlePredictor
{
public:
struct
Config
;
PaddlePredictor
()
=
default
;
PaddlePredictor
(
const
PaddlePredictor
&
)
=
delete
;
PaddlePredictor
&
operator
=
(
const
PaddlePredictor
&
)
=
delete
;
...
...
@@ -107,6 +106,9 @@ class PaddlePredictor {
struct
Config
{
std
::
string
model_dir
;
// path to the model directory.
};
protected:
PaddlePredictor
()
=
default
;
};
struct
PaddleMobileConfig
:
public
PaddlePredictor
::
Config
{
...
...
src/operators/kernel/fpga/fc_relu_kernel.cpp
浏览文件 @
8aac7d9d
...
...
@@ -46,7 +46,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
filter
->
Resize
(
framework
::
make_ddim
({
num
,
filter_channel
,
height
,
width
}));
float
max_value
=
fpga
::
filter_find_max
(
filter
);
fpga
::
format_f
ilter
(
filter
,
max_value
,
1
);
fpga
::
format_f
c_filter
(
filter
,
max_value
);
int
element_num_per_div
=
fpga
::
get_filter_num_per_div
(
filter
,
1
);
fpga
::
format_bias_scale_array
(
&
bs_ptr
,
element_num_per_div
,
channel
);
...
...
src/operators/kernel/fpga/fusion_fc_kernel.cpp
浏览文件 @
8aac7d9d
...
...
@@ -47,7 +47,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
filter
->
Resize
(
framework
::
make_ddim
({
num
,
filter_channel
,
height
,
width
}));
float
max_value
=
fpga
::
filter_find_max
(
filter
);
fpga
::
format_f
ilter
(
filter
,
max_value
,
1
);
fpga
::
format_f
c_filter
(
filter
,
max_value
);
int
element_num_per_div
=
fpga
::
get_filter_num_per_div
(
filter
,
1
);
fpga
::
format_bias_scale_array
(
&
bs_ptr
,
element_num_per_div
,
channel
);
...
...
src/operators/kernel/fpga/mul_kernel.cpp
0 → 100644
浏览文件 @
8aac7d9d
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef MUL_OP
#include "operators/kernel/mul_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
MulKernel
<
FPGA
,
float
>::
Init
(
MulParam
<
FPGA
>
*
param
)
{
bool
relu_enabled
=
false
;
auto
input_x
=
const_cast
<
LoDTensor
*>
(
param
->
InputX
());
auto
filter
=
const_cast
<
LoDTensor
*>
(
param
->
InputY
());
auto
out
=
param
->
Out
();
PADDLE_MOBILE_ENFORCE
(
input_x
->
dims
()[
1
]
==
filter
->
dims
()[
0
],
"Image channel should be equal to weight number"
);
int
channel
=
(
uint32_t
)
out
->
dims
()[
1
];
auto
bs_ptr
=
(
float
*
)
fpga
::
fpga_malloc
(
2
*
channel
*
sizeof
(
float
));
// NOLINT
for
(
int
i
=
0
;
i
<
channel
;
i
++
)
{
bs_ptr
[
i
+
channel
]
=
1
;
bs_ptr
[
i
]
=
0
;
}
int
num
=
(
uint32_t
)
filter
->
dims
()[
1
];
int
chw
=
(
uint32_t
)
filter
->
dims
()[
0
];
PADDLE_MOBILE_ENFORCE
(
chw
==
input_x
->
numel
(),
"Filter element num should be equal to IFM element num"
);
int
height
=
(
uint32_t
)
input_x
->
dims
()[
2
];
int
width
=
(
uint32_t
)
input_x
->
dims
()[
3
];
int
filter_channel
=
chw
/
height
/
width
;
filter
->
Resize
(
framework
::
make_ddim
({
num
,
filter_channel
,
height
,
width
}));
float
max_value
=
fpga
::
filter_find_max
(
filter
);
fpga
::
format_fc_filter
(
filter
,
max_value
);
int
element_num_per_div
=
fpga
::
get_filter_num_per_div
(
filter
,
1
);
fpga
::
format_bias_scale_array
(
&
bs_ptr
,
element_num_per_div
,
channel
);
fpga
::
format_fp16_ofm
(
out
);
fpga
::
WrapperConvArgs
conv_arg
=
{
0
};
fpga
::
fill_conv_arg
(
&
conv_arg
,
input_x
,
out
,
filter
,
relu_enabled
,
1
,
1
,
1
,
0
,
0
,
bs_ptr
);
param
->
SetFpgaArgs
(
conv_arg
);
return
true
;
}
template
<
>
void
MulKernel
<
FPGA
,
float
>::
Compute
(
const
MulParam
<
FPGA
>
&
param
)
const
{
fpga
::
ComputeFpgaConv
(
param
.
FpgaArgs
());
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/math/gemm.cpp
浏览文件 @
8aac7d9d
此差异已折叠。
点击以展开。
src/operators/math/gemm.h
浏览文件 @
8aac7d9d
...
...
@@ -35,146 +35,166 @@ namespace paddle_mobile {
namespace
operators
{
namespace
math
{
/*
class
Gemm
{
public:
/*
// 将 A 矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer);
float *buffer);
// 将 B 矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer);
float *buffer);
*/
// 将 A 矩阵分块复制到连续内存(RowMajor)
void
PackMatrixA_4r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_8r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_omp_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_omp_8r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
// 将 B 矩阵分块复制到连续内存(RowMajor)
void
PackMatrixB_8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_12c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_16c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_omp_8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_omp_12c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_omp_16c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
// 分块矩阵乘法
void
InnerKernel
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
);
void
InnerKernelWithBias
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
typedef
void
(
Gemm
::*
FnPack
)(
int
,
int
,
int
,
const
float
*
,
int
,
float
*
);
typedef
void
(
Gemm
::*
FnAddDot
)(
int
,
const
float
*
,
const
float
*
,
float
*
,
int
);
FnPack
procPackA
;
FnPack
procPackB
;
FnAddDot
procAddDot
;
// 将 A 矩阵分块复制到连续内存(RowMajor)
void
PackMatrixA_4r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_8r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_omp_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_omp_8r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
// 将 B 矩阵分块复制到连续内存(RowMajor)
void
PackMatrixB_8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_12c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_16c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_omp_8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_omp_12c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_omp_16c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
// 分块矩阵乘法
void
InnerKernel
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
);
void
InnerKernelWithBias
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
);
void
InnerKernelWithBn
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
);
void
InnerKernelWithBn
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
void
InnerKernelWithBnAdd
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
void
InnerKernelWithBnAdd
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
);
void
InnerKernelWithPRelu
(
int
mc
,
int
nc
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
/*
// 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu);
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float
*C, int ldc, bool relu, float *new_scale, float *new_bias);
*/
// 计算一个更小的 C 矩阵分块
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot6x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot8x12
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot6x16
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
// 分块矩阵乘法结果回写
// C = A * B
void
WriteBasic
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = alpha * A * B + beta * C
void
WriteWithAlphaBeta
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + C
void
WriteWithAdd
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + bias
void
WriteWithAddV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
);
// C = A * B + C, relu(C)
void
WriteWithAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + C,prelu(C)
void
WriteWithAddPRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
// C = A * B + bias ,relu(C)
void
WriteWithAddReluV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
);
void
InnerKernelWithPRelu
(
int
mc
,
int
nc
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
/*
// 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu);
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C,
int ldc, bool relu, float *new_scale, float *new_bias);
*/
// C = A * B, batchnorm(C)
void
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
);
// C = A * B, batchnorm(C), relu(C)
void
WriteWithBnRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
);
void
WriteWithBnAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias1
);
/*
// 向量矩阵乘法结果回写
// C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc);
// C = alpha * A * B + beta * C
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
// C = A * B + C
void VecWriteWithAdd(int n, float *c, float *C, int ldc);
// C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
// C = A * B, batchnorm(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
*/
// 32位 float 矩阵乘法
void
Sgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom
void
SgemmWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
);
void
SgemmWithPRelu
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
// 32位 float 矩阵乘法(openmp 多线程版本)
void
Sgemm_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
);
// 计算一个更小的 C 矩阵分块
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot6x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot8x12
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot6x16
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
// 分块矩阵乘法结果回写
// C = A * B
void
WriteBasic
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = alpha * A * B + beta * C
void
WriteWithAlphaBeta
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + C
void
WriteWithAdd
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + bias
void
WriteWithAddV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
);
// C = A * B + C, relu(C)
void
WriteWithAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + C,prelu(C)
void
WriteWithAddPRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
// C = A * B + bias ,relu(C)
void
WriteWithAddReluV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
);
// C = A * B, batchnorm(C)
void
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
);
// C = A * B, batchnorm(C), relu(C)
void
WriteWithBnRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
);
void
WriteWithBnAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias1
);
/*
// 向量矩阵乘法结果回写
// C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc);
// C = alpha * A * B + beta * C
void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
// C = A * B + C
void VecWriteWithAdd(int n, float *c, float *C, int ldc);
// C = A * B + C, relu(C)
void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
// C = A * B, batchnorm(C)
void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
*/
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
void
SgemmWithBn_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
);
void
SgemmWithPRelu_omp
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
// 32位 float 矩阵乘法
void
Sgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
)
;
private:
int
MC
=
0
;
int
KC
=
0
;
int
NC
=
0
;
// 32位 float 矩阵乘法, 并对结果进行 batchnrom
void
SgemmWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
);
void
SgemmWithPRelu
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
// 32位 float 矩阵乘法(openmp 多线程版本)
void
Sgemm_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
void
SgemmWithBn_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
);
void
SgemmWithPRelu_omp
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
float
*
packedA
;
float
*
packedB
;
float
*
packedC
;
float
*
zero
;
};
}
// namespace math
}
// namespace operators
...
...
src/operators/math/gru_compute.cpp
浏览文件 @
8aac7d9d
...
...
@@ -28,19 +28,22 @@ struct GRUUnitFunctor<CPU, T> {
static
void
compute
(
GRUMetaValue
<
T
>
value
,
int
frame_size
,
int
batch_size
,
const
ActivationType
active_node
,
const
ActivationType
active_gate
)
{
Gemm
gemm
;
if
(
value
.
prev_out_value
)
{
Sgemm
(
batch_size
,
frame_size
*
2
,
frame_size
,
1
,
value
.
prev_out_value
,
frame_size
,
value
.
gate_weight
,
frame_size
*
2
,
1
,
value
.
gate_value
,
frame_size
*
3
,
false
,
nullptr
);
gemm
.
Sgemm
(
batch_size
,
frame_size
*
2
,
frame_size
,
1
,
value
.
prev_out_value
,
frame_size
,
value
.
gate_weight
,
frame_size
*
2
,
1
,
value
.
gate_value
,
frame_size
*
3
,
false
,
nullptr
);
}
forward_reset_output
(
forward
::
gru_resetOutput
<
T
>
(),
value
,
frame_size
,
batch_size
,
active_gate
);
if
(
value
.
prev_out_value
)
{
Sgemm
(
batch_size
,
frame_size
,
frame_size
,
1
,
value
.
reset_output_value
,
frame_size
,
value
.
state_weight
,
frame_size
,
1
,
value
.
gate_value
+
frame_size
*
2
,
frame_size
*
3
,
false
,
nullptr
);
gemm
.
Sgemm
(
batch_size
,
frame_size
,
frame_size
,
1
,
value
.
reset_output_value
,
frame_size
,
value
.
state_weight
,
frame_size
,
1
,
value
.
gate_value
+
frame_size
*
2
,
frame_size
*
3
,
false
,
nullptr
);
}
forward_final_output
(
forward
::
gru_finalOutput
<
T
>
(),
value
,
frame_size
,
...
...
src/operators/math/math_function.cpp
浏览文件 @
8aac7d9d
...
...
@@ -36,6 +36,7 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
int
M
=
dim_out
[
0
];
int
N
=
dim_out
[
1
];
int
K
=
(
!
trans_a
)
?
dim_a
[
1
]
:
dim_a
[
0
];
Gemm
gemm
;
if
(
trans_a
)
{
int
numel
=
matrix_a
.
numel
();
...
...
@@ -50,20 +51,24 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
a
[
index
++
]
=
tmp
[
i
*
n
+
j
];
}
}
#ifdef _OPENMP
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
bias
);
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
bias
);
#else
Sgemm
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
bias
);
gemm
.
Sgemm
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
bias
);
#endif
}
else
{
#ifdef _OPENMP
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
bias
);
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
bias
);
#else
Sgemm
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
bias
);
gemm
.
Sgemm
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
bias
);
#endif
}
}
...
...
@@ -74,6 +79,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
framework
::
Tensor
*
new_scale
,
framework
::
Tensor
*
new_bias
,
int
group
,
float
*
bias
)
{
Gemm
gemm
;
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_out
=
matrix_out
->
dims
();
...
...
@@ -86,21 +92,22 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
int
K
=
(
!
trans_a
)
?
dim_a
[
1
]
:
dim_a
[
0
];
#ifdef _OPENMP
SgemmWithBn_omp
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
new_scale
->
data
<
float
>
()
+
group
,
new_bias
->
data
<
float
>
()
+
group
,
bias
);
gemm
.
SgemmWithBn_omp
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
new_scale
->
data
<
float
>
()
+
group
,
new_bias
->
data
<
float
>
()
+
group
,
bias
);
#else
SgemmWithBn
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
()
,
N
,
beta
,
matrix_out
->
data
<
float
>
(),
N
,
relu
,
new_scale
->
data
<
float
>
()
+
group
,
new_bias
->
data
<
float
>
()
+
group
,
bias
);
gemm
.
SgemmWithBn
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
beta
,
matrix_out
->
data
<
float
>
()
,
N
,
relu
,
new_scale
->
data
<
float
>
()
+
group
,
new_bias
->
data
<
float
>
()
+
group
,
bias
);
#endif
}
void
matmulWithPRelu
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
framework
::
Tensor
*
matrix_out
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
Gemm
gemm
;
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_out
=
matrix_out
->
dims
();
...
...
@@ -113,11 +120,13 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
int
K
=
(
!
trans_a
)
?
dim_a
[
1
]
:
dim_a
[
0
];
#ifdef _OPENMP
SgemmWithPRelu_omp
(
M
,
N
,
K
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
matrix_out
->
data
<
float
>
(),
N
,
p
,
mode
,
bias
,
bias1
);
gemm
.
SgemmWithPRelu_omp
(
M
,
N
,
K
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
matrix_out
->
data
<
float
>
(),
N
,
p
,
mode
,
bias
,
bias1
);
#else
SgemmWithPRelu
(
M
,
N
,
K
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
matrix_out
->
data
<
float
>
(),
N
,
p
,
mode
,
bias
,
bias1
);
gemm
.
SgemmWithPRelu
(
M
,
N
,
K
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
matrix_out
->
data
<
float
>
(),
N
,
p
,
mode
,
bias
,
bias1
);
#endif
}
...
...
src/operators/mul_op.cpp
浏览文件 @
8aac7d9d
...
...
@@ -61,5 +61,7 @@ REGISTER_OPERATOR_CPU(mul, ops::MulOp);
#ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU
(
mul
,
ops
::
MulOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA
(
mul
,
ops
::
MulOp
);
#endif
#endif
src/operators/op_param.h
浏览文件 @
8aac7d9d
...
...
@@ -263,6 +263,10 @@ class OpParam {
static
const
T
GetAttr
(
const
string
&
key
,
const
AttributeMap
&
map
)
{
return
((
Attribute
)
map
.
at
(
key
)).
Get
<
T
>
();
}
static
const
std
::
string
GetStringAttr
(
const
string
&
key
,
const
AttributeMap
&
map
)
{
return
((
Attribute
)
map
.
at
(
key
)).
GetString
();
}
static
const
bool
HasAttr
(
const
string
&
key
,
const
AttributeMap
&
map
)
{
return
map
.
count
(
key
)
>
0
;
...
...
@@ -438,6 +442,15 @@ class MulParam : OpParam {
GType
*
out_
;
int
x_num_col_dims_
;
int
y_num_col_dims_
;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga
::
WrapperConvArgs
fpga_conv_args
;
public:
const
fpga
::
WrapperConvArgs
&
FpgaArgs
()
const
{
return
fpga_conv_args
;
}
void
SetFpgaArgs
(
const
fpga
::
WrapperConvArgs
&
args
)
{
fpga_conv_args
=
args
;
}
#endif
};
#endif
...
...
@@ -493,7 +506,7 @@ class LrnParam : public OpParam {
alpha_
=
GetAttr
<
float
>
(
"alpha"
,
attrs
);
beta_
=
GetAttr
<
float
>
(
"beta"
,
attrs
);
k_
=
GetAttr
<
float
>
(
"k"
,
attrs
);
data_format_
=
Get
Attr
<
string
>
(
"data_format"
,
attrs
);
data_format_
=
Get
StringAttr
(
"data_format"
,
attrs
);
}
const
RType
*
InputX
()
const
{
return
input_x_
;
}
...
...
@@ -590,7 +603,7 @@ class PoolParam : public OpParam {
input_
=
InputXFrom
<
GType
>
(
inputs
,
scope
);
output_
=
OutFrom
<
GType
>
(
outputs
,
scope
);
pooling_type_
=
Get
Attr
<
string
>
(
"pooling_type"
,
attrs
);
pooling_type_
=
Get
StringAttr
(
"pooling_type"
,
attrs
);
ksize_
=
GetAttr
<
vector
<
int
>>
(
"ksize"
,
attrs
);
strides_
=
GetAttr
<
vector
<
int
>>
(
"strides"
,
attrs
);
paddings_
=
GetAttr
<
vector
<
int
>>
(
"paddings"
,
attrs
);
...
...
@@ -724,7 +737,7 @@ class BoxCoderParam : public OpParam {
input_priorboxvar_
=
InputPriorBoxVarFrom
<
GType
>
(
inputs
,
scope
);
input_targetbox_
=
InputTargetBoxFrom
<
GType
>
(
inputs
,
scope
);
output_box_
=
OutputBoxFrom
<
GType
>
(
outputs
,
scope
);
code_type_
=
Get
Attr
<
std
::
string
>
(
"code_type"
,
attrs
);
code_type_
=
Get
StringAttr
(
"code_type"
,
attrs
);
}
const
RType
*
InputPriorBox
()
const
{
return
input_priorbox_
;
}
...
...
@@ -1199,7 +1212,7 @@ class PReluParam : public OpParam {
alpha_
=
InputAlphaFrom
<
GType
>
(
inputs
,
scope
);
framework
::
DDim
dims
=
alpha_
->
dims
();
out_
=
OutFrom
<
GType
>
(
outputs
,
scope
);
mode_
=
Get
Attr
<
std
::
string
>
(
"mode"
,
attrs
);
mode_
=
Get
StringAttr
(
"mode"
,
attrs
);
DLOG
<<
"PReluParam mode after"
<<
mode_
;
}
const
RType
*
InputX
()
const
{
return
input_x_
;
}
...
...
@@ -1330,7 +1343,7 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> {
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
:
ConvParam
<
Dtype
>
(
inputs
,
outputs
,
attrs
,
scope
)
{
alpha_
=
OpParam
::
InputAlphaFrom
<
GType
>
(
inputs
,
scope
);
mode_
=
OpParam
::
Get
Attr
<
std
::
string
>
(
"mode"
,
attrs
);
mode_
=
OpParam
::
Get
StringAttr
(
"mode"
,
attrs
);
framework
::
DDim
dims
=
alpha_
->
dims
();
bias_
=
OpParam
::
InputYFrom
<
GType
>
(
inputs
,
scope
);
axis_
=
OpParam
::
GetAttr
<
int
>
(
"axis"
,
attrs
);
...
...
@@ -1373,7 +1386,7 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
:
ConvParam
<
Dtype
>
(
inputs
,
outputs
,
attrs
,
scope
)
{
bias1_
=
OpParam
::
InputYFrom1
<
GType
>
(
inputs
,
scope
);
alpha_
=
OpParam
::
InputAlphaFrom
<
GType
>
(
inputs
,
scope
);
mode_
=
OpParam
::
Get
Attr
<
std
::
string
>
(
"mode"
,
attrs
);
mode_
=
OpParam
::
Get
StringAttr
(
"mode"
,
attrs
);
framework
::
DDim
dims
=
alpha_
->
dims
();
bias_
=
OpParam
::
InputYFrom
<
GType
>
(
inputs
,
scope
);
output_
=
OpParam
::
OutFrom
<
GType
>
(
outputs
,
scope
);
...
...
@@ -1980,8 +1993,8 @@ class GruParam : public OpParam {
OutputBatchResetHiddenPrevFrom
<
GType
>
(
outputs
,
scope
);
output_batch_hidden_
=
OutputBatchHiddenFrom
<
GType
>
(
outputs
,
scope
);
output_hidden_
=
OutputHiddenFrom
<
GType
>
(
outputs
,
scope
);
activation_
=
Get
Attr
<
std
::
string
>
(
"activation"
,
attrs
);
gate_activation_
=
Get
Attr
<
std
::
string
>
(
"gate_activation"
,
attrs
);
activation_
=
Get
StringAttr
(
"activation"
,
attrs
);
gate_activation_
=
Get
StringAttr
(
"gate_activation"
,
attrs
);
is_reverse_
=
GetAttr
<
bool
>
(
"is_reverse"
,
attrs
);
}
const
GType
*
InputInput
()
const
{
return
input_input_
;
}
...
...
test/CMakeLists.txt
浏览文件 @
8aac7d9d
...
...
@@ -35,8 +35,8 @@ if (CON GREATER -1)
ADD_EXECUTABLE
(
test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test-yolo paddle-mobile
)
# gen test
ADD_EXECUTABLE
(
test
_yolo_
combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test
_yolo_
combined paddle-mobile
)
ADD_EXECUTABLE
(
test
-yolo-
combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h
)
target_link_libraries
(
test
-yolo-
combined paddle-mobile
)
set
(
FOUND_MATCH ON
)
endif
()
...
...
@@ -323,5 +323,10 @@ if (NOT FOUND_MATCH)
target_link_libraries
(
test-fssd paddle-mobile
)
# gen test
ADD_EXECUTABLE
(
test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h
)
target_link_libraries
(
test-multi-process paddle-mobile
)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif
()
test/common/test_gemm_accuracy.cpp
浏览文件 @
8aac7d9d
...
...
@@ -83,8 +83,9 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
}
}
paddle_mobile
::
operators
::
math
::
SgemmWithBn
(
m
,
n
,
k
,
0.9
,
a
,
lda
,
b
,
ldb
,
0.3
,
c
,
ldc
,
relu
,
scale
,
bias
,
nullptr
);
paddle_mobile
::
operators
::
math
::
Gemm
gemm
;
gemm
.
SgemmWithBn
(
m
,
n
,
k
,
0.9
,
a
,
lda
,
b
,
ldb
,
0.3
,
c
,
ldc
,
relu
,
scale
,
bias
,
nullptr
);
int
eq
=
0
;
int
neq
=
0
;
for
(
int
i
=
0
;
i
<
m
*
n
;
++
i
)
{
...
...
test/fpga/test_resnet50.cpp
浏览文件 @
8aac7d9d
...
...
@@ -18,8 +18,9 @@ static const char *g_resnet_combine = "../models/resnet50";
int
main
()
{
DLOG
<<
paddle_mobile
::
fpga
::
open_device
();
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
FPGA
>
paddle_mobile
;
if
(
paddle_mobile
.
Load
(
std
::
string
(
g_resnet_combine
)
+
"/model"
,
std
::
string
(
g_resnet_combine
)
+
"/params"
,
true
))
{
// if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model",
// std::string(g_resnet_combine) + "/params", true)) {
if
(
paddle_mobile
.
Load
(
std
::
string
(
g_resnet_combine
),
true
))
{
std
::
vector
<
int64_t
>
dims
{
1
,
3
,
224
,
224
};
Tensor
input_tensor
;
SetupTensor
<
float
>
(
&
input_tensor
,
{
1
,
3
,
224
,
224
},
static_cast
<
float
>
(
0
),
...
...
test/framework/test_inference_api.cpp
浏览文件 @
8aac7d9d
...
...
@@ -46,7 +46,12 @@ int main() {
tensor_out
.
dtype
=
PaddleDType
::
FLOAT32
;
std
::
vector
<
PaddleTensor
>
outputs
(
1
,
tensor_out
);
assert
(
predictor
->
Run
(
paddle_tensor_feeds
,
&
outputs
));
std
::
cout
<<
" before predict "
<<
std
::
endl
;
predictor
->
Run
(
paddle_tensor_feeds
,
&
outputs
);
std
::
cout
<<
" after predict "
<<
std
::
endl
;
// assert();
float
*
data_o
=
static_cast
<
float
*>
(
outputs
[
0
].
data
.
data
());
for
(
size_t
j
=
0
;
j
<
outputs
[
0
].
data
.
length
()
/
sizeof
(
float
);
++
j
)
{
...
...
test/net/test_multi_inference_predict.cpp
0 → 100644
浏览文件 @
8aac7d9d
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include <thread> // NOLINT
#include "../test_helper.h"
#include "../test_include.h"
void
fun_yolo
();
int
fun_mobilenet
();
int
main
()
{
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
>
paddle_mobile2
;
// fun_yolo();
// fun_mobilenet();
std
::
thread
t1
(
fun_yolo
);
std
::
thread
t2
(
fun_mobilenet
);
t1
.
join
();
t2
.
join
();
return
0
;
}
void
fun_yolo
()
{
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
>
paddle_mobile
;
paddle_mobile
.
SetThreadNum
(
4
);
// ../../../test/models/googlenet
// ../../../test/models/mobilenet
auto
time1
=
time
();
if
(
paddle_mobile
.
Load
(
g_yolo
,
true
))
{
auto
time2
=
time
();
std
::
cout
<<
"load cost :"
<<
time_diff
(
time1
,
time1
)
<<
"ms"
<<
std
::
endl
;
vector
<
int64_t
>
dims
{
1
,
3
,
227
,
227
};
Tensor
input_tensor
;
SetupTensor
<
float
>
(
&
input_tensor
,
{
1
,
3
,
227
,
227
},
static_cast
<
float
>
(
0
),
static_cast
<
float
>
(
1
));
vector
<
float
>
input
(
input_tensor
.
data
<
float
>
(),
input_tensor
.
data
<
float
>
()
+
input_tensor
.
numel
());
auto
time3
=
time
();
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
paddle_mobile
.
Predict
(
input
,
dims
);
}
auto
time4
=
time
();
std
::
cout
<<
"thread 1: predict cost :"
<<
time_diff
(
time3
,
time4
)
/
10
<<
"ms"
<<
std
::
endl
;
}
}
int
fun_mobilenet
()
{
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
>
paddle_mobile
;
paddle_mobile
.
SetThreadNum
(
4
);
auto
time1
=
time
();
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// std::string(g_mobilenet_detect) + "/params", true);
auto
isok
=
paddle_mobile
.
Load
(
g_mobilenet
,
true
);
if
(
isok
)
{
auto
time2
=
time
();
std
::
cout
<<
"load cost :"
<<
time_diff
(
time1
,
time1
)
<<
"ms"
<<
std
::
endl
;
vector
<
float
>
input
;
vector
<
int64_t
>
dims
{
1
,
3
,
224
,
224
};
GetInput
<
float
>
(
g_test_image_1x3x224x224_banana
,
&
input
,
dims
);
auto
vec_result
=
paddle_mobile
.
Predict
(
input
,
dims
);
auto
biggest
=
max_element
(
begin
(
vec_result
),
end
(
vec_result
));
std
::
cout
<<
" Max element is "
<<
*
biggest
<<
" at position "
<<
distance
(
begin
(
vec_result
),
biggest
)
<<
std
::
endl
;
// 预热十次
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
auto
vec_result
=
paddle_mobile
.
Predict
(
input
,
dims
);
}
auto
time3
=
time
();
for
(
int
i
=
0
;
i
<
10
;
++
i
)
{
auto
vec_result
=
paddle_mobile
.
Predict
(
input
,
dims
);
}
DLOG
<<
vec_result
;
auto
time4
=
time
();
std
::
cout
<<
"thread 2: predict cost :"
<<
time_diff
(
time3
,
time4
)
/
10
<<
"ms"
<<
std
::
endl
;
}
std
::
cout
<<
"如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
"是否存在?"
<<
std
::
endl
;
return
0
;
}
test/net/test_nlp.cpp
浏览文件 @
8aac7d9d
...
...
@@ -60,7 +60,15 @@ int main() {
std
::
cout
<<
"load cost :"
<<
time_diff
(
time1
,
time1
)
<<
"ms"
<<
std
::
endl
;
// 1064 1603 644 699 2878 1219 867 1352 8 1 13 312 479
std
::
vector
<
int64_t
>
ids
{
1791
,
656
,
1549
,
281
,
96
};
std
::
vector
<
int64_t
>
ids
{
2084
,
635
,
1035
,
197
,
990
,
150
,
1132
,
2403
,
546
,
770
,
4060
,
3352
,
1798
,
1589
,
1352
,
98
,
136
,
3461
,
3186
,
1159
,
515
,
764
,
278
,
1178
,
5044
,
4060
,
943
,
932
,
463
,
1198
,
3352
,
374
,
1198
,
3352
,
374
,
2047
,
1069
,
1589
,
3672
,
1178
,
1178
,
2165
,
1178
,
2084
,
635
,
3087
,
2236
,
546
,
2047
,
1549
,
546
,
2047
,
302
,
2202
,
398
,
804
,
397
,
657
,
804
,
866
,
932
,
2084
,
515
,
2165
,
397
,
302
,
2202
,
526
,
992
,
906
,
1215
,
1589
,
4493
,
2403
,
723
,
932
,
2084
,
635
,
1352
,
932
,
444
,
2047
,
1159
,
1893
,
1579
,
59
,
330
,
98
,
1296
,
1159
,
3430
,
738
,
3186
,
1071
,
2174
,
3933
};
paddle_mobile
::
framework
::
LoDTensor
words
;
auto
size
=
static_cast
<
int
>
(
ids
.
size
());
...
...
test/net/test_resnet.cpp
浏览文件 @
8aac7d9d
...
...
@@ -52,8 +52,8 @@ int main() {
#else
auto
time3
=
time
();
paddle_mobile
.
FeedData
(
input_tensor
);
paddle_mobile
.
Predict_To
(
10
);
paddle_mobile
.
Predict_From
(
10
);
paddle_mobile
.
Predict_To
(
-
1
);
/*
paddle_mobile.Predict_From(10);
auto tensor_ptr = paddle_mobile.FetchResult(9);
std::cout << "Tensor element number for op[9]: " << tensor_ptr->numel()
<< std::endl;
...
...
@@ -63,7 +63,7 @@ int main() {
auto time4 = time();
std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
<<
std
::
endl
;
<< std::endl;
*/
#endif
}
return
0
;
...
...
test/operators/test_box_coder_op.cpp
浏览文件 @
8aac7d9d
...
...
@@ -46,7 +46,7 @@ class TestBoxCoderOp {
DLOG
<<
" Input TargetBox is : "
<<
op
->
Input
(
"TargetBox"
)[
0
];
DLOG
<<
" OutputBox is : "
<<
op
->
Output
(
"OutputBox"
)[
0
];
DLOG
<<
" code_type : "
<<
op
->
GetAttrMap
().
at
(
"code_type"
).
Get
<
std
::
string
>
();
<<
op
->
GetAttrMap
().
at
(
"code_type"
).
Get
String
();
std
::
shared_ptr
<
operators
::
BoxCoderOp
<
Dtype
,
float
>>
boxcoder
=
std
::
make_shared
<
operators
::
BoxCoderOp
<
Dtype
,
float
>>
(
op
->
Type
(),
op
->
GetInputs
(),
op
->
GetOutputs
(),
...
...
tools/op.cmake
浏览文件 @
8aac7d9d
...
...
@@ -121,6 +121,7 @@ if (CON GREATER -1)
set
(
FUSION_CONVBNRELU_OP ON
)
set
(
FUSION_CONVBN_OP ON
)
set
(
FUSION_CONVADD_OP ON
)
set
(
MUL_OP ON
)
set
(
FOUND_MATCH ON
)
endif
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录