Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
a9133cf4
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a9133cf4
编写于
3月 14, 2019
作者:
H
hjchen2
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Resize feed and fetch variables before infer shape, and reimplement scale kernel according to fluid
上级
ef17bf2f
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
60 addition
and
147 deletion
+60
-147
src/framework/executor.cpp
src/framework/executor.cpp
+4
-3
src/operators/kernel/arm/scale_kernel.cpp
src/operators/kernel/arm/scale_kernel.cpp
+41
-117
src/operators/kernel/arm/sequence_pool_kernel.cpp
src/operators/kernel/arm/sequence_pool_kernel.cpp
+7
-7
src/operators/op_param.h
src/operators/op_param.h
+8
-20
未找到文件。
src/framework/executor.cpp
浏览文件 @
a9133cf4
...
...
@@ -56,8 +56,11 @@ Executor<Device, T>::Executor(const Program<Device> &program,
use_optimize_
?
program_
.
optimizeProgram
:
program_
.
originProgram
;
PADDLE_MOBILE_ENFORCE
(
program_desc_
!=
nullptr
,
"program_desc_ should not be nullptr"
);
const
auto
&
blocks
=
program_desc_
->
Blocks
();
// resize feed and fetch list
// should init feed and fetch variables before infer shape
InitFeedFetchList
();
const
auto
&
blocks
=
program_desc_
->
Blocks
();
std
::
shared_ptr
<
BlockDesc
>
block_desc
=
blocks
[
0
];
std
::
vector
<
std
::
shared_ptr
<
OpDesc
>>
ops
=
block_desc
->
Ops
();
for
(
int
j
=
0
;
j
<
ops
.
size
();
++
j
)
{
...
...
@@ -79,8 +82,6 @@ Executor<Device, T>::Executor(const Program<Device> &program,
}
else
{
InitMemory
();
}
// resize feed and fetch list
InitFeedFetchList
();
#ifdef PADDLE_MOBILE_FPGA
program_
.
scope
->
EraseVars
({
"feed"
,
"fetch"
});
...
...
src/operators/kernel/arm/scale_kernel.cpp
浏览文件 @
a9133cf4
...
...
@@ -15,131 +15,55 @@ limitations under the License. */
#ifdef SCALE_OP
#include "operators/kernel/scale_kernel.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace
paddle_mobile
{
namespace
operators
{
/*
* @b 特化到具体平台的实现, param 从 op 层传入
* */
template
<
>
void
ScaleKernel
<
CPU
,
float
>::
Compute
(
const
ScaleParam
<
CPU
>
&
param
)
{
const
auto
*
input_x
=
param
.
InputX
();
auto
*
input_x_ptr
=
input_x
->
data
<
float
>
();
auto
*
out
=
param
.
Out
();
auto
*
out_ptr
=
out
->
mutable_data
<
float
>
();
const
vector
<
float
>
scales
=
param
.
Scales
();
bool
has_bias
=
param
.
HasBias
();
const
int
dim_size
=
input_x
->
dims
().
size
();
switch
(
dim_size
)
{
case
1
:
{
const
int
input_width
=
input_x
->
dims
()[
0
];
if
(
has_bias
)
{
const
vector
<
float
>
biases
=
param
.
Biases
();
#pragma omp parallel for
for
(
int
w
=
0
;
w
<
input_width
;
w
++
)
{
out_ptr
[
w
]
=
input_x_ptr
[
w
]
*
scales
[
w
]
+
biases
[
w
];
}
}
else
{
#pragma omp parallel for
for
(
int
w
=
0
;
w
<
input_width
;
w
++
)
{
out_ptr
[
w
]
=
input_x_ptr
[
w
]
*
scales
[
w
];
}
}
}
break
;
case
2
:
{
const
int
input_height
=
input_x
->
dims
()[
0
];
const
int
input_width
=
input_x
->
dims
()[
1
];
if
(
has_bias
)
{
const
vector
<
float
>
biases
=
param
.
Biases
();
#pragma omp parallel for
for
(
int
h
=
0
;
h
<
input_height
;
++
h
)
{
const
float
*
iptr
=
input_x_ptr
+
h
*
input_width
;
float
*
optr
=
out_ptr
+
h
*
input_width
;
for
(
int
w
=
0
;
w
<
input_width
;
++
w
)
{
optr
[
w
]
=
iptr
[
w
]
*
scales
[
w
]
+
biases
[
w
];
}
}
}
else
{
#pragma omp parallel for
for
(
int
h
=
0
;
h
<
input_height
;
++
h
)
{
const
float
*
iptr
=
input_x_ptr
+
h
*
input_width
;
float
*
optr
=
out_ptr
+
h
*
input_width
;
for
(
int
w
=
0
;
w
<
input_width
;
++
w
)
{
optr
[
w
]
=
iptr
[
w
]
*
scales
[
w
];
}
}
}
}
break
;
case
3
:
{
const
int
chan_size
=
input_x
->
dims
()[
0
];
const
int
input_height
=
input_x
->
dims
()[
1
];
const
int
input_width
=
input_x
->
dims
()[
2
];
int
size
=
input_width
*
input_height
;
if
(
has_bias
)
{
const
vector
<
float
>
biases
=
param
.
Biases
();
#pragma omp parallel for
for
(
int
c
=
0
;
c
<
chan_size
;
++
c
)
{
const
float
*
iptr
=
input_x_ptr
+
c
*
size
;
float
*
optr
=
out_ptr
+
c
*
size
;
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
optr
[
i
]
=
iptr
[
i
]
*
scales
[
c
]
+
biases
[
c
];
}
}
}
else
{
#pragma omp parallel for
for
(
int
c
=
0
;
c
<
chan_size
;
++
c
)
{
const
float
*
iptr
=
input_x_ptr
+
c
*
size
;
float
*
optr
=
out_ptr
+
c
*
size
;
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
optr
[
i
]
=
iptr
[
i
]
*
scales
[
c
];
}
}
}
}
break
;
case
4
:
{
const
int
batch_size
=
input_x
->
dims
()[
0
];
const
int
chan_size
=
input_x
->
dims
()[
0
];
const
int
input_height
=
input_x
->
dims
()[
1
];
const
int
input_width
=
input_x
->
dims
()[
2
];
int
size
=
input_width
*
input_height
;
if
(
has_bias
)
{
const
vector
<
float
>
biases
=
param
.
Biases
();
#pragma omp parallel for
for
(
int
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
int
c
=
0
;
c
<
chan_size
;
++
c
)
{
const
float
*
iptr
=
input_x_ptr
+
b
*
c
*
size
;
float
*
optr
=
out_ptr
+
b
*
c
*
size
;
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
optr
[
i
]
=
iptr
[
i
]
*
scales
[
c
]
+
biases
[
c
];
}
}
}
}
else
{
#pragma omp parallel for
for
(
int
b
=
0
;
b
<
batch_size
;
++
b
)
{
for
(
int
c
=
0
;
c
<
chan_size
;
++
c
)
{
const
float
*
iptr
=
input_x_ptr
+
b
*
c
*
size
;
float
*
optr
=
out_ptr
+
b
*
c
*
size
;
for
(
int
i
=
0
;
i
<
size
;
++
i
)
{
optr
[
i
]
=
iptr
[
i
]
*
scales
[
c
];
}
}
}
}
}
break
;
default:
break
;
const
auto
input
=
param
.
InputX
();
auto
output
=
param
.
Out
();
const
float
scale
=
param
.
Scale
();
const
float
bias
=
param
.
Bias
();
const
float
*
input_data
=
input
->
data
<
float
>
();
float
*
output_data
=
output
->
mutable_data
<
float
>
();
int
i
=
0
;
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
float32x4_t
vscale
=
vdupq_n_f32
(
scale
);
float32x4_t
vbias
=
vdupq_n_f32
(
bias
);
for
(;
i
<
output
->
numel
()
-
15
;
i
+=
16
)
{
float32x4_t
_in0
=
vld1q_f32
(
input_data
);
float32x4_t
_in1
=
vld1q_f32
(
input_data
+
4
);
float32x4_t
_in2
=
vld1q_f32
(
input_data
+
8
);
float32x4_t
_in3
=
vld1q_f32
(
input_data
+
12
);
_in0
=
vmlaq_f32
(
vbias
,
vscale
,
_in0
);
_in1
=
vmlaq_f32
(
vbias
,
vscale
,
_in1
);
_in2
=
vmlaq_f32
(
vbias
,
vscale
,
_in2
);
_in3
=
vmlaq_f32
(
vbias
,
vscale
,
_in3
);
vst1q_f32
(
output_data
,
_in0
);
vst1q_f32
(
output_data
+
4
,
_in1
);
vst1q_f32
(
output_data
+
8
,
_in2
);
vst1q_f32
(
output_data
+
12
,
_in3
);
input_data
+=
16
;
output_data
+=
16
;
}
for
(;
i
<
output
->
numel
()
-
3
;
i
+=
4
)
{
float32x4_t
_in0
=
vld1q_f32
(
input_data
);
_in0
=
vmlaq_f32
(
vbias
,
vscale
,
_in0
);
vst1q_f32
(
output_data
,
_in0
);
input_data
+=
4
;
output_data
+=
4
;
}
#endif
for
(;
i
<
output
->
numel
();
++
i
,
++
output_data
,
++
input_data
)
{
*
output_data
=
scale
*
(
*
input_data
)
+
bias
;
}
}
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/kernel/arm/sequence_pool_kernel.cpp
浏览文件 @
a9133cf4
...
...
@@ -21,7 +21,7 @@ limitations under the License. */
#include "common/types.h"
#include "operators/kernel/sequence_kernels.h"
#include "operators/math/pooling.h"
#if
def __ARM_NEON__
#if
defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif // __ARM_NEON__
...
...
@@ -44,7 +44,7 @@ void SequencePoolImpl(const framework::LoDTensor &input,
if
(
width
==
1
)
{
float
max
=
-
std
::
numeric_limits
<
float
>::
max
();
int
remain_h
=
height
;
#if
def __ARM_NEON__
#if
defined(__ARM_NEON__) || defined(__ARM_NEON)
int
loop
=
remain_h
>>
2
;
remain_h
=
remain_h
&
0x3
;
float32x4_t
__max4
=
math
::
vPoolInitq_f32
<
MAX
>
();
...
...
@@ -67,11 +67,11 @@ void SequencePoolImpl(const framework::LoDTensor &input,
in_ptr
+=
width
;
int
remain_h
=
height
-
1
;
int
remain_w_start
=
0
;
#if
def __ARM_NEON__
#if
defined(__ARM_NEON__) || defined(__ARM_NEON)
remain_w_start
=
width
&
0xfffc
;
#endif // __ARM_NEON__
for
(
int
h
=
0
;
h
<
remain_h
;
++
h
)
{
#if
def __ARM_NEON__
#if
defined(__ARM_NEON__) || defined(__ARM_NEON)
for
(
int
w
=
0
;
w
<
width
;
w
+=
4
)
{
float32x4_t
__in
=
vld1q_f32
(
in_ptr
+
w
);
float32x4_t
__out
=
vld1q_f32
(
out_ptr
+
w
);
...
...
@@ -104,7 +104,7 @@ void SequencePoolImpl<SUM, float>(const framework::LoDTensor &input,
if
(
width
==
1
)
{
float
sum
=
0.
f
;
int
remain_h
=
height
;
#if
def __ARM_NEON__
#if
defined(__ARM_NEON__) || defined(__ARM_NEON)
int
loop
=
remain_h
>>
2
;
remain_h
=
remain_h
&
0x3
;
float32x4_t
__sum4
=
vdupq_n_f32
(
0.
f
);
...
...
@@ -126,12 +126,12 @@ void SequencePoolImpl<SUM, float>(const framework::LoDTensor &input,
in_ptr
+=
width
;
int
remain_h
=
height
-
1
;
int
remain_w_start
=
0
;
#if
def __ARM_NEON__
#if
defined(__ARM_NEON__) || defined(__ARM_NEON)
int
loop_w
=
width
>>
2
;
remain_w_start
=
width
&
0xfffc
;
#endif // __ARM_NEON__
for
(
int
h
=
0
;
h
<
remain_h
;
++
h
)
{
#if
def __ARM_NEON__
#if
defined(__ARM_NEON__) || defined(__ARM_NEON)
for
(
int
w
=
0
;
w
<
width
-
3
;
w
+=
4
)
{
float32x4_t
__in
=
vld1q_f32
(
in_ptr
+
w
);
float32x4_t
__out
=
vld1q_f32
(
out_ptr
+
w
);
...
...
src/operators/op_param.h
浏览文件 @
a9133cf4
...
...
@@ -1533,36 +1533,24 @@ class ScaleParam : public OpParam {
const
AttributeMap
&
attrs
,
Scope
*
scope
)
:
OpParam
(
inputs
,
outputs
,
attrs
,
scope
)
{
input_x_
=
InputXFrom
<
GType
>
(
inputs
,
*
scope
);
input_bias_
=
InputBiasFrom
<
GType
>
(
inputs
,
*
scope
);
out_
=
OutFrom
<
GType
>
(
outputs
,
*
scope
);
inplace_
=
GetAttr
<
bool
>
(
"inplace"
,
attrs
);
has_bias_
=
GetAttr
<
bool
>
(
"has_bias"
,
attrs
);
scales_
=
GetAttr
<
vector
<
float
>>
(
"scales"
,
attrs
);
biases_
=
GetAttr
<
vector
<
float
>>
(
"biases"
,
attrs
);
scale_
=
GetAttr
<
float
>
(
"scale"
,
attrs
);
bias_
=
GetAttr
<
float
>
(
"bias"
,
attrs
);
}
const
GType
*
InputX
()
const
{
return
input_x_
;
}
const
GType
*
InputBias
()
const
{
return
input_bias_
;
}
GType
*
Out
()
const
{
return
out_
;
}
const
bool
&
Inplace
()
const
{
return
inplace_
;
}
const
bool
&
HasBias
()
const
{
return
has_bias_
;
}
const
float
Scale
()
const
{
return
scale_
;
}
const
vector
<
float
>
&
Scales
()
const
{
return
scales_
;
}
const
vector
<
float
>
&
Biases
()
const
{
return
biases_
;
}
const
float
Bias
()
const
{
return
bias_
;
}
private:
GType
*
input_x_
;
GType
*
input_bias_
;
GType
*
out_
;
bool
inplace_
;
bool
has_bias_
;
vector
<
float
>
scales_
;
vector
<
float
>
biases_
;
float
scale_
;
float
bias_
;
};
#endif
...
...
@@ -2933,8 +2921,8 @@ class QuantizeParam : public OpParam {
// if offine scale or not
bool
offline_
=
false
;
// round method type
RoundType
round_type_
=
ROUND_NEAREST_AWAY_ZERO
;
//
RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
//
RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
RoundType
round_type_
=
ROUND_NEAREST_TOWARDS_ZERO
;
};
#endif
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录