Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
b985e005
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b985e005
编写于
11月 25, 2018
作者:
H
hjchen2
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix quant bug
上级
8f086bc0
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
143 addition
and
52 deletion
+143
-52
src/operators/kernel/arm/conv_kernel.cpp
src/operators/kernel/arm/conv_kernel.cpp
+0
-1
src/operators/kernel/arm/quantize_kernel.cpp
src/operators/kernel/arm/quantize_kernel.cpp
+143
-51
未找到文件。
src/operators/kernel/arm/conv_kernel.cpp
浏览文件 @
b985e005
...
...
@@ -99,7 +99,6 @@ void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> ¶m) {
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
}
std
::
cout
<<
"exec here..."
<<
std
::
endl
;
}
template
class
ConvKernel
<
CPU
,
float
>;
...
...
src/operators/kernel/arm/quantize_kernel.cpp
浏览文件 @
b985e005
...
...
@@ -126,6 +126,54 @@ static float find_abs_max(const Tensor *input) {
return
max_abs
;
}
#if 0
static void quantize_round_to_zero(const Tensor *input, const float scale,
const std::vector<int> &paddings,
const int8_t padding_val, Tensor *output) {
const float *x = input->data<const float>();
int8_t *y = output->mutable_data<int8_t>();
size_t size = input->numel();
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = size >> 4;
size_t remain = size & 0xF;
#pragma omp parallel for
for (size_t i = 0; i < loop; ++i) {
const float *local_x = x + (i << 4);
int8_t *local_y = y + (i << 4);
float32x4_t r0 = vld1q_f32(local_x);
float32x4_t r1 = vld1q_f32(local_x + 4);
float32x4_t r2 = vld1q_f32(local_x + 8);
float32x4_t r3 = vld1q_f32(local_x + 12);
r0 = vmulq_n_f32(r0, scale);
r1 = vmulq_n_f32(r1, scale);
r2 = vmulq_n_f32(r2, scale);
r3 = vmulq_n_f32(r3, scale);
int32x4_t q0 = vrnd_towards_zero(r0);
int32x4_t q1 = vrnd_towards_zero(r1);
int32x4_t q2 = vrnd_towards_zero(r2);
int32x4_t q3 = vrnd_towards_zero(r3);
int16x4_t d0 = vmovn_s32(q0);
int16x4_t d1 = vmovn_s32(q1);
int16x4_t d2 = vmovn_s32(q2);
int16x4_t d3 = vmovn_s32(q3);
int16x8_t q5 = vcombine_s16(d0, d1);
int16x8_t q6 = vcombine_s16(d2, d3);
int8x8_t d5 = vmovn_s16(q5);
int8x8_t d6 = vmovn_s16(q6);
vst1_s8(local_y, d5);
vst1_s8(local_y + 8, d6);
}
size = remain;
x += (loop << 4);
y += (loop << 4);
#endif
for (size_t i = 0; i < size; ++i) {
y[i] = static_cast<int8_t>(x[i] * scale);
}
}
#endif
#ifdef __aarch64__
static
void
quantize_round_to_even
(
const
Tensor
*
input
,
const
float
scale
,
Tensor
*
output
)
{
...
...
@@ -272,7 +320,7 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale,
y
[
i
]
=
round
(
x
[
i
]
*
scale
);
}
}
#else
// __aarch64__
#else // __aarch64__
static
void
quantize_round_to_even
(
const
Tensor
*
input
,
const
float
scale
,
const
std
::
vector
<
int
>
&
paddings
,
...
...
@@ -282,7 +330,7 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale,
const
std
::
vector
<
int
>
&
paddings
,
const
int8_t
padding_val
,
Tensor
*
output
)
{}
#if 1
static
void
quantize_round_to_zero
(
const
Tensor
*
input
,
const
float
scale
,
const
std
::
vector
<
int
>
&
paddings
,
const
int8_t
padding_val
,
Tensor
*
output
)
{
...
...
@@ -300,11 +348,11 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
for
(
int
batch
=
0
;
batch
<
input
->
dims
()[
0
];
++
batch
)
{
for
(
int
c
=
0
;
c
<
channels
-
3
;
c
+=
4
)
{
const
float
*
x0
=
x
+
c
*
input_spatial_size
;
const
float
*
x1
=
x
0
+
input_spatial_size
;
const
float
*
x2
=
x
1
+
input_spatial_size
;
const
float
*
x3
=
x
2
+
input_spatial_size
;
size_t
offset
=
c
*
output_spatial_size
;
const
float
*
input0
=
x
+
(
batch
*
channels
+
c
)
*
input_spatial_size
;
const
float
*
input1
=
input
0
+
input_spatial_size
;
const
float
*
input2
=
input
1
+
input_spatial_size
;
const
float
*
input3
=
input
2
+
input_spatial_size
;
size_t
offset
=
(
batch
*
channels
+
c
)
*
output_spatial_size
;
for
(
int
h
=
0
;
h
<
2
;
++
h
)
{
int8_t
*
y0
=
y
+
offset
+
h
*
((
input_h
+
paddings
[
0
])
*
output_w
-
paddings
[
1
]);
...
...
@@ -312,7 +360,7 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
int8_t
*
y2
=
y1
+
output_spatial_size
;
int8_t
*
y3
=
y2
+
output_spatial_size
;
int
loop
=
start
>>
4
;
int
remain
=
start
&
0xF
FF0
;
int
remain
=
start
&
0xF
;
asm
volatile
(
"vdup.s8 q0, %[val]
\n
"
"cmp %[loop], #0
\n
"
...
...
@@ -372,10 +420,15 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
int8_t
*
y2
=
y1
+
output_spatial_size
;
int8_t
*
y3
=
y2
+
output_spatial_size
;
for
(
int
h
=
0
;
h
<
input_h
;
++
h
)
{
const
float
*
x0
=
input0
+
h
*
input_w
;
const
float
*
x1
=
input1
+
h
*
input_w
;
const
float
*
x2
=
input2
+
h
*
input_w
;
const
float
*
x3
=
input3
+
h
*
input_w
;
int
loop
=
input_w
>>
4
;
int
remain
=
input_w
&
0xF
FF0
;
int
remain
=
input_w
&
0xF
;
int
pad_loop
=
paddings
[
1
]
>>
1
;
int
pad_remain
=
paddings
[
1
]
&
0xFFFE
;
int
pad_remain
=
paddings
[
1
]
&
0x1
;
int
remain_steps
=
remain
;
asm
volatile
(
"vdup.f32 q0, %[scale]
\n
"
"cmp %[loop], #0
\n
"
...
...
@@ -446,10 +499,10 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
"vmovn.s16 d21, q2
\n
"
"vmovn.s16 d23, q3
\n
"
"vmovn.s16 d25, q4
\n
"
"vst1.32 {q9}, [%[y0]]
\n
"
"vst1.32 {q10}, [%[y
0]]
\n
"
"vst1.32 {q11}, [%[y
0]]
\n
"
"vst1.32 {q12}, [%[y
0]]
\n
"
"vst1.32 {q9}, [%[y0]]
!
\n
"
"vst1.32 {q10}, [%[y
1]]!
\n
"
"vst1.32 {q11}, [%[y
2]]!
\n
"
"vst1.32 {q12}, [%[y
3]]!
\n
"
"subs %[loop], #1
\n
"
"bne loop_quantize_%=
\n
"
...
...
@@ -458,10 +511,10 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
"cmp %[remain], #0
\n
"
"ble end_%=
\n
"
"vld1.32 {q1, q2}, [%[x0]]
\n
"
"vld1.32 {q3, q4}, [%[x1]]
\n
"
"vld1.32 {q5, q6}, [%[x2]]
\n
"
"vld1.32 {q7, q8}, [%[x3]]
\n
"
"vld1.32 {q1, q2}, [%[x0]]
!
\n
"
"vld1.32 {q3, q4}, [%[x1]]
!
\n
"
"vld1.32 {q5, q6}, [%[x2]]
!
\n
"
"vld1.32 {q7, q8}, [%[x3]]
!
\n
"
"vmul.f32 q1, q1, q0
\n
"
"vmul.f32 q2, q2, q0
\n
"
"vmul.f32 q3, q3, q0
\n
"
...
...
@@ -490,10 +543,10 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
"vmovn.s16 d20, q2
\n
"
"vmovn.s16 d22, q3
\n
"
"vmovn.s16 d24, q4
\n
"
"vld1.32 {q1, q2}, [%[x0]]
!
\n
"
"vld1.32 {q3, q4}, [%[x1]]
!
\n
"
"vld1.32 {q5, q6}, [%[x2]]
!
\n
"
"vld1.32 {q7, q8}, [%[x3]]
!
\n
"
"vld1.32 {q1, q2}, [%[x0]]
\n
"
"vld1.32 {q3, q4}, [%[x1]]
\n
"
"vld1.32 {q5, q6}, [%[x2]]
\n
"
"vld1.32 {q7, q8}, [%[x3]]
\n
"
"vmul.f32 q1, q1, q0
\n
"
"vmul.f32 q2, q2, q0
\n
"
"vmul.f32 q3, q3, q0
\n
"
...
...
@@ -574,8 +627,8 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
[
y0
]
"+r"
(
y0
),
[
y1
]
"+r"
(
y1
),
[
y2
]
"+r"
(
y2
),
[
y3
]
"+r"
(
y3
),
[
loop
]
"+r"
(
loop
),
[
remain
]
"+r"
(
remain
)
:
[
scale
]
"r"
(
scale
)
:
"
memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8
"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
);
:
"
cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7
"
,
"q
8"
,
"q
9"
,
"q10"
,
"q11"
,
"q12"
);
asm
volatile
(
"vdup.s8 d0, %[val]
\n
"
"cmp %[pad_loop], #0
\n
"
...
...
@@ -608,23 +661,63 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
:
[
y0
]
"+r"
(
y0
),
[
y1
]
"+r"
(
y1
),
[
y2
]
"+r"
(
y2
),
[
y3
]
"+r"
(
y3
),
[
pad_loop
]
"+r"
(
pad_loop
),
[
pad_remain
]
"+r"
(
pad_remain
)
:
[
val
]
"r"
(
padding_val
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
);
x0
+=
remain
;
x1
+=
remain
;
x2
+=
remain
;
x3
+=
remain
;
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
);
}
}
for
(
int
c
=
(
channels
&
0xFFFC
);
c
<
channels
;
++
c
)
{
const
float
*
x0
=
x
+
c
*
input_spatial_size
;
int8_t
*
y0
=
y
+
c
*
output_spatial_size
;
for
(
int
h
=
0
;
h
<
paddings
[
0
];
++
h
)
{
const
float
*
input0
=
x
+
(
batch
*
channels
+
c
)
*
input_spatial_size
;
size_t
offset
=
(
batch
*
channels
+
c
)
*
output_spatial_size
;
for
(
int
h
=
0
;
h
<
2
;
++
h
)
{
int8_t
*
y0
=
y
+
offset
+
h
*
((
input_h
+
paddings
[
0
])
*
output_w
-
paddings
[
1
]);
int
loop
=
start
>>
4
;
int
remain
=
start
&
0xF
;
asm
volatile
(
"vdup.s8 q0, %[val]
\n
"
"cmp %[loop], #0
\n
"
"ble start_remain_%=
\n
"
"store_16w_%=:
\n
"
"vst1.32 {q0}, [%[y0]]!
\n
"
"subs %[loop], #1
\n
"
"bne store_16w_%=
\n
"
"start_remain_%=:
\n
"
"cmp %[remain], #8
\n
"
"blt store_4w_%=
\n
"
"vst1.32 {d0}, [%[y0]]!
\n
"
"sub %[remain], #8
\n
"
"store_4w_%=:
\n
"
"cmp %[remain], #4
\n
"
"blt store_2w_%=
\n
"
"vst1.32 {d0[0]}, [%[y0]]!
\n
"
"sub %[remain], #4
\n
"
"store_2w_%=:
\n
"
"cmp %[remain], #4
\n
"
"blt store_1w_%=
\n
"
"vst1.16 {d0[0]}, [%[y0]]!
\n
"
"sub %[remain], #2
\n
"
"store_1w_%=:
\n
"
"cmp %[remain], #1
\n
"
"blt end_%=
\n
"
"vst1.8 {d0[0]}, [%[y0]]!
\n
"
"end_%=:
\n
"
:
[
y0
]
"+r"
(
y0
),
[
loop
]
"+r"
(
loop
),
[
remain
]
"+r"
(
remain
)
:
[
val
]
"r"
(
padding_val
)
:
"cc"
,
"memory"
,
"q0"
);
}
// quantize valid area
int8_t
*
y0
=
y
+
offset
+
start
;
for
(
int
h
=
0
;
h
<
input_h
;
++
h
)
{
const
float
*
x0
=
input0
+
h
*
input_w
;
int
loop
=
input_w
>>
4
;
int
remain
=
input_w
&
0xF
FF0
;
int
remain
=
input_w
&
0xF
;
int
pad_loop
=
paddings
[
1
]
>>
1
;
int
pad_remain
=
paddings
[
1
]
&
0x
FFFE
;
int
pad_remain
=
paddings
[
1
]
&
0x
1
;
asm
volatile
(
"vdup.f32 q0, %[scale]
\n
"
"cmp %[loop], #0
\n
"
...
...
@@ -632,22 +725,22 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
"loop_quantize_%=:
\n
"
"vld1.32 {q1, q2}, [%[x0]]!
\n
"
"vmul.f32
q1, q1, q0
\n
"
"vmul.f32
q2, q2, q0
\n
"
"vmul.f32
q1, q1, q0
\n
"
"vmul.f32
q2, q2, q0
\n
"
"vcvt.s32.f32 q1, q1
\n
"
"vcvt.s32.f32 q2, q2
\n
"
"vmovn.s32 d2, q1
\n
"
"vmovn.s32 d3, q2
\n
"
"vmovn.s16 d18, q1
\n
"
"vld1.32 {q1, q2}, [%[x0]]!
\n
"
"vmul.f32
q1, q1, q0
\n
"
"vmul.f32
q2, q2, q0
\n
"
"vmul.f32
q1, q1, q0
\n
"
"vmul.f32
q2, q2, q0
\n
"
"vcvt.s32.f32 q1, q1
\n
"
"vcvt.s32.f32 q2, q2
\n
"
"vmovn.s32 d2, q1
\n
"
"vmovn.s32 d3, q2
\n
"
"vmovn.s16 d19, q1
\n
"
"vst1.32 {q9}, [%[y0]]
\n
"
"vst1.32 {q9}, [%[y0]]
!
\n
"
"subs %[loop], #1
\n
"
"bne loop_quantize_%=
\n
"
...
...
@@ -656,19 +749,18 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
"cmp %[remain], #0
\n
"
"ble start_pad_%=
\n
"
"vld
1.32 {q1, q2}, [%[x0]]
\n
"
"vmul.f32
q1, q1, q0
\n
"
"vmul.f32
q2, q2, q0
\n
"
"vld
m %[x0], {d2-d9}
\n
"
"vmul.f32
q1, q1, q0
\n
"
"vmul.f32
q2, q2, q0
\n
"
"vcvt.s32.f32 q1, q1
\n
"
"vcvt.s32.f32 q2, q2
\n
"
"vmovn.s32 d2, q1
\n
"
"vmovn.s32 d3, q2
\n
"
"vmovn.s16 d18, q1
\n
"
"vld1.32 {q1, q2}, [%[x0]]!
\n
"
"vmul.f32 q1, q1, q0
\n
"
"vmul.f32 q2, q2, q0
\n
"
"vcvt.s32.f32 q1, q1
\n
"
"vcvt.s32.f32 q2, q2
\n
"
"vmul.f32 q3, q3, q0
\n
"
"vmul.f32 q4, q4, q0
\n
"
"vcvt.s32.f32 q1, q3
\n
"
"vcvt.s32.f32 q2, q4
\n
"
"vmovn.s32 d2, q1
\n
"
"vmovn.s32 d3, q2
\n
"
"vmovn.s16 d19, q1
\n
"
...
...
@@ -722,12 +814,12 @@ static void quantize_round_to_zero(const Tensor *input, const float scale,
[
remain
]
"+r"
(
remain
),
[
pad_loop
]
"+r"
(
pad_loop
),
[
pad_remain
]
"+r"
(
pad_remain
)
:
[
scale
]
"r"
(
scale
),
[
val
]
"r"
(
padding_val
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q9"
);
x0
+=
remain
;
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q9"
);
}
}
}
}
#endif
#endif // __aarch64__
#endif // ARM_NEON
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录