Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
09a07a23
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
09a07a23
编写于
6月 19, 2019
作者:
S
Shixiaowei02
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add calib kernel
上级
6f705068
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
440 addition
and
580 deletion
+440
-580
paddle/fluid/lite/arm/math/CMakeLists.txt
paddle/fluid/lite/arm/math/CMakeLists.txt
+1
-1
paddle/fluid/lite/arm/math/type_trans.cpp
paddle/fluid/lite/arm/math/type_trans.cpp
+0
-579
paddle/fluid/lite/kernels/arm/CMakeLists.txt
paddle/fluid/lite/kernels/arm/CMakeLists.txt
+2
-0
paddle/fluid/lite/kernels/arm/calib_compute.cc
paddle/fluid/lite/kernels/arm/calib_compute.cc
+57
-0
paddle/fluid/lite/kernels/arm/calib_compute.h
paddle/fluid/lite/kernels/arm/calib_compute.h
+40
-0
paddle/fluid/lite/kernels/arm/calib_compute_test.cc
paddle/fluid/lite/kernels/arm/calib_compute_test.cc
+149
-0
paddle/fluid/lite/operators/CMakeLists.txt
paddle/fluid/lite/operators/CMakeLists.txt
+3
-0
paddle/fluid/lite/operators/calib_op.cc
paddle/fluid/lite/operators/calib_op.cc
+56
-0
paddle/fluid/lite/operators/calib_op.h
paddle/fluid/lite/operators/calib_op.h
+60
-0
paddle/fluid/lite/operators/calib_op_test.cc
paddle/fluid/lite/operators/calib_op_test.cc
+64
-0
paddle/fluid/lite/operators/op_params.h
paddle/fluid/lite/operators/op_params.h
+8
-0
未找到文件。
paddle/fluid/lite/arm/math/CMakeLists.txt
浏览文件 @
09a07a23
...
...
@@ -16,7 +16,7 @@ cc_library(math_arm SRCS
elementwise.cc
concat.cc
sgemv.cc
type_trans.c
pp
type_trans.c
c
conv_impl.cc
conv_direct_3x3s1.cc
conv_direct_3x3s2.cc
...
...
paddle/fluid/lite/arm/math/type_trans.cpp
已删除
100644 → 0
浏览文件 @
6f705068
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/arm/math/type_trans.h"
#include <arm_neon.h>
#include <string.h>
#include "paddle/fluid/lite/arm/math/saturate.h"
namespace
paddle
{
namespace
lite
{
namespace
arm
{
namespace
math
{
template
<
typename
dtype
>
void
int32_to_dtype
(
const
int
*
din
,
dtype
*
dout
,
const
float
*
scale
,
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
);
void
fp32_to_int8
(
const
float
*
din
,
signed
char
*
dout
,
const
float
*
scale
,
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
int
cnt
=
inner_size
/
16
;
int
remain
=
inner_size
&
15
;
int64_t
loop_size
=
outer_size
*
axis_size
;
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
loop_size
;
++
j
)
{
float
inv_scale
=
1.
f
/
scale
[
j
%
axis_size
];
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vscale
=
vdupq_n_f32
(
inv_scale
);
float32x4_t
vpoff
=
vdupq_n_f32
(
0.5
f
);
float32x4_t
vnoff
=
vdupq_n_f32
(
-
0.5
f
);
const
float
*
din_c
=
din
+
j
*
inner_size
;
signed
char
*
dout_c
=
dout
+
j
*
inner_size
;
if
(
cnt
>
0
)
{
int
cnt_loop
=
cnt
;
const
float
*
din_ptr
=
din_c
;
signed
char
*
dout_ptr
=
dout_c
;
#ifdef __aarch64__
asm
volatile
(
"ldp q0, q1, [%[in]], #32
\n
"
"ldp q2, q3, [%[in]], #32
\n
"
"0:
\n
"
/* main loop */
"fmul v4.4s, v0.4s, %[scale].4s
\n
"
"fmul v5.4s, v1.4s, %[scale].4s
\n
"
"fmul v6.4s, v2.4s, %[scale].4s
\n
"
"fmul v7.4s, v3.4s, %[scale].4s
\n
"
"ldp q0, q1, [%[in]], #32
\n
"
"subs %[cnt], %[cnt], #1
\n
"
"FCVTAS v8.4s, v4.4s
\n
"
"FCVTAS v9.4s, v5.4s
\n
"
"FCVTAS v10.4s, v6.4s
\n
"
"FCVTAS v11.4s, v7.4s
\n
"
"ldp q2, q3, [%[in]], #32
\n
"
"sqxtn v4.4h, v8.4s
\n
"
"sqxtn2 v4.8h, v9.4s
\n
"
"sqxtn v5.4h, v10.4s
\n
"
"sqxtn2 v5.8h, v11.4s
\n
"
"sqxtn v8.8b, v4.8h
\n
"
"sqxtn2 v8.16b, v5.8h
\n
"
"str q8, [%[out]], #16
\n
"
"bne 0b
\n
"
:
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
),
[
cnt
]
"+r"
(
cnt_loop
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
);
#else
asm
volatile
(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7
\n
"
"vld1.32 {d4-d7}, [%[din]]! @ load in8~in16
\n
"
"0: @ main loop
\n
"
"vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5
\n
"
"vand.i32 q5, q4, q4 @ set offset, 0.5
\n
"
"vand.i32 q6, q4, q4 @ set offset, 0.5
\n
"
"vand.i32 q7, q4, q4 @ set offset, 0.5
\n
"
"vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0
\n
"
"vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1
\n
"
"vcgt.f32 q10, q2, %q[vzero] @ get mask > 0, in2
\n
"
"vcgt.f32 q11, q3, %q[vzero] @ get mask > 0, in3
\n
"
"vbif.f32 q4, %q[vnoff], q8 @ get right offset
\n
"
"vbif.f32 q5, %q[vnoff], q9 @ get right offset
\n
"
"vbif.f32 q6, %q[vnoff], q10 @ get right offset
\n
"
"vbif.f32 q7, %q[vnoff], q11 @ get right offset
\n
"
"vmla.f32 q4, q0, %q[vscale] @ mul scale
\n
"
"vmla.f32 q5, q1, %q[vscale] @ mul scale
\n
"
"vmla.f32 q6, q2, %q[vscale] @ mul scale
\n
"
"vmla.f32 q7, q3, %q[vscale] @ mul scale
\n
"
"vcvt.s32.f32 q0, q4 @ cvt to int32
\n
"
"vcvt.s32.f32 q1, q5 @ cvt to int32
\n
"
"vcvt.s32.f32 q2, q6 @ cvt to int32
\n
"
"vcvt.s32.f32 q3, q7 @ cvt to int32
\n
"
"vqmovn.s32 d8, q0 @ cnt to int16
\n
"
"vqmovn.s32 d9, q1 @ cnt to int16
\n
"
"vqmovn.s32 d10, q2 @ cnt to int16
\n
"
"vqmovn.s32 d11, q3 @ cnt to int16
\n
"
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7
\n
"
"vqmovn.s16 d12, q4 @ cnt to int8
\n
"
"vqmovn.s16 d13, q5 @ cnt to int8
\n
"
"vld1.32 {d4-d7}, [%[din]]! @ load in8~in16
\n
"
"vst1.32 {d12-d13}, [%[dout]]! @ write to output
\n
"
"subs %[cnt], #1 @ loop count -1
\n
"
"bne 0b @ to main loop
\n
"
:
[
dout
]
"+r"
(
dout_ptr
),
[
din
]
"+r"
(
din_ptr
),
[
cnt
]
"+r"
(
cnt_loop
)
:
[
vscale
]
"w"
(
vscale
),
[
vpoff
]
"w"
(
vpoff
),
[
vnoff
]
"w"
(
vnoff
),
[
vzero
]
"w"
(
vzero
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
);
#endif
}
const
float
*
din_r
=
din_c
+
16
*
cnt
;
signed
char
*
dout_r
=
dout_c
+
16
*
cnt
;
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
dout_r
[
i
]
=
saturate_cast
<
int8_t
>
(
roundf
(
inv_scale
*
din_r
[
i
]));
}
}
}
void
fp32_to_int16
(
const
float
*
din
,
int16_t
*
dout
,
const
float
*
scale
,
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
int
cnt
=
inner_size
/
8
;
int
remain
=
inner_size
&
7
;
int64_t
loop_size
=
outer_size
*
axis_size
;
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
loop_size
;
++
j
)
{
float
inv_scale
=
1.
f
/
scale
[
j
%
axis_size
];
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vscale
=
vdupq_n_f32
(
inv_scale
);
float32x4_t
vpoff
=
vdupq_n_f32
(
0.5
f
);
float32x4_t
vnoff
=
vdupq_n_f32
(
-
0.5
f
);
const
float
*
din_c
=
din
+
j
*
inner_size
;
int16_t
*
dout_c
=
dout
+
j
*
inner_size
;
if
(
cnt
>
0
)
{
int
cnt_loop
=
cnt
;
const
float
*
din_ptr
=
din_c
;
int16_t
*
dout_ptr
=
dout_c
;
#ifdef __aarch64__
asm
volatile
(
"ldp q0, q1, [%[in]], #32
\n
"
"0:
\n
"
/* main loop */
"fmul v4.4s, v0.4s, %[scale].4s
\n
"
"fmul v5.4s, v1.4s, %[scale].4s
\n
"
"ldp q0, q1, [%[in]], #32
\n
"
"subs %[cnt], %[cnt], #1
\n
"
"FCVTAS v8.4s, v4.4s
\n
"
"FCVTAS v9.4s, v5.4s
\n
"
"sqxtn v4.4h, v8.4s
\n
"
"sqxtn2 v4.8h, v9.4s
\n
"
"str q4, [%[out]], #16
\n
"
"bne 0b
\n
"
:
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
),
[
cnt
]
"+r"
(
cnt_loop
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v4"
,
"v5"
,
"v8"
,
"v9"
);
#else
asm
volatile
(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7
\n
"
"0: @ main loop
\n
"
"vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5
\n
"
"vand.i32 q5, q4, q4 @ set offset, 0.5
\n
"
"vand.i32 q6, q4, q4 @ set offset, 0.5
\n
"
"vand.i32 q7, q4, q4 @ set offset, 0.5
\n
"
"vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0
\n
"
"vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1
\n
"
"vbif.f32 q4, %q[vnoff], q8 @ get right offset
\n
"
"vbif.f32 q5, %q[vnoff], q9 @ get right offset
\n
"
"vmla.f32 q4, q0, %q[vscale] @ mul scale
\n
"
"vmla.f32 q5, q1, %q[vscale] @ mul scale
\n
"
"vcvt.s32.f32 q0, q4 @ cvt to int32
\n
"
"vcvt.s32.f32 q1, q5 @ cvt to int32
\n
"
"vqmovn.s32 d8, q0 @ cnt to int16
\n
"
"vqmovn.s32 d9, q1 @ cnt to int16
\n
"
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7
\n
"
"vst1.32 {d8-d9}, [%[dout]]! @ write to output
\n
"
"subs %[cnt], #1 @ loop count -1
\n
"
"bne 0b @ to main loop
\n
"
:
[
dout
]
"+r"
(
dout_ptr
),
[
din
]
"+r"
(
din_ptr
),
[
cnt
]
"+r"
(
cnt_loop
)
:
[
vscale
]
"w"
(
vscale
),
[
vpoff
]
"w"
(
vpoff
),
[
vnoff
]
"w"
(
vnoff
),
[
vzero
]
"w"
(
vzero
)
:
"q0"
,
"q1"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
);
#endif
}
const
float
*
din_r
=
din_c
+
8
*
cnt
;
int16_t
*
dout_r
=
dout_c
+
8
*
cnt
;
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
dout_r
[
i
]
=
saturate_cast
<
int16_t
>
(
roundf
(
inv_scale
*
din_r
[
i
]));
}
}
}
void
int8_to_fp32
(
const
signed
char
*
in
,
float
*
out
,
const
float
*
scale
,
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
int
cnt
=
inner_size
/
16
;
int
remain
=
inner_size
&
15
;
int64_t
loop_size
=
axis_size
*
outer_size
;
#pragma omp parallel for
for
(
int64_t
n
=
0
;
n
<
loop_size
;
++
n
)
{
float
in_scale
=
scale
[
n
%
axis_size
];
const
signed
char
*
din_c
=
in
+
n
*
inner_size
;
float
*
dout_c
=
out
+
n
*
inner_size
;
float32x4_t
vscale
=
vdupq_n_f32
(
in_scale
);
if
(
cnt
>
0
)
{
int
loop
=
cnt
;
const
signed
char
*
din_ptr
=
din_c
;
float
*
dout_ptr
=
dout_c
;
#ifdef __aarch64__
asm
volatile
(
"ldp d0, d1, [%[in]], #16
\n
"
/* load 16 int8*/
"0:
\n
"
/* main loop */
"sshll v2.8h, v0.8b, #0
\n
"
/* trans to int16*/
"sshll v3.8h, v1.8b, #0
\n
"
/* trans to int16*/
"sshll v4.4s, v2.4h, #0
\n
"
/* trans to int32*/
"sshll2 v5.4s, v2.8h, #0
\n
"
/* trans to int32*/
"sshll v6.4s, v3.4h, #0
\n
"
/* trans to int32*/
"sshll2 v7.4s, v3.8h, #0
\n
"
/* trans to int32*/
"ldp d0, d1, [%[in]], #16
\n
"
/* load 16 int8*/
"scvtf v8.4s, v4.4s
\n
"
/* trans to fp32*/
"scvtf v9.4s, v5.4s
\n
"
/* trans to fp32*/
"scvtf v10.4s, v6.4s
\n
"
/* trans to fp32*/
"scvtf v11.4s, v7.4s
\n
"
/* trans to fp32*/
"subs %[loop], %[loop], #1
\n
"
"fmul v4.4s, v8.4s, %[scale].4s
\n
"
/* mul with scale*/
"fmul v5.4s, v9.4s, %[scale].4s
\n
"
/* mul with scale*/
"fmul v6.4s, v10.4s, %[scale].4s
\n
"
/* mul with scale*/
"fmul v7.4s, v11.4s, %[scale].4s
\n
"
/* mul with scale*/
"stp q4, q5, [%[out]], #32
\n
"
/* write to memory*/
"stp q6, q7, [%[out]], #32
\n
"
/* write to memory*/
"bne 0b
\n
"
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
);
#else
asm
volatile
(
"vld1.32 {d0-d1}, [%[in]]! @ load 16 int8
\n
"
"0: @ main loop
\n
"
"vmovl.s8 q2, d0 @ trans to int16
\n
"
"vmovl.s8 q3, d1 @ trans to int16
\n
"
"vmovl.s16 q4, d4 @ trans to int32
\n
"
"vmovl.s16 q5, d5 @ trans to int32
\n
"
"vmovl.s16 q6, d6 @ trans to int32
\n
"
"vmovl.s16 q7, d7 @ trans to int32
\n
"
"vcvt.f32.s32 q0, q4 @ trans to fp32
\n
"
"vcvt.f32.s32 q1, q5 @ trans to fp32
\n
"
"vcvt.f32.s32 q2, q6 @ trans to fp32
\n
"
"vcvt.f32.s32 q3, q7 @ trans to fp32
\n
"
"vmul.f32 q4, q0, %q[scale] @ mul with scale
\n
"
"vmul.f32 q5, q1, %q[scale] @ mul with scale
\n
"
"vmul.f32 q6, q2, %q[scale] @ mul with scale
\n
"
"vmul.f32 q7, q3, %q[scale] @ mul with scale
\n
"
"vld1.32 {d0-d1}, [%[in]]! @ load 16 int8
\n
"
"subs %[loop], #1
\n
"
"vst1.f32 {d8-d11}, [%[out]]! @ write to memory
\n
"
"vst1.f32 {d12-d15}, [%[out]]! @ write to memory
\n
"
"bne 0b
\n
"
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
);
#endif // __aarch64__
}
const
signed
char
*
din_r
=
din_c
+
16
*
cnt
;
float
*
dout_r
=
dout_c
+
16
*
cnt
;
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
dout_r
[
i
]
=
in_scale
*
din_r
[
i
];
}
}
}
void
int16_to_fp32
(
const
int16_t
*
in
,
float
*
out
,
const
float
*
scale
,
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
int
cnt
=
inner_size
/
16
;
int
remain
=
inner_size
&
15
;
int64_t
loop_size
=
axis_size
*
outer_size
;
#pragma omp parallel for
for
(
int64_t
n
=
0
;
n
<
loop_size
;
++
n
)
{
float
in_scale
=
scale
[
n
%
axis_size
];
const
int16_t
*
din_c
=
in
+
n
*
inner_size
;
float
*
dout_c
=
out
+
n
*
inner_size
;
float32x4_t
vscale
=
vdupq_n_f32
(
in_scale
);
if
(
cnt
>
0
)
{
int
loop
=
cnt
;
const
int16_t
*
din_ptr
=
din_c
;
float
*
dout_ptr
=
dout_c
;
#ifdef __aarch64__
asm
volatile
(
"ldp q0, q1, [%[in]], #32
\n
"
/* load 16 int16*/
"0:
\n
"
/* main loop */
"sshll v4.4s, v0.4h, #0
\n
"
/* trans to int32*/
"sshll2 v5.4s, v0.8h, #0
\n
"
/* trans to int32*/
"sshll v6.4s, v1.4h, #0
\n
"
/* trans to int32*/
"sshll2 v7.4s, v1.8h, #0
\n
"
/* trans to int32*/
"ldp q0, q1, [%[in]], #32
\n
"
/* load 16 int16*/
"scvtf v8.4s, v4.4s
\n
"
/* trans to fp32*/
"scvtf v9.4s, v5.4s
\n
"
/* trans to fp32*/
"scvtf v10.4s, v6.4s
\n
"
/* trans to fp32*/
"scvtf v11.4s, v7.4s
\n
"
/* trans to fp32*/
"subs %[loop], %[loop], #1
\n
"
"fmul v4.4s, v8.4s, %[scale].4s
\n
"
/* mul with scale*/
"fmul v5.4s, v9.4s, %[scale].4s
\n
"
/* mul with scale*/
"fmul v6.4s, v10.4s, %[scale].4s
\n
"
/* mul with scale*/
"fmul v7.4s, v11.4s, %[scale].4s
\n
"
/* mul with scale*/
"stp q4, q5, [%[out]], #32
\n
"
/* write to memory*/
"stp q6, q7, [%[out]], #32
\n
"
/* write to memory*/
"bne 0b
\n
"
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
);
#else
asm
volatile
(
"vld1.32 {d0-d3}, [%[in]]! @ load 16 int16
\n
"
"0: @ main loop
\n
"
"vmovl.s16 q4, d0 @ trans to int32
\n
"
"vmovl.s16 q5, d1 @ trans to int32
\n
"
"vmovl.s16 q6, d2 @ trans to int32
\n
"
"vmovl.s16 q7, d3 @ trans to int32
\n
"
"vcvt.f32.s32 q0, q4 @ trans to fp32
\n
"
"vcvt.f32.s32 q1, q5 @ trans to fp32
\n
"
"vcvt.f32.s32 q2, q6 @ trans to fp32
\n
"
"vcvt.f32.s32 q3, q7 @ trans to fp32
\n
"
"vmul.f32 q4, q0, %q[scale] @ mul with scale
\n
"
"vmul.f32 q5, q1, %q[scale] @ mul with scale
\n
"
"vmul.f32 q6, q2, %q[scale] @ mul with scale
\n
"
"vmul.f32 q7, q3, %q[scale] @ mul with scale
\n
"
"vld1.32 {d0-d3}, [%[in]]! @ load 16 int8
\n
"
"subs %[loop], #1
\n
"
"vst1.f32 {d8-d11}, [%[out]]! @ write to memory
\n
"
"vst1.f32 {d12-d15}, [%[out]]! @ write to memory
\n
"
"bne 0b
\n
"
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
);
#endif // __aarch64__
}
const
int16_t
*
din_r
=
din_c
+
16
*
cnt
;
float
*
dout_r
=
dout_c
+
16
*
cnt
;
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
dout_r
[
i
]
=
in_scale
*
din_r
[
i
];
}
}
}
void
int32_to_fp32
(
const
int
*
din
,
float
*
dout
,
const
float
*
scale
,
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
int
cnt
=
inner_size
/
16
;
int
remain
=
inner_size
&
15
;
int64_t
loop_size
=
axis_size
*
outer_size
;
#pragma omp parallel for
for
(
int64_t
n
=
0
;
n
<
loop_size
;
++
n
)
{
float
in_scale
=
scale
[
n
%
axis_size
];
const
int
*
din_c
=
din
+
n
*
inner_size
;
float
*
dout_c
=
dout
+
n
*
inner_size
;
float32x4_t
vscale
=
vdupq_n_f32
(
in_scale
);
if
(
cnt
>
0
)
{
int
loop
=
cnt
;
const
int
*
din_ptr
=
din_c
;
float
*
dout_ptr
=
dout_c
;
#ifdef __aarch64__
asm
volatile
(
"ldp q0, q1, [%[in]], #32
\n
"
"ldp q2, q3, [%[in]], #32
\n
"
"0:
\n
"
"scvtf v4.4s, v0.4s
\n
"
"scvtf v5.4s, v1.4s
\n
"
"scvtf v6.4s, v2.4s
\n
"
"scvtf v7.4s, v3.4s
\n
"
"ldp q0, q1, [%[in]], #32
\n
"
"fmul v8.4s, v4.4s, %[scale].4s
\n
"
"fmul v9.4s, v5.4s, %[scale].4s
\n
"
"fmul v10.4s, v6.4s, %[scale].4s
\n
"
"fmul v11.4s, v7.4s, %[scale].4s
\n
"
"ldp q2, q3, [%[in]], #32
\n
"
"stp q8, q9, [%[out]], #32
\n
"
"stp q10, q11, [%[out]], #32
\n
"
"subs %[loop], %[loop], #1
\n
"
"bne 0b
\n
"
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
);
#else
asm
volatile
(
"vld1.s32 {d0-d3}, [%[in]]!
\n
"
"vld1.s32 {d4-d7}, [%[in]]!
\n
"
"0:
\n
"
"vcvt.f32.s32 q4, q0
\n
"
"vcvt.f32.s32 q5, q1
\n
"
"vcvt.f32.s32 q6, q2
\n
"
"vcvt.f32.s32 q7, q3
\n
"
"vld1.s32 {d0-d3}, [%[in]]!
\n
"
"vmul.f32 q8, q4, %q[scale]
\n
"
"vmul.f32 q9, q5, %q[scale]
\n
"
"vmul.f32 q10, q6, %q[scale]
\n
"
"vmul.f32 q11, q7, %q[scale]
\n
"
"vld1.s32 {d4-d7}, [%[in]]!
\n
"
"subs %[loop], #1
\n
"
"vst1.f32 {d16-d19}, [%[out]]!
\n
"
"vst1.f32 {d20-d23}, [%[out]]!
\n
"
"bne 0b
\n
"
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
);
#endif // __aarch64__
}
const
int
*
din_r
=
din_c
+
16
*
cnt
;
float
*
dout_r
=
dout_c
+
16
*
cnt
;
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
dout_r
[
i
]
=
in_scale
*
din_r
[
i
];
}
}
}
void
int32_to_int8
(
const
int
*
din
,
signed
char
*
dout
,
const
float
*
scale
,
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
int
cnt
=
inner_size
/
16
;
int
remain
=
inner_size
&
15
;
int64_t
loop_size
=
outer_size
*
axis_size
;
#pragma omp parallel for
for
(
int64_t
n
=
0
;
n
<
loop_size
;
++
n
)
{
float
in_scale
=
scale
[
n
%
axis_size
];
const
int
*
din_c
=
din
+
n
*
inner_size
;
signed
char
*
dout_c
=
dout
+
n
*
inner_size
;
float32x4_t
vscale
=
vdupq_n_f32
(
in_scale
);
float32x4_t
vzero
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vpoff
=
vdupq_n_f32
(
0.5
f
);
float32x4_t
vnoff
=
vdupq_n_f32
(
-
0.5
f
);
if
(
cnt
>
0
)
{
int
loop
=
cnt
;
const
int
*
din_ptr
=
din_c
;
signed
char
*
dout_ptr
=
dout_c
;
#ifdef __aarch64__
asm
volatile
(
"0:
\n
"
"ld1 {v0.4s, v1.4s}, [%[in]], #32
\n
"
"ld1 {v2.4s, v3.4s}, [%[in]], #32
\n
"
"scvtf v4.4s, v0.4s
\n
"
"scvtf v5.4s, v1.4s
\n
"
"scvtf v6.4s, v2.4s
\n
"
"scvtf v7.4s, v3.4s
\n
"
"fmul v0.4s, v4.4s, %[scale].4s
\n
"
"fmul v1.4s, v5.4s, %[scale].4s
\n
"
"fmul v2.4s, v6.4s, %[scale].4s
\n
"
"fmul v3.4s, v7.4s, %[scale].4s
\n
"
"fcvtas v4.4s, v0.4s
\n
"
"fcvtas v5.4s, v1.4s
\n
"
"fcvtas v6.4s, v2.4s
\n
"
"fcvtas v7.4s, v3.4s
\n
"
"sqxtn v0.4h, v4.4s
\n
"
"sqxtn2 v0.8h, v5.4s
\n
"
"sqxtn v1.4h, v6.4s
\n
"
"sqxtn2 v1.8h, v7.4s
\n
"
"sqxtn v2.8b, v0.8h
\n
"
"sqxtn2 v2.16b, v1.8h
\n
"
"st1 {v2.16b}, [%[out]], #16
\n
"
"subs %[loop], %[loop], #1
\n
"
"bne 0b
\n
"
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
);
#else
asm
volatile
(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7
\n
"
"vld1.32 {d4-d7}, [%[din]]! @ load in8~in16
\n
"
"0: @ main loop
\n
"
"vcvt.f32.s32 q4, q0 @ cvt to float
\n
"
"vcvt.f32.s32 q5, q1 @ cvt to float
\n
"
"vcvt.f32.s32 q6, q2 @ cvt to float
\n
"
"vcvt.f32.s32 q7, q3 @ cvt to float
\n
"
"vand.i32 q0, %q[vpoff], %q[vpoff] @ set offset, 0.5
\n
"
"vand.i32 q1, q0, q0 @ set offset, 0.5
\n
"
"vand.i32 q2, q0, q0 @ set offset, 0.5
\n
"
"vand.i32 q3, q0, q0 @ set offset, 0.5
\n
"
"vcgt.f32 q8, q4, %q[vzero] @ get mask > 0, in0
\n
"
"vcgt.f32 q9, q5, %q[vzero] @ get mask > 0, in1
\n
"
"vcgt.f32 q10, q6, %q[vzero] @ get mask > 0, in2
\n
"
"vcgt.f32 q11, q7, %q[vzero] @ get mask > 0, in3
\n
"
"vbif.f32 q0, %q[vnoff], q8 @ get right offset
\n
"
"vbif.f32 q1, %q[vnoff], q9 @ get right offset
\n
"
"vbif.f32 q2, %q[vnoff], q10 @ get right offset
\n
"
"vbif.f32 q3, %q[vnoff], q11 @ get right offset
\n
"
"vmla.f32 q0, q4, %q[vscale] @ mul scale
\n
"
"vmla.f32 q1, q5, %q[vscale] @ mul scale
\n
"
"vmla.f32 q2, q6, %q[vscale] @ mul scale
\n
"
"vmla.f32 q3, q7, %q[vscale] @ mul scale
\n
"
"vcvt.s32.f32 q4, q0 @ cvt to int32
\n
"
"vcvt.s32.f32 q5, q1 @ cvt to int32
\n
"
"vcvt.s32.f32 q6, q2 @ cvt to int32
\n
"
"vcvt.s32.f32 q7, q3 @ cvt to int32
\n
"
"vqmovn.s32 d16, q4 @ cnt to int16
\n
"
"vqmovn.s32 d17, q5 @ cnt to int16
\n
"
"vqmovn.s32 d18, q6 @ cnt to int16
\n
"
"vqmovn.s32 d19, q7 @ cnt to int16
\n
"
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7
\n
"
"vqmovn.s16 d8, q8 @ cnt to int8
\n
"
"vqmovn.s16 d9, q9 @ cnt to int8
\n
"
"vld1.32 {d4-d7}, [%[din]]! @ load in8~in16
\n
"
"vst1.32 {d8-d9}, [%[dout]]! @ write to output
\n
"
"subs %[loop], #1 @ loop count -1
\n
"
"bne 0b @ to main loop
\n
"
:
[
loop
]
"+r"
(
loop
),
[
din
]
"+r"
(
din_ptr
),
[
dout
]
"+r"
(
dout_ptr
)
:
[
vscale
]
"w"
(
vscale
),
[
vzero
]
"w"
(
vzero
),
[
vnoff
]
"w"
(
vnoff
),
[
vpoff
]
"w"
(
vpoff
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
);
#endif // __aarch64__
}
const
int
*
din_r
=
din_c
+
16
*
cnt
;
int8_t
*
dout_r
=
dout_c
+
16
*
cnt
;
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
dout_r
[
i
]
=
saturate_cast
<
int8_t
>
(
roundf
(
in_scale
*
din_r
[
i
]));
}
}
}
void
int32_to_int32
(
const
int
*
din
,
int
*
dout
,
const
float
*
scale
,
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
int
size_all
=
outer_size
*
axis_size
*
inner_size
;
memmove
(
dout
,
din
,
size_all
*
sizeof
(
int
));
}
template
<
>
void
int32_to_dtype
(
const
int
*
din
,
float
*
dout
,
const
float
*
scale
,
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
return
int32_to_fp32
(
din
,
dout
,
scale
,
axis_size
,
outer_size
,
inner_size
);
}
template
<
>
void
int32_to_dtype
(
const
int
*
din
,
signed
char
*
dout
,
const
float
*
scale
,
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
return
int32_to_int8
(
din
,
dout
,
scale
,
axis_size
,
outer_size
,
inner_size
);
}
template
<
>
void
int32_to_dtype
(
const
int
*
din
,
int
*
dout
,
const
float
*
scale
,
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
return
int32_to_int32
(
din
,
dout
,
scale
,
axis_size
,
outer_size
,
inner_size
);
}
}
// namespace math
}
// namespace arm
}
// namespace lite
}
// namespace paddle
paddle/fluid/lite/kernels/arm/CMakeLists.txt
浏览文件 @
09a07a23
...
...
@@ -16,6 +16,7 @@ cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_a
cc_library
(
split_compute_arm SRCS split_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
cc_library
(
concat_compute_arm SRCS concat_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
cc_library
(
dropout_compute_arm SRCS dropout_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
cc_library
(
calib_compute_arm SRCS calib_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
cc_library
(
transpose_compute_arm SRCS transpose_compute.cc DEPS
${
lite_kernel_deps
}
math_arm
)
lite_cc_test
(
test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm
)
...
...
@@ -30,6 +31,7 @@ lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
lite_cc_test
(
test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm
)
lite_cc_test
(
test_concat_compute_arm SRCS concat_compute_test.cc DEPS concat_compute_arm
)
lite_cc_test
(
test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm
)
lite_cc_test
(
test_calib_compute_arm SRCS calib_compute_test.cc DEPS calib_compute_arm
)
lite_cc_test
(
test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm
)
set
(
arm_kernels
...
...
paddle/fluid/lite/kernels/arm/calib_compute.cc
0 → 100644
浏览文件 @
09a07a23
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/arm/calib_compute.h"
#include <vector>
#include "paddle/fluid/lite/arm/math/type_trans.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/type_system.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
arm
{
void
CalibCompute
::
Run
()
{
auto
&
param
=
this
->
Param
<
operators
::
CalibParam
>
();
std
::
vector
<
float
>
scale
=
{
param
.
in_scale
};
if
(
param
.
in_dtype
==
PRECISION
(
kFloat
)
&&
param
.
out_dtype
==
PRECISION
(
kInt8
))
{
const
auto
*
din
=
param
.
input
->
data
<
float
>
();
auto
*
dout
=
param
.
output
->
mutable_data
<
signed
char
>
();
lite
::
arm
::
math
::
fp32_to_int8
(
din
,
dout
,
scale
.
data
(),
1
,
1
,
param
.
input
->
numel
());
return
;
}
if
(
param
.
in_dtype
==
PRECISION
(
kInt8
)
&&
param
.
out_dtype
==
PRECISION
(
kFloat
))
{
const
auto
*
din
=
param
.
input
->
data
<
signed
char
>
();
auto
*
dout
=
param
.
output
->
mutable_data
<
float
>
();
lite
::
arm
::
math
::
int8_to_fp32
(
din
,
dout
,
scale
.
data
(),
1
,
1
,
param
.
input
->
numel
());
return
;
}
LOG
(
FATAL
)
<<
"Unsupport Dtype."
;
}
}
// namespace arm
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
REGISTER_LITE_KERNEL
(
calib
,
kARM
,
kAny
,
kAny
,
paddle
::
lite
::
kernels
::
arm
::
CalibCompute
,
def
)
.
BindInput
(
"Input"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
Finalize
();
paddle/fluid/lite/kernels/arm/calib_compute.h
0 → 100644
浏览文件 @
09a07a23
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/operators/calib_op.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
arm
{
class
CalibCompute
:
public
KernelLite
<
TARGET
(
kARM
),
PRECISION
(
kAny
)
>
{
public:
using
param_t
=
operators
::
CalibParam
;
// void PrepareForRun() override;
void
Run
()
override
;
~
CalibCompute
()
override
{};
private:
};
}
// namespace arm
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
paddle/fluid/lite/kernels/arm/calib_compute_test.cc
0 → 100644
浏览文件 @
09a07a23
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/arm/calib_compute.h"
#include <gtest/gtest.h>
#include <algorithm>
#include <iostream>
#include <memory>
#include <random>
#include <utility>
#include <vector>
#include "paddle/fluid/lite/arm/math/funcs.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
arm
{
static
int
get_rand
(
int
start
,
int
end
)
{
int
i
=
rand
();
// NOLINT
i
=
(
i
%
(
end
-
start
))
+
start
;
return
i
;
}
static
void
int8_to_fp32_basic
(
const
int8_t
*
din
,
float
*
dout
,
const
float
*
scale
,
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
int
loop_size
=
axis_size
*
outer_size
;
for
(
int
i
=
0
;
i
<
loop_size
;
++
i
)
{
float
scale_in
=
scale
[
i
%
axis_size
];
for
(
int
j
=
0
;
j
<
inner_size
;
++
j
)
{
dout
[
j
]
=
din
[
j
]
*
scale_in
;
}
dout
+=
inner_size
;
din
+=
inner_size
;
}
}
static
void
fp32_to_int8_basic
(
const
float
*
din
,
int8_t
*
dout
,
const
float
*
scale
,
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
int
loop_size
=
axis_size
*
outer_size
;
for
(
int
i
=
0
;
i
<
loop_size
;
++
i
)
{
float
inv_scale
=
1.
f
/
scale
[
i
%
axis_size
];
for
(
int
j
=
0
;
j
<
inner_size
;
++
j
)
{
dout
[
j
]
=
static_cast
<
int8_t
>
(
roundf
(
din
[
j
]
*
inv_scale
));
}
dout
+=
inner_size
;
din
+=
inner_size
;
}
}
void
calib_ref
(
const
operators
::
CalibParam
&
param
)
{
std
::
vector
<
float
>
scale
=
{
param
.
in_scale
};
if
(
param
.
in_dtype
==
PRECISION
(
kFloat
)
&&
param
.
out_dtype
==
PRECISION
(
kInt8
))
{
const
auto
*
din
=
param
.
input
->
data
<
float
>
();
auto
*
dout
=
param
.
output
->
mutable_data
<
signed
char
>
();
fp32_to_int8_basic
(
din
,
dout
,
scale
.
data
(),
1
,
1
,
param
.
input
->
numel
());
return
;
}
if
(
param
.
in_dtype
==
PRECISION
(
kInt8
)
&&
param
.
out_dtype
==
PRECISION
(
kFloat
))
{
const
auto
*
din
=
param
.
input
->
data
<
signed
char
>
();
auto
*
dout
=
param
.
output
->
mutable_data
<
float
>
();
int8_to_fp32_basic
(
din
,
dout
,
scale
.
data
(),
1
,
1
,
param
.
input
->
numel
());
return
;
}
LOG
(
FATAL
)
<<
"Unsupport Dtype."
;
}
TEST
(
calib_arm
,
retrive_op
)
{
auto
calib
=
KernelRegistry
::
Global
()
.
Create
<
TARGET
(
kARM
),
PRECISION
(
kAny
),
DATALAYOUT
(
kAny
)
>
(
"calib"
);
ASSERT_FALSE
(
calib
.
empty
());
ASSERT_TRUE
(
calib
.
front
());
}
TEST
(
calib_arm
,
init
)
{
CalibCompute
calib
;
ASSERT_EQ
(
calib
.
precision
(),
PRECISION
(
kAny
));
ASSERT_EQ
(
calib
.
target
(),
TARGET
(
kARM
));
}
TEST
(
calib_arm
,
int8_to_fp32
)
{
DeviceInfo
::
Init
();
for
(
auto
n
:
{
1
,
2
})
{
for
(
auto
c
:
{
6
,
32
/*, 128*/
})
{
for
(
auto
h
:
{
9
,
18
/*, 56 , 112, 224, 512*/
})
{
for
(
auto
w
:
{
9
,
18
/*, 56, 112, 224, 512*/
})
{
Tensor
x
;
Tensor
output
;
Tensor
output_ref
;
// set the dims of input, output, ref output tensors
x
.
Resize
({
n
,
c
,
h
,
w
});
output
.
Resize
({
n
,
c
,
h
,
w
});
output_ref
.
Resize
({
n
,
c
,
h
,
w
});
// initialize the data of input tensors
auto
*
x_data
=
x
.
mutable_data
<
char
>
();
auto
*
output_data
=
output
.
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
x
.
dims
().
production
();
i
++
)
{
float
sign
=
i
%
3
==
0
?
-
1.0
f
:
1.0
f
;
x_data
[
i
]
=
sign
*
static_cast
<
float
>
(
i
%
128
)
*
0.013
f
;
}
// prepare kernel params and run
CalibCompute
calib
;
std
::
unique_ptr
<
KernelContext
>
ctx
(
new
KernelContext
);
ctx
->
As
<
ARMContext
>
();
calib
.
SetContext
(
std
::
move
(
ctx
));
operators
::
CalibParam
param
;
param
.
in_scale
=
get_rand
(
0
,
100
)
*
0.1
f
;
param
.
in_dtype
=
PRECISION
(
kInt8
);
param
.
out_dtype
=
PRECISION
(
kFloat
);
param
.
input
=
&
x
;
param
.
output
=
&
output
;
calib
.
SetParam
(
param
);
calib
.
Launch
();
// invoking ref implementation and compare results
param
.
output
=
&
output_ref
;
calib_ref
(
param
);
auto
*
output_ref_data
=
output_ref
.
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
output
.
dims
().
production
();
i
++
)
{
EXPECT_NEAR
(
output_data
[
i
],
output_ref_data
[
i
],
1e-5
);
}
}
}
}
}
}
}
// namespace arm
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
USE_LITE_KERNEL
(
calib
,
kARM
,
kAny
,
kAny
,
def
);
paddle/fluid/lite/operators/CMakeLists.txt
浏览文件 @
09a07a23
...
...
@@ -21,6 +21,7 @@ cc_library(fill_constant_op_lite SRCS fill_constant_op.cc DEPS ${op_DEPS})
cc_library
(
op_params_lite SRCS op_params.cc DEPS
${
tensor_lite
}
any_lite framework_proto_lite
)
cc_library
(
dropout_op_lite SRCS dropout_op.cc DEPS
${
op_DEPS
}
)
cc_library
(
concat_op_lite SRCS concat_op.cc DEPS
${
op_DEPS
}
)
cc_library
(
calib_op_lite SRCS calib_op.cc DEPS
${
op_DEPS
}
)
cc_library
(
split_op_lite SRCS split_op.cc DEPS
${
op_DEPS
}
)
cc_library
(
transpose_op_lite SRCS transpose_op.cc DEPS
${
op_DEPS
}
)
...
...
@@ -44,6 +45,7 @@ set(ops_lite
activation_ops_lite
dropout_op_lite
concat_op_lite
calib_op_lite
split_op_lite
transpose_op_lite
PARENT_SCOPE
)
...
...
@@ -60,6 +62,7 @@ lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite m
lite_cc_test
(
test_reshape_op_lite SRCS reshape_op_test.cc DEPS reshape_op_lite memory_lite
)
lite_cc_test
(
test_batch_norm_op_lite SRCS batch_norm_op_test.cc DEPS batch_norm_op_lite memory_lite
)
lite_cc_test
(
test_concat_op_lite SRCS concat_op_test.cc DEPS concat_op_lite memory_lite
)
lite_cc_test
(
test_calib_op_lite SRCS calib_op_test.cc DEPS calib_op_lite memory_lite ARM_DEPS calib_compute_arm
)
lite_cc_test
(
test_fusion_elementwise_activation_ops_lite
SRCS fusion_elementwise_activation_ops_test.cc
DEPS fusion_elementwise_activation_ops_lite memory_lite
)
...
...
paddle/fluid/lite/operators/calib_op.cc
0 → 100644
浏览文件 @
09a07a23
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/operators/calib_op.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace
paddle
{
namespace
lite
{
namespace
operators
{
bool
CalibOpLite
::
CheckShape
()
const
{
CHECK_OR_FALSE
(
param_
.
input
);
CHECK_OR_FALSE
(
param_
.
output
);
return
true
;
}
bool
CalibOpLite
::
InferShape
()
const
{
param_
.
output
->
Resize
(
param_
.
input
->
dims
());
return
true
;
}
bool
CalibOpLite
::
AttachImpl
(
const
cpp
::
OpDesc
&
opdesc
,
lite
::
Scope
*
scope
)
{
auto
x_var
=
scope
->
FindVar
(
opdesc
.
Input
(
"Input"
).
front
());
auto
output_var
=
scope
->
FindVar
(
opdesc
.
Output
(
"Out"
).
front
());
CHECK
(
x_var
);
CHECK
(
output_var
);
param_
.
input
=
const_cast
<
lite
::
Tensor
*>
(
&
(
x_var
->
Get
<
lite
::
Tensor
>
()));
param_
.
output
=
output_var
->
GetMutable
<
lite
::
Tensor
>
();
std
::
vector
<
std
::
string
>
input_arg_names
=
opdesc
.
InputArgumentNames
();
param_
.
in_dtype
=
static_cast
<
lite
::
PrecisionType
>
(
opdesc
.
GetAttr
<
int
>
(
"in_dtype"
));
param_
.
out_dtype
=
static_cast
<
lite
::
PrecisionType
>
(
opdesc
.
GetAttr
<
int
>
(
"out_dtype"
));
if
(
opdesc
.
HasAttr
(
"in_scale"
))
{
param_
.
in_scale
=
opdesc
.
GetAttr
<
float
>
(
"in_scale"
);
}
CHECK
(
param_
.
input
)
<<
"Input(X) of CalibOp should not be null."
;
CHECK
(
param_
.
output
)
<<
"Output(Out) of CalibOp should not be null."
;
return
true
;
}
}
// namespace operators
}
// namespace lite
}
// namespace paddle
REGISTER_LITE_OP
(
calib
,
paddle
::
lite
::
operators
::
CalibOpLite
);
paddle/fluid/lite/operators/calib_op.h
0 → 100644
浏览文件 @
09a07a23
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_lite.h"
#include "paddle/fluid/lite/core/scope.h"
#include "paddle/fluid/lite/operators/op_params.h"
#include "paddle/fluid/lite/utils/all.h"
namespace
paddle
{
namespace
lite
{
namespace
operators
{
class
CalibOpLite
:
public
OpLite
{
public:
CalibOpLite
()
{}
explicit
CalibOpLite
(
const
std
::
string
&
type
)
:
OpLite
(
type
)
{}
bool
CheckShape
()
const
override
;
bool
InferShape
()
const
override
;
/*
bool Run() override {
CHECK(kernel_);
kernel_->Run();
return true;
}
*/
bool
AttachImpl
(
const
cpp
::
OpDesc
&
opdesc
,
lite
::
Scope
*
scope
);
void
AttachKernel
(
KernelBase
*
kernel
)
override
{
kernel
->
SetParam
(
param_
);
}
std
::
string
DebugString
()
const
override
{
return
"calib"
;
}
private:
mutable
CalibParam
param_
;
};
}
// namespace operators
}
// namespace lite
}
// namespace paddle
paddle/fluid/lite/operators/calib_op_test.cc
0 → 100644
浏览文件 @
09a07a23
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/operators/calib_op.h"
#include <gtest/gtest.h>
#include "paddle/fluid/lite/core/op_registry.h"
namespace
paddle
{
namespace
lite
{
namespace
operators
{
#ifdef LITE_WITH_ARM
TEST
(
calib_op_lite
,
TestARM
)
{
// prepare variables
Scope
scope
;
auto
*
x
=
scope
.
Var
(
"Input"
)
->
GetMutable
<
Tensor
>
();
auto
*
output
=
scope
.
Var
(
"output"
)
->
GetMutable
<
Tensor
>
();
x
->
Resize
(
DDim
(
std
::
vector
<
int64_t
>
({
1
,
10
,
20
})));
output
->
Resize
(
DDim
(
std
::
vector
<
int64_t
>
{
1
,
10
,
20
}));
// set data
for
(
int
i
=
0
;
i
<
10
*
20
;
i
++
)
{
x
->
mutable_data
<
float
>
()[
i
]
=
i
;
}
for
(
int
i
=
0
;
i
<
10
*
20
;
i
++
)
{
output
->
mutable_data
<
float
>
()[
i
]
=
0.
;
}
// prepare op desc
cpp
::
OpDesc
desc
;
desc
.
SetType
(
"calib"
);
desc
.
SetInput
(
"Input"
,
{
"Input"
});
desc
.
SetOutput
(
"Out"
,
{
"output"
});
desc
.
SetAttr
(
"in_dtype"
,
static_cast
<
int
>
(
PRECISION
(
kInt8
)));
desc
.
SetAttr
(
"out_dtype"
,
static_cast
<
int
>
(
PRECISION
(
kFloat
)));
desc
.
SetAttr
(
"in_scale"
,
10.0
f
);
CalibOpLite
calib
(
"calib"
);
calib
.
SetValidPlaces
({
Place
{
TARGET
(
kARM
),
PRECISION
(
kAny
)}});
calib
.
Attach
(
desc
,
&
scope
);
auto
kernels
=
calib
.
CreateKernels
({
Place
{
TARGET
(
kARM
),
PRECISION
(
kAny
)}});
ASSERT_FALSE
(
kernels
.
empty
());
}
#endif
}
// namespace operators
}
// namespace lite
}
// namespace paddle
#ifdef LITE_WITH_ARM
USE_LITE_KERNEL
(
calib
,
kARM
,
kAny
,
kAny
,
def
);
#endif
paddle/fluid/lite/operators/op_params.h
浏览文件 @
09a07a23
...
...
@@ -48,6 +48,14 @@ struct IoCopyParam {
lite
::
Tensor
*
y
{};
};
struct
CalibParam
{
const
lite
::
Tensor
*
input
{};
lite
::
Tensor
*
output
{};
float
in_scale
;
PrecisionType
in_dtype
;
PrecisionType
out_dtype
;
};
/// -------------------------- NN operators ------------------------------------
struct
FcParam
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录