Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
9ae05da3
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
9ae05da3
编写于
6月 14, 2019
作者:
T
tensor-tang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix format
上级
1b1f9b07
变更
25
显示空白变更内容
内联
并排
Showing
25 changed file
with
516 addition
and
501 deletion
+516
-501
paddle/fluid/lite/CMakeLists.txt
paddle/fluid/lite/CMakeLists.txt
+1
-0
paddle/fluid/lite/api/CMakeLists.txt
paddle/fluid/lite/api/CMakeLists.txt
+1
-0
paddle/fluid/lite/arm/CMakeLists.txt
paddle/fluid/lite/arm/CMakeLists.txt
+1
-0
paddle/fluid/lite/arm/math/CMakeLists.txt
paddle/fluid/lite/arm/math/CMakeLists.txt
+1
-0
paddle/fluid/lite/arm/math/type_trans.cpp
paddle/fluid/lite/arm/math/type_trans.cpp
+492
-501
paddle/fluid/lite/core/CMakeLists.txt
paddle/fluid/lite/core/CMakeLists.txt
+1
-0
paddle/fluid/lite/core/mir/CMakeLists.txt
paddle/fluid/lite/core/mir/CMakeLists.txt
+1
-0
paddle/fluid/lite/core/profile/CMakeLists.txt
paddle/fluid/lite/core/profile/CMakeLists.txt
+1
-0
paddle/fluid/lite/cuda/CMakeLists.txt
paddle/fluid/lite/cuda/CMakeLists.txt
+1
-0
paddle/fluid/lite/gen_code/CMakeLists.txt
paddle/fluid/lite/gen_code/CMakeLists.txt
+1
-0
paddle/fluid/lite/host/CMakeLists.txt
paddle/fluid/lite/host/CMakeLists.txt
+1
-0
paddle/fluid/lite/kernels/CMakeLists.txt
paddle/fluid/lite/kernels/CMakeLists.txt
+1
-0
paddle/fluid/lite/kernels/arm/CMakeLists.txt
paddle/fluid/lite/kernels/arm/CMakeLists.txt
+1
-0
paddle/fluid/lite/kernels/cuda/CMakeLists.txt
paddle/fluid/lite/kernels/cuda/CMakeLists.txt
+1
-0
paddle/fluid/lite/kernels/host/CMakeLists.txt
paddle/fluid/lite/kernels/host/CMakeLists.txt
+1
-0
paddle/fluid/lite/kernels/x86/CMakeLists.txt
paddle/fluid/lite/kernels/x86/CMakeLists.txt
+1
-0
paddle/fluid/lite/model_parser/CMakeLists.txt
paddle/fluid/lite/model_parser/CMakeLists.txt
+1
-0
paddle/fluid/lite/model_parser/cpp/CMakeLists.txt
paddle/fluid/lite/model_parser/cpp/CMakeLists.txt
+1
-0
paddle/fluid/lite/model_parser/pb/CMakeLists.txt
paddle/fluid/lite/model_parser/pb/CMakeLists.txt
+1
-0
paddle/fluid/lite/operators/CMakeLists.txt
paddle/fluid/lite/operators/CMakeLists.txt
+1
-0
paddle/fluid/lite/tools/Dockerfile.mobile
paddle/fluid/lite/tools/Dockerfile.mobile
+1
-0
paddle/fluid/lite/tools/build.sh
paddle/fluid/lite/tools/build.sh
+1
-0
paddle/fluid/lite/tools/mobile_readme.md
paddle/fluid/lite/tools/mobile_readme.md
+1
-0
paddle/fluid/lite/utils/CMakeLists.txt
paddle/fluid/lite/utils/CMakeLists.txt
+1
-0
paddle/fluid/lite/x86/CMakeLists.txt
paddle/fluid/lite/x86/CMakeLists.txt
+1
-0
未找到文件。
paddle/fluid/lite/CMakeLists.txt
浏览文件 @
9ae05da3
...
...
@@ -172,3 +172,4 @@ add_subdirectory(model_parser)
add_subdirectory
(
utils
)
add_subdirectory
(
api
)
add_subdirectory
(
gen_code
)
paddle/fluid/lite/api/CMakeLists.txt
浏览文件 @
9ae05da3
...
...
@@ -54,3 +54,4 @@ lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
mir_passes
${
ops_lite
}
${
host_kernels
}
ARM_DEPS
${
arm_kernels
}
)
paddle/fluid/lite/arm/CMakeLists.txt
浏览文件 @
9ae05da3
add_subdirectory
(
math
)
paddle/fluid/lite/arm/math/CMakeLists.txt
浏览文件 @
9ae05da3
...
...
@@ -34,3 +34,4 @@ cc_library(math_arm SRCS
split.cc
DEPS
${
lite_kernel_deps
}
eigen3
)
paddle/fluid/lite/arm/math/type_trans.cpp
浏览文件 @
9ae05da3
...
...
@@ -12,9 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/arm/math/
saturate
.h"
#include "paddle/fluid/lite/arm/math/
type_trans
.h"
#include <arm_neon.h>
#include <string.h>
#include "paddle/fluid/lite/arm/math/saturate.h"
namespace
paddle
{
namespace
lite
{
...
...
@@ -23,14 +24,13 @@ namespace math {
template
<
typename
dtype
>
void
int32_to_dtype
(
const
int
*
din
,
dtype
*
dout
,
const
float
*
scale
,
int
axis_size
,
long
long
outer_size
,
long
long
inner_size
);
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
);
void
fp32_to_int8
(
const
float
*
din
,
signed
char
*
dout
,
const
float
*
scale
,
int
axis_size
,
long
long
outer_size
,
long
long
inner_size
)
{
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
int
cnt
=
inner_size
/
16
;
int
remain
=
inner_size
&
15
;
long
long
loop_size
=
outer_size
*
axis_size
;
int64_t
loop_size
=
outer_size
*
axis_size
;
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
loop_size
;
++
j
)
{
...
...
@@ -69,10 +69,10 @@ void fp32_to_int8(const float* din, signed char* dout, const float* scale,
"sqxtn2 v8.16b, v5.8h
\n
"
"str q8, [%[out]], #16
\n
"
"bne 0b
\n
"
:
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
),
[
cnt
]
"+r"
(
cnt_loop
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
);
:
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
),
[
cnt
]
"+r"
(
cnt_loop
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
);
#else
asm
volatile
(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7
\n
"
...
...
@@ -110,10 +110,11 @@ void fp32_to_int8(const float* din, signed char* dout, const float* scale,
"subs %[cnt], #1 @ loop count -1
\n
"
"bne 0b @ to main loop
\n
"
:
[
dout
]
"+r"
(
dout_ptr
),
[
din
]
"+r"
(
din_ptr
),
[
cnt
]
"+r"
(
cnt_loop
)
:
[
vscale
]
"w"
(
vscale
),
[
vpoff
]
"w"
(
vpoff
),
[
vnoff
]
"w"
(
vnoff
),
[
vzero
]
"w"
(
vzero
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
);
:
[
dout
]
"+r"
(
dout_ptr
),
[
din
]
"+r"
(
din_ptr
),
[
cnt
]
"+r"
(
cnt_loop
)
:
[
vscale
]
"w"
(
vscale
),
[
vpoff
]
"w"
(
vpoff
),
[
vnoff
]
"w"
(
vnoff
),
[
vzero
]
"w"
(
vzero
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
);
#endif
}
const
float
*
din_r
=
din_c
+
16
*
cnt
;
...
...
@@ -125,11 +126,10 @@ void fp32_to_int8(const float* din, signed char* dout, const float* scale,
}
void
fp32_to_int16
(
const
float
*
din
,
int16_t
*
dout
,
const
float
*
scale
,
int
axis_size
,
long
long
outer_size
,
long
long
inner_size
)
{
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
int
cnt
=
inner_size
/
8
;
int
remain
=
inner_size
&
7
;
long
long
loop_size
=
outer_size
*
axis_size
;
int64_t
loop_size
=
outer_size
*
axis_size
;
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
loop_size
;
++
j
)
{
...
...
@@ -158,10 +158,9 @@ void fp32_to_int16(const float* din, int16_t* dout, const float* scale,
"sqxtn2 v4.8h, v9.4s
\n
"
"str q4, [%[out]], #16
\n
"
"bne 0b
\n
"
:
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
),
[
cnt
]
"+r"
(
cnt_loop
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v4"
,
"v5"
,
"v8"
,
"v9"
);
:
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
),
[
cnt
]
"+r"
(
cnt_loop
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v4"
,
"v5"
,
"v8"
,
"v9"
);
#else
asm
volatile
(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7
\n
"
...
...
@@ -185,10 +184,10 @@ void fp32_to_int16(const float* din, int16_t* dout, const float* scale,
"subs %[cnt], #1 @ loop count -1
\n
"
"bne 0b @ to main loop
\n
"
:
[
dout
]
"+r"
(
dout_ptr
),
[
din
]
"+r"
(
din_ptr
),
[
cnt
]
"+r"
(
cnt_loop
)
:
[
vscale
]
"w"
(
vscale
),
[
vpoff
]
"w"
(
vpoff
),
[
vnoff
]
"w"
(
vnoff
),
[
vzero
]
"w"
(
vzero
)
:
"q0"
,
"q1"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
);
:
[
dout
]
"+r"
(
dout_ptr
),
[
din
]
"+r"
(
din_ptr
),
[
cnt
]
"+r"
(
cnt_loop
)
:
[
vscale
]
"w"
(
vscale
),
[
vpoff
]
"w"
(
vpoff
),
[
vnoff
]
"w"
(
vnoff
),
[
vzero
]
"w"
(
vzero
)
:
"q0"
,
"q1"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
);
#endif
}
const
float
*
din_r
=
din_c
+
8
*
cnt
;
...
...
@@ -200,13 +199,12 @@ void fp32_to_int16(const float* din, int16_t* dout, const float* scale,
}
void
int8_to_fp32
(
const
signed
char
*
in
,
float
*
out
,
const
float
*
scale
,
int
axis_size
,
long
long
outer_size
,
long
long
inner_size
)
{
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
int
cnt
=
inner_size
/
16
;
int
remain
=
inner_size
&
15
;
long
long
loop_size
=
axis_size
*
outer_size
;
int64_t
loop_size
=
axis_size
*
outer_size
;
#pragma omp parallel for
for
(
long
long
n
=
0
;
n
<
loop_size
;
++
n
)
{
for
(
int64_t
n
=
0
;
n
<
loop_size
;
++
n
)
{
float
in_scale
=
scale
[
n
%
axis_size
];
const
signed
char
*
din_c
=
in
+
n
*
inner_size
;
float
*
dout_c
=
out
+
n
*
inner_size
;
...
...
@@ -245,10 +243,10 @@ void int8_to_fp32(const signed char* in, float* out, const float* scale,
"stp q6, q7, [%[out]], #32
\n
"
/* write to memory*/
"bne 0b
\n
"
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
);
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
);
#else
asm
volatile
(
"vld1.32 {d0-d1}, [%[in]]! @ load 16 int8
\n
"
...
...
@@ -276,11 +274,10 @@ void int8_to_fp32(const signed char* in, float* out, const float* scale,
"vst1.f32 {d12-d15}, [%[out]]! @ write to memory
\n
"
"bne 0b
\n
"
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
);
#endif //__aarch64__
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
);
#endif // __aarch64__
}
const
signed
char
*
din_r
=
din_c
+
16
*
cnt
;
float
*
dout_r
=
dout_c
+
16
*
cnt
;
...
...
@@ -290,21 +287,20 @@ void int8_to_fp32(const signed char* in, float* out, const float* scale,
}
}
void
int16_to_fp32
(
const
short
*
in
,
float
*
out
,
const
float
*
scale
,
int
axis_size
,
long
long
outer_size
,
long
long
inner_size
)
{
void
int16_to_fp32
(
const
int16_t
*
in
,
float
*
out
,
const
float
*
scale
,
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
int
cnt
=
inner_size
/
16
;
int
remain
=
inner_size
&
15
;
long
long
loop_size
=
axis_size
*
outer_size
;
int64_t
loop_size
=
axis_size
*
outer_size
;
#pragma omp parallel for
for
(
long
long
n
=
0
;
n
<
loop_size
;
++
n
)
{
for
(
int64_t
n
=
0
;
n
<
loop_size
;
++
n
)
{
float
in_scale
=
scale
[
n
%
axis_size
];
const
shor
t
*
din_c
=
in
+
n
*
inner_size
;
const
int16_
t
*
din_c
=
in
+
n
*
inner_size
;
float
*
dout_c
=
out
+
n
*
inner_size
;
float32x4_t
vscale
=
vdupq_n_f32
(
in_scale
);
if
(
cnt
>
0
)
{
int
loop
=
cnt
;
const
shor
t
*
din_ptr
=
din_c
;
const
int16_
t
*
din_ptr
=
din_c
;
float
*
dout_ptr
=
dout_c
;
#ifdef __aarch64__
asm
volatile
(
...
...
@@ -333,10 +329,9 @@ void int16_to_fp32(const short* in, float* out, const float* scale,
"stp q6, q7, [%[out]], #32
\n
"
/* write to memory*/
"bne 0b
\n
"
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
);
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
);
#else
asm
volatile
(
"vld1.32 {d0-d3}, [%[in]]! @ load 16 int16
\n
"
...
...
@@ -362,13 +357,12 @@ void int16_to_fp32(const short* in, float* out, const float* scale,
"vst1.f32 {d12-d15}, [%[out]]! @ write to memory
\n
"
"bne 0b
\n
"
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
);
#endif //__aarch64__
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
);
#endif // __aarch64__
}
const
shor
t
*
din_r
=
din_c
+
16
*
cnt
;
const
int16_
t
*
din_r
=
din_c
+
16
*
cnt
;
float
*
dout_r
=
dout_c
+
16
*
cnt
;
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
dout_r
[
i
]
=
in_scale
*
din_r
[
i
];
...
...
@@ -377,12 +371,12 @@ void int16_to_fp32(const short* in, float* out, const float* scale,
}
void
int32_to_fp32
(
const
int
*
din
,
float
*
dout
,
const
float
*
scale
,
int
axis_size
,
long
long
outer_size
,
long
long
inner_size
)
{
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
int
cnt
=
inner_size
/
16
;
int
remain
=
inner_size
&
15
;
long
long
loop_size
=
axis_size
*
outer_size
;
int64_t
loop_size
=
axis_size
*
outer_size
;
#pragma omp parallel for
for
(
long
long
n
=
0
;
n
<
loop_size
;
++
n
)
{
for
(
int64_t
n
=
0
;
n
<
loop_size
;
++
n
)
{
float
in_scale
=
scale
[
n
%
axis_size
];
const
int
*
din_c
=
din
+
n
*
inner_size
;
float
*
dout_c
=
dout
+
n
*
inner_size
;
...
...
@@ -410,10 +404,10 @@ void int32_to_fp32(const int* din, float* dout, const float* scale,
"stp q10, q11, [%[out]], #32
\n
"
"subs %[loop], %[loop], #1
\n
"
"bne 0b
\n
"
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
);
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
);
#else
asm
volatile
(
"vld1.s32 {d0-d3}, [%[in]]!
\n
"
...
...
@@ -433,11 +427,11 @@ void int32_to_fp32(const int* din, float* dout, const float* scale,
"vst1.f32 {d16-d19}, [%[out]]!
\n
"
"vst1.f32 {d20-d23}, [%[out]]!
\n
"
"bne 0b
\n
"
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
);
#endif
//
__aarch64__
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
);
#endif
//
__aarch64__
}
const
int
*
din_r
=
din_c
+
16
*
cnt
;
float
*
dout_r
=
dout_c
+
16
*
cnt
;
...
...
@@ -447,13 +441,13 @@ void int32_to_fp32(const int* din, float* dout, const float* scale,
}
}
void
int32_to_int8
(
const
int
*
din
,
signed
char
*
dout
,
const
float
*
scale
,
\
int
axis_size
,
long
long
outer_size
,
long
long
inner_size
)
{
void
int32_to_int8
(
const
int
*
din
,
signed
char
*
dout
,
const
float
*
scale
,
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
int
cnt
=
inner_size
/
16
;
int
remain
=
inner_size
&
15
;
long
long
loop_size
=
outer_size
*
axis_size
;
int64_t
loop_size
=
outer_size
*
axis_size
;
#pragma omp parallel for
for
(
long
long
n
=
0
;
n
<
loop_size
;
++
n
)
{
for
(
int64_t
n
=
0
;
n
<
loop_size
;
++
n
)
{
float
in_scale
=
scale
[
n
%
axis_size
];
const
int
*
din_c
=
din
+
n
*
inner_size
;
signed
char
*
dout_c
=
dout
+
n
*
inner_size
;
...
...
@@ -497,10 +491,9 @@ void int32_to_int8(const int* din, signed char* dout, const float* scale, \
"st1 {v2.16b}, [%[out]], #16
\n
"
"subs %[loop], %[loop], #1
\n
"
"bne 0b
\n
"
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
);
:
[
loop
]
"+r"
(
loop
),
[
in
]
"+r"
(
din_ptr
),
[
out
]
"+r"
(
dout_ptr
)
:
[
scale
]
"w"
(
vscale
)
:
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
);
#else
asm
volatile
(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7
\n
"
...
...
@@ -541,11 +534,12 @@ void int32_to_int8(const int* din, signed char* dout, const float* scale, \
"vst1.32 {d8-d9}, [%[dout]]! @ write to output
\n
"
"subs %[loop], #1 @ loop count -1
\n
"
"bne 0b @ to main loop
\n
"
:
[
loop
]
"+r"
(
loop
),
[
din
]
"+r"
(
din_ptr
),
[
dout
]
"+r"
(
dout_ptr
)
:
[
vscale
]
"w"
(
vscale
),
[
vzero
]
"w"
(
vzero
),
[
vnoff
]
"w"
(
vnoff
),
[
vpoff
]
"w"
(
vpoff
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
);
#endif //__aarch64__
:
[
loop
]
"+r"
(
loop
),
[
din
]
"+r"
(
din_ptr
),
[
dout
]
"+r"
(
dout_ptr
)
:
[
vscale
]
"w"
(
vscale
),
[
vzero
]
"w"
(
vzero
),
[
vnoff
]
"w"
(
vnoff
),
[
vpoff
]
"w"
(
vpoff
)
:
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
);
#endif // __aarch64__
}
const
int
*
din_r
=
din_c
+
16
*
cnt
;
int8_t
*
dout_r
=
dout_c
+
16
*
cnt
;
...
...
@@ -555,30 +549,27 @@ void int32_to_int8(const int* din, signed char* dout, const float* scale, \
}
}
void
int32_to_int32
(
const
int
*
din
,
int
*
dout
,
const
float
*
scale
,
\
int
axis_size
,
long
long
outer_size
,
long
long
inner_size
)
{
void
int32_to_int32
(
const
int
*
din
,
int
*
dout
,
const
float
*
scale
,
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
int
size_all
=
outer_size
*
axis_size
*
inner_size
;
memmove
(
dout
,
din
,
size_all
*
sizeof
(
int
));
memmove
(
dout
,
din
,
size_all
*
sizeof
(
int
));
}
template
<
>
void
int32_to_dtype
(
const
int
*
din
,
float
*
dout
,
const
float
*
scale
,
int
axis_size
,
long
long
outer_size
,
long
long
inner_size
)
{
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
return
int32_to_fp32
(
din
,
dout
,
scale
,
axis_size
,
outer_size
,
inner_size
);
}
template
<
>
void
int32_to_dtype
(
const
int
*
din
,
signed
char
*
dout
,
const
float
*
scale
,
int
axis_size
,
long
long
outer_size
,
long
long
inner_size
)
{
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
return
int32_to_int8
(
din
,
dout
,
scale
,
axis_size
,
outer_size
,
inner_size
);
}
template
<
>
void
int32_to_dtype
(
const
int
*
din
,
int
*
dout
,
const
float
*
scale
,
int
axis_size
,
long
long
outer_size
,
long
long
inner_size
)
{
int
axis_size
,
int64_t
outer_size
,
int64_t
inner_size
)
{
return
int32_to_int32
(
din
,
dout
,
scale
,
axis_size
,
outer_size
,
inner_size
);
}
...
...
paddle/fluid/lite/core/CMakeLists.txt
浏览文件 @
9ae05da3
...
...
@@ -57,3 +57,4 @@ lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils_li
lite_cc_test
(
test_types_lite SRCS types_test.cc DEPS types_lite
)
lite_cc_test
(
test_memory_lite SRCS memory_test.cc DEPS memory_lite
)
lite_cc_test
(
test_context_lite SRCS context_test.cc DEPS context_lite X86_DEPS operator
)
paddle/fluid/lite/core/mir/CMakeLists.txt
浏览文件 @
9ae05da3
...
...
@@ -59,3 +59,4 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
pattern_matcher_high_api proto_desc mir_pass_manager fc_op_lite mul_op_lite elementwise_ops_lite
mir_passes compatible_pb_lite program_lite
${
ops_lite
}
)
endif
()
paddle/fluid/lite/core/profile/CMakeLists.txt
浏览文件 @
9ae05da3
...
...
@@ -4,3 +4,4 @@ endif()
lite_cc_library
(
basic_profiler_lite SRCS basic_profiler.cc
)
lite_cc_test
(
test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler_lite
)
paddle/fluid/lite/cuda/CMakeLists.txt
浏览文件 @
9ae05da3
...
...
@@ -4,3 +4,4 @@ endif()
nv_library
(
target_wrapper_cuda SRCS target_wrapper.cc
)
nv_library
(
cuda_blas_lite SRCS blas.cc
)
paddle/fluid/lite/gen_code/CMakeLists.txt
浏览文件 @
9ae05da3
...
...
@@ -25,3 +25,4 @@ if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
add_dependencies
(
__generated_code__ test_gen_code_lite
)
endif
()
paddle/fluid/lite/host/CMakeLists.txt
浏览文件 @
9ae05da3
cc_library
(
target_wrapper_host SRCS target_wrapper.cc
)
paddle/fluid/lite/kernels/CMakeLists.txt
浏览文件 @
9ae05da3
...
...
@@ -5,3 +5,4 @@ add_subdirectory(arm)
add_subdirectory
(
cuda
)
add_subdirectory
(
x86
)
paddle/fluid/lite/kernels/arm/CMakeLists.txt
浏览文件 @
9ae05da3
...
...
@@ -40,3 +40,4 @@ set(arm_kernels
set
(
arm_kernels
"
${
arm_kernels
}
"
CACHE INTERNAL
"arm kernels"
)
paddle/fluid/lite/kernels/cuda/CMakeLists.txt
浏览文件 @
9ae05da3
...
...
@@ -9,3 +9,4 @@ cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS ${tensor_lite})
nv_library
(
kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda cuda_blas_lite
)
paddle/fluid/lite/kernels/host/CMakeLists.txt
浏览文件 @
9ae05da3
...
...
@@ -13,3 +13,4 @@ set(host_kernels
)
set
(
host_kernels
"
${
host_kernels
}
"
CACHE GLOBAL
"host kernels"
)
paddle/fluid/lite/kernels/x86/CMakeLists.txt
浏览文件 @
9ae05da3
...
...
@@ -35,3 +35,4 @@ set(x86_kernels
)
set
(
x86_kernels
"
${
x86_kernels
}
"
CACHE INTERNAL
"x86 kernels"
)
paddle/fluid/lite/model_parser/CMakeLists.txt
浏览文件 @
9ae05da3
...
...
@@ -27,3 +27,4 @@ lite_cc_test(test_op_desc_lite SRCS op_desc_test.cc DEPS cpp_op_desc_lite op_des
add_subdirectory
(
pb
)
add_subdirectory
(
cpp
)
paddle/fluid/lite/model_parser/cpp/CMakeLists.txt
浏览文件 @
9ae05da3
cc_library
(
cpp_op_desc_lite SRCS op_desc.cc DEPS any_lite
)
paddle/fluid/lite/model_parser/pb/CMakeLists.txt
浏览文件 @
9ae05da3
cc_library
(
var_desc_lite SRCS var_desc.cc DEPS framework_proto_lite
)
cc_library
(
op_desc_lite SRCS op_desc.cc DEPS framework_proto_lite
)
paddle/fluid/lite/operators/CMakeLists.txt
浏览文件 @
9ae05da3
...
...
@@ -56,3 +56,4 @@ lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite m
lite_cc_test
(
test_reshape_op_lite SRCS reshape_op_test.cc DEPS reshape_op_lite memory_lite
)
lite_cc_test
(
test_batch_norm_op_lite SRCS batch_norm_op_test.cc DEPS batch_norm_op_lite memory_lite
)
lite_cc_test
(
test_concat_op_lite SRCS concat_op_test.cc DEPS concat_op_lite memory_lite
)
paddle/fluid/lite/tools/Dockerfile.mobile
浏览文件 @
9ae05da3
...
...
@@ -88,3 +88,4 @@ RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel
RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit
RUN apt-get autoremove -y && apt-get clean
RUN rm -rf /sdk-tools-linux-4333796.zip /tmp/android-ndk-r17c-linux-x86_64.zip /cmake-3.10.3-Linux-x86_64.tar.gz
\ No newline at end of file
paddle/fluid/lite/tools/build.sh
浏览文件 @
9ae05da3
...
...
@@ -283,3 +283,4 @@ function main {
}
main
$@
paddle/fluid/lite/tools/mobile_readme.md
浏览文件 @
9ae05da3
...
...
@@ -124,3 +124,4 @@ $ adb devices
List of devices attached
5cb00b6 device
```
paddle/fluid/lite/utils/CMakeLists.txt
浏览文件 @
9ae05da3
...
...
@@ -9,3 +9,4 @@ set(utils_DEPS glog)
lite_cc_test
(
test_varient SRCS varient_test.cc DEPS utils_lite
)
cc_library
(
any_lite SRCS any.cc
)
cc_library
(
utils_lite SRCS cp_logging.cc string.cc DEPS
${
utils_DEPS
}
any_lite
)
paddle/fluid/lite/x86/CMakeLists.txt
浏览文件 @
9ae05da3
...
...
@@ -4,3 +4,4 @@ endif()
cc_library
(
target_wrapper_x86 SRCS target_wrapper.cc
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录