Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
3360e9cd
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
3360e9cd
编写于
9月 07, 2017
作者:
L
Liu Yiqun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Change the definition of vmlaq_laneq_f32 from template function to macro.
上级
a98c9e6b
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
59 addition
and
62 deletion
+59
-62
.travis.yml
.travis.yml
+1
-1
Dockerfile.android
Dockerfile.android
+2
-2
paddle/function/GruFunctor.h
paddle/function/GruFunctor.h
+0
-1
paddle/function/neon/NeonDepthwiseConv.cpp
paddle/function/neon/NeonDepthwiseConv.cpp
+50
-50
paddle/function/neon/neon_util.h
paddle/function/neon/neon_util.h
+2
-6
paddle/scripts/docker/build_android.sh
paddle/scripts/docker/build_android.sh
+4
-2
未找到文件。
.travis.yml
浏览文件 @
3360e9cd
...
...
@@ -4,7 +4,7 @@ cache:
-
$HOME/.ccache
-
$HOME/.cache/pip
-
$TRAVIS_BUILD_DIR/build/third_party
-
$TRAVIS_BUILD_DIR/build
/third_party_android
-
$TRAVIS_BUILD_DIR/build
_android/third_party
sudo
:
required
dist
:
trusty
os
:
...
...
Dockerfile.android
浏览文件 @
3360e9cd
...
...
@@ -11,8 +11,8 @@ ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
ENV HOME=/root \
ANDROID_NDK_HOME=/opt/android-ndk-linux \
ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain
-gcc
\
ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain
-gcc
ANDROID_ARM_STANDALONE_TOOLCHAIN=/opt/arm-toolchain \
ANDROID_ARM64_STANDALONE_TOOLCHAIN=/opt/arm64-toolchain
RUN apt-get update && \
apt-get install -y \
...
...
paddle/function/GruFunctor.h
浏览文件 @
3360e9cd
...
...
@@ -15,7 +15,6 @@ limitations under the License. */
#pragma once
#include "GemmFunctor.h"
#include "GruFunctor.h"
#include "hl_cpu_gru.cuh"
namespace
paddle
{
...
...
paddle/function/neon/NeonDepthwiseConv.cpp
浏览文件 @
3360e9cd
...
...
@@ -116,15 +116,15 @@ struct DepthwiseConvKernel<3, 1> {
float32x4_t
tmp1
=
vdupq_n_f32
(
0.
f
);
float32x4_t
tmp2
=
vdupq_n_f32
(
0.
f
);
tmp1
=
vmlaq_laneq_f32
<
0
>
(
tmp1
,
input
[
0
][
0
],
k
[
0
]
);
tmp2
=
vmlaq_laneq_f32
<
1
>
(
tmp2
,
input
[
0
][
1
],
k
[
0
]
);
tmp1
=
vmlaq_laneq_f32
<
2
>
(
tmp1
,
input
[
0
][
2
],
k
[
0
]
);
tmp2
=
vmlaq_laneq_f32
<
0
>
(
tmp2
,
input
[
1
][
0
],
k
[
1
]
);
tmp1
=
vmlaq_laneq_f32
<
1
>
(
tmp1
,
input
[
1
][
1
],
k
[
1
]
);
tmp2
=
vmlaq_laneq_f32
<
2
>
(
tmp2
,
input
[
1
][
2
],
k
[
1
]
);
tmp1
=
vmlaq_laneq_f32
<
0
>
(
tmp1
,
input
[
2
][
0
],
k
[
2
]
);
tmp2
=
vmlaq_laneq_f32
<
1
>
(
tmp2
,
input
[
2
][
1
],
k
[
2
]
);
tmp1
=
vmlaq_laneq_f32
<
2
>
(
tmp1
,
input
[
2
][
2
],
k
[
2
]
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
0
][
0
],
k
[
0
],
0
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
0
][
1
],
k
[
0
],
1
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
0
][
2
],
k
[
0
],
2
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
1
][
0
],
k
[
1
],
0
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
1
][
1
],
k
[
1
],
1
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
1
][
2
],
k
[
1
],
2
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
2
][
0
],
k
[
2
],
0
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
2
][
1
],
k
[
2
],
1
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
2
][
2
],
k
[
2
],
2
);
tmp1
=
vaddq_f32
(
tmp1
,
tmp2
);
vst1q_f32
(
outputData
,
tmp1
);
...
...
@@ -223,15 +223,15 @@ struct DepthwiseConvKernel<3, 2> {
float32x4_t
tmp1
=
vdupq_n_f32
(
0.
f
);
float32x4_t
tmp2
=
vdupq_n_f32
(
0.
f
);
tmp1
=
vmlaq_laneq_f32
<
0
>
(
tmp1
,
input
[
0
][
0
],
k
[
0
]
);
tmp2
=
vmlaq_laneq_f32
<
1
>
(
tmp2
,
input
[
0
][
1
],
k
[
0
]
);
tmp1
=
vmlaq_laneq_f32
<
2
>
(
tmp1
,
input
[
0
][
2
],
k
[
0
]
);
tmp2
=
vmlaq_laneq_f32
<
0
>
(
tmp2
,
input
[
1
][
0
],
k
[
1
]
);
tmp1
=
vmlaq_laneq_f32
<
1
>
(
tmp1
,
input
[
1
][
1
],
k
[
1
]
);
tmp2
=
vmlaq_laneq_f32
<
2
>
(
tmp2
,
input
[
1
][
2
],
k
[
1
]
);
tmp1
=
vmlaq_laneq_f32
<
0
>
(
tmp1
,
input
[
2
][
0
],
k
[
2
]
);
tmp2
=
vmlaq_laneq_f32
<
1
>
(
tmp2
,
input
[
2
][
1
],
k
[
2
]
);
tmp1
=
vmlaq_laneq_f32
<
2
>
(
tmp1
,
input
[
2
][
2
],
k
[
2
]
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
0
][
0
],
k
[
0
],
0
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
0
][
1
],
k
[
0
],
1
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
0
][
2
],
k
[
0
],
2
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
1
][
0
],
k
[
1
],
0
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
1
][
1
],
k
[
1
],
1
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
1
][
2
],
k
[
1
],
2
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
2
][
0
],
k
[
2
],
0
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
2
][
1
],
k
[
2
],
1
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
2
][
2
],
k
[
2
],
2
);
tmp1
=
vaddq_f32
(
tmp1
,
tmp2
);
vst1q_f32
(
outputData
,
tmp1
);
...
...
@@ -316,22 +316,22 @@ struct DepthwiseConvKernel<4, 1> {
float32x4_t
tmp1
=
vdupq_n_f32
(
0.
f
);
float32x4_t
tmp2
=
vdupq_n_f32
(
0.
f
);
tmp1
=
vmlaq_laneq_f32
<
0
>
(
tmp1
,
input
[
0
][
0
],
k
[
0
]
);
tmp2
=
vmlaq_laneq_f32
<
1
>
(
tmp2
,
input
[
0
][
1
],
k
[
0
]
);
tmp1
=
vmlaq_laneq_f32
<
2
>
(
tmp1
,
input
[
0
][
2
],
k
[
0
]
);
tmp2
=
vmlaq_laneq_f32
<
3
>
(
tmp2
,
input
[
0
][
3
],
k
[
0
]
);
tmp1
=
vmlaq_laneq_f32
<
0
>
(
tmp1
,
input
[
1
][
0
],
k
[
1
]
);
tmp2
=
vmlaq_laneq_f32
<
1
>
(
tmp2
,
input
[
1
][
1
],
k
[
1
]
);
tmp1
=
vmlaq_laneq_f32
<
2
>
(
tmp1
,
input
[
1
][
2
],
k
[
1
]
);
tmp2
=
vmlaq_laneq_f32
<
3
>
(
tmp2
,
input
[
1
][
3
],
k
[
1
]
);
tmp1
=
vmlaq_laneq_f32
<
0
>
(
tmp1
,
input
[
2
][
0
],
k
[
2
]
);
tmp2
=
vmlaq_laneq_f32
<
1
>
(
tmp2
,
input
[
2
][
1
],
k
[
2
]
);
tmp1
=
vmlaq_laneq_f32
<
2
>
(
tmp1
,
input
[
2
][
2
],
k
[
2
]
);
tmp2
=
vmlaq_laneq_f32
<
3
>
(
tmp2
,
input
[
2
][
3
],
k
[
2
]
);
tmp1
=
vmlaq_laneq_f32
<
0
>
(
tmp1
,
input
[
3
][
0
],
k
[
3
]
);
tmp2
=
vmlaq_laneq_f32
<
1
>
(
tmp2
,
input
[
3
][
1
],
k
[
3
]
);
tmp1
=
vmlaq_laneq_f32
<
2
>
(
tmp1
,
input
[
3
][
2
],
k
[
3
]
);
tmp2
=
vmlaq_laneq_f32
<
3
>
(
tmp2
,
input
[
3
][
3
],
k
[
3
]
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
0
][
0
],
k
[
0
],
0
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
0
][
1
],
k
[
0
],
1
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
0
][
2
],
k
[
0
],
2
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
0
][
3
],
k
[
0
],
3
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
1
][
0
],
k
[
1
],
0
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
1
][
1
],
k
[
1
],
1
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
1
][
2
],
k
[
1
],
2
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
1
][
3
],
k
[
1
],
3
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
2
][
0
],
k
[
2
],
0
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
2
][
1
],
k
[
2
],
1
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
2
][
2
],
k
[
2
],
2
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
2
][
3
],
k
[
2
],
3
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
3
][
0
],
k
[
3
],
0
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
3
][
1
],
k
[
3
],
1
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
3
][
2
],
k
[
3
],
2
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
3
][
3
],
k
[
3
],
3
);
tmp1
=
vaddq_f32
(
tmp1
,
tmp2
);
vst1q_f32
(
outputData
,
tmp1
);
...
...
@@ -431,22 +431,22 @@ struct DepthwiseConvKernel<4, 2> {
float32x4_t
tmp1
=
vdupq_n_f32
(
0.
f
);
float32x4_t
tmp2
=
vdupq_n_f32
(
0.
f
);
tmp1
=
vmlaq_laneq_f32
<
0
>
(
tmp1
,
input
[
0
][
0
],
k
[
0
]
);
tmp2
=
vmlaq_laneq_f32
<
1
>
(
tmp2
,
input
[
0
][
1
],
k
[
0
]
);
tmp1
=
vmlaq_laneq_f32
<
2
>
(
tmp1
,
input
[
0
][
2
],
k
[
0
]
);
tmp2
=
vmlaq_laneq_f32
<
3
>
(
tmp2
,
input
[
0
][
3
],
k
[
0
]
);
tmp1
=
vmlaq_laneq_f32
<
0
>
(
tmp1
,
input
[
1
][
0
],
k
[
1
]
);
tmp2
=
vmlaq_laneq_f32
<
1
>
(
tmp2
,
input
[
1
][
1
],
k
[
1
]
);
tmp1
=
vmlaq_laneq_f32
<
2
>
(
tmp1
,
input
[
1
][
2
],
k
[
1
]
);
tmp2
=
vmlaq_laneq_f32
<
3
>
(
tmp2
,
input
[
1
][
3
],
k
[
1
]
);
tmp1
=
vmlaq_laneq_f32
<
0
>
(
tmp1
,
input
[
2
][
0
],
k
[
2
]
);
tmp2
=
vmlaq_laneq_f32
<
1
>
(
tmp2
,
input
[
2
][
1
],
k
[
2
]
);
tmp1
=
vmlaq_laneq_f32
<
2
>
(
tmp1
,
input
[
2
][
2
],
k
[
2
]
);
tmp2
=
vmlaq_laneq_f32
<
3
>
(
tmp2
,
input
[
2
][
3
],
k
[
2
]
);
tmp1
=
vmlaq_laneq_f32
<
0
>
(
tmp1
,
input
[
3
][
0
],
k
[
3
]
);
tmp2
=
vmlaq_laneq_f32
<
1
>
(
tmp2
,
input
[
3
][
1
],
k
[
3
]
);
tmp1
=
vmlaq_laneq_f32
<
2
>
(
tmp1
,
input
[
3
][
2
],
k
[
3
]
);
tmp2
=
vmlaq_laneq_f32
<
3
>
(
tmp2
,
input
[
3
][
3
],
k
[
3
]
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
0
][
0
],
k
[
0
],
0
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
0
][
1
],
k
[
0
],
1
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
0
][
2
],
k
[
0
],
2
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
0
][
3
],
k
[
0
],
3
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
1
][
0
],
k
[
1
],
0
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
1
][
1
],
k
[
1
],
1
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
1
][
2
],
k
[
1
],
2
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
1
][
3
],
k
[
1
],
3
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
2
][
0
],
k
[
2
],
0
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
2
][
1
],
k
[
2
],
1
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
2
][
2
],
k
[
2
],
2
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
2
][
3
],
k
[
2
],
3
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
3
][
0
],
k
[
3
],
0
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
3
][
1
],
k
[
3
],
1
);
tmp1
=
vmlaq_laneq_f32
(
tmp1
,
input
[
3
][
2
],
k
[
3
],
2
);
tmp2
=
vmlaq_laneq_f32
(
tmp2
,
input
[
3
][
3
],
k
[
3
],
3
);
tmp1
=
vaddq_f32
(
tmp1
,
tmp2
);
vst1q_f32
(
outputData
,
tmp1
);
...
...
paddle/function/neon/neon_util.h
浏览文件 @
3360e9cd
...
...
@@ -33,12 +33,8 @@ inline float32_t vaddvq_f32(float32x4_t a) {
return
vget_lane_f32
(
vpadd_f32
(
v
,
v
),
0
);
}
template
<
int
lane
>
inline
float32x4_t
vmlaq_laneq_f32
(
float32x4_t
a
,
float32x4_t
b
,
float32x4_t
v
)
{
return
vmlaq_n_f32
(
a
,
b
,
vgetq_lane_f32
(
v
,
lane
));
}
#define vmlaq_laneq_f32(a, b, v, lane) \
vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane))
#endif
}
// namespace neon
...
...
paddle/scripts/docker/build_android.sh
浏览文件 @
3360e9cd
...
...
@@ -36,6 +36,7 @@ elif [ $ANDROID_ABI == "arm64-v8a" ]; then
-DUSE_EIGEN_FOR_BLAS
=
OFF
\
-DWITH_C_API
=
ON
\
-DWITH_SWIG_PY
=
OFF
\
-DWITH_STYLE_CHECK
=
OFF
\
..
elif
[
$ANDROID_ABI
==
"armeabi"
]
;
then
cmake
-DCMAKE_SYSTEM_NAME
=
Android
\
...
...
@@ -48,10 +49,11 @@ elif [ $ANDROID_ABI == "armeabi" ]; then
-DCMAKE_BUILD_TYPE
=
Release
\
-DWITH_C_API
=
ON
\
-DWITH_SWIG_PY
=
OFF
\
-DWITH_STYLE_CHECK
=
OFF
\
..
else
echo
"Invalid ANDROID_ABI:
$ANDROID_ABI
"
fi
make
VERBOSE
=
1
-j2
make
install
-j
2
make
-j
`
nproc
`
make
install
-j
`
nproc
`
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录