Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
58bf3c48
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
58bf3c48
编写于
6月 21, 2019
作者:
N
nhzlx
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'incubate/lite' of
http://10.87.145.36/inference/paddlelite
into xzl/incubate/lite
上级
7e2ecbd6
758db8df
变更
15
展开全部
隐藏空白更改
内联
并排
Showing
15 changed file
with
1314 addition
and
869 deletion
+1314
-869
.gitlab-ci.yml
.gitlab-ci.yml
+76
-28
paddle/fluid/lite/api/cxx_api_bin.cc
paddle/fluid/lite/api/cxx_api_bin.cc
+4
-3
paddle/fluid/lite/core/context.cc
paddle/fluid/lite/core/context.cc
+1
-316
paddle/fluid/lite/core/context.h
paddle/fluid/lite/core/context.h
+19
-25
paddle/fluid/lite/core/cpu_info.cc
paddle/fluid/lite/core/cpu_info.cc
+747
-417
paddle/fluid/lite/core/cpu_info.h
paddle/fluid/lite/core/cpu_info.h
+55
-58
paddle/fluid/lite/kernels/arm/conv_compute.cc
paddle/fluid/lite/kernels/arm/conv_compute.cc
+0
-2
paddle/fluid/lite/kernels/arm/fc_compute.cc
paddle/fluid/lite/kernels/arm/fc_compute.cc
+0
-1
paddle/fluid/lite/kernels/arm/mul_compute.cc
paddle/fluid/lite/kernels/arm/mul_compute.cc
+0
-1
paddle/fluid/lite/kernels/arm/pool_compute.cc
paddle/fluid/lite/kernels/arm/pool_compute.cc
+0
-1
paddle/fluid/lite/kernels/x86/CMakeLists.txt
paddle/fluid/lite/kernels/x86/CMakeLists.txt
+3
-0
paddle/fluid/lite/kernels/x86/batch_norm_compute.cc
paddle/fluid/lite/kernels/x86/batch_norm_compute.cc
+30
-0
paddle/fluid/lite/kernels/x86/batch_norm_compute.h
paddle/fluid/lite/kernels/x86/batch_norm_compute.h
+158
-0
paddle/fluid/lite/kernels/x86/batch_norm_compute_test.cc
paddle/fluid/lite/kernels/x86/batch_norm_compute_test.cc
+139
-0
paddle/fluid/lite/tools/build.sh
paddle/fluid/lite/tools/build.sh
+82
-17
未找到文件。
.gitlab-ci.yml
浏览文件 @
58bf3c48
...
...
@@ -2,6 +2,20 @@ before_script:
-
env
-
export CI_USER_DIR=$(pwd)
# prepare ccache
-
apt install ccache
# for proxy
-
export http_proxy=$CI_PROXY
-
export https_proxy=$CI_PROXY
# merge the latest code
-
git config --global user.email "you@example.com"
-
git config --global user.name "Your Name"
-
git fetch origin incubate/lite
-
git merge --no-ff origin/incubate/lite
image
:
$SERVER_LITE_DOCKER_IMAGE
stages
:
...
...
@@ -14,19 +28,13 @@ check:prebuilt:
-
lite
stage
:
ci
script
:
# prepare for pre-commit
-
rm -rf ~/.pip
-
export http_proxy=$CI_PROXY
-
export https_proxy=$CI_PROXY
-
pip install pre-commit
-
pre-commit install
# merge the latest code
-
git config --global user.email "you@example.com"
-
git config --global user.name "Your Name"
-
git fetch origin incubate/lite
-
git merge --no-ff origin/incubate/lite
-
./paddle/fluid/lite/tools/build.sh check_style
cache
:
key
:
check_style
paths
:
...
...
@@ -42,17 +50,11 @@ build:server:
paths
:
-
build/third_party
-
~/.ccache
-
$CI_PROJECT_DIR/_build_server_ccache
script
:
-
apt install ccache
-
export http_proxy=$CI_PROXY
-
export https_proxy=$CI_PROXY
# merge the latest code
-
git config --global user.email "you@example.com"
-
git config --global user.name "Your Name"
-
git fetch origin incubate/lite
-
git merge --no-ff origin/incubate/lite
# customize ccache path for specifying runner cache
-
export CCACHE_DIR=$CI_PROJECT_DIR/_build_server_ccache
# run build and test
-
mkdir -p build
-
cd build
-
../paddle/fluid/lite/tools/build.sh cmake_x86
...
...
@@ -66,7 +68,27 @@ build:server:
dependencies
:
-
check:prebuilt
build:mobile:
build:mobile_android:
tags
:
-
lite
stage
:
build_mobile
image
:
$MOBILE_LITE_DOCKER_IMAGE
cache
:
key
:
mobile_thirdparty
paths
:
-
$MOBILE_LITE_CACHE0
-
$MOBILE_LITE_CACHE1
-
~/.ccache
-
$CI_PROJECT_DIR/build_mobile_ccache
script
:
-
export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_ccache
-
./paddle/fluid/lite/tools/build.sh build_test_arm_subtask_android
dependencies
:
-
build:server
build:mobile_armlinux:
tags
:
-
lite
stage
:
build_mobile
...
...
@@ -77,17 +99,43 @@ build:mobile:
-
$MOBILE_LITE_CACHE0
-
$MOBILE_LITE_CACHE1
-
~/.ccache
-
$CI_PROJECT_DIR/build_mobile_ccache2
script
:
-
apt install ccache
-
export http_proxy=$CI_PROXY
-
export https_proxy=$CI_PROXY
-
export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_ccache2
-
./paddle/fluid/lite/tools/build.sh build_test_arm_subtask_armlinux
dependencies
:
-
build:server
# merge the latest code
-
git config --global user.email "you@example.com"
-
git config --global user.name "Your Name"
-
git fetch origin incubate/lite
-
git merge --no-ff origin/incubate/lite
cache
:
key
:
mobile_thirdparty
paths
:
-
$MOBILE_LITE_CACHE0
-
$MOBILE_LITE_CACHE1
-
~/.ccache
build:mobile_model_mobilenetv2:
tags
:
-
lite
stage
:
build_mobile
image
:
$MOBILE_LITE_DOCKER_IMAGE
cache
:
key
:
mobile_thirdparty
paths
:
-
$MOBILE_LITE_CACHE0
-
$MOBILE_LITE_CACHE1
-
~/.ccache
script
:
-
export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model1
-
./paddle/fluid/lite/tools/build.sh build_test_arm_model1
-
./paddle/fluid/lite/tools/build.sh build_test_arm
dependencies
:
-
build:server
cache
:
key
:
mobile_thirdparty
paths
:
-
$MOBILE_LITE_CACHE0
-
$MOBILE_LITE_CACHE1
-
~/.ccache
-
$CI_PROJECT_DIR/build_mobile_model1
paddle/fluid/lite/api/cxx_api_bin.cc
浏览文件 @
58bf3c48
...
...
@@ -29,9 +29,10 @@ double time_diff(Time t1, Time t2) {
return
counter
.
count
()
/
1000.0
;
}
void
Run
(
const
char
*
model_dir
,
int
repeat
)
{
void
Run
(
const
char
*
model_dir
,
int
repeat
,
int
thread_num
)
{
#ifdef LITE_WITH_ARM
DeviceInfo
::
Init
();
DeviceInfo
::
Global
().
SetRunMode
(
LITE_POWER_HIGH
,
thread_num
);
#endif
lite
::
ExecutorLite
predictor
;
std
::
vector
<
Place
>
valid_places
({
Place
{
TARGET
(
kHost
),
PRECISION
(
kFloat
)},
...
...
@@ -67,8 +68,8 @@ void Run(const char* model_dir, int repeat) {
}
// namespace paddle
int
main
(
int
argc
,
char
**
argv
)
{
CHECK_EQ
(
argc
,
3
)
<<
"usage: ./cmd <model_dir> <repeat
>"
;
paddle
::
lite
::
Run
(
argv
[
1
],
std
::
stoi
(
argv
[
2
]));
CHECK_EQ
(
argc
,
4
)
<<
"usage: ./cmd <model_dir> <repeat> <thread_num
>"
;
paddle
::
lite
::
Run
(
argv
[
1
],
std
::
stoi
(
argv
[
2
])
,
std
::
stoi
(
argv
[
3
])
);
return
0
;
}
...
...
paddle/fluid/lite/core/context.cc
浏览文件 @
58bf3c48
...
...
@@ -13,322 +13,7 @@
// limitations under the License.
#include "paddle/fluid/lite/core/context.h"
#include "paddle/fluid/lite/core/cpu_info.h"
#ifdef LITE_WITH_LINUX
#include <sys/syscall.h>
#include <unistd.h>
#endif
#if __APPLE__
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#include <mach/machine.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#endif // TARGET_OS_IPHONE
#endif // __APPLE__
#ifdef ARM_WITH_OMP
#include <omp.h>
#endif
namespace
paddle
{
namespace
lite
{
#ifdef LITE_WITH_ARM
void
Context
<
TargetType
::
kARM
>::
SetCache
(
int
l1size
,
int
l2size
,
int
l3size
)
{
DeviceInfo
&
dev
=
DeviceInfo
::
Global
();
int
cpu_count
=
arm_get_cpucount
();
dev
.
L1_cache_
.
resize
(
cpu_count
);
dev
.
L2_cache_
.
resize
(
cpu_count
);
dev
.
L3_cache_
.
resize
(
cpu_count
);
for
(
int
i
=
0
;
i
<
cpu_count
;
++
i
)
{
dev
.
L1_cache_
[
i
]
=
l1size
;
dev
.
L2_cache_
[
i
]
=
l2size
;
dev
.
L3_cache_
[
i
]
=
l3size
;
}
workspace_
.
Resize
({
2
*
(
l1size
+
l2size
)});
}
Context
<
TargetType
::
kARM
>::
Context
()
{
active_ids_
=
{
0
};
mode_
=
LITE_POWER_HIGH
;
DeviceInfo
&
dev
=
DeviceInfo
::
Global
();
workspace_
.
Resize
(
{
static_cast
<
int64_t
>
(
dev
.
L2_cache_
[
active_ids_
[
0
]]
/
sizeof
(
float
))});
#ifdef TARGET_IOS
arch_
=
APPLE
;
// use 6x8
#else
if
(
dev
.
big_core_ids_
.
size
()
>
0
)
{
arch_
=
dev
.
archs_
[
dev
.
big_core_ids_
[
0
]];
}
#endif
}
PowerMode
Context
<
TargetType
::
kARM
>::
mode
()
const
{
return
mode_
;
}
int
Context
<
TargetType
::
kARM
>::
threads
()
const
{
return
active_ids_
.
size
();
}
Context
<
TargetType
::
kARM
>::
Context
(
const
ARMContext
&
ctx
)
{
mode_
=
ctx
.
mode_
;
active_ids_
=
ctx
.
active_ids_
;
workspace_
=
ctx
.
workspace_
;
arch_
=
ctx
.
arch_
;
count_
=
ctx
.
count_
;
}
ARMContext
&
Context
<
TargetType
::
kARM
>::
operator
=
(
const
ARMContext
&
ctx
)
{
mode_
=
ctx
.
mode_
;
active_ids_
=
ctx
.
active_ids_
;
workspace_
=
ctx
.
workspace_
;
arch_
=
ctx
.
arch_
;
count_
=
ctx
.
count_
;
return
*
this
;
}
void
Context
<
TargetType
::
kARM
>::
BindDev
()
{
#ifdef ARM_WITH_OMP
int
num_threads
=
active_ids_
.
size
();
omp_set_num_threads
(
num_threads
);
#ifdef LITE_WITH_LINUX
std
::
vector
<
int
>
ssarets
;
for
(
int
j
=
0
;
j
<
num_threads
;
++
j
)
{
ssarets
.
push_back
(
0
);
}
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
num_threads
;
i
++
)
{
ssarets
[
i
]
=
set_sched_affinity
(
active_ids_
);
}
for
(
int
i
=
0
;
i
<
num_threads
;
i
++
)
{
if
(
ssarets
[
i
]
!=
0
)
{
LOG
(
ERROR
)
<<
"set cpu affinity failed, cpuID: "
<<
active_ids_
[
i
];
return
;
}
}
#endif // LITE_WITH_LINUX
#else // ARM_WITH_OMP
#ifdef LITE_WITH_LINUX
std
::
vector
<
int
>
cpuid1
;
cpuid1
.
push_back
(
active_ids_
[
0
]);
int
ssaret
=
set_sched_affinity
(
cpuid1
);
if
(
ssaret
!=
0
)
{
printf
(
"set cpu affinity failed, cpuID: %d
\n
"
,
active_ids_
[
0
]);
return
;
}
#endif // LITE_WITH_LINUX
#endif // ARM_WITH_OMP
}
void
Context
<
TargetType
::
kARM
>::
SetRunMode
(
PowerMode
mode
,
int
threads
)
{
DeviceInfo
&
dev
=
DeviceInfo
::
Global
();
int
big_core_size
=
dev
.
big_core_ids_
.
size
();
int
small_core_size
=
dev
.
little_core_ids_
.
size
();
if
(
threads
>
big_core_size
+
small_core_size
)
{
threads
=
big_core_size
+
small_core_size
;
}
#ifdef ARM_WITH_OMP
count_
++
;
int
shift_num
=
(
count_
/
10
)
%
big_core_size
;
switch
(
mode
)
{
case
LITE_POWER_FULL
:
mode_
=
mode
;
active_ids_
.
clear
();
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
if
(
i
<
big_core_size
)
{
active_ids_
.
push_back
(
dev
.
big_core_ids_
[
i
]);
}
else
{
active_ids_
.
push_back
(
dev
.
little_core_ids_
[
i
-
big_core_size
]);
}
}
if
(
active_ids_
.
size
()
==
0
)
{
active_ids_
.
push_back
(
0
);
}
break
;
case
LITE_POWER_HIGH
:
active_ids_
.
clear
();
if
(
big_core_size
>
0
)
{
mode_
=
LITE_POWER_HIGH
;
if
(
threads
>
big_core_size
)
{
LOG
(
ERROR
)
<<
"threads: "
<<
threads
<<
", exceed the big cores size: "
<<
big_core_size
;
active_ids_
=
dev
.
big_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
active_ids_
.
push_back
(
dev
.
big_core_ids_
[
i
]);
}
}
}
else
{
mode_
=
LITE_POWER_LOW
;
LOG
(
ERROR
)
<<
"HIGH POWER MODE is not support, switch to little cores"
;
if
(
threads
>
small_core_size
)
{
active_ids_
=
dev
.
little_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
active_ids_
.
push_back
(
dev
.
little_core_ids_
[
i
]);
}
}
}
if
(
active_ids_
.
size
()
==
0
)
{
active_ids_
.
push_back
(
0
);
}
break
;
case
LITE_POWER_LOW
:
active_ids_
.
clear
();
if
(
small_core_size
>
0
)
{
mode_
=
LITE_POWER_LOW
;
if
(
threads
>
small_core_size
)
{
LOG
(
WARNING
)
<<
"threads: "
<<
threads
<<
", exceed the little cores size: "
<<
small_core_size
;
active_ids_
=
dev
.
little_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
active_ids_
.
push_back
(
dev
.
little_core_ids_
[
i
]);
}
}
}
else
{
mode_
=
LITE_POWER_HIGH
;
LOG
(
WARNING
)
<<
"LOW POWER MODE is not support, switch to big cores"
;
if
(
threads
>
big_core_size
)
{
active_ids_
=
dev
.
big_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
active_ids_
.
push_back
(
dev
.
big_core_ids_
[
i
]);
}
}
}
if
(
active_ids_
.
size
()
==
0
)
{
active_ids_
.
push_back
(
0
);
}
break
;
case
LITE_POWER_NO_BIND
:
mode_
=
LITE_POWER_NO_BIND
;
active_ids_
.
clear
();
if
(
threads
>
dev
.
core_ids_
.
size
())
{
active_ids_
.
resize
(
dev
.
core_ids_
.
size
());
}
else
{
active_ids_
.
resize
(
threads
);
}
break
;
case
LITE_POWER_RAND_HIGH
:
active_ids_
.
clear
();
if
(
big_core_size
>
0
)
{
mode_
=
LITE_POWER_RAND_HIGH
;
if
(
threads
>
big_core_size
)
{
LOG
(
WARNING
)
<<
"threads: "
<<
threads
<<
", exceed the big cores size: "
<<
big_core_size
;
active_ids_
=
dev
.
big_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
active_ids_
.
push_back
(
dev
.
big_core_ids_
[(
i
+
shift_num
)
%
big_core_size
]);
}
}
}
else
{
mode_
=
LITE_POWER_LOW
;
LOG
(
WARNING
)
<<
"HIGH POWER MODE is not support, switch to little cores"
;
if
(
threads
>
small_core_size
)
{
active_ids_
=
dev
.
little_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
active_ids_
.
push_back
(
dev
.
little_core_ids_
[
i
]);
}
}
}
if
(
active_ids_
.
size
()
==
0
)
{
active_ids_
.
push_back
(
0
);
}
break
;
case
LITE_POWER_RAND_LOW
:
active_ids_
.
clear
();
if
(
small_core_size
>
0
)
{
mode_
=
LITE_POWER_RAND_LOW
;
if
(
threads
>
small_core_size
)
{
LOG
(
WARNING
)
<<
"threads: "
<<
threads
<<
", exceed the little cores size: "
<<
small_core_size
;
active_ids_
=
dev
.
little_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
active_ids_
.
push_back
(
dev
.
little_core_ids_
[(
i
+
shift_num
)
%
small_core_size
]);
}
}
}
else
{
mode_
=
LITE_POWER_HIGH
;
LOG
(
WARNING
)
<<
"LOW POWER MODE is not support, switch to big cores"
;
if
(
threads
>
big_core_size
)
{
active_ids_
=
dev
.
big_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
active_ids_
.
push_back
(
dev
.
big_core_ids_
[
i
]);
}
}
}
if
(
active_ids_
.
size
()
==
0
)
{
active_ids_
.
push_back
(
0
);
}
break
;
}
//! fix multi-threads LITE_POWER_HIGH mode
if
(
mode_
==
LITE_POWER_NO_BIND
||
threads
>
1
)
{
int
threads
=
active_ids_
.
size
();
omp_set_num_threads
(
threads
);
}
else
{
if
(
check_online
(
active_ids_
))
{
BindDev
();
}
else
{
LOG
(
ERROR
)
<<
"core id "
<<
active_ids_
[
0
]
<<
" is offline, switch to NO BIND MODE"
;
int
threads
=
active_ids_
.
size
();
omp_set_num_threads
(
threads
);
}
}
#else
if
(
big_core_size
>
0
)
{
active_ids_
=
{
dev
.
big_core_ids_
[
0
]};
}
else
{
active_ids_
=
{
0
};
}
#endif
//! alloc memory for sgemm in this context
int
temp_mem_size
=
DeviceInfo
::
Global
().
L2_cache_
[
active_ids_
[
0
]]
/
sizeof
(
float
);
workspace_
.
Resize
({
temp_mem_size
});
arch_
=
DeviceInfo
::
Global
().
archs_
[
active_ids_
[
0
]];
}
ARMArch
Context
<
TargetType
::
kARM
>::
arch
()
const
{
return
arch_
;
}
void
Context
<
TargetType
::
kARM
>::
SetArch
(
ARMArch
arch
)
{
arch_
=
arch
;
}
int
Context
<
TargetType
::
kARM
>::
l1_cache_size
()
const
{
DeviceInfo
&
dev
=
DeviceInfo
::
Global
();
return
dev
.
L1_cache_
[
active_ids_
[
0
]];
}
int
Context
<
TargetType
::
kARM
>::
l2_cache_size
()
const
{
DeviceInfo
&
dev
=
DeviceInfo
::
Global
();
return
dev
.
L2_cache_
[
active_ids_
[
0
]];
}
int
Context
<
TargetType
::
kARM
>::
l3_cache_size
()
const
{
DeviceInfo
&
dev
=
DeviceInfo
::
Global
();
return
dev
.
L3_cache_
[
active_ids_
[
0
]];
}
bool
Context
<
TargetType
::
kARM
>::
ExtendWorkspace
(
DDimLite
dims
)
{
auto
count
=
dims
.
product
();
auto
old
=
workspace_
.
dims
();
if
(
count
==
old
.
product
())
{
return
false
;
}
workspace_
.
Resize
(
{
static_cast
<
int64_t
>
(
count
+
l2_cache_size
()
/
sizeof
(
float
))});
return
true
;
}
#endif // LITE_WITH_ARM
}
// namespace lite
namespace
lite
{}
// namespace lite
}
// namespace paddle
paddle/fluid/lite/core/context.h
浏览文件 @
58bf3c48
...
...
@@ -61,47 +61,41 @@ class Context<TargetType::kHost> {
template
<
>
class
Context
<
TargetType
::
kARM
>
{
public:
Context
();
Context
(
PowerMode
mode
,
int
threads
);
Context
()
{}
explicit
Context
(
const
ARMContext
&
ctx
);
ARMContext
&
operator
=
(
const
ARMContext
&
ctx
)
;
ARMContext
&
operator
=
(
const
ARMContext
&
ctx
)
{}
// NOTE: InitOnce should only be used by ContextScheduler
void
InitOnce
()
{
DeviceInfo
::
Init
();
}
void
CopyShared
(
const
ARMContext
*
ctx
)
{}
void
SetRunMode
(
PowerMode
mode
,
int
threads
);
void
SetCache
(
int
l1size
,
int
l2size
,
int
l3size
);
void
SetArch
(
ARMArch
arch
);
void
BindDev
();
void
SetRunMode
(
PowerMode
mode
,
int
threads
)
{
return
DeviceInfo
::
Global
().
SetRunMode
(
mode
,
threads
);
}
void
SetCache
(
int
l1size
,
int
l2size
,
int
l3size
)
{
return
DeviceInfo
::
Global
().
SetCache
(
l1size
,
l2size
,
l3size
);
}
void
SetArch
(
ARMArch
arch
)
{
return
DeviceInfo
::
Global
().
SetArch
(
arch
);
}
PowerMode
mode
()
const
;
int
threads
()
const
;
ARMArch
arch
()
const
;
PowerMode
mode
()
const
{
return
DeviceInfo
::
Global
().
mode
();
}
int
threads
()
const
{
return
DeviceInfo
::
Global
().
threads
();
}
ARMArch
arch
()
const
{
return
DeviceInfo
::
Global
().
arch
();
}
int
l1_cache_size
()
const
{
return
DeviceInfo
::
Global
().
l1_cache_size
();
}
int
l2_cache_size
()
const
{
return
DeviceInfo
::
Global
().
l2_cache_size
();
}
int
l3_cache_size
()
const
{
return
DeviceInfo
::
Global
().
l3_cache_size
();
}
template
<
typename
T
>
T
*
workspace_data
()
{
return
workspace_
.
mutabl
e_data
<
T
>
();
return
DeviceInfo
::
Global
().
workspac
e_data
<
T
>
();
}
int
l1_cache_size
()
const
;
int
l2_cache_size
()
const
;
int
l3_cache_size
()
const
;
bool
ExtendWorkspace
(
DDimLite
dims
);
bool
ExtendWorkspace
(
DDimLite
dims
)
{
return
DeviceInfo
::
Global
().
ExtendWorkspace
(
dims
);
}
std
::
string
name
()
const
{
return
"ARMContext"
;
}
private:
// LITE_POWER_HIGH stands for using big cores,
// LITE_POWER_LOW stands for using small core,
// LITE_POWER_FULL stands for using all cores
ARMArch
arch_
;
PowerMode
mode_
;
std
::
vector
<
int
>
active_ids_
;
TensorLite
workspace_
;
int64_t
count_
{
0
};
};
#endif
...
...
paddle/fluid/lite/core/cpu_info.cc
浏览文件 @
58bf3c48
此差异已折叠。
点击以展开。
paddle/fluid/lite/core/cpu_info.h
浏览文件 @
58bf3c48
...
...
@@ -14,24 +14,12 @@
#pragma once
#include <cstdarg>
#include <string>
#include <vector>
#include "paddle/fluid/lite/core/lite_tensor.h"
#include "paddle/fluid/lite/utils/cp_logging.h"
#ifdef LITE_WITH_LINUX
#include <sys/syscall.h>
#include <unistd.h>
#endif
#if __APPLE__
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#include <mach/machine.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#endif // TARGET_OS_IPHONE
#endif // __APPLE__
namespace
paddle
{
namespace
lite
{
...
...
@@ -60,64 +48,73 @@ typedef enum {
class
DeviceInfo
{
public:
int
idx_
;
int
max_freq_
;
int
min_freq_
;
int
generate_arch_
;
int
compute_core_num_
;
int
max_memory_
;
int
sharemem_size_
;
std
::
string
device_name_
;
std
::
string
compute_ability_
;
std
::
vector
<
int
>
L1_cache_
;
std
::
vector
<
int
>
L2_cache_
;
std
::
vector
<
int
>
L3_cache_
;
std
::
vector
<
int
>
core_ids_
;
std
::
vector
<
int
>
big_core_ids_
;
std
::
vector
<
int
>
little_core_ids_
;
std
::
vector
<
int
>
cluster_ids_
;
std
::
vector
<
ARMArch
>
archs_
;
static
DeviceInfo
&
Global
()
{
static
auto
*
x
=
new
DeviceInfo
;
return
*
x
;
}
static
void
Init
()
{
auto
&
info
=
Global
();
InitInternal
(
&
info
)
;
static
int
Init
()
{
static
int
ret
=
Global
().
Setup
();
return
ret
;
}
private:
DeviceInfo
()
=
default
;
static
void
InitInternal
(
DeviceInfo
*
dev
);
};
int
Setup
();
size_t
arm_get_meminfo
();
void
SetRunMode
(
PowerMode
mode
,
int
thread_num
);
void
SetCache
(
int
l1size
,
int
l2size
,
int
l3size
);
void
SetArch
(
ARMArch
arch
)
{
arch_
=
arch
;
}
int
arm_get_cpucount
();
PowerMode
mode
()
const
{
return
mode_
;
}
int
threads
()
const
{
return
active_ids_
.
size
();
}
ARMArch
arch
()
const
{
return
arch_
;
}
int
l1_cache_size
()
const
{
return
L1_cache_
[
active_ids_
[
0
]];
}
int
l2_cache_size
()
const
{
return
L2_cache_
[
active_ids_
[
0
]];
}
int
l3_cache_size
()
const
{
return
L3_cache_
[
active_ids_
[
0
]];
}
void
arm_get_cpu_arch
(
std
::
vector
<
ARMArch
>*
archs
);
bool
get_cpu_info_from_name
(
DeviceInfo
*
cpu_info
,
std
::
string
hardware_name
);
#ifdef LITE_WITH_LINUX
void
set_default_cache
(
DeviceInfo
*
dev
);
template
<
typename
T
>
T
*
workspace_data
()
{
return
workspace_
.
mutable_data
<
T
>
();
}
bool
ExtendWorkspace
(
DDimLite
dims
);
std
::
string
arm_get_cpu_name
();
private:
int
core_num_
;
std
::
vector
<
int
>
max_freqs_
;
std
::
vector
<
int
>
min_freqs_
;
int
mem_size_
;
std
::
string
dev_name_
;
int
get_max_freq_khz
(
int
cpuid
);
std
::
vector
<
int
>
L1_cache_
;
std
::
vector
<
int
>
L2_cache_
;
std
::
vector
<
int
>
L3_cache_
;
std
::
vector
<
int
>
core_ids_
;
std
::
vector
<
int
>
big_core_ids_
;
std
::
vector
<
int
>
little_core_ids_
;
std
::
vector
<
int
>
cluster_ids_
;
std
::
vector
<
ARMArch
>
archs_
;
int
arm_sort_cpuid_by_max_frequency
(
int
cpu_count
,
std
::
vector
<
int
>*
cpuids
,
const
std
::
vector
<
int
>&
cpu_freq
,
std
::
vector
<
int
>*
cluster_ids
);
int
check_online
(
const
std
::
vector
<
int
>&
core_ids
);
int
set_sched_affinity
(
const
std
::
vector
<
int
>&
cpuids
);
ARMArch
arch_
;
// LITE_POWER_HIGH stands for using big cores,
// LITE_POWER_LOW stands for using small core,
// LITE_POWER_FULL stands for using all cores
PowerMode
mode_
;
std
::
vector
<
int
>
active_ids_
;
TensorLite
workspace_
;
int64_t
count_
{
0
};
void
SetCacheInfo
(
int
cache_id
,
int
argc
,
...);
void
SetArchInfo
(
int
argc
,
...);
bool
SetCPUInfoByName
();
void
SetCPUInfoByProb
();
void
RequestPowerFullMode
(
const
int
thread_num
);
void
RequestPowerHighMode
(
const
int
thread_num
);
void
RequestPowerLowMode
(
const
int
thread_num
);
void
RequestPowerNoBindMode
(
const
int
thread_num
);
void
RequestPowerRandHighMode
(
const
int
shift_num
,
const
int
thread_num
);
void
RequestPowerRandLowMode
(
const
int
shift_num
,
const
int
thread_num
);
#endif // LITE_WITH_LINUX
DeviceInfo
()
=
default
;
};
#endif // LITE_WITH_ARM
...
...
paddle/fluid/lite/kernels/arm/conv_compute.cc
浏览文件 @
58bf3c48
...
...
@@ -28,8 +28,6 @@ void ConvCompute::PrepareForRun() {
auto
o_dims
=
param
.
output
->
dims
();
auto
&
ctx
=
this
->
ctx_
->
template
As
<
ARMContext
>();
// TODO(xxx): make api and expose it
ctx
.
SetRunMode
(
LITE_POWER_HIGH
,
4
);
int
win
=
x_dims
[
3
];
// nchw
int
hin
=
x_dims
[
2
];
...
...
paddle/fluid/lite/kernels/arm/fc_compute.cc
浏览文件 @
58bf3c48
...
...
@@ -28,7 +28,6 @@ void FcCompute::PrepareForRun() {
auto
w_dims
=
param
.
w
->
dims
();
auto
&
ctx
=
this
->
ctx_
->
template
As
<
ARMContext
>();
ctx
.
SetRunMode
(
LITE_POWER_HIGH
,
4
);
CHECK_GE
(
x_dims
.
size
(),
2UL
);
CHECK_EQ
(
w_dims
.
size
(),
2UL
);
...
...
paddle/fluid/lite/kernels/arm/mul_compute.cc
浏览文件 @
58bf3c48
...
...
@@ -24,7 +24,6 @@ namespace arm {
void
MulCompute
::
PrepareForRun
()
{
auto
&
ctx
=
this
->
ctx_
->
template
As
<
ARMContext
>();
ctx
.
SetRunMode
(
LITE_POWER_HIGH
,
4
);
}
void
MulCompute
::
Run
()
{
...
...
paddle/fluid/lite/kernels/arm/pool_compute.cc
浏览文件 @
58bf3c48
...
...
@@ -26,7 +26,6 @@ namespace arm {
void
PoolCompute
::
PrepareForRun
()
{
auto
&
ctx
=
this
->
ctx_
->
template
As
<
ARMContext
>();
ctx
.
SetRunMode
(
LITE_POWER_HIGH
,
4
);
}
void
PoolCompute
::
Run
()
{
...
...
paddle/fluid/lite/kernels/x86/CMakeLists.txt
浏览文件 @
58bf3c48
...
...
@@ -17,6 +17,7 @@ cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps}
cc_library
(
concat_compute_x86 SRCS concat_compute.cc DEPS
${
lite_kernel_deps
}
)
cc_library
(
conv_compute_x86 SRCS conv_compute.cc DEPS
${
lite_kernel_deps
}
blas im2col vol2col
)
cc_library
(
pool_compute_x86 SRCS pool_compute.cc DEPS
${
lite_kernel_deps
}
pooling
)
cc_library
(
batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS
${
lite_kernel_deps
}
)
lite_cc_test
(
test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86
)
lite_cc_test
(
test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86
)
...
...
@@ -28,6 +29,7 @@ lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS relu_compute_x
lite_cc_test
(
test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86 operator
)
lite_cc_test
(
test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86
)
lite_cc_test
(
test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86
)
lite_cc_test
(
test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86
)
set
(
x86_kernels
...
...
@@ -44,6 +46,7 @@ set(x86_kernels
concat_compute_x86
conv_compute_x86
pool_compute_x86
batch_norm_compute_x86
)
set
(
x86_kernels
"
${
x86_kernels
}
"
CACHE INTERNAL
"x86 kernels"
)
...
...
paddle/fluid/lite/kernels/x86/batch_norm_compute.cc
0 → 100644
浏览文件 @
58bf3c48
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/x86/batch_norm_compute.h"
REGISTER_LITE_KERNEL
(
batch_norm
,
kX86
,
kFloat
,
kNCHW
,
paddle
::
lite
::
kernels
::
x86
::
BatchNormCompute
<
float
>
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindInput
(
"Scale"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindInput
(
"Bias"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindInput
(
"Mean"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindInput
(
"Variance"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindOutput
(
"Y"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindOutput
(
"MeanOut"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindOutput
(
"VarianceOut"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindOutput
(
"MeanOut"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindOutput
(
"SavedMean"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindOutput
(
"SavedVariance"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
Finalize
();
paddle/fluid/lite/kernels/x86/batch_norm_compute.h
0 → 100644
浏览文件 @
58bf3c48
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <random>
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
x86
{
template
<
typename
T
>
using
EigenArrayMap
=
Eigen
::
Map
<
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
Eigen
::
Dynamic
>>
;
template
<
typename
T
>
using
ConstEigenArrayMap
=
Eigen
::
Map
<
const
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
Eigen
::
Dynamic
>>
;
template
<
typename
T
>
using
EigenVectorArrayMap
=
Eigen
::
Map
<
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>>
;
template
<
typename
T
>
using
ConstEigenVectorArrayMap
=
Eigen
::
Map
<
const
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>>
;
template
<
typename
T
>
class
BatchNormCompute
:
public
KernelLite
<
TARGET
(
kX86
),
PRECISION
(
kFloat
)
>
{
public:
using
param_t
=
operators
::
BatchNormParam
;
void
Run
()
override
{
auto
&
param
=
*
param_
.
get_mutable
<
operators
::
BatchNormParam
>
();
bool
global_stats
=
param
.
is_test
||
param
.
use_global_stats
;
const
auto
*
x
=
param
.
x
;
const
auto
&
x_dims
=
x
->
dims
();
CHECK
(
x_dims
.
size
()
>=
2
&&
x_dims
.
size
()
<=
5
);
const
int
N
=
x_dims
[
0
];
const
int
C
=
param
.
data_layout
==
DATALAYOUT
(
kNCHW
)
?
x_dims
[
1
]
:
x_dims
[
x_dims
.
size
()
-
1
];
const
int
sample_size
=
x
->
dims
().
production
()
/
N
/
C
;
// alloc memory
param
.
y
->
template
mutable_data
<
T
>();
param
.
mean_out
->
template
mutable_data
<
T
>();
param
.
variance_out
->
template
mutable_data
<
T
>();
param
.
saved_mean
->
template
mutable_data
<
T
>();
param
.
saved_variance
->
template
mutable_data
<
T
>();
if
(
!
global_stats
)
{
// saved_xx is use just in this batch of data
EigenVectorArrayMap
<
T
>
saved_mean_e
(
param
.
saved_mean
->
mutable_data
<
T
>
(),
C
);
EigenVectorArrayMap
<
T
>
saved_variance_e
(
param
.
saved_variance
->
mutable_data
<
T
>
(),
C
);
saved_mean_e
.
setZero
();
saved_variance_e
.
setZero
();
EigenVectorArrayMap
<
T
>
running_mean_arr
(
param
.
mean_out
->
mutable_data
<
T
>
(),
C
);
EigenVectorArrayMap
<
T
>
running_var_arr
(
param
.
variance_out
->
mutable_data
<
T
>
(),
C
);
if
((
N
*
sample_size
)
==
1
)
{
LOG
(
WARNING
)
<<
"Only 1 element in normalization dimension, "
<<
"we skip the batch norm calculation, let y = x."
;
framework
::
TensorCopy
(
x
->
raw_tensor
(),
platform
::
CPUPlace
(),
&
param
.
y
->
raw_tensor
());
return
;
}
switch
(
param
.
data_layout
)
{
case
DATALAYOUT
(
kNCHW
):
{
ConstEigenArrayMap
<
T
>
x_arr
(
x
->
data
<
T
>
(),
sample_size
,
N
*
C
);
for
(
int
nc
=
0
;
nc
<
N
*
C
;
++
nc
)
{
saved_mean_e
(
nc
%
C
)
+=
x_arr
.
col
(
nc
).
sum
();
}
saved_mean_e
/=
N
*
sample_size
;
for
(
int
nc
=
0
;
nc
<
N
*
C
;
++
nc
)
{
saved_variance_e
(
nc
%
C
)
+=
(
x_arr
.
col
(
nc
)
-
saved_mean_e
(
nc
%
C
)).
matrix
().
squaredNorm
();
}
saved_variance_e
/=
N
*
sample_size
;
break
;
}
default:
LOG
(
FATAL
)
<<
"Unknown storage order: "
<<
DataLayoutToStr
(
param
.
data_layout
);
break
;
}
running_mean_arr
=
running_mean_arr
*
param
.
momentum
+
saved_mean_e
*
(
1.
-
param
.
momentum
);
running_var_arr
=
running_var_arr
*
param
.
momentum
+
saved_variance_e
*
(
1.
-
param
.
momentum
);
}
// use SavedMean and SavedVariance to do normalize
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>
inv_std
(
C
);
if
(
global_stats
)
{
ConstEigenVectorArrayMap
<
T
>
var_arr
(
param
.
variance
->
data
<
T
>
(),
C
);
inv_std
=
(
var_arr
+
param
.
epsilon
).
sqrt
().
inverse
();
}
else
{
EigenVectorArrayMap
<
T
>
saved_inv_std
(
param
.
saved_variance
->
mutable_data
<
T
>
(),
C
);
// inverse SavedVariance first, gradient will use it too.
saved_inv_std
=
(
saved_inv_std
+
param
.
epsilon
).
inverse
().
sqrt
();
inv_std
=
saved_inv_std
;
}
ConstEigenVectorArrayMap
<
T
>
mean_arr
(
global_stats
?
param
.
mean
->
data
<
T
>
()
:
param
.
saved_mean
->
data
<
T
>
(),
C
);
// ((x - est_mean) * (inv_var) * scale + bias
// formula transform ====>
// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
ConstEigenVectorArrayMap
<
T
>
scale_arr
(
param
.
scale
->
data
<
T
>
(),
C
);
ConstEigenVectorArrayMap
<
T
>
bias_arr
(
param
.
bias
->
data
<
T
>
(),
C
);
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>
new_scale
=
inv_std
*
scale_arr
;
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>
new_bias
=
bias_arr
-
mean_arr
*
inv_std
*
scale_arr
;
switch
(
param
.
data_layout
)
{
case
DATALAYOUT
(
kNCHW
):
{
EigenArrayMap
<
T
>
y_arr
(
param
.
y
->
mutable_data
<
T
>
(),
sample_size
,
N
*
C
);
ConstEigenArrayMap
<
T
>
x_arr
(
x
->
data
<
T
>
(),
sample_size
,
N
*
C
);
for
(
int
nc
=
0
;
nc
<
N
*
C
;
++
nc
)
{
y_arr
.
col
(
nc
)
=
x_arr
.
col
(
nc
)
*
new_scale
(
nc
%
C
)
+
new_bias
(
nc
%
C
);
}
break
;
}
default:
LOG
(
FATAL
)
<<
"Unknown storage order: "
<<
DataLayoutToStr
(
param
.
data_layout
);
break
;
}
}
virtual
~
BatchNormCompute
()
=
default
;
};
}
// namespace x86
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
paddle/fluid/lite/kernels/x86/batch_norm_compute_test.cc
0 → 100644
浏览文件 @
58bf3c48
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/x86/batch_norm_compute.h"
#include <gtest/gtest.h>
#include <iostream>
#include <vector>
#include "paddle/fluid/lite/core/op_registry.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
x86
{
TEST
(
batch_norm_x86
,
retrive_op
)
{
auto
batch_norm
=
KernelRegistry
::
Global
().
Create
<
TARGET
(
kX86
),
PRECISION
(
kFloat
)
>
(
"batch_norm"
);
ASSERT_FALSE
(
batch_norm
.
empty
());
ASSERT_TRUE
(
batch_norm
.
front
());
}
TEST
(
batch_norm_x86
,
init
)
{
BatchNormCompute
<
float
>
batch_norm
;
ASSERT_EQ
(
batch_norm
.
precision
(),
PRECISION
(
kFloat
));
ASSERT_EQ
(
batch_norm
.
target
(),
TARGET
(
kX86
));
}
TEST
(
batch_norm_x86
,
run_test
)
{
lite
::
Tensor
x
,
scale
,
bias
,
mean
,
variance
,
y
,
mean_out
,
variance_out
,
saved_mean
,
saved_variance
;
constexpr
int
batch_size
=
2
;
std
::
vector
<
int64_t
>
x_shape
{
batch_size
,
3
,
64
,
64
};
x
.
Resize
(
lite
::
DDim
(
x_shape
));
std
::
vector
<
int64_t
>
scale_shape
{
3
};
scale
.
Resize
(
lite
::
DDim
(
scale_shape
));
std
::
vector
<
int64_t
>
bias_shape
{
3
};
bias
.
Resize
(
lite
::
DDim
(
bias_shape
));
std
::
vector
<
int64_t
>
mean_shape
{
3
};
mean
.
Resize
(
lite
::
DDim
(
mean_shape
));
std
::
vector
<
int64_t
>
variance_shape
{
3
};
variance
.
Resize
(
lite
::
DDim
(
variance_shape
));
std
::
vector
<
int64_t
>
y_shape
{
batch_size
,
3
,
64
,
64
};
y
.
Resize
(
lite
::
DDim
(
y_shape
));
std
::
vector
<
int64_t
>
mean_out_shape
{
3
};
mean_out
.
Resize
(
lite
::
DDim
(
mean_out_shape
));
std
::
vector
<
int64_t
>
variance_out_shape
{
3
};
variance_out
.
Resize
(
lite
::
DDim
(
variance_out_shape
));
std
::
vector
<
int64_t
>
saved_mean_shape
{
3
};
saved_mean
.
Resize
(
lite
::
DDim
(
saved_mean_shape
));
std
::
vector
<
int64_t
>
saved_variance_shape
{
3
};
saved_variance
.
Resize
(
lite
::
DDim
(
saved_variance_shape
));
auto
x_data
=
x
.
mutable_data
<
float
>
();
auto
scale_data
=
scale
.
mutable_data
<
float
>
();
auto
bias_data
=
bias
.
mutable_data
<
float
>
();
auto
mean_data
=
mean
.
mutable_data
<
float
>
();
auto
variance_data
=
variance
.
mutable_data
<
float
>
();
y
.
mutable_data
<
float
>
();
mean_out
.
mutable_data
<
float
>
();
variance_out
.
mutable_data
<
float
>
();
saved_mean
.
mutable_data
<
float
>
();
saved_variance
.
mutable_data
<
float
>
();
for
(
int64_t
i
=
0
;
i
<
x
.
dims
().
production
();
i
++
)
{
x_data
[
i
]
=
static_cast
<
float
>
(
i
);
}
for
(
int
i
=
0
;
i
<
scale
.
dims
().
production
();
i
++
)
{
scale_data
[
i
]
=
static_cast
<
float
>
(
i
)
*
0.01
f
+
0.03
f
;
}
for
(
int
i
=
0
;
i
<
bias
.
dims
().
production
();
i
++
)
{
bias_data
[
i
]
=
static_cast
<
float
>
(
i
)
*
0.065
f
+
0.1
f
;
}
for
(
int
i
=
0
;
i
<
mean
.
dims
().
production
();
i
++
)
{
mean_data
[
i
]
=
static_cast
<
float
>
(
i
)
*
0.0565
f
;
}
for
(
int
i
=
0
;
i
<
variance
.
dims
().
production
();
i
++
)
{
variance_data
[
i
]
=
static_cast
<
float
>
(
i
)
*
2.08
f
+
1.5
f
;
}
// BatchNormCompute batch_norm;
BatchNormCompute
<
float
>
batch_norm
;
operators
::
BatchNormParam
param
;
param
.
x
=
&
x
;
param
.
is_test
=
false
;
param
.
scale
=
&
scale
;
param
.
bias
=
&
bias
;
param
.
mean
=
&
mean
;
param
.
variance
=
&
variance
;
param
.
use_global_stats
=
false
;
param
.
epsilon
=
1e-4
f
;
param
.
momentum
=
0.9
f
;
param
.
y
=
&
y
;
param
.
mean_out
=
&
mean_out
;
param
.
variance_out
=
&
variance_out
;
param
.
saved_mean
=
&
saved_mean
;
param
.
saved_variance
=
&
saved_variance
;
batch_norm
.
SetParam
(
param
);
batch_norm
.
Run
();
LOG
(
INFO
)
<<
"output: "
<<
y
;
LOG
(
INFO
)
<<
"mean_out: "
<<
mean_out
;
LOG
(
INFO
)
<<
"variance_out: "
<<
mean_out
;
LOG
(
INFO
)
<<
"saved_mean: "
<<
saved_mean
;
LOG
(
INFO
)
<<
"saved_variance: "
<<
saved_variance
;
/*for (int i = 0; i < y.dims().production(); i++) {
if(i < 5 || i > y.dims().production() - 5)
LOG(INFO) << y_data[i];
}*/
}
}
// namespace x86
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
USE_LITE_KERNEL
(
batch_norm
,
kX86
,
kFloat
,
kNCHW
,
def
);
paddle/fluid/lite/tools/build.sh
浏览文件 @
58bf3c48
...
...
@@ -135,8 +135,8 @@ function test_arm_model {
adb
-s
emulator-
${
port
}
push
${
model_dir
}
${
adb_work_dir
}
adb
-s
emulator-
${
port
}
push
${
testpath
}
${
adb_work_dir
}
adb
-s
emulator-
${
port
}
shell
chmod
+x
"
${
adb_work_dir
}
/
${
test_name
}
"
local
adb_model_path
=
"
./
${
adb_work_dir
}
/
`
basename
${
model_dir
}
`
"
adb
-s
emulator-
${
port
}
shell
"
./
${
adb_work_dir
}
/
${
test_name
}
--eval_model_dir=
$adb_model_path
"
local
adb_model_path
=
"
${
adb_work_dir
}
/
`
basename
${
model_dir
}
`
"
adb
-s
emulator-
${
port
}
shell
"
${
adb_work_dir
}
/
${
test_name
}
--eval_model_dir=
$adb_model_path
"
}
...
...
@@ -225,16 +225,11 @@ function test_arm {
for
_test
in
$(
cat
$TESTS_FILE
)
;
do
test_arm_android
$_test
$port
done
# TODO(sangoly): refine this
test_arm_model
"test_cxx_api_lite"
$port
"./third_party/install/mobilenet_v2_relu"
}
# Build the code and run lite arm tests. This is executed in the CI system.
function
build_test_arm
{
########################################################################
# job 1-4 must be in one runner
port_armv8
=
5554
port_armv7
=
5556
function
prepare_emulator
{
local
port_armv8
=
$1
local
port_armv7
=
$2
adb kill-server
adb devices |
grep
emulator |
cut
-f1
|
while
read
line
;
do
adb
-s
$line
emu
kill
;
done
...
...
@@ -245,6 +240,18 @@ function build_test_arm {
echo
n | avdmanager create avd
-f
-n
paddle-armv7
-k
"system-images;android-24;google_apis;armeabi-v7a"
echo
-ne
'\n'
|
${
ANDROID_HOME
}
/emulator/emulator
-avd
paddle-armv7
-noaudio
-no-window
-gpu
off
-verbose
-port
${
port_armv7
}
&
sleep
1m
}
# We split the arm unittest into several sub-tasks to parallel and reduce the overall CI timetime.
# sub-task1
function
build_test_arm_subtask_android
{
########################################################################
# job 1-4 must be in one runner
port_armv8
=
5554
port_armv7
=
5556
prepare_emulator
$port_armv8
$port_armv7
# job 1
build_arm
"android"
"armv8"
"gcc"
...
...
@@ -252,9 +259,9 @@ function build_test_arm {
cd
-
# job 2
build_arm
"android"
"armv8"
"clang"
test_arm
"android"
"armv8"
"clang"
${
port_armv8
}
cd
-
#
build_arm "android" "armv8" "clang"
#
test_arm "android" "armv8" "clang" ${port_armv8}
#
cd -
# job 3
build_arm
"android"
"armv7"
"gcc"
...
...
@@ -262,13 +269,22 @@ function build_test_arm {
cd
-
# job 4
build_arm
"android"
"armv7"
"clang"
test_arm
"android"
"armv7"
"clang"
${
port_armv7
}
cd
-
#
build_arm "android" "armv7" "clang"
#
test_arm "android" "armv7" "clang" ${port_armv7}
#
cd -
adb devices |
grep
emulator |
cut
-f1
|
while
read
line
;
do
adb
-s
$line
emu
kill
;
done
echo
"Done"
}
# sub-task2
function
build_test_arm_subtask_armlinux
{
########################################################################
# job 1-4 must be in one runner
port_armv8
=
5554
port_armv7
=
5556
prepare_emulator
$port_armv8
$port_armv7
# job 5
build_arm
"armlinux"
"armv8"
...
...
@@ -285,9 +301,47 @@ function build_test_arm {
test_arm
"armlinux"
"armv7hf"
cd
-
adb devices |
grep
emulator |
cut
-f1
|
while
read
line
;
do
adb
-s
$line
emu
kill
;
done
echo
"Done"
}
# sub-task3
function
build_test_arm_subtask3_mobilenet_v2
{
local
port_armv8
=
5554
local
port_armv7
=
5556
# We just test following single one environment to limit the CI time.
local
os
=
android
local
abi
=
armv8
local
lang
=
gcc
cur_dir
=
$(
pwd
)
build_dir
=
$cur_dir
/build.lite.
${
os
}
.
${
abi
}
.
${
lang
}
mkdir
-p
$build_dir
cd
$build_dir
cmake_arm
$os
$abi
$lang
make test_cxx_api_lite
-j
$NUM_CORES_FOR_COMPILE
prepare_emulator
$port_armv8
$port_armv7
# just test the model on armv8
test_arm_model
"test_cxx_api_lite"
$port_armv8
"./third_party/install/mobilenet_v2_relu"
adb devices |
grep
emulator |
cut
-f1
|
while
read
line
;
do
adb
-s
$line
emu
kill
;
done
echo
"Done"
}
# Build the code and run lite arm tests. This is executed in the CI system.
function
build_test_arm
{
########################################################################
# job 1-4 must be in one runner
port_armv8
=
5554
port_armv7
=
5556
build_test_arm_subtask_android
build_test_arm_subtask_armlinux
}
############################# MAIN #################################
function
print_usage
{
echo
-e
"
\n
USAGE:"
...
...
@@ -379,6 +433,18 @@ function main {
build_test_arm
shift
;;
build_test_arm_subtask_android
)
build_test_arm_subtask_android
shift
;;
build_test_arm_subtask_armlinux
)
build_test_arm_subtask_armlinux
shift
;;
build_test_arm_model1
)
build_test_arm_subtask3_mobilenet_v2
shift
;;
check_style
)
check_style
shift
...
...
@@ -397,4 +463,3 @@ function main {
}
main
$@
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录