Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
58bf3c48
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
58bf3c48
编写于
6月 21, 2019
作者:
N
nhzlx
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'incubate/lite' of
http://10.87.145.36/inference/paddlelite
into xzl/incubate/lite
上级
7e2ecbd6
758db8df
变更
15
显示空白变更内容
内联
并排
Showing
15 changed file
with
1314 addition
and
869 deletion
+1314
-869
.gitlab-ci.yml
.gitlab-ci.yml
+76
-28
paddle/fluid/lite/api/cxx_api_bin.cc
paddle/fluid/lite/api/cxx_api_bin.cc
+4
-3
paddle/fluid/lite/core/context.cc
paddle/fluid/lite/core/context.cc
+1
-316
paddle/fluid/lite/core/context.h
paddle/fluid/lite/core/context.h
+19
-25
paddle/fluid/lite/core/cpu_info.cc
paddle/fluid/lite/core/cpu_info.cc
+747
-417
paddle/fluid/lite/core/cpu_info.h
paddle/fluid/lite/core/cpu_info.h
+55
-58
paddle/fluid/lite/kernels/arm/conv_compute.cc
paddle/fluid/lite/kernels/arm/conv_compute.cc
+0
-2
paddle/fluid/lite/kernels/arm/fc_compute.cc
paddle/fluid/lite/kernels/arm/fc_compute.cc
+0
-1
paddle/fluid/lite/kernels/arm/mul_compute.cc
paddle/fluid/lite/kernels/arm/mul_compute.cc
+0
-1
paddle/fluid/lite/kernels/arm/pool_compute.cc
paddle/fluid/lite/kernels/arm/pool_compute.cc
+0
-1
paddle/fluid/lite/kernels/x86/CMakeLists.txt
paddle/fluid/lite/kernels/x86/CMakeLists.txt
+3
-0
paddle/fluid/lite/kernels/x86/batch_norm_compute.cc
paddle/fluid/lite/kernels/x86/batch_norm_compute.cc
+30
-0
paddle/fluid/lite/kernels/x86/batch_norm_compute.h
paddle/fluid/lite/kernels/x86/batch_norm_compute.h
+158
-0
paddle/fluid/lite/kernels/x86/batch_norm_compute_test.cc
paddle/fluid/lite/kernels/x86/batch_norm_compute_test.cc
+139
-0
paddle/fluid/lite/tools/build.sh
paddle/fluid/lite/tools/build.sh
+82
-17
未找到文件。
.gitlab-ci.yml
浏览文件 @
58bf3c48
...
@@ -2,6 +2,20 @@ before_script:
...
@@ -2,6 +2,20 @@ before_script:
-
env
-
env
-
export CI_USER_DIR=$(pwd)
-
export CI_USER_DIR=$(pwd)
# prepare ccache
-
apt install ccache
# for proxy
-
export http_proxy=$CI_PROXY
-
export https_proxy=$CI_PROXY
# merge the latest code
-
git config --global user.email "you@example.com"
-
git config --global user.name "Your Name"
-
git fetch origin incubate/lite
-
git merge --no-ff origin/incubate/lite
image
:
$SERVER_LITE_DOCKER_IMAGE
image
:
$SERVER_LITE_DOCKER_IMAGE
stages
:
stages
:
...
@@ -14,19 +28,13 @@ check:prebuilt:
...
@@ -14,19 +28,13 @@ check:prebuilt:
-
lite
-
lite
stage
:
ci
stage
:
ci
script
:
script
:
# prepare for pre-commit
-
rm -rf ~/.pip
-
rm -rf ~/.pip
-
export http_proxy=$CI_PROXY
-
export https_proxy=$CI_PROXY
-
pip install pre-commit
-
pip install pre-commit
-
pre-commit install
-
pre-commit install
# merge the latest code
-
git config --global user.email "you@example.com"
-
git config --global user.name "Your Name"
-
git fetch origin incubate/lite
-
git merge --no-ff origin/incubate/lite
-
./paddle/fluid/lite/tools/build.sh check_style
-
./paddle/fluid/lite/tools/build.sh check_style
cache
:
cache
:
key
:
check_style
key
:
check_style
paths
:
paths
:
...
@@ -42,17 +50,11 @@ build:server:
...
@@ -42,17 +50,11 @@ build:server:
paths
:
paths
:
-
build/third_party
-
build/third_party
-
~/.ccache
-
~/.ccache
-
$CI_PROJECT_DIR/_build_server_ccache
script
:
script
:
-
apt install ccache
# customize ccache path for specifying runner cache
-
export http_proxy=$CI_PROXY
-
export CCACHE_DIR=$CI_PROJECT_DIR/_build_server_ccache
-
export https_proxy=$CI_PROXY
# run build and test
# merge the latest code
-
git config --global user.email "you@example.com"
-
git config --global user.name "Your Name"
-
git fetch origin incubate/lite
-
git merge --no-ff origin/incubate/lite
-
mkdir -p build
-
mkdir -p build
-
cd build
-
cd build
-
../paddle/fluid/lite/tools/build.sh cmake_x86
-
../paddle/fluid/lite/tools/build.sh cmake_x86
...
@@ -66,7 +68,7 @@ build:server:
...
@@ -66,7 +68,7 @@ build:server:
dependencies
:
dependencies
:
-
check:prebuilt
-
check:prebuilt
build:mobile:
build:mobile
_android
:
tags
:
tags
:
-
lite
-
lite
stage
:
build_mobile
stage
:
build_mobile
...
@@ -77,17 +79,63 @@ build:mobile:
...
@@ -77,17 +79,63 @@ build:mobile:
-
$MOBILE_LITE_CACHE0
-
$MOBILE_LITE_CACHE0
-
$MOBILE_LITE_CACHE1
-
$MOBILE_LITE_CACHE1
-
~/.ccache
-
~/.ccache
-
$CI_PROJECT_DIR/build_mobile_ccache
script
:
script
:
-
apt install ccache
-
export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_ccache
-
export http_proxy=$CI_PROXY
-
./paddle/fluid/lite/tools/build.sh build_test_arm_subtask_android
-
export https_proxy=$CI_PROXY
dependencies
:
-
build:server
# merge the latest code
-
git config --global user.email "you@example.com"
-
git config --global user.name "Your Name"
-
git fetch origin incubate/lite
-
git merge --no-ff origin/incubate/lite
-
./paddle/fluid/lite/tools/build.sh build_test_arm
build:mobile_armlinux:
tags
:
-
lite
stage
:
build_mobile
image
:
$MOBILE_LITE_DOCKER_IMAGE
cache
:
key
:
mobile_thirdparty
paths
:
-
$MOBILE_LITE_CACHE0
-
$MOBILE_LITE_CACHE1
-
~/.ccache
-
$CI_PROJECT_DIR/build_mobile_ccache2
script
:
-
export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_ccache2
-
./paddle/fluid/lite/tools/build.sh build_test_arm_subtask_armlinux
dependencies
:
dependencies
:
-
build:server
-
build:server
cache
:
key
:
mobile_thirdparty
paths
:
-
$MOBILE_LITE_CACHE0
-
$MOBILE_LITE_CACHE1
-
~/.ccache
build:mobile_model_mobilenetv2:
tags
:
-
lite
stage
:
build_mobile
image
:
$MOBILE_LITE_DOCKER_IMAGE
cache
:
key
:
mobile_thirdparty
paths
:
-
$MOBILE_LITE_CACHE0
-
$MOBILE_LITE_CACHE1
-
~/.ccache
script
:
-
export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model1
-
./paddle/fluid/lite/tools/build.sh build_test_arm_model1
dependencies
:
-
build:server
cache
:
key
:
mobile_thirdparty
paths
:
-
$MOBILE_LITE_CACHE0
-
$MOBILE_LITE_CACHE1
-
~/.ccache
-
$CI_PROJECT_DIR/build_mobile_model1
paddle/fluid/lite/api/cxx_api_bin.cc
浏览文件 @
58bf3c48
...
@@ -29,9 +29,10 @@ double time_diff(Time t1, Time t2) {
...
@@ -29,9 +29,10 @@ double time_diff(Time t1, Time t2) {
return
counter
.
count
()
/
1000.0
;
return
counter
.
count
()
/
1000.0
;
}
}
void
Run
(
const
char
*
model_dir
,
int
repeat
)
{
void
Run
(
const
char
*
model_dir
,
int
repeat
,
int
thread_num
)
{
#ifdef LITE_WITH_ARM
#ifdef LITE_WITH_ARM
DeviceInfo
::
Init
();
DeviceInfo
::
Init
();
DeviceInfo
::
Global
().
SetRunMode
(
LITE_POWER_HIGH
,
thread_num
);
#endif
#endif
lite
::
ExecutorLite
predictor
;
lite
::
ExecutorLite
predictor
;
std
::
vector
<
Place
>
valid_places
({
Place
{
TARGET
(
kHost
),
PRECISION
(
kFloat
)},
std
::
vector
<
Place
>
valid_places
({
Place
{
TARGET
(
kHost
),
PRECISION
(
kFloat
)},
...
@@ -67,8 +68,8 @@ void Run(const char* model_dir, int repeat) {
...
@@ -67,8 +68,8 @@ void Run(const char* model_dir, int repeat) {
}
// namespace paddle
}
// namespace paddle
int
main
(
int
argc
,
char
**
argv
)
{
int
main
(
int
argc
,
char
**
argv
)
{
CHECK_EQ
(
argc
,
3
)
<<
"usage: ./cmd <model_dir> <repeat
>"
;
CHECK_EQ
(
argc
,
4
)
<<
"usage: ./cmd <model_dir> <repeat> <thread_num
>"
;
paddle
::
lite
::
Run
(
argv
[
1
],
std
::
stoi
(
argv
[
2
]));
paddle
::
lite
::
Run
(
argv
[
1
],
std
::
stoi
(
argv
[
2
])
,
std
::
stoi
(
argv
[
3
])
);
return
0
;
return
0
;
}
}
...
...
paddle/fluid/lite/core/context.cc
浏览文件 @
58bf3c48
...
@@ -13,322 +13,7 @@
...
@@ -13,322 +13,7 @@
// limitations under the License.
// limitations under the License.
#include "paddle/fluid/lite/core/context.h"
#include "paddle/fluid/lite/core/context.h"
#include "paddle/fluid/lite/core/cpu_info.h"
#ifdef LITE_WITH_LINUX
#include <sys/syscall.h>
#include <unistd.h>
#endif
#if __APPLE__
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#include <mach/machine.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#endif // TARGET_OS_IPHONE
#endif // __APPLE__
#ifdef ARM_WITH_OMP
#include <omp.h>
#endif
namespace
paddle
{
namespace
paddle
{
namespace
lite
{
namespace
lite
{}
// namespace lite
#ifdef LITE_WITH_ARM
void
Context
<
TargetType
::
kARM
>::
SetCache
(
int
l1size
,
int
l2size
,
int
l3size
)
{
DeviceInfo
&
dev
=
DeviceInfo
::
Global
();
int
cpu_count
=
arm_get_cpucount
();
dev
.
L1_cache_
.
resize
(
cpu_count
);
dev
.
L2_cache_
.
resize
(
cpu_count
);
dev
.
L3_cache_
.
resize
(
cpu_count
);
for
(
int
i
=
0
;
i
<
cpu_count
;
++
i
)
{
dev
.
L1_cache_
[
i
]
=
l1size
;
dev
.
L2_cache_
[
i
]
=
l2size
;
dev
.
L3_cache_
[
i
]
=
l3size
;
}
workspace_
.
Resize
({
2
*
(
l1size
+
l2size
)});
}
Context
<
TargetType
::
kARM
>::
Context
()
{
active_ids_
=
{
0
};
mode_
=
LITE_POWER_HIGH
;
DeviceInfo
&
dev
=
DeviceInfo
::
Global
();
workspace_
.
Resize
(
{
static_cast
<
int64_t
>
(
dev
.
L2_cache_
[
active_ids_
[
0
]]
/
sizeof
(
float
))});
#ifdef TARGET_IOS
arch_
=
APPLE
;
// use 6x8
#else
if
(
dev
.
big_core_ids_
.
size
()
>
0
)
{
arch_
=
dev
.
archs_
[
dev
.
big_core_ids_
[
0
]];
}
#endif
}
PowerMode
Context
<
TargetType
::
kARM
>::
mode
()
const
{
return
mode_
;
}
int
Context
<
TargetType
::
kARM
>::
threads
()
const
{
return
active_ids_
.
size
();
}
Context
<
TargetType
::
kARM
>::
Context
(
const
ARMContext
&
ctx
)
{
mode_
=
ctx
.
mode_
;
active_ids_
=
ctx
.
active_ids_
;
workspace_
=
ctx
.
workspace_
;
arch_
=
ctx
.
arch_
;
count_
=
ctx
.
count_
;
}
ARMContext
&
Context
<
TargetType
::
kARM
>::
operator
=
(
const
ARMContext
&
ctx
)
{
mode_
=
ctx
.
mode_
;
active_ids_
=
ctx
.
active_ids_
;
workspace_
=
ctx
.
workspace_
;
arch_
=
ctx
.
arch_
;
count_
=
ctx
.
count_
;
return
*
this
;
}
void
Context
<
TargetType
::
kARM
>::
BindDev
()
{
#ifdef ARM_WITH_OMP
int
num_threads
=
active_ids_
.
size
();
omp_set_num_threads
(
num_threads
);
#ifdef LITE_WITH_LINUX
std
::
vector
<
int
>
ssarets
;
for
(
int
j
=
0
;
j
<
num_threads
;
++
j
)
{
ssarets
.
push_back
(
0
);
}
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
num_threads
;
i
++
)
{
ssarets
[
i
]
=
set_sched_affinity
(
active_ids_
);
}
for
(
int
i
=
0
;
i
<
num_threads
;
i
++
)
{
if
(
ssarets
[
i
]
!=
0
)
{
LOG
(
ERROR
)
<<
"set cpu affinity failed, cpuID: "
<<
active_ids_
[
i
];
return
;
}
}
#endif // LITE_WITH_LINUX
#else // ARM_WITH_OMP
#ifdef LITE_WITH_LINUX
std
::
vector
<
int
>
cpuid1
;
cpuid1
.
push_back
(
active_ids_
[
0
]);
int
ssaret
=
set_sched_affinity
(
cpuid1
);
if
(
ssaret
!=
0
)
{
printf
(
"set cpu affinity failed, cpuID: %d
\n
"
,
active_ids_
[
0
]);
return
;
}
#endif // LITE_WITH_LINUX
#endif // ARM_WITH_OMP
}
void
Context
<
TargetType
::
kARM
>::
SetRunMode
(
PowerMode
mode
,
int
threads
)
{
DeviceInfo
&
dev
=
DeviceInfo
::
Global
();
int
big_core_size
=
dev
.
big_core_ids_
.
size
();
int
small_core_size
=
dev
.
little_core_ids_
.
size
();
if
(
threads
>
big_core_size
+
small_core_size
)
{
threads
=
big_core_size
+
small_core_size
;
}
#ifdef ARM_WITH_OMP
count_
++
;
int
shift_num
=
(
count_
/
10
)
%
big_core_size
;
switch
(
mode
)
{
case
LITE_POWER_FULL
:
mode_
=
mode
;
active_ids_
.
clear
();
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
if
(
i
<
big_core_size
)
{
active_ids_
.
push_back
(
dev
.
big_core_ids_
[
i
]);
}
else
{
active_ids_
.
push_back
(
dev
.
little_core_ids_
[
i
-
big_core_size
]);
}
}
if
(
active_ids_
.
size
()
==
0
)
{
active_ids_
.
push_back
(
0
);
}
break
;
case
LITE_POWER_HIGH
:
active_ids_
.
clear
();
if
(
big_core_size
>
0
)
{
mode_
=
LITE_POWER_HIGH
;
if
(
threads
>
big_core_size
)
{
LOG
(
ERROR
)
<<
"threads: "
<<
threads
<<
", exceed the big cores size: "
<<
big_core_size
;
active_ids_
=
dev
.
big_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
active_ids_
.
push_back
(
dev
.
big_core_ids_
[
i
]);
}
}
}
else
{
mode_
=
LITE_POWER_LOW
;
LOG
(
ERROR
)
<<
"HIGH POWER MODE is not support, switch to little cores"
;
if
(
threads
>
small_core_size
)
{
active_ids_
=
dev
.
little_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
active_ids_
.
push_back
(
dev
.
little_core_ids_
[
i
]);
}
}
}
if
(
active_ids_
.
size
()
==
0
)
{
active_ids_
.
push_back
(
0
);
}
break
;
case
LITE_POWER_LOW
:
active_ids_
.
clear
();
if
(
small_core_size
>
0
)
{
mode_
=
LITE_POWER_LOW
;
if
(
threads
>
small_core_size
)
{
LOG
(
WARNING
)
<<
"threads: "
<<
threads
<<
", exceed the little cores size: "
<<
small_core_size
;
active_ids_
=
dev
.
little_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
active_ids_
.
push_back
(
dev
.
little_core_ids_
[
i
]);
}
}
}
else
{
mode_
=
LITE_POWER_HIGH
;
LOG
(
WARNING
)
<<
"LOW POWER MODE is not support, switch to big cores"
;
if
(
threads
>
big_core_size
)
{
active_ids_
=
dev
.
big_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
active_ids_
.
push_back
(
dev
.
big_core_ids_
[
i
]);
}
}
}
if
(
active_ids_
.
size
()
==
0
)
{
active_ids_
.
push_back
(
0
);
}
break
;
case
LITE_POWER_NO_BIND
:
mode_
=
LITE_POWER_NO_BIND
;
active_ids_
.
clear
();
if
(
threads
>
dev
.
core_ids_
.
size
())
{
active_ids_
.
resize
(
dev
.
core_ids_
.
size
());
}
else
{
active_ids_
.
resize
(
threads
);
}
break
;
case
LITE_POWER_RAND_HIGH
:
active_ids_
.
clear
();
if
(
big_core_size
>
0
)
{
mode_
=
LITE_POWER_RAND_HIGH
;
if
(
threads
>
big_core_size
)
{
LOG
(
WARNING
)
<<
"threads: "
<<
threads
<<
", exceed the big cores size: "
<<
big_core_size
;
active_ids_
=
dev
.
big_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
active_ids_
.
push_back
(
dev
.
big_core_ids_
[(
i
+
shift_num
)
%
big_core_size
]);
}
}
}
else
{
mode_
=
LITE_POWER_LOW
;
LOG
(
WARNING
)
<<
"HIGH POWER MODE is not support, switch to little cores"
;
if
(
threads
>
small_core_size
)
{
active_ids_
=
dev
.
little_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
active_ids_
.
push_back
(
dev
.
little_core_ids_
[
i
]);
}
}
}
if
(
active_ids_
.
size
()
==
0
)
{
active_ids_
.
push_back
(
0
);
}
break
;
case
LITE_POWER_RAND_LOW
:
active_ids_
.
clear
();
if
(
small_core_size
>
0
)
{
mode_
=
LITE_POWER_RAND_LOW
;
if
(
threads
>
small_core_size
)
{
LOG
(
WARNING
)
<<
"threads: "
<<
threads
<<
", exceed the little cores size: "
<<
small_core_size
;
active_ids_
=
dev
.
little_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
active_ids_
.
push_back
(
dev
.
little_core_ids_
[(
i
+
shift_num
)
%
small_core_size
]);
}
}
}
else
{
mode_
=
LITE_POWER_HIGH
;
LOG
(
WARNING
)
<<
"LOW POWER MODE is not support, switch to big cores"
;
if
(
threads
>
big_core_size
)
{
active_ids_
=
dev
.
big_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
threads
;
++
i
)
{
active_ids_
.
push_back
(
dev
.
big_core_ids_
[
i
]);
}
}
}
if
(
active_ids_
.
size
()
==
0
)
{
active_ids_
.
push_back
(
0
);
}
break
;
}
//! fix multi-threads LITE_POWER_HIGH mode
if
(
mode_
==
LITE_POWER_NO_BIND
||
threads
>
1
)
{
int
threads
=
active_ids_
.
size
();
omp_set_num_threads
(
threads
);
}
else
{
if
(
check_online
(
active_ids_
))
{
BindDev
();
}
else
{
LOG
(
ERROR
)
<<
"core id "
<<
active_ids_
[
0
]
<<
" is offline, switch to NO BIND MODE"
;
int
threads
=
active_ids_
.
size
();
omp_set_num_threads
(
threads
);
}
}
#else
if
(
big_core_size
>
0
)
{
active_ids_
=
{
dev
.
big_core_ids_
[
0
]};
}
else
{
active_ids_
=
{
0
};
}
#endif
//! alloc memory for sgemm in this context
int
temp_mem_size
=
DeviceInfo
::
Global
().
L2_cache_
[
active_ids_
[
0
]]
/
sizeof
(
float
);
workspace_
.
Resize
({
temp_mem_size
});
arch_
=
DeviceInfo
::
Global
().
archs_
[
active_ids_
[
0
]];
}
ARMArch
Context
<
TargetType
::
kARM
>::
arch
()
const
{
return
arch_
;
}
void
Context
<
TargetType
::
kARM
>::
SetArch
(
ARMArch
arch
)
{
arch_
=
arch
;
}
int
Context
<
TargetType
::
kARM
>::
l1_cache_size
()
const
{
DeviceInfo
&
dev
=
DeviceInfo
::
Global
();
return
dev
.
L1_cache_
[
active_ids_
[
0
]];
}
int
Context
<
TargetType
::
kARM
>::
l2_cache_size
()
const
{
DeviceInfo
&
dev
=
DeviceInfo
::
Global
();
return
dev
.
L2_cache_
[
active_ids_
[
0
]];
}
int
Context
<
TargetType
::
kARM
>::
l3_cache_size
()
const
{
DeviceInfo
&
dev
=
DeviceInfo
::
Global
();
return
dev
.
L3_cache_
[
active_ids_
[
0
]];
}
bool
Context
<
TargetType
::
kARM
>::
ExtendWorkspace
(
DDimLite
dims
)
{
auto
count
=
dims
.
product
();
auto
old
=
workspace_
.
dims
();
if
(
count
==
old
.
product
())
{
return
false
;
}
workspace_
.
Resize
(
{
static_cast
<
int64_t
>
(
count
+
l2_cache_size
()
/
sizeof
(
float
))});
return
true
;
}
#endif // LITE_WITH_ARM
}
// namespace lite
}
// namespace paddle
}
// namespace paddle
paddle/fluid/lite/core/context.h
浏览文件 @
58bf3c48
...
@@ -61,47 +61,41 @@ class Context<TargetType::kHost> {
...
@@ -61,47 +61,41 @@ class Context<TargetType::kHost> {
template
<
>
template
<
>
class
Context
<
TargetType
::
kARM
>
{
class
Context
<
TargetType
::
kARM
>
{
public:
public:
Context
();
Context
()
{}
Context
(
PowerMode
mode
,
int
threads
);
explicit
Context
(
const
ARMContext
&
ctx
);
explicit
Context
(
const
ARMContext
&
ctx
);
ARMContext
&
operator
=
(
const
ARMContext
&
ctx
)
;
ARMContext
&
operator
=
(
const
ARMContext
&
ctx
)
{}
// NOTE: InitOnce should only be used by ContextScheduler
// NOTE: InitOnce should only be used by ContextScheduler
void
InitOnce
()
{
DeviceInfo
::
Init
();
}
void
InitOnce
()
{
DeviceInfo
::
Init
();
}
void
CopyShared
(
const
ARMContext
*
ctx
)
{}
void
CopyShared
(
const
ARMContext
*
ctx
)
{}
void
SetRunMode
(
PowerMode
mode
,
int
threads
);
void
SetRunMode
(
PowerMode
mode
,
int
threads
)
{
void
SetCache
(
int
l1size
,
int
l2size
,
int
l3size
);
return
DeviceInfo
::
Global
().
SetRunMode
(
mode
,
threads
);
void
SetArch
(
ARMArch
arch
);
}
void
BindDev
();
void
SetCache
(
int
l1size
,
int
l2size
,
int
l3size
)
{
return
DeviceInfo
::
Global
().
SetCache
(
l1size
,
l2size
,
l3size
);
}
void
SetArch
(
ARMArch
arch
)
{
return
DeviceInfo
::
Global
().
SetArch
(
arch
);
}
PowerMode
mode
()
const
;
PowerMode
mode
()
const
{
return
DeviceInfo
::
Global
().
mode
();
}
int
threads
()
const
;
int
threads
()
const
{
return
DeviceInfo
::
Global
().
threads
();
}
ARMArch
arch
()
const
;
ARMArch
arch
()
const
{
return
DeviceInfo
::
Global
().
arch
();
}
int
l1_cache_size
()
const
{
return
DeviceInfo
::
Global
().
l1_cache_size
();
}
int
l2_cache_size
()
const
{
return
DeviceInfo
::
Global
().
l2_cache_size
();
}
int
l3_cache_size
()
const
{
return
DeviceInfo
::
Global
().
l3_cache_size
();
}
template
<
typename
T
>
template
<
typename
T
>
T
*
workspace_data
()
{
T
*
workspace_data
()
{
return
workspace_
.
mutabl
e_data
<
T
>
();
return
DeviceInfo
::
Global
().
workspac
e_data
<
T
>
();
}
}
int
l1_cache_size
()
const
;
bool
ExtendWorkspace
(
DDimLite
dims
)
{
int
l2_cache_size
()
const
;
return
DeviceInfo
::
Global
().
ExtendWorkspace
(
dims
);
int
l3_cache_size
()
const
;
}
bool
ExtendWorkspace
(
DDimLite
dims
);
std
::
string
name
()
const
{
return
"ARMContext"
;
}
std
::
string
name
()
const
{
return
"ARMContext"
;
}
private:
// LITE_POWER_HIGH stands for using big cores,
// LITE_POWER_LOW stands for using small core,
// LITE_POWER_FULL stands for using all cores
ARMArch
arch_
;
PowerMode
mode_
;
std
::
vector
<
int
>
active_ids_
;
TensorLite
workspace_
;
int64_t
count_
{
0
};
};
};
#endif
#endif
...
...
paddle/fluid/lite/core/cpu_info.cc
浏览文件 @
58bf3c48
...
@@ -12,312 +12,81 @@
...
@@ -12,312 +12,81 @@
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#ifdef LITE_WITH_LINUX
#include <sys/syscall.h>
#include <unistd.h>
#endif
#if __APPLE__
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#include <mach/machine.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#endif // TARGET_OS_IPHONE
#endif // __APPLE__
#ifdef ARM_WITH_OMP
#include <omp.h>
#endif
#include <algorithm>
#include <limits>
#include "paddle/fluid/lite/core/cpu_info.h"
#include "paddle/fluid/lite/core/cpu_info.h"
#include <cstdarg>
namespace
paddle
{
namespace
paddle
{
namespace
lite
{
namespace
lite
{
#ifdef LITE_WITH_ARM
#ifdef LITE_WITH_ARM
void
DeviceInfo
::
InitInternal
(
DeviceInfo
*
dev
)
{
#ifdef TARGET_IOS
set_default_cache
(
dev
);
const
int
DEFAULT_L1_CACHE_SIZE
=
64
*
1024
;
dev
->
compute_core_num_
=
arm_get_cpucount
();
const
int
DEFAULT_L2_CACHE_SIZE
=
2048
*
1024
;
dev
->
max_memory_
=
arm_get_meminfo
();
const
int
DEFAULT_L3_CACHE_SIZE
=
0
;
#else
// get max freq
const
int
DEFAULT_L1_CACHE_SIZE
=
32
*
1024
;
#ifdef LITE_WITH_LINUX
const
int
DEFAULT_L2_CACHE_SIZE
=
512
*
1024
;
std
::
vector
<
int
>
max_freq
(
dev
->
compute_core_num_
);
const
int
DEFAULT_L3_CACHE_SIZE
=
0
;
for
(
int
i
=
0
;
i
<
dev
->
compute_core_num_
;
++
i
)
{
max_freq
[
i
]
=
get_max_freq_khz
(
i
)
/
1000
;
}
std
::
string
cpu_name
=
arm_get_cpu_name
();
if
(
get_cpu_info_from_name
(
dev
,
cpu_name
)
!=
true
)
{
arm_sort_cpuid_by_max_frequency
(
dev
->
compute_core_num_
,
&
dev
->
core_ids_
,
max_freq
,
&
dev
->
cluster_ids_
);
dev
->
big_core_ids_
.
clear
();
dev
->
little_core_ids_
.
clear
();
for
(
int
i
=
0
;
i
<
dev
->
cluster_ids_
.
size
();
++
i
)
{
if
(
dev
->
cluster_ids_
[
i
]
==
0
)
{
dev
->
big_core_ids_
.
push_back
(
dev
->
core_ids_
[
i
]);
}
else
{
dev
->
little_core_ids_
.
push_back
(
dev
->
core_ids_
[
i
]);
}
}
arm_get_cpu_arch
(
&
dev
->
archs_
);
}
LOG
(
INFO
)
<<
"ARM multiprocessors number: "
<<
dev
->
compute_core_num_
;
for
(
int
i
=
0
;
i
<
dev
->
compute_core_num_
;
++
i
)
{
LOG
(
INFO
)
<<
"ARM multiprocessors ID: "
<<
dev
->
core_ids_
[
i
]
<<
", frequence: "
<<
max_freq
[
i
]
<<
", cluster ID: "
<<
dev
->
cluster_ids_
[
dev
->
core_ids_
[
i
]]
<<
", CPU ARCH: A"
<<
dev
->
archs_
[
i
];
}
VLOG
(
1
)
<<
"L1 DataCache size is: "
;
for
(
int
i
=
0
;
i
<
dev
->
compute_core_num_
;
++
i
)
{
VLOG
(
1
)
<<
dev
->
L1_cache_
[
i
]
/
1024
<<
" KB"
;
}
VLOG
(
1
)
<<
"L2 Cache size is: "
;
for
(
int
i
=
0
;
i
<
dev
->
compute_core_num_
;
++
i
)
{
VLOG
(
1
)
<<
dev
->
L2_cache_
[
i
]
/
1024
<<
" KB"
;
}
VLOG
(
1
)
<<
"Total memory: "
<<
dev
->
max_memory_
<<
"KB"
;
dev
->
max_freq_
=
max_freq
[
0
];
for
(
int
j
=
1
;
j
<
dev
->
compute_core_num_
;
++
j
)
{
if
(
dev
->
max_freq_
<
max_freq
[
j
])
{
dev
->
max_freq_
=
max_freq
[
j
];
}
}
#elif defined(TARGET_IOS)
arm_get_cpu_arch
(
&
dev
->
archs_
);
#endif
#endif
}
// cache_id : 0 -> L1, 1 -> L2, 2 -> L3
int
get_cpu_num
()
{
void
set_cache_info
(
DeviceInfo
*
cpu_info
,
int
cache_id
,
int
argc
,
...)
{
#ifdef LITE_WITH_LINUX
va_list
arg_ptr
;
// get cpu count from /sys/devices/system/cpu/cpunum/uevent
va_start
(
arg_ptr
,
argc
);
int
max_cpu_num
=
20
;
std
::
vector
<
int
>*
cache
;
int
cpu_num
=
0
;
switch
(
cache_id
)
{
for
(
int
i
=
0
;
i
<
max_cpu_num
;
++
i
)
{
case
0
:
char
path
[
256
];
cache
=
&
cpu_info
->
L1_cache_
;
snprintf
(
path
,
sizeof
(
path
),
"/sys/devices/system/cpu/cpu%d/uevent"
,
i
);
break
;
FILE
*
fp
=
fopen
(
path
,
"rb"
);
case
1
:
if
(
!
fp
)
{
cache
=
&
cpu_info
->
L2_cache_
;
break
;
case
2
:
cache
=
&
cpu_info
->
L3_cache_
;
break
;
default:
break
;
break
;
}
}
int
core_num
=
cpu_info
->
compute_core_num_
;
cpu_num
++
;
cache
->
resize
(
core_num
);
fclose
(
fp
);
if
(
argc
==
1
)
{
int
cache_size
=
va_arg
(
arg_ptr
,
int
);
for
(
int
i
=
0
;
i
<
core_num
;
++
i
)
{
(
*
cache
)[
i
]
=
cache_size
;
}
}
else
{
int
big_core_num
=
cpu_info
->
big_core_ids_
.
size
();
int
little_core_num
=
cpu_info
->
little_core_ids_
.
size
();
int
big_core_cache_size
=
va_arg
(
arg_ptr
,
int
);
int
little_core_cache_size
=
va_arg
(
arg_ptr
,
int
);
for
(
int
i
=
0
;
i
<
big_core_num
;
++
i
)
{
(
*
cache
)[
cpu_info
->
big_core_ids_
[
i
]]
=
big_core_cache_size
;
}
for
(
int
i
=
0
;
i
<
little_core_num
;
++
i
)
{
(
*
cache
)[
cpu_info
->
little_core_ids_
[
i
]]
=
little_core_cache_size
;
}
}
va_end
(
arg_ptr
);
}
void
set_arch_info
(
DeviceInfo
*
cpu_info
,
int
argc
,
...)
{
va_list
arg_ptr
;
va_start
(
arg_ptr
,
argc
);
int
core_num
=
cpu_info
->
compute_core_num_
;
cpu_info
->
archs_
.
resize
(
core_num
);
if
(
argc
==
1
)
{
ARMArch
arch
=
(
ARMArch
)
va_arg
(
arg_ptr
,
int
);
for
(
int
i
=
0
;
i
<
core_num
;
++
i
)
{
cpu_info
->
archs_
[
i
]
=
arch
;
}
}
else
{
ARMArch
big_core_arch
=
(
ARMArch
)
va_arg
(
arg_ptr
,
int
);
ARMArch
little_core_arch
=
(
ARMArch
)
va_arg
(
arg_ptr
,
int
);
int
big_core_num
=
cpu_info
->
big_core_ids_
.
size
();
int
little_core_num
=
cpu_info
->
little_core_ids_
.
size
();
for
(
int
i
=
0
;
i
<
big_core_num
;
++
i
)
{
cpu_info
->
archs_
[
cpu_info
->
big_core_ids_
[
i
]]
=
big_core_arch
;
}
for
(
int
i
=
0
;
i
<
little_core_num
;
++
i
)
{
cpu_info
->
archs_
[
cpu_info
->
little_core_ids_
[
i
]]
=
little_core_arch
;
}
}
if
(
cpu_num
<
1
)
{
cpu_num
=
1
;
}
}
va_end
(
arg_ptr
);
return
cpu_num
;
}
#elif defined(TARGET_IOS)
int
cpu_num
=
0
;
bool
get_cpu_info_from_name
(
DeviceInfo
*
cpu_info
,
std
::
string
hardware_name
)
{
size_t
len
=
sizeof
(
cpu_num
);
/* Snapdragon */
sysctlbyname
(
"hw.ncpu"
,
&
cpu_num
,
&
len
,
NULL
,
0
);
if
(
hardware_name
.
find
(
"SDM845"
)
!=
std
::
string
::
npos
)
{
// 845
if
(
cpu_num
<
1
)
{
cpu_info
->
compute_core_num_
=
8
;
cpu_num
=
1
;
cpu_info
->
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
cpu_info
->
big_core_ids_
=
{
4
,
5
,
6
,
7
};
cpu_info
->
little_core_ids_
=
{
0
,
1
,
2
,
3
};
cpu_info
->
cluster_ids_
=
{
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
};
set_arch_info
(
cpu_info
,
2
,
kA75
,
kA55
);
set_cache_info
(
cpu_info
,
0
,
1
,
32
*
1024
);
set_cache_info
(
cpu_info
,
1
,
2
,
256
*
1024
,
128
*
1024
);
set_cache_info
(
cpu_info
,
2
,
1
,
2048
*
1024
);
return
true
;
}
else
if
(
hardware_name
.
find
(
"SDM710"
)
!=
std
::
string
::
npos
)
{
// 710
cpu_info
->
compute_core_num_
=
8
;
cpu_info
->
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
cpu_info
->
big_core_ids_
=
{
6
,
7
};
cpu_info
->
little_core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
};
cpu_info
->
cluster_ids_
=
{
1
,
1
,
1
,
1
,
1
,
1
,
0
,
0
};
set_arch_info
(
cpu_info
,
2
,
kA75
,
kA55
);
return
true
;
}
else
if
(
hardware_name
.
find
(
"MSM8998"
)
!=
std
::
string
::
npos
)
{
// 835
cpu_info
->
compute_core_num_
=
8
;
cpu_info
->
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
cpu_info
->
big_core_ids_
=
{
4
,
5
,
6
,
7
};
cpu_info
->
little_core_ids_
=
{
0
,
1
,
2
,
3
};
cpu_info
->
cluster_ids_
=
{
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
};
set_arch_info
(
cpu_info
,
2
,
kA73
,
kA53
);
set_cache_info
(
cpu_info
,
0
,
2
,
64
*
1024
);
set_cache_info
(
cpu_info
,
1
,
2
,
1024
*
1024
,
/*real cache size is 2M, while that will get bad performace
on conv3x3s1 or gemm, set to 1M or 512K*/
1024
*
1024
);
return
true
;
}
else
if
(
hardware_name
.
find
(
"MSM8996"
)
!=
std
::
string
::
npos
)
{
// 820
cpu_info
->
compute_core_num_
=
4
;
cpu_info
->
core_ids_
=
{
0
,
1
,
2
,
3
};
cpu_info
->
big_core_ids_
=
{
2
,
3
};
cpu_info
->
little_core_ids_
=
{
0
,
1
};
cpu_info
->
cluster_ids_
=
{
1
,
1
,
0
,
0
};
set_arch_info
(
cpu_info
,
1
,
kA72
);
set_cache_info
(
cpu_info
,
0
,
1
,
24
*
1024
);
set_cache_info
(
cpu_info
,
1
,
2
,
1024
*
1024
,
512
*
1024
);
return
true
;
}
else
if
(
hardware_name
.
find
(
"SDM660"
)
!=
std
::
string
::
npos
||
hardware_name
.
find
(
"SDM636"
)
!=
std
::
string
::
npos
)
{
// 660, 636
cpu_info
->
compute_core_num_
=
8
;
cpu_info
->
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
cpu_info
->
big_core_ids_
=
{
4
,
5
,
6
,
7
};
cpu_info
->
little_core_ids_
=
{
0
,
1
,
2
,
3
};
cpu_info
->
cluster_ids_
=
{
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
};
set_arch_info
(
cpu_info
,
1
,
kA73
);
set_cache_info
(
cpu_info
,
0
,
2
,
64
*
1024
,
32
*
1024
);
set_cache_info
(
cpu_info
,
1
,
1
,
1024
*
1024
);
return
true
;
}
else
if
(
hardware_name
.
find
(
"MSM8976"
)
!=
std
::
string
::
npos
)
{
// 652,653
cpu_info
->
compute_core_num_
=
8
;
cpu_info
->
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
cpu_info
->
big_core_ids_
=
{
4
,
5
,
6
,
7
};
cpu_info
->
little_core_ids_
=
{
0
,
1
,
2
,
3
};
cpu_info
->
cluster_ids_
=
{
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
};
set_arch_info
(
cpu_info
,
2
,
kA72
,
kA53
);
set_cache_info
(
cpu_info
,
0
,
1
,
32
*
1024
);
set_cache_info
(
cpu_info
,
1
,
2
,
1024
*
1024
,
512
*
1024
);
return
true
;
}
else
if
(
hardware_name
.
find
(
"MSM8953"
)
!=
std
::
string
::
npos
)
{
// 625
cpu_info
->
compute_core_num_
=
8
;
cpu_info
->
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
cpu_info
->
big_core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
cpu_info
->
little_core_ids_
=
{};
cpu_info
->
cluster_ids_
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
set_arch_info
(
cpu_info
,
1
,
kA53
);
set_cache_info
(
cpu_info
,
0
,
1
,
32
*
1024
);
set_cache_info
(
cpu_info
,
1
,
1
,
1024
*
1024
);
return
true
;
}
else
if
(
hardware_name
.
find
(
"MSM8939"
)
!=
std
::
string
::
npos
)
{
// 615
cpu_info
->
compute_core_num_
=
8
;
cpu_info
->
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
cpu_info
->
big_core_ids_
=
{
0
,
1
,
2
,
3
};
cpu_info
->
little_core_ids_
=
{
4
,
5
,
6
,
7
};
cpu_info
->
cluster_ids_
=
{
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
};
set_arch_info
(
cpu_info
,
1
,
kA53
);
set_cache_info
(
cpu_info
,
0
,
1
,
32
*
1024
);
set_cache_info
(
cpu_info
,
1
,
2
,
512
*
1024
,
256
*
1024
);
return
true
;
/* MediaTek */
}
else
if
(
hardware_name
.
find
(
"MT6797"
)
!=
std
::
string
::
npos
)
{
// X20/X23/X25/X27
cpu_info
->
compute_core_num_
=
10
;
cpu_info
->
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
};
cpu_info
->
big_core_ids_
=
{
8
,
9
};
cpu_info
->
little_core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
cpu_info
->
cluster_ids_
=
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
0
};
set_arch_info
(
cpu_info
,
2
,
kA72
,
kA53
);
set_cache_info
(
cpu_info
,
0
,
1
,
32
*
1024
);
set_cache_info
(
cpu_info
,
1
,
2
,
1024
*
1024
,
512
*
1024
);
return
true
;
}
else
if
(
hardware_name
.
find
(
"MT6799"
)
!=
std
::
string
::
npos
)
{
// X30
cpu_info
->
compute_core_num_
=
10
;
cpu_info
->
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
};
cpu_info
->
big_core_ids_
=
{
8
,
9
};
cpu_info
->
little_core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
cpu_info
->
cluster_ids_
=
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
0
};
set_arch_info
(
cpu_info
,
2
,
kA73
,
kA53
);
return
true
;
}
else
if
(
hardware_name
.
find
(
"MT6795"
)
!=
std
::
string
::
npos
||
hardware_name
.
find
(
"MT6762"
)
!=
std
::
string
::
npos
||
hardware_name
.
find
(
"MT6755T"
)
!=
std
::
string
::
npos
||
hardware_name
.
find
(
"MT6755S"
)
!=
std
::
string
::
npos
||
hardware_name
.
find
(
"MT6753"
)
!=
std
::
string
::
npos
||
hardware_name
.
find
(
"MT6752"
)
!=
std
::
string
::
npos
||
hardware_name
.
find
(
"MT6750"
)
!=
std
::
string
::
npos
)
{
// X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
cpu_info
->
compute_core_num_
=
8
;
cpu_info
->
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
cpu_info
->
big_core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
cpu_info
->
little_core_ids_
=
{};
cpu_info
->
cluster_ids_
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
set_arch_info
(
cpu_info
,
1
,
kA53
);
return
true
;
}
else
if
(
hardware_name
.
find
(
"MT6758"
)
!=
std
::
string
::
npos
||
hardware_name
.
find
(
"MT6757"
)
!=
std
::
string
::
npos
||
hardware_name
.
find
(
"MT6763"
)
!=
std
::
string
::
npos
||
hardware_name
.
find
(
"MT6755M"
)
!=
std
::
string
::
npos
||
hardware_name
.
find
(
"MT6755"
)
!=
std
::
string
::
npos
)
{
// P30, P20/P25, P23, P10
cpu_info
->
compute_core_num_
=
8
;
cpu_info
->
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
cpu_info
->
big_core_ids_
=
{
4
,
5
,
6
,
7
};
cpu_info
->
little_core_ids_
=
{
0
,
1
,
2
,
3
};
cpu_info
->
cluster_ids_
=
{
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
};
set_arch_info
(
cpu_info
,
1
,
kA53
);
return
true
;
}
else
if
(
hardware_name
.
find
(
"MT6771"
)
!=
std
::
string
::
npos
)
{
// P60
cpu_info
->
compute_core_num_
=
8
;
cpu_info
->
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
cpu_info
->
big_core_ids_
=
{
4
,
5
,
6
,
7
};
cpu_info
->
little_core_ids_
=
{
0
,
1
,
2
,
3
};
cpu_info
->
cluster_ids_
=
{
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
};
set_arch_info
(
cpu_info
,
2
,
kA73
,
kA53
);
return
true
;
}
else
if
(
hardware_name
.
find
(
"MT6765"
)
!=
std
::
string
::
npos
||
hardware_name
.
find
(
"MT6739"
)
!=
std
::
string
::
npos
||
hardware_name
.
find
(
"MT6738"
)
!=
std
::
string
::
npos
||
hardware_name
.
find
(
"MT6737"
)
!=
std
::
string
::
npos
)
{
// A22, MT6739, MT6738, MT6767
cpu_info
->
compute_core_num_
=
4
;
cpu_info
->
core_ids_
=
{
0
,
1
,
2
,
3
};
cpu_info
->
big_core_ids_
=
{
0
,
0
,
0
,
0
};
cpu_info
->
little_core_ids_
=
{};
cpu_info
->
cluster_ids_
=
{
0
,
0
,
0
,
0
};
set_arch_info
(
cpu_info
,
1
,
kA53
);
return
true
;
}
}
return
false
;
return
cpu_num
;
#else
return
1
;
#endif
}
}
size_t
arm_get_meminfo
()
{
size_t
get_mem_size
()
{
#ifdef LITE_WITH_LINUX
#ifdef LITE_WITH_LINUX
// get cpu count from /proc/cpuinfo
// get cpu count from /proc/cpuinfo
FILE
*
fp
=
fopen
(
"/proc/meminfo"
,
"rb"
);
FILE
*
fp
=
fopen
(
"/proc/meminfo"
,
"rb"
);
if
(
!
fp
)
{
if
(
!
fp
)
{
return
1
;
return
1
;
}
}
size_t
memsize
=
0
;
size_t
memsize
=
0
;
char
line
[
1024
];
char
line
[
1024
];
while
(
!
feof
(
fp
))
{
while
(
!
feof
(
fp
))
{
...
@@ -327,52 +96,18 @@ size_t arm_get_meminfo() {
...
@@ -327,52 +96,18 @@ size_t arm_get_meminfo() {
}
}
sscanf
(
s
,
"MemTotal: %d kB"
,
&
memsize
);
sscanf
(
s
,
"MemTotal: %d kB"
,
&
memsize
);
}
}
fclose
(
fp
);
fclose
(
fp
);
return
memsize
;
return
memsize
;
#elif defined(TARGET_IOS)
#elif defined(TARGET_IOS)
// to be implemented
// to be implemented
printf
(
"not implemented
\n
"
);
printf
(
"not implemented
\n
"
);
return
0
;
#endif
}
int
arm_get_cpucount
()
{
#ifdef LITE_WITH_LINUX
// get cpu count from /sys/devices/system/cpu/cpunum/uevent
int
max_cpu_count
=
20
;
int
count
=
0
;
for
(
int
i
=
0
;
i
<
max_cpu_count
;
++
i
)
{
char
path
[
256
];
snprintf
(
path
,
sizeof
(
path
),
"/sys/devices/system/cpu/cpu%d/uevent"
,
i
);
FILE
*
fp
=
fopen
(
path
,
"rb"
);
if
(
!
fp
)
{
break
;
}
count
++
;
fclose
(
fp
);
}
if
(
count
<
1
)
{
count
=
1
;
}
return
count
;
#elif defined(TARGET_IOS)
int
count
=
0
;
size_t
len
=
sizeof
(
count
);
sysctlbyname
(
"hw.ncpu"
,
&
count
,
&
len
,
NULL
,
0
);
if
(
count
<
1
)
{
count
=
1
;
}
return
count
;
#else
return
1
;
#endif
#endif
return
0
;
}
}
void
arm_get_cpu_arch
(
std
::
vector
<
ARMArch
>*
archs
)
{
void
get_cpu_arch
(
std
::
vector
<
ARMArch
>*
archs
,
const
int
cpu_num
)
{
#ifdef LITE_WITH_LINUX
archs
->
clear
();
archs
->
clear
();
#ifdef LITE_WITH_LINUX
//! get CPU ARCH
//! get CPU ARCH
FILE
*
fp
=
fopen
(
"/proc/cpuinfo"
,
"rb"
);
FILE
*
fp
=
fopen
(
"/proc/cpuinfo"
,
"rb"
);
if
(
!
fp
)
{
if
(
!
fp
)
{
...
@@ -406,6 +141,29 @@ void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
...
@@ -406,6 +141,29 @@ void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
case
0xd0a
:
case
0xd0a
:
archs
->
push_back
(
kA75
);
archs
->
push_back
(
kA75
);
break
;
break
;
case
0xd40
:
archs
->
push_back
(
kA76
);
break
;
case
0x804
:
// 855
archs
->
push_back
(
kA76
);
break
;
case
0x805
:
// 855
archs
->
push_back
(
kA55
);
break
;
case
0x802
:
// 845
archs
->
push_back
(
kA75
);
break
;
case
0x803
:
// 845
archs
->
push_back
(
kA55
);
break
;
case
0x801
:
// 835
archs
->
push_back
(
kA73
);
break
;
case
0x800
:
case
0x800
:
// 835
// 835
archs
->
push_back
(
kA73
);
archs
->
push_back
(
kA73
);
...
@@ -415,49 +173,31 @@ void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
...
@@ -415,49 +173,31 @@ void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
archs
->
push_back
(
kA72
);
archs
->
push_back
(
kA72
);
break
;
break
;
default:
default:
LOG
(
ERROR
)
<<
"
unknow type"
;
LOG
(
ERROR
)
<<
"
Unknow cpu arch: "
<<
arch_id
;
archs
->
push_back
(
kARMArch_UNKOWN
);
archs
->
push_back
(
kARMArch_UNKOWN
);
}
}
}
}
}
}
fclose
(
fp
);
fclose
(
fp
);
int
cpu_count
=
arm_get_cpucount
();
if
(
archs
->
size
()
<
cpu_num
)
{
if
(
archs
->
size
()
<
cpu_count
)
{
for
(
int
i
=
archs
->
size
();
i
<
cpu_num
;
++
i
)
{
for
(
int
i
=
archs
->
size
();
i
<
cpu_count
;
++
i
)
{
archs
->
push_back
(
archs
->
at
(
i
-
1
));
archs
->
push_back
(
archs
->
at
(
i
-
1
));
}
}
}
}
#endif
#elif defined(TARGET_IOS)
#ifdef TARGET_IOS
for
(
int
i
=
0
;
i
<
cpu_num
;
++
i
)
{
int
cpu_count
=
arm_get_cpucount
();
for
(
int
i
=
0
;
i
<
cpu_count
;
++
i
)
{
archs
->
push_back
(
APPLE
);
archs
->
push_back
(
APPLE
);
}
}
#else
for
(
int
i
=
0
;
i
<
cpu_num
;
++
i
)
{
archs
->
push_back
(
kARMArch_UNKOWN
);
}
#endif
#endif
}
}
#ifdef LITE_WITH_LINUX
#ifdef LITE_WITH_LINUX
void
set_default_cache
(
DeviceInfo
*
dev
)
{
std
::
string
get_cpu_name
()
{
int
cpu_count
=
arm_get_cpucount
();
dev
->
L1_cache_
.
resize
(
cpu_count
);
dev
->
L2_cache_
.
resize
(
cpu_count
);
dev
->
L3_cache_
.
resize
(
cpu_count
);
#ifdef TARGET_IOS
for
(
int
i
=
0
;
i
<
cpu_count
;
++
i
)
{
dev
->
L1_cache_
[
i
]
=
64
*
1024
;
dev
->
L2_cache_
[
i
]
=
2048
*
1024
;
dev
->
L3_cache_
[
i
]
=
0
;
}
#else
for
(
int
i
=
0
;
i
<
cpu_count
;
++
i
)
{
dev
->
L1_cache_
[
i
]
=
32
*
1024
;
dev
->
L2_cache_
[
i
]
=
512
*
1024
;
dev
->
L3_cache_
[
i
]
=
0
;
}
#endif
}
std
::
string
arm_get_cpu_name
()
{
FILE
*
fp
=
fopen
(
"/proc/cpuinfo"
,
"rb"
);
FILE
*
fp
=
fopen
(
"/proc/cpuinfo"
,
"rb"
);
if
(
!
fp
)
{
if
(
!
fp
)
{
return
""
;
return
""
;
...
@@ -477,122 +217,163 @@ std::string arm_get_cpu_name() {
...
@@ -477,122 +217,163 @@ std::string arm_get_cpu_name() {
return
""
;
return
""
;
}
}
int
get_max_freq_khz
(
int
cpuid
)
{
void
get_cpu_max_min_freq
(
int
cpu_id
,
int
*
max_freq
,
int
*
min_freq
)
{
*
max_freq
=
0
;
*
min_freq
=
0
;
// first try, for all possible cpu
// first try, for all possible cpu
char
path
[
256
];
char
path
[
256
];
snprintf
(
path
,
sizeof
(
path
),
snprintf
(
path
,
sizeof
(
path
),
"/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state"
,
cpuid
);
"/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state"
,
cpu_id
);
FILE
*
fp
=
fopen
(
path
,
"rb"
);
FILE
*
fp
=
fopen
(
path
,
"rb"
);
if
(
!
fp
)
{
if
(
!
fp
)
{
// second try, for online cpu
// second try, for online cpu
snprintf
(
path
,
sizeof
(
path
),
snprintf
(
path
,
sizeof
(
path
),
"/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state"
,
"/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state"
,
cpuid
);
cpu
_
id
);
fp
=
fopen
(
path
,
"rb"
);
fp
=
fopen
(
path
,
"rb"
);
if
(
!
fp
)
{
if
(
!
fp
)
{
// third try, for online cpu
// third try, for online cpu
// get max_freq
snprintf
(
path
,
sizeof
(
path
),
snprintf
(
path
,
sizeof
(
path
),
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq"
,
cpuid
);
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq"
,
cpu_id
);
fp
=
fopen
(
path
,
"rb"
);
fp
=
fopen
(
path
,
"rb"
);
if
(
!
fp
)
{
if
(
!
fp
)
{
return
-
1
;
return
;
}
}
fscanf
(
fp
,
"%d"
,
max_freq
);
int
max_freq_khz
=
-
1
;
fscanf
(
fp
,
"%d"
,
&
max_freq_khz
);
fclose
(
fp
);
fclose
(
fp
);
// get min_freq
return
max_freq_khz
;
snprintf
(
path
,
sizeof
(
path
),
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_min_freq"
,
cpu_id
);
fp
=
fopen
(
path
,
"rb"
);
if
(
!
fp
)
{
return
;
}
}
fscanf
(
fp
,
"%d"
,
min_freq
);
fclose
(
fp
);
return
;
}
}
}
int
max_freq_khz
=
0
;
*
min_freq
=
std
::
numeric_limits
<
int
>::
max
()
;
while
(
!
feof
(
fp
))
{
while
(
!
feof
(
fp
))
{
int
freq
_khz
=
0
;
int
freq
=
0
;
int
nscan
=
fscanf
(
fp
,
"%d %*d"
,
&
freq
_khz
);
int
nscan
=
fscanf
(
fp
,
"%d %*d"
,
&
freq
);
if
(
nscan
!=
1
)
{
if
(
nscan
!=
1
)
{
break
;
break
;
}
}
if
(
freq
>
*
max_freq
)
{
if
(
freq_khz
>
max_freq_khz
)
{
*
max_freq
=
freq
;
max_freq_khz
=
freq_khz
;
}
if
(
freq
<
*
min_freq
)
{
*
min_freq
=
freq
;
}
}
}
}
fclose
(
fp
);
fclose
(
fp
);
return
max_freq_khz
;
}
}
int
arm_sort_cpuid_by_max_frequency
(
int
cpu_count
,
std
::
vector
<
int
>*
cpuid
s
,
void
sort_cpuid_by_max_freq
(
const
std
::
vector
<
int
>&
max_freq
s
,
const
std
::
vector
<
int
>&
cpu_freq
,
std
::
vector
<
int
>*
cpu_ids
,
std
::
vector
<
int
>*
cluster_ids
)
{
std
::
vector
<
int
>*
cluster_ids
)
{
if
(
cpu_count
==
0
)
{
int
cpu_num
=
max_freqs
.
size
();
return
0
;
if
(
cpu_num
==
0
)
{
return
;
}
}
cpu_ids
->
resize
(
cpu_num
);
cpuids
->
resize
(
cpu_count
);
cluster_ids
->
resize
(
cpu_num
);
cluster_ids
->
resize
(
cpu_count
);
for
(
int
i
=
0
;
i
<
cpu_num
;
i
++
)
{
cpu_ids
->
at
(
i
)
=
i
;
for
(
int
i
=
0
;
i
<
cpu_count
;
i
++
)
{
cpuids
->
at
(
i
)
=
i
;
}
}
// sort cpuid as big core first
// sort cpuid as big core first
// simple bubble sort
// simple bubble sort
for
(
int
i
=
0
;
i
<
cpu_num
;
i
++
)
{
for
(
int
i
=
0
;
i
<
cpu_count
;
i
++
)
{
for
(
int
j
=
i
+
1
;
j
<
cpu_num
;
j
++
)
{
for
(
int
j
=
i
+
1
;
j
<
cpu_count
;
j
++
)
{
if
(
max_freqs
[
i
]
<
max_freqs
[
j
])
{
if
(
cpu_freq
[
i
]
<
cpu_freq
[
j
])
{
// swap
// swap
int
tmp
=
cpuids
->
at
(
i
);
int
tmp
=
cpu
_
ids
->
at
(
i
);
cpu
ids
->
at
(
i
)
=
cpu
ids
->
at
(
j
);
cpu
_ids
->
at
(
i
)
=
cpu_
ids
->
at
(
j
);
cpuids
->
at
(
j
)
=
tmp
;
cpu
_
ids
->
at
(
j
)
=
tmp
;
}
}
}
}
}
}
// SMP
// SMP
int
mid_max_freq
_khz
=
int
mid_max_freq
=
(
cpu_freq
[
cpuids
->
at
(
0
)]
+
cpu_freq
[
cpuids
->
at
(
cpu_count
-
1
)])
/
2
;
(
max_freqs
[
cpu_ids
->
at
(
0
)]
+
max_freqs
[
cpu_ids
->
at
(
cpu_num
-
1
)])
/
2
;
for
(
int
i
=
0
;
i
<
cpu_
count
;
i
++
)
{
for
(
int
i
=
0
;
i
<
cpu_
num
;
i
++
)
{
cpuids
->
at
(
i
)
=
i
;
cpu
_
ids
->
at
(
i
)
=
i
;
if
(
cpu_freq
[
i
]
>=
mid_max_freq_khz
)
{
if
(
max_freqs
[
i
]
>=
mid_max_freq
)
{
cluster_ids
->
at
(
i
)
=
0
;
cluster_ids
->
at
(
i
)
=
0
;
}
else
{
}
else
{
cluster_ids
->
at
(
i
)
=
1
;
cluster_ids
->
at
(
i
)
=
1
;
}
}
}
}
return
0
;
}
}
int
check_online
(
const
std
::
vector
<
int
>&
core_ids
)
{
void
get_cpu_cache_size
(
int
cpu_id
,
int
*
l1_cache_size
,
int
*
l2_cache_size
,
if
(
core_ids
.
size
()
==
0
)
{
int
*
l3_cache_size
)
{
return
0
;
int
max_cache_idx_num
=
10
;
*
l1_cache_size
=
DEFAULT_L1_CACHE_SIZE
;
*
l2_cache_size
=
DEFAULT_L2_CACHE_SIZE
;
*
l3_cache_size
=
DEFAULT_L3_CACHE_SIZE
;
for
(
int
i
=
0
;
i
<
max_cache_idx_num
;
i
++
)
{
char
path
[
256
];
snprintf
(
path
,
sizeof
(
path
),
"/sys/devices/system/cpu/cpu%d/cache/index%d/level"
,
cpu_id
,
i
);
FILE
*
fp
=
fopen
(
path
,
"rb"
);
if
(
fp
)
{
int
level
=
-
1
;
fscanf
(
fp
,
"%d"
,
&
level
);
fclose
(
fp
);
snprintf
(
path
,
sizeof
(
path
),
"/sys/devices/system/cpu/cpu%d/cache/index%d/size"
,
cpu_id
,
i
);
fp
=
fopen
(
path
,
"rb"
);
if
(
fp
)
{
int
size
=
-
1
;
fscanf
(
fp
,
"%d"
,
&
size
);
fclose
(
fp
);
if
(
size
>=
0
)
{
if
(
level
==
1
)
{
*
l1_cache_size
=
size
*
1024
;
}
else
if
(
level
==
2
)
{
*
l2_cache_size
=
size
*
1024
;
}
else
if
(
level
==
3
)
{
*
l3_cache_size
=
size
*
1024
;
}
}
}
}
}
}
bool
check_cpu_online
(
const
std
::
vector
<
int
>&
cpu_ids
)
{
if
(
cpu_ids
.
size
()
==
0
)
{
return
false
;
}
}
char
path
[
256
];
char
path
[
256
];
int
online
=
1
;
bool
all_online
=
true
;
for
(
int
i
=
0
;
i
<
c
ore
_ids
.
size
();
++
i
)
{
for
(
int
i
=
0
;
i
<
c
pu
_ids
.
size
();
++
i
)
{
snprintf
(
path
,
sizeof
(
path
),
"/sys/devices/system/cpu/cpu%d/online"
,
snprintf
(
path
,
sizeof
(
path
),
"/sys/devices/system/cpu/cpu%d/online"
,
c
ore
_ids
[
i
]);
c
pu
_ids
[
i
]);
FILE
*
fp
=
fopen
(
path
,
"rb"
);
FILE
*
fp
=
fopen
(
path
,
"rb"
);
if
(
!
fp
)
{
int
is_online
=
0
;
return
0
;
if
(
fp
)
{
}
fscanf
(
fp
,
"%d"
,
&
is_online
);
int
cur_online
=
0
;
fscanf
(
fp
,
"%d"
,
&
cur_online
);
online
&=
cur_online
;
fclose
(
fp
);
fclose
(
fp
);
}
else
{
LOG
(
ERROR
)
<<
"Failed to query the online statue of CPU id:"
<<
cpu_ids
[
i
];
}
}
return
online
;
if
(
is_online
==
0
)
{
all_online
=
false
;
LOG
(
ERROR
)
<<
"CPU id:"
<<
cpu_ids
[
i
]
<<
" is offine"
;
}
}
return
all_online
;
}
}
int
set_sched_affinity
(
const
std
::
vector
<
int
>&
cpuids
)
{
int
set_sched_affinity
(
const
std
::
vector
<
int
>&
cpu
_
ids
)
{
// #define CPU_SETSIZE 1024
// #define CPU_SETSIZE 1024
// #define __NCPUBITS (8 * sizeof (unsigned long))
// #define __NCPUBITS (8 * sizeof (unsigned long))
// typedef struct
// typedef struct
...
@@ -608,20 +389,569 @@ int set_sched_affinity(const std::vector<int>& cpuids) {
...
@@ -608,20 +389,569 @@ int set_sched_affinity(const std::vector<int>& cpuids) {
#endif
#endif
cpu_set_t
mask
;
cpu_set_t
mask
;
CPU_ZERO
(
&
mask
);
CPU_ZERO
(
&
mask
);
for
(
int
i
=
0
;
i
<
cpu
ids
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
cpu
_ids
.
size
();
++
i
)
{
CPU_SET
(
cpuids
[
i
],
&
mask
);
CPU_SET
(
cpu
_
ids
[
i
],
&
mask
);
}
}
int
syscallret
=
syscall
(
__NR_sched_setaffinity
,
pid
,
sizeof
(
mask
),
&
mask
);
int
syscallret
=
syscall
(
__NR_sched_setaffinity
,
pid
,
sizeof
(
mask
),
&
mask
);
if
(
syscallret
)
{
if
(
syscallret
)
{
LOG
(
ERROR
)
<<
"syscall error "
<<
syscallret
;
return
-
1
;
return
-
1
;
}
}
return
0
;
}
bool
bind_threads
(
const
std
::
vector
<
int
>
cpu_ids
)
{
#ifdef ARM_WITH_OMP
int
thread_num
=
cpu_ids
.
size
();
omp_set_num_threads
(
thread_num
);
std
::
vector
<
int
>
ssarets
;
for
(
int
i
=
0
;
i
<
thread_num
;
++
i
)
{
ssarets
.
push_back
(
0
);
}
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
thread_num
;
i
++
)
{
ssarets
[
i
]
=
set_sched_affinity
(
cpu_ids
);
}
for
(
int
i
=
0
;
i
<
thread_num
;
i
++
)
{
if
(
ssarets
[
i
]
!=
0
)
{
LOG
(
ERROR
)
<<
"Set cpu affinity failed, core id: "
<<
cpu_ids
[
i
];
return
false
;
}
}
#else // ARM_WITH_OMP
std
::
vector
<
int
>
first_cpu_id
;
first_cpu_id
.
push_back
(
cpu_ids
[
0
]);
int
ssaret
=
set_sched_affinity
(
first_cpu_id
);
if
(
ssaret
!=
0
)
{
LOG
(
ERROR
)
<<
"Set cpu affinity failed, core id: "
<<
cpu_ids
[
0
];
return
false
;
}
#endif // ARM_WITH_OMP
}
#endif // LITE_WITH_LINUX
// cache_id : 0 -> L1, 1 -> L2, 2 -> L3
void
DeviceInfo
::
SetCacheInfo
(
int
cache_id
,
int
argc
,
...)
{
va_list
arg_ptr
;
va_start
(
arg_ptr
,
argc
);
std
::
vector
<
int
>*
cache
;
switch
(
cache_id
)
{
case
0
:
cache
=
&
L1_cache_
;
break
;
case
1
:
cache
=
&
L2_cache_
;
break
;
case
2
:
cache
=
&
L3_cache_
;
break
;
default:
break
;
}
cache
->
resize
(
core_num_
);
if
(
argc
==
1
)
{
int
cache_size
=
va_arg
(
arg_ptr
,
int
);
for
(
int
i
=
0
;
i
<
core_num_
;
++
i
)
{
(
*
cache
)[
i
]
=
cache_size
;
}
}
else
{
int
big_core_num
=
big_core_ids_
.
size
();
int
little_core_num
=
little_core_ids_
.
size
();
int
big_core_cache_size
=
va_arg
(
arg_ptr
,
int
);
int
little_core_cache_size
=
va_arg
(
arg_ptr
,
int
);
for
(
int
i
=
0
;
i
<
big_core_num
;
++
i
)
{
(
*
cache
)[
big_core_ids_
[
i
]]
=
big_core_cache_size
;
}
for
(
int
i
=
0
;
i
<
little_core_num
;
++
i
)
{
(
*
cache
)[
little_core_ids_
[
i
]]
=
little_core_cache_size
;
}
}
va_end
(
arg_ptr
);
}
void
DeviceInfo
::
SetArchInfo
(
int
argc
,
...)
{
va_list
arg_ptr
;
va_start
(
arg_ptr
,
argc
);
archs_
.
resize
(
core_num_
);
if
(
argc
==
1
)
{
ARMArch
arch
=
(
ARMArch
)
va_arg
(
arg_ptr
,
int
);
for
(
int
i
=
0
;
i
<
core_num_
;
++
i
)
{
archs_
[
i
]
=
arch
;
}
}
else
{
ARMArch
big_core_arch
=
(
ARMArch
)
va_arg
(
arg_ptr
,
int
);
ARMArch
little_core_arch
=
(
ARMArch
)
va_arg
(
arg_ptr
,
int
);
int
big_core_num
=
big_core_ids_
.
size
();
int
little_core_num
=
little_core_ids_
.
size
();
for
(
int
i
=
0
;
i
<
big_core_num
;
++
i
)
{
archs_
[
big_core_ids_
[
i
]]
=
big_core_arch
;
}
for
(
int
i
=
0
;
i
<
little_core_num
;
++
i
)
{
archs_
[
little_core_ids_
[
i
]]
=
little_core_arch
;
}
}
va_end
(
arg_ptr
);
}
bool
DeviceInfo
::
SetCPUInfoByName
()
{
/* Snapdragon */
if
(
dev_name_
.
find
(
"SM8150"
)
!=
std
::
string
::
npos
)
{
// 855
core_num_
=
8
;
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
big_core_ids_
=
{
4
,
5
,
6
,
7
};
little_core_ids_
=
{
0
,
1
,
2
,
3
};
cluster_ids_
=
{
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
};
SetArchInfo
(
2
,
kA76
,
kA55
);
SetCacheInfo
(
0
,
2
,
64
*
1024
,
32
*
1024
);
SetCacheInfo
(
1
,
2
,
256
*
1024
,
128
*
1024
);
SetCacheInfo
(
2
,
1
,
2048
*
1024
);
return
true
;
}
else
if
(
dev_name_
.
find
(
"SDM845"
)
!=
std
::
string
::
npos
)
{
// 845
core_num_
=
8
;
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
big_core_ids_
=
{
4
,
5
,
6
,
7
};
little_core_ids_
=
{
0
,
1
,
2
,
3
};
cluster_ids_
=
{
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
};
SetArchInfo
(
2
,
kA75
,
kA55
);
SetCacheInfo
(
0
,
2
,
64
*
1024
,
32
*
1024
);
SetCacheInfo
(
1
,
2
,
256
*
1024
,
128
*
1024
);
SetCacheInfo
(
2
,
1
,
2048
*
1024
);
return
true
;
}
else
if
(
dev_name_
.
find
(
"SDM710"
)
!=
std
::
string
::
npos
)
{
// 710
core_num_
=
8
;
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
big_core_ids_
=
{
6
,
7
};
little_core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
};
cluster_ids_
=
{
1
,
1
,
1
,
1
,
1
,
1
,
0
,
0
};
SetArchInfo
(
2
,
kA75
,
kA55
);
SetCacheInfo
(
0
,
2
,
64
*
1024
,
32
*
1024
);
SetCacheInfo
(
1
,
2
,
256
*
1024
,
128
*
1024
);
SetCacheInfo
(
2
,
1
,
1024
*
1024
);
return
true
;
}
else
if
(
dev_name_
.
find
(
"MSM8998"
)
!=
std
::
string
::
npos
)
{
// 835
core_num_
=
8
;
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
big_core_ids_
=
{
4
,
5
,
6
,
7
};
little_core_ids_
=
{
0
,
1
,
2
,
3
};
cluster_ids_
=
{
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
};
SetArchInfo
(
2
,
kA73
,
kA53
);
SetCacheInfo
(
0
,
2
,
64
*
1024
,
32
*
1024
);
SetCacheInfo
(
1
,
2
,
1024
*
1024
,
/*real cache size is 2M, while that will get bad performace
on conv3x3s1 or gemm, set to 1M or 512K*/
1024
*
1024
);
return
true
;
}
else
if
(
dev_name_
.
find
(
"MSM8996"
)
!=
std
::
string
::
npos
)
{
// 820
core_num_
=
4
;
core_ids_
=
{
0
,
1
,
2
,
3
};
big_core_ids_
=
{
2
,
3
};
little_core_ids_
=
{
0
,
1
};
cluster_ids_
=
{
1
,
1
,
0
,
0
};
SetArchInfo
(
1
,
kA72
);
SetCacheInfo
(
0
,
1
,
24
*
1024
);
SetCacheInfo
(
1
,
2
,
1024
*
1024
,
512
*
1024
);
return
true
;
}
else
if
(
dev_name_
.
find
(
"SDM660"
)
!=
std
::
string
::
npos
||
dev_name_
.
find
(
"SDM636"
)
!=
std
::
string
::
npos
)
{
// 660, 636
core_num_
=
8
;
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
big_core_ids_
=
{
4
,
5
,
6
,
7
};
little_core_ids_
=
{
0
,
1
,
2
,
3
};
cluster_ids_
=
{
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
};
SetArchInfo
(
1
,
kA73
);
SetCacheInfo
(
0
,
2
,
64
*
1024
,
32
*
1024
);
SetCacheInfo
(
1
,
1
,
1024
*
1024
);
return
true
;
}
else
if
(
dev_name_
.
find
(
"MSM8976"
)
!=
std
::
string
::
npos
)
{
// 652,653
core_num_
=
8
;
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
big_core_ids_
=
{
4
,
5
,
6
,
7
};
little_core_ids_
=
{
0
,
1
,
2
,
3
};
cluster_ids_
=
{
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
};
SetArchInfo
(
2
,
kA72
,
kA53
);
SetCacheInfo
(
0
,
1
,
32
*
1024
);
SetCacheInfo
(
1
,
2
,
1024
*
1024
,
512
*
1024
);
return
true
;
}
else
if
(
dev_name_
.
find
(
"MSM8953"
)
!=
std
::
string
::
npos
)
{
// 625
core_num_
=
8
;
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
big_core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
little_core_ids_
=
{};
cluster_ids_
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
SetArchInfo
(
1
,
kA53
);
SetCacheInfo
(
0
,
1
,
32
*
1024
);
SetCacheInfo
(
1
,
1
,
1024
*
1024
);
return
true
;
}
else
if
(
dev_name_
.
find
(
"MSM8939"
)
!=
std
::
string
::
npos
)
{
// 615
core_num_
=
8
;
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
big_core_ids_
=
{
0
,
1
,
2
,
3
};
little_core_ids_
=
{
4
,
5
,
6
,
7
};
cluster_ids_
=
{
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
};
SetArchInfo
(
1
,
kA53
);
SetCacheInfo
(
0
,
1
,
32
*
1024
);
SetCacheInfo
(
1
,
2
,
512
*
1024
,
256
*
1024
);
return
true
;
/* MediaTek */
}
else
if
(
dev_name_
.
find
(
"MT6797"
)
!=
std
::
string
::
npos
)
{
// X20/X23/X25/X27
core_num_
=
10
;
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
big_core_ids_
=
{
8
,
9
};
little_core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
cluster_ids_
=
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
0
};
SetArchInfo
(
2
,
kA72
,
kA53
);
SetCacheInfo
(
0
,
1
,
32
*
1024
);
SetCacheInfo
(
1
,
2
,
1024
*
1024
,
512
*
1024
);
return
true
;
}
else
if
(
dev_name_
.
find
(
"MT6799"
)
!=
std
::
string
::
npos
)
{
// X30
core_num_
=
10
;
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
big_core_ids_
=
{
8
,
9
};
little_core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
cluster_ids_
=
{
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
,
0
,
0
};
SetArchInfo
(
2
,
kA73
,
kA53
);
return
true
;
}
else
if
(
dev_name_
.
find
(
"MT6795"
)
!=
std
::
string
::
npos
||
dev_name_
.
find
(
"MT6762"
)
!=
std
::
string
::
npos
||
dev_name_
.
find
(
"MT6755T"
)
!=
std
::
string
::
npos
||
dev_name_
.
find
(
"MT6755S"
)
!=
std
::
string
::
npos
||
dev_name_
.
find
(
"MT6753"
)
!=
std
::
string
::
npos
||
dev_name_
.
find
(
"MT6752"
)
!=
std
::
string
::
npos
||
dev_name_
.
find
(
"MT6750"
)
!=
std
::
string
::
npos
)
{
// X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
core_num_
=
8
;
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
big_core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
little_core_ids_
=
{};
cluster_ids_
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
SetArchInfo
(
1
,
kA53
);
return
true
;
}
else
if
(
dev_name_
.
find
(
"MT6758"
)
!=
std
::
string
::
npos
||
dev_name_
.
find
(
"MT6757"
)
!=
std
::
string
::
npos
||
dev_name_
.
find
(
"MT6763"
)
!=
std
::
string
::
npos
||
dev_name_
.
find
(
"MT6755M"
)
!=
std
::
string
::
npos
||
dev_name_
.
find
(
"MT6755"
)
!=
std
::
string
::
npos
)
{
// P30, P20/P25, P23, P10
core_num_
=
8
;
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
big_core_ids_
=
{
4
,
5
,
6
,
7
};
little_core_ids_
=
{
0
,
1
,
2
,
3
};
cluster_ids_
=
{
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
};
SetArchInfo
(
1
,
kA53
);
return
true
;
}
else
if
(
dev_name_
.
find
(
"MT6771"
)
!=
std
::
string
::
npos
)
{
// P60
core_num_
=
8
;
core_ids_
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
};
big_core_ids_
=
{
4
,
5
,
6
,
7
};
little_core_ids_
=
{
0
,
1
,
2
,
3
};
cluster_ids_
=
{
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
};
SetArchInfo
(
2
,
kA73
,
kA53
);
return
true
;
}
else
if
(
dev_name_
.
find
(
"MT6765"
)
!=
std
::
string
::
npos
||
dev_name_
.
find
(
"MT6739"
)
!=
std
::
string
::
npos
||
dev_name_
.
find
(
"MT6738"
)
!=
std
::
string
::
npos
||
dev_name_
.
find
(
"MT6737"
)
!=
std
::
string
::
npos
)
{
// A22, MT6739, MT6738, MT6767
core_num_
=
4
;
core_ids_
=
{
0
,
1
,
2
,
3
};
big_core_ids_
=
{
0
,
1
,
2
,
3
};
little_core_ids_
=
{};
cluster_ids_
=
{
0
,
0
,
0
,
0
};
SetArchInfo
(
1
,
kA53
);
return
true
;
}
return
false
;
}
void
DeviceInfo
::
SetCPUInfoByProb
()
{
#ifdef LITE_WITH_LINUX
// get big.LITTLE cores by sorting CPU frequency
sort_cpuid_by_max_freq
(
max_freqs_
,
&
core_ids_
,
&
cluster_ids_
);
big_core_ids_
.
clear
();
little_core_ids_
.
clear
();
for
(
int
i
=
0
;
i
<
cluster_ids_
.
size
();
++
i
)
{
if
(
cluster_ids_
[
i
]
==
0
)
{
big_core_ids_
.
push_back
(
core_ids_
[
i
]);
}
else
{
little_core_ids_
.
push_back
(
core_ids_
[
i
]);
}
}
// get l1, l2, l3 cache size for each core
for
(
int
i
=
0
;
i
<
core_num_
;
i
++
)
{
get_cpu_cache_size
(
i
,
&
(
L1_cache_
[
i
]),
&
(
L2_cache_
[
i
]),
&
(
L3_cache_
[
i
]));
}
#endif // LITE_WITH_LINUX
}
void
DeviceInfo
::
RequestPowerFullMode
(
const
int
thread_num
)
{
int
big_core_size
=
big_core_ids_
.
size
();
int
little_core_size
=
little_core_ids_
.
size
();
active_ids_
.
clear
();
for
(
int
i
=
0
;
i
<
thread_num
;
++
i
)
{
if
(
i
<
big_core_size
)
{
active_ids_
.
push_back
(
big_core_ids_
[
i
]);
}
else
if
(
i
<
big_core_size
+
little_core_size
)
{
active_ids_
.
push_back
(
little_core_ids_
[
i
-
big_core_size
]);
}
}
mode_
=
LITE_POWER_FULL
;
}
void
DeviceInfo
::
RequestPowerHighMode
(
const
int
thread_num
)
{
int
big_core_size
=
big_core_ids_
.
size
();
int
little_core_size
=
little_core_ids_
.
size
();
active_ids_
.
clear
();
if
(
big_core_size
>
0
)
{
mode_
=
LITE_POWER_HIGH
;
if
(
thread_num
>
big_core_size
)
{
LOG
(
ERROR
)
<<
"Request thread num: "
<<
thread_num
<<
", exceed the big cores size: "
<<
big_core_size
<<
", truncate thread num to "
<<
big_core_size
;
active_ids_
=
big_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
thread_num
;
++
i
)
{
active_ids_
.
push_back
(
big_core_ids_
[
i
]);
}
}
}
else
{
mode_
=
LITE_POWER_LOW
;
LOG
(
ERROR
)
<<
"HIGH POWER MODE is not support, switch to little cores."
;
if
(
thread_num
>
little_core_size
)
{
active_ids_
=
little_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
thread_num
;
++
i
)
{
active_ids_
.
push_back
(
little_core_ids_
[
i
]);
}
}
}
}
void
DeviceInfo
::
RequestPowerLowMode
(
const
int
thread_num
)
{
int
big_core_size
=
big_core_ids_
.
size
();
int
little_core_size
=
little_core_ids_
.
size
();
active_ids_
.
clear
();
if
(
little_core_size
>
0
)
{
mode_
=
LITE_POWER_LOW
;
if
(
thread_num
>
little_core_size
)
{
LOG
(
WARNING
)
<<
"Request thread num: "
<<
thread_num
<<
", exceed the little cores size: "
<<
little_core_size
<<
", truncate thread num to "
<<
little_core_size
;
active_ids_
=
little_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
thread_num
;
i
++
)
{
active_ids_
.
push_back
(
little_core_ids_
[
i
]);
}
}
}
else
{
mode_
=
LITE_POWER_HIGH
;
LOG
(
WARNING
)
<<
"LOW POWER MODE is not support, switch to big cores"
;
if
(
thread_num
>
big_core_size
)
{
active_ids_
=
big_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
thread_num
;
i
++
)
{
active_ids_
.
push_back
(
big_core_ids_
[
i
]);
}
}
}
}
void
DeviceInfo
::
RequestPowerNoBindMode
(
const
int
thread_num
)
{
active_ids_
.
clear
();
for
(
int
i
=
0
;
i
<
thread_num
;
i
++
)
{
active_ids_
.
push_back
(
0
);
}
mode_
=
LITE_POWER_NO_BIND
;
}
void
DeviceInfo
::
RequestPowerRandHighMode
(
const
int
shift_num
,
const
int
thread_num
)
{
int
big_core_size
=
big_core_ids_
.
size
();
int
little_core_size
=
little_core_ids_
.
size
();
if
(
big_core_size
>
0
)
{
mode_
=
LITE_POWER_RAND_HIGH
;
if
(
thread_num
>
big_core_size
)
{
LOG
(
WARNING
)
<<
"Request thread num: "
<<
thread_num
<<
", exceed the big cores size: "
<<
big_core_size
<<
", truncate thread num to "
<<
big_core_size
;
active_ids_
=
big_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
thread_num
;
++
i
)
{
active_ids_
.
push_back
(
big_core_ids_
[(
i
+
shift_num
)
%
big_core_size
]);
}
}
}
else
{
mode_
=
LITE_POWER_LOW
;
LOG
(
WARNING
)
<<
"HIGH POWER MODE is not support, switch to little cores."
;
if
(
thread_num
>
little_core_size
)
{
active_ids_
=
little_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
thread_num
;
++
i
)
{
active_ids_
.
push_back
(
little_core_ids_
[
i
]);
}
}
}
}
void
DeviceInfo
::
RequestPowerRandLowMode
(
const
int
shift_num
,
const
int
thread_num
)
{
int
big_core_size
=
big_core_ids_
.
size
();
int
little_core_size
=
little_core_ids_
.
size
();
active_ids_
.
clear
();
if
(
little_core_size
>
0
)
{
mode_
=
LITE_POWER_RAND_LOW
;
if
(
thread_num
>
little_core_size
)
{
LOG
(
WARNING
)
<<
"Request thread num: "
<<
thread_num
<<
", exceed the little cores size: "
<<
little_core_size
<<
", truncate thread num to "
<<
little_core_size
;
active_ids_
=
little_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
thread_num
;
++
i
)
{
active_ids_
.
push_back
(
little_core_ids_
[(
i
+
shift_num
)
%
little_core_size
]);
}
}
}
else
{
mode_
=
LITE_POWER_HIGH
;
LOG
(
WARNING
)
<<
"LOW POWER MODE is not support, switch to big cores."
;
if
(
thread_num
>
big_core_size
)
{
active_ids_
=
big_core_ids_
;
}
else
{
for
(
int
i
=
0
;
i
<
thread_num
;
++
i
)
{
active_ids_
.
push_back
(
big_core_ids_
[
i
]);
}
}
}
}
int
DeviceInfo
::
Setup
()
{
core_num_
=
get_cpu_num
();
mem_size_
=
get_mem_size
();
get_cpu_arch
(
&
archs_
,
core_num_
);
// set defalut CPU info
SetCacheInfo
(
0
,
DEFAULT_L1_CACHE_SIZE
);
SetCacheInfo
(
1
,
DEFAULT_L2_CACHE_SIZE
);
SetCacheInfo
(
2
,
DEFAULT_L3_CACHE_SIZE
);
#ifdef LITE_WITH_LINUX
// get max&min freq
max_freqs_
.
resize
(
core_num_
);
min_freqs_
.
resize
(
core_num_
);
for
(
int
i
=
0
;
i
<
core_num_
;
++
i
)
{
int
max_freq
,
min_freq
;
get_cpu_max_min_freq
(
i
,
&
max_freq
,
&
min_freq
);
max_freqs_
[
i
]
=
max_freq
/
1000
;
min_freqs_
[
i
]
=
min_freq
/
1000
;
}
// get cache size and big.LITTLE core ids
dev_name_
=
get_cpu_name
();
if
(
!
SetCPUInfoByName
())
{
SetCPUInfoByProb
();
}
// output info
LOG
(
INFO
)
<<
"ARM multiprocessors name: "
<<
dev_name_
;
LOG
(
INFO
)
<<
"ARM multiprocessors number: "
<<
core_num_
;
for
(
int
i
=
0
;
i
<
core_num_
;
++
i
)
{
LOG
(
INFO
)
<<
"ARM multiprocessors ID: "
<<
core_ids_
[
i
]
<<
", max freq: "
<<
max_freqs_
[
i
]
<<
", min freq: "
<<
min_freqs_
[
i
]
<<
", cluster ID: "
<<
cluster_ids_
[
core_ids_
[
i
]]
<<
", CPU ARCH: A"
<<
archs_
[
i
];
}
LOG
(
INFO
)
<<
"L1 DataCache size is: "
;
for
(
int
i
=
0
;
i
<
core_num_
;
++
i
)
{
LOG
(
INFO
)
<<
L1_cache_
[
i
]
/
1024
<<
" KB"
;
}
LOG
(
INFO
)
<<
"L2 Cache size is: "
;
for
(
int
i
=
0
;
i
<
core_num_
;
++
i
)
{
LOG
(
INFO
)
<<
L2_cache_
[
i
]
/
1024
<<
" KB"
;
}
LOG
(
INFO
)
<<
"Total memory: "
<<
mem_size_
<<
"KB"
;
#endif
// set default run mode
SetRunMode
(
LITE_POWER_NO_BIND
,
1
);
// use single thread by default
return
0
;
return
0
;
}
}
void
DeviceInfo
::
SetRunMode
(
PowerMode
mode
,
int
thread_num
)
{
#ifdef ARM_WITH_OMP
thread_num
=
std
::
min
(
thread_num
,
core_num_
);
#else
thread_num
=
1
;
// force thread_num to 1 if OpenMP is disabled
#endif
#ifdef LITE_WITH_LINUX
int
big_core_size
=
big_core_ids_
.
size
();
int
little_core_size
=
little_core_ids_
.
size
();
int
big_little_core_size
=
big_core_size
+
little_core_size
;
thread_num
=
std
::
min
(
thread_num
,
big_little_core_size
);
count_
++
;
int
shift_num
=
(
count_
/
10
)
%
big_core_size
;
switch
(
mode
)
{
case
LITE_POWER_FULL
:
RequestPowerFullMode
(
thread_num
);
break
;
case
LITE_POWER_HIGH
:
RequestPowerHighMode
(
thread_num
);
break
;
case
LITE_POWER_LOW
:
RequestPowerLowMode
(
thread_num
);
break
;
case
LITE_POWER_NO_BIND
:
RequestPowerNoBindMode
(
thread_num
);
break
;
case
LITE_POWER_RAND_HIGH
:
RequestPowerRandHighMode
(
shift_num
,
thread_num
);
break
;
case
LITE_POWER_RAND_LOW
:
RequestPowerRandLowMode
(
shift_num
,
thread_num
);
break
;
default:
LOG
(
FATAL
)
<<
"Unsupported power mode: "
<<
mode
;
break
;
}
if
(
active_ids_
.
size
()
==
0
)
{
active_ids_
.
push_back
(
0
);
}
#ifdef ARM_WITH_OMP
omp_set_num_threads
(
active_ids_
.
size
());
#endif
if
(
mode_
!=
LITE_POWER_NO_BIND
)
{
if
(
check_cpu_online
(
active_ids_
))
{
bind_threads
(
active_ids_
);
}
else
{
LOG
(
WARNING
)
<<
"Some cores are offline, switch to NO BIND MODE"
;
mode_
=
LITE_POWER_NO_BIND
;
}
}
#else // LITE_WITH_LINUX
// only LITE_POWER_NO_BIND is supported in other OS
RequestPowerNoBindMode
(
thread_num
);
#ifdef ARM_WITH_OMP
omp_set_num_threads
(
active_ids_
.
size
());
#endif
#endif // LITE_WITH_LINUX
#endif // LITE_WITH_LINUX
//! alloc memory for sgemm in this context
workspace_
.
Resize
(
{
static_cast
<
int64_t
>
(
L2_cache_
[
active_ids_
[
0
]]
/
sizeof
(
float
))});
arch_
=
archs_
[
active_ids_
[
0
]];
}
void
DeviceInfo
::
SetCache
(
int
l1size
,
int
l2size
,
int
l3size
)
{
SetCacheInfo
(
0
,
l1size
);
SetCacheInfo
(
1
,
l2size
);
SetCacheInfo
(
2
,
l3size
);
workspace_
.
Resize
({
2
*
(
l1size
+
l2size
)});
}
bool
DeviceInfo
::
ExtendWorkspace
(
DDimLite
dims
)
{
auto
count
=
dims
.
product
();
auto
old
=
workspace_
.
dims
();
if
(
count
==
old
.
product
())
{
return
false
;
}
workspace_
.
Resize
({
static_cast
<
int64_t
>
(
count
+
L2_cache_
[
active_ids_
[
0
]]
/
sizeof
(
float
))});
return
true
;
}
#endif // LITE_WITH_ARM
#endif // LITE_WITH_ARM
...
...
paddle/fluid/lite/core/cpu_info.h
浏览文件 @
58bf3c48
...
@@ -14,24 +14,12 @@
...
@@ -14,24 +14,12 @@
#pragma once
#pragma once
#include <cstdarg>
#include <string>
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/lite/core/lite_tensor.h"
#include "paddle/fluid/lite/utils/cp_logging.h"
#include "paddle/fluid/lite/utils/cp_logging.h"
#ifdef LITE_WITH_LINUX
#include <sys/syscall.h>
#include <unistd.h>
#endif
#if __APPLE__
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#include <mach/machine.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#endif // TARGET_OS_IPHONE
#endif // __APPLE__
namespace
paddle
{
namespace
paddle
{
namespace
lite
{
namespace
lite
{
...
@@ -60,64 +48,73 @@ typedef enum {
...
@@ -60,64 +48,73 @@ typedef enum {
class
DeviceInfo
{
class
DeviceInfo
{
public:
public:
int
idx_
;
int
max_freq_
;
int
min_freq_
;
int
generate_arch_
;
int
compute_core_num_
;
int
max_memory_
;
int
sharemem_size_
;
std
::
string
device_name_
;
std
::
string
compute_ability_
;
std
::
vector
<
int
>
L1_cache_
;
std
::
vector
<
int
>
L2_cache_
;
std
::
vector
<
int
>
L3_cache_
;
std
::
vector
<
int
>
core_ids_
;
std
::
vector
<
int
>
big_core_ids_
;
std
::
vector
<
int
>
little_core_ids_
;
std
::
vector
<
int
>
cluster_ids_
;
std
::
vector
<
ARMArch
>
archs_
;
static
DeviceInfo
&
Global
()
{
static
DeviceInfo
&
Global
()
{
static
auto
*
x
=
new
DeviceInfo
;
static
auto
*
x
=
new
DeviceInfo
;
return
*
x
;
return
*
x
;
}
}
static
void
Init
()
{
static
int
Init
()
{
auto
&
info
=
Global
();
static
int
ret
=
Global
().
Setup
();
InitInternal
(
&
info
)
;
return
ret
;
}
}
private:
int
Setup
();
DeviceInfo
()
=
default
;
static
void
InitInternal
(
DeviceInfo
*
dev
);
};
size_t
arm_get_meminfo
();
void
SetRunMode
(
PowerMode
mode
,
int
thread_num
);
void
SetCache
(
int
l1size
,
int
l2size
,
int
l3size
);
void
SetArch
(
ARMArch
arch
)
{
arch_
=
arch
;
}
int
arm_get_cpucount
();
PowerMode
mode
()
const
{
return
mode_
;
}
int
threads
()
const
{
return
active_ids_
.
size
();
}
ARMArch
arch
()
const
{
return
arch_
;
}
int
l1_cache_size
()
const
{
return
L1_cache_
[
active_ids_
[
0
]];
}
int
l2_cache_size
()
const
{
return
L2_cache_
[
active_ids_
[
0
]];
}
int
l3_cache_size
()
const
{
return
L3_cache_
[
active_ids_
[
0
]];
}
void
arm_get_cpu_arch
(
std
::
vector
<
ARMArch
>*
archs
);
template
<
typename
T
>
T
*
workspace_data
()
{
bool
get_cpu_info_from_name
(
DeviceInfo
*
cpu_info
,
std
::
string
hardware_name
);
return
workspace_
.
mutable_data
<
T
>
();
}
#ifdef LITE_WITH_LINUX
bool
ExtendWorkspace
(
DDimLite
dims
);
void
set_default_cache
(
DeviceInfo
*
dev
);
std
::
string
arm_get_cpu_name
();
private:
int
core_num_
;
std
::
vector
<
int
>
max_freqs_
;
std
::
vector
<
int
>
min_freqs_
;
int
mem_size_
;
std
::
string
dev_name_
;
int
get_max_freq_khz
(
int
cpuid
);
std
::
vector
<
int
>
L1_cache_
;
std
::
vector
<
int
>
L2_cache_
;
std
::
vector
<
int
>
L3_cache_
;
std
::
vector
<
int
>
core_ids_
;
std
::
vector
<
int
>
big_core_ids_
;
std
::
vector
<
int
>
little_core_ids_
;
std
::
vector
<
int
>
cluster_ids_
;
std
::
vector
<
ARMArch
>
archs_
;
int
arm_sort_cpuid_by_max_frequency
(
int
cpu_count
,
std
::
vector
<
int
>*
cpuids
,
ARMArch
arch_
;
const
std
::
vector
<
int
>&
cpu_freq
,
// LITE_POWER_HIGH stands for using big cores,
std
::
vector
<
int
>*
cluster_ids
);
// LITE_POWER_LOW stands for using small core,
int
check_online
(
const
std
::
vector
<
int
>&
core_ids
);
// LITE_POWER_FULL stands for using all cores
int
set_sched_affinity
(
const
std
::
vector
<
int
>&
cpuids
);
PowerMode
mode_
;
std
::
vector
<
int
>
active_ids_
;
TensorLite
workspace_
;
int64_t
count_
{
0
};
void
SetCacheInfo
(
int
cache_id
,
int
argc
,
...);
void
SetArchInfo
(
int
argc
,
...);
bool
SetCPUInfoByName
();
void
SetCPUInfoByProb
();
void
RequestPowerFullMode
(
const
int
thread_num
);
void
RequestPowerHighMode
(
const
int
thread_num
);
void
RequestPowerLowMode
(
const
int
thread_num
);
void
RequestPowerNoBindMode
(
const
int
thread_num
);
void
RequestPowerRandHighMode
(
const
int
shift_num
,
const
int
thread_num
);
void
RequestPowerRandLowMode
(
const
int
shift_num
,
const
int
thread_num
);
#endif // LITE_WITH_LINUX
DeviceInfo
()
=
default
;
};
#endif // LITE_WITH_ARM
#endif // LITE_WITH_ARM
...
...
paddle/fluid/lite/kernels/arm/conv_compute.cc
浏览文件 @
58bf3c48
...
@@ -28,8 +28,6 @@ void ConvCompute::PrepareForRun() {
...
@@ -28,8 +28,6 @@ void ConvCompute::PrepareForRun() {
auto
o_dims
=
param
.
output
->
dims
();
auto
o_dims
=
param
.
output
->
dims
();
auto
&
ctx
=
this
->
ctx_
->
template
As
<
ARMContext
>();
auto
&
ctx
=
this
->
ctx_
->
template
As
<
ARMContext
>();
// TODO(xxx): make api and expose it
ctx
.
SetRunMode
(
LITE_POWER_HIGH
,
4
);
int
win
=
x_dims
[
3
];
// nchw
int
win
=
x_dims
[
3
];
// nchw
int
hin
=
x_dims
[
2
];
int
hin
=
x_dims
[
2
];
...
...
paddle/fluid/lite/kernels/arm/fc_compute.cc
浏览文件 @
58bf3c48
...
@@ -28,7 +28,6 @@ void FcCompute::PrepareForRun() {
...
@@ -28,7 +28,6 @@ void FcCompute::PrepareForRun() {
auto
w_dims
=
param
.
w
->
dims
();
auto
w_dims
=
param
.
w
->
dims
();
auto
&
ctx
=
this
->
ctx_
->
template
As
<
ARMContext
>();
auto
&
ctx
=
this
->
ctx_
->
template
As
<
ARMContext
>();
ctx
.
SetRunMode
(
LITE_POWER_HIGH
,
4
);
CHECK_GE
(
x_dims
.
size
(),
2UL
);
CHECK_GE
(
x_dims
.
size
(),
2UL
);
CHECK_EQ
(
w_dims
.
size
(),
2UL
);
CHECK_EQ
(
w_dims
.
size
(),
2UL
);
...
...
paddle/fluid/lite/kernels/arm/mul_compute.cc
浏览文件 @
58bf3c48
...
@@ -24,7 +24,6 @@ namespace arm {
...
@@ -24,7 +24,6 @@ namespace arm {
void
MulCompute
::
PrepareForRun
()
{
void
MulCompute
::
PrepareForRun
()
{
auto
&
ctx
=
this
->
ctx_
->
template
As
<
ARMContext
>();
auto
&
ctx
=
this
->
ctx_
->
template
As
<
ARMContext
>();
ctx
.
SetRunMode
(
LITE_POWER_HIGH
,
4
);
}
}
void
MulCompute
::
Run
()
{
void
MulCompute
::
Run
()
{
...
...
paddle/fluid/lite/kernels/arm/pool_compute.cc
浏览文件 @
58bf3c48
...
@@ -26,7 +26,6 @@ namespace arm {
...
@@ -26,7 +26,6 @@ namespace arm {
void
PoolCompute
::
PrepareForRun
()
{
void
PoolCompute
::
PrepareForRun
()
{
auto
&
ctx
=
this
->
ctx_
->
template
As
<
ARMContext
>();
auto
&
ctx
=
this
->
ctx_
->
template
As
<
ARMContext
>();
ctx
.
SetRunMode
(
LITE_POWER_HIGH
,
4
);
}
}
void
PoolCompute
::
Run
()
{
void
PoolCompute
::
Run
()
{
...
...
paddle/fluid/lite/kernels/x86/CMakeLists.txt
浏览文件 @
58bf3c48
...
@@ -17,6 +17,7 @@ cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps}
...
@@ -17,6 +17,7 @@ cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps}
cc_library
(
concat_compute_x86 SRCS concat_compute.cc DEPS
${
lite_kernel_deps
}
)
cc_library
(
concat_compute_x86 SRCS concat_compute.cc DEPS
${
lite_kernel_deps
}
)
cc_library
(
conv_compute_x86 SRCS conv_compute.cc DEPS
${
lite_kernel_deps
}
blas im2col vol2col
)
cc_library
(
conv_compute_x86 SRCS conv_compute.cc DEPS
${
lite_kernel_deps
}
blas im2col vol2col
)
cc_library
(
pool_compute_x86 SRCS pool_compute.cc DEPS
${
lite_kernel_deps
}
pooling
)
cc_library
(
pool_compute_x86 SRCS pool_compute.cc DEPS
${
lite_kernel_deps
}
pooling
)
cc_library
(
batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS
${
lite_kernel_deps
}
)
lite_cc_test
(
test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86
)
lite_cc_test
(
test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86
)
lite_cc_test
(
test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86
)
lite_cc_test
(
test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86
)
...
@@ -28,6 +29,7 @@ lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS relu_compute_x
...
@@ -28,6 +29,7 @@ lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS relu_compute_x
lite_cc_test
(
test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86 operator
)
lite_cc_test
(
test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86 operator
)
lite_cc_test
(
test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86
)
lite_cc_test
(
test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86
)
lite_cc_test
(
test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86
)
lite_cc_test
(
test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86
)
lite_cc_test
(
test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86
)
set
(
x86_kernels
set
(
x86_kernels
...
@@ -44,6 +46,7 @@ set(x86_kernels
...
@@ -44,6 +46,7 @@ set(x86_kernels
concat_compute_x86
concat_compute_x86
conv_compute_x86
conv_compute_x86
pool_compute_x86
pool_compute_x86
batch_norm_compute_x86
)
)
set
(
x86_kernels
"
${
x86_kernels
}
"
CACHE INTERNAL
"x86 kernels"
)
set
(
x86_kernels
"
${
x86_kernels
}
"
CACHE INTERNAL
"x86 kernels"
)
...
...
paddle/fluid/lite/kernels/x86/batch_norm_compute.cc
0 → 100644
浏览文件 @
58bf3c48
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/x86/batch_norm_compute.h"
REGISTER_LITE_KERNEL
(
batch_norm
,
kX86
,
kFloat
,
kNCHW
,
paddle
::
lite
::
kernels
::
x86
::
BatchNormCompute
<
float
>
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindInput
(
"Scale"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindInput
(
"Bias"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindInput
(
"Mean"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindInput
(
"Variance"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindOutput
(
"Y"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindOutput
(
"MeanOut"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindOutput
(
"VarianceOut"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindOutput
(
"MeanOut"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindOutput
(
"SavedMean"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
BindOutput
(
"SavedVariance"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kX86
))})
.
Finalize
();
paddle/fluid/lite/kernels/x86/batch_norm_compute.h
0 → 100644
浏览文件 @
58bf3c48
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <random>
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
x86
{
template
<
typename
T
>
using
EigenArrayMap
=
Eigen
::
Map
<
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
Eigen
::
Dynamic
>>
;
template
<
typename
T
>
using
ConstEigenArrayMap
=
Eigen
::
Map
<
const
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
Eigen
::
Dynamic
>>
;
template
<
typename
T
>
using
EigenVectorArrayMap
=
Eigen
::
Map
<
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>>
;
template
<
typename
T
>
using
ConstEigenVectorArrayMap
=
Eigen
::
Map
<
const
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>>
;
template
<
typename
T
>
class
BatchNormCompute
:
public
KernelLite
<
TARGET
(
kX86
),
PRECISION
(
kFloat
)
>
{
public:
using
param_t
=
operators
::
BatchNormParam
;
void
Run
()
override
{
auto
&
param
=
*
param_
.
get_mutable
<
operators
::
BatchNormParam
>
();
bool
global_stats
=
param
.
is_test
||
param
.
use_global_stats
;
const
auto
*
x
=
param
.
x
;
const
auto
&
x_dims
=
x
->
dims
();
CHECK
(
x_dims
.
size
()
>=
2
&&
x_dims
.
size
()
<=
5
);
const
int
N
=
x_dims
[
0
];
const
int
C
=
param
.
data_layout
==
DATALAYOUT
(
kNCHW
)
?
x_dims
[
1
]
:
x_dims
[
x_dims
.
size
()
-
1
];
const
int
sample_size
=
x
->
dims
().
production
()
/
N
/
C
;
// alloc memory
param
.
y
->
template
mutable_data
<
T
>();
param
.
mean_out
->
template
mutable_data
<
T
>();
param
.
variance_out
->
template
mutable_data
<
T
>();
param
.
saved_mean
->
template
mutable_data
<
T
>();
param
.
saved_variance
->
template
mutable_data
<
T
>();
if
(
!
global_stats
)
{
// saved_xx is use just in this batch of data
EigenVectorArrayMap
<
T
>
saved_mean_e
(
param
.
saved_mean
->
mutable_data
<
T
>
(),
C
);
EigenVectorArrayMap
<
T
>
saved_variance_e
(
param
.
saved_variance
->
mutable_data
<
T
>
(),
C
);
saved_mean_e
.
setZero
();
saved_variance_e
.
setZero
();
EigenVectorArrayMap
<
T
>
running_mean_arr
(
param
.
mean_out
->
mutable_data
<
T
>
(),
C
);
EigenVectorArrayMap
<
T
>
running_var_arr
(
param
.
variance_out
->
mutable_data
<
T
>
(),
C
);
if
((
N
*
sample_size
)
==
1
)
{
LOG
(
WARNING
)
<<
"Only 1 element in normalization dimension, "
<<
"we skip the batch norm calculation, let y = x."
;
framework
::
TensorCopy
(
x
->
raw_tensor
(),
platform
::
CPUPlace
(),
&
param
.
y
->
raw_tensor
());
return
;
}
switch
(
param
.
data_layout
)
{
case
DATALAYOUT
(
kNCHW
):
{
ConstEigenArrayMap
<
T
>
x_arr
(
x
->
data
<
T
>
(),
sample_size
,
N
*
C
);
for
(
int
nc
=
0
;
nc
<
N
*
C
;
++
nc
)
{
saved_mean_e
(
nc
%
C
)
+=
x_arr
.
col
(
nc
).
sum
();
}
saved_mean_e
/=
N
*
sample_size
;
for
(
int
nc
=
0
;
nc
<
N
*
C
;
++
nc
)
{
saved_variance_e
(
nc
%
C
)
+=
(
x_arr
.
col
(
nc
)
-
saved_mean_e
(
nc
%
C
)).
matrix
().
squaredNorm
();
}
saved_variance_e
/=
N
*
sample_size
;
break
;
}
default:
LOG
(
FATAL
)
<<
"Unknown storage order: "
<<
DataLayoutToStr
(
param
.
data_layout
);
break
;
}
running_mean_arr
=
running_mean_arr
*
param
.
momentum
+
saved_mean_e
*
(
1.
-
param
.
momentum
);
running_var_arr
=
running_var_arr
*
param
.
momentum
+
saved_variance_e
*
(
1.
-
param
.
momentum
);
}
// use SavedMean and SavedVariance to do normalize
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>
inv_std
(
C
);
if
(
global_stats
)
{
ConstEigenVectorArrayMap
<
T
>
var_arr
(
param
.
variance
->
data
<
T
>
(),
C
);
inv_std
=
(
var_arr
+
param
.
epsilon
).
sqrt
().
inverse
();
}
else
{
EigenVectorArrayMap
<
T
>
saved_inv_std
(
param
.
saved_variance
->
mutable_data
<
T
>
(),
C
);
// inverse SavedVariance first, gradient will use it too.
saved_inv_std
=
(
saved_inv_std
+
param
.
epsilon
).
inverse
().
sqrt
();
inv_std
=
saved_inv_std
;
}
ConstEigenVectorArrayMap
<
T
>
mean_arr
(
global_stats
?
param
.
mean
->
data
<
T
>
()
:
param
.
saved_mean
->
data
<
T
>
(),
C
);
// ((x - est_mean) * (inv_var) * scale + bias
// formula transform ====>
// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
ConstEigenVectorArrayMap
<
T
>
scale_arr
(
param
.
scale
->
data
<
T
>
(),
C
);
ConstEigenVectorArrayMap
<
T
>
bias_arr
(
param
.
bias
->
data
<
T
>
(),
C
);
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>
new_scale
=
inv_std
*
scale_arr
;
Eigen
::
Array
<
T
,
Eigen
::
Dynamic
,
1
>
new_bias
=
bias_arr
-
mean_arr
*
inv_std
*
scale_arr
;
switch
(
param
.
data_layout
)
{
case
DATALAYOUT
(
kNCHW
):
{
EigenArrayMap
<
T
>
y_arr
(
param
.
y
->
mutable_data
<
T
>
(),
sample_size
,
N
*
C
);
ConstEigenArrayMap
<
T
>
x_arr
(
x
->
data
<
T
>
(),
sample_size
,
N
*
C
);
for
(
int
nc
=
0
;
nc
<
N
*
C
;
++
nc
)
{
y_arr
.
col
(
nc
)
=
x_arr
.
col
(
nc
)
*
new_scale
(
nc
%
C
)
+
new_bias
(
nc
%
C
);
}
break
;
}
default:
LOG
(
FATAL
)
<<
"Unknown storage order: "
<<
DataLayoutToStr
(
param
.
data_layout
);
break
;
}
}
virtual
~
BatchNormCompute
()
=
default
;
};
}
// namespace x86
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
paddle/fluid/lite/kernels/x86/batch_norm_compute_test.cc
0 → 100644
浏览文件 @
58bf3c48
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/x86/batch_norm_compute.h"
#include <gtest/gtest.h>
#include <iostream>
#include <vector>
#include "paddle/fluid/lite/core/op_registry.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
x86
{
TEST
(
batch_norm_x86
,
retrive_op
)
{
auto
batch_norm
=
KernelRegistry
::
Global
().
Create
<
TARGET
(
kX86
),
PRECISION
(
kFloat
)
>
(
"batch_norm"
);
ASSERT_FALSE
(
batch_norm
.
empty
());
ASSERT_TRUE
(
batch_norm
.
front
());
}
TEST
(
batch_norm_x86
,
init
)
{
BatchNormCompute
<
float
>
batch_norm
;
ASSERT_EQ
(
batch_norm
.
precision
(),
PRECISION
(
kFloat
));
ASSERT_EQ
(
batch_norm
.
target
(),
TARGET
(
kX86
));
}
TEST
(
batch_norm_x86
,
run_test
)
{
lite
::
Tensor
x
,
scale
,
bias
,
mean
,
variance
,
y
,
mean_out
,
variance_out
,
saved_mean
,
saved_variance
;
constexpr
int
batch_size
=
2
;
std
::
vector
<
int64_t
>
x_shape
{
batch_size
,
3
,
64
,
64
};
x
.
Resize
(
lite
::
DDim
(
x_shape
));
std
::
vector
<
int64_t
>
scale_shape
{
3
};
scale
.
Resize
(
lite
::
DDim
(
scale_shape
));
std
::
vector
<
int64_t
>
bias_shape
{
3
};
bias
.
Resize
(
lite
::
DDim
(
bias_shape
));
std
::
vector
<
int64_t
>
mean_shape
{
3
};
mean
.
Resize
(
lite
::
DDim
(
mean_shape
));
std
::
vector
<
int64_t
>
variance_shape
{
3
};
variance
.
Resize
(
lite
::
DDim
(
variance_shape
));
std
::
vector
<
int64_t
>
y_shape
{
batch_size
,
3
,
64
,
64
};
y
.
Resize
(
lite
::
DDim
(
y_shape
));
std
::
vector
<
int64_t
>
mean_out_shape
{
3
};
mean_out
.
Resize
(
lite
::
DDim
(
mean_out_shape
));
std
::
vector
<
int64_t
>
variance_out_shape
{
3
};
variance_out
.
Resize
(
lite
::
DDim
(
variance_out_shape
));
std
::
vector
<
int64_t
>
saved_mean_shape
{
3
};
saved_mean
.
Resize
(
lite
::
DDim
(
saved_mean_shape
));
std
::
vector
<
int64_t
>
saved_variance_shape
{
3
};
saved_variance
.
Resize
(
lite
::
DDim
(
saved_variance_shape
));
auto
x_data
=
x
.
mutable_data
<
float
>
();
auto
scale_data
=
scale
.
mutable_data
<
float
>
();
auto
bias_data
=
bias
.
mutable_data
<
float
>
();
auto
mean_data
=
mean
.
mutable_data
<
float
>
();
auto
variance_data
=
variance
.
mutable_data
<
float
>
();
y
.
mutable_data
<
float
>
();
mean_out
.
mutable_data
<
float
>
();
variance_out
.
mutable_data
<
float
>
();
saved_mean
.
mutable_data
<
float
>
();
saved_variance
.
mutable_data
<
float
>
();
for
(
int64_t
i
=
0
;
i
<
x
.
dims
().
production
();
i
++
)
{
x_data
[
i
]
=
static_cast
<
float
>
(
i
);
}
for
(
int
i
=
0
;
i
<
scale
.
dims
().
production
();
i
++
)
{
scale_data
[
i
]
=
static_cast
<
float
>
(
i
)
*
0.01
f
+
0.03
f
;
}
for
(
int
i
=
0
;
i
<
bias
.
dims
().
production
();
i
++
)
{
bias_data
[
i
]
=
static_cast
<
float
>
(
i
)
*
0.065
f
+
0.1
f
;
}
for
(
int
i
=
0
;
i
<
mean
.
dims
().
production
();
i
++
)
{
mean_data
[
i
]
=
static_cast
<
float
>
(
i
)
*
0.0565
f
;
}
for
(
int
i
=
0
;
i
<
variance
.
dims
().
production
();
i
++
)
{
variance_data
[
i
]
=
static_cast
<
float
>
(
i
)
*
2.08
f
+
1.5
f
;
}
// BatchNormCompute batch_norm;
BatchNormCompute
<
float
>
batch_norm
;
operators
::
BatchNormParam
param
;
param
.
x
=
&
x
;
param
.
is_test
=
false
;
param
.
scale
=
&
scale
;
param
.
bias
=
&
bias
;
param
.
mean
=
&
mean
;
param
.
variance
=
&
variance
;
param
.
use_global_stats
=
false
;
param
.
epsilon
=
1e-4
f
;
param
.
momentum
=
0.9
f
;
param
.
y
=
&
y
;
param
.
mean_out
=
&
mean_out
;
param
.
variance_out
=
&
variance_out
;
param
.
saved_mean
=
&
saved_mean
;
param
.
saved_variance
=
&
saved_variance
;
batch_norm
.
SetParam
(
param
);
batch_norm
.
Run
();
LOG
(
INFO
)
<<
"output: "
<<
y
;
LOG
(
INFO
)
<<
"mean_out: "
<<
mean_out
;
LOG
(
INFO
)
<<
"variance_out: "
<<
mean_out
;
LOG
(
INFO
)
<<
"saved_mean: "
<<
saved_mean
;
LOG
(
INFO
)
<<
"saved_variance: "
<<
saved_variance
;
/*for (int i = 0; i < y.dims().production(); i++) {
if(i < 5 || i > y.dims().production() - 5)
LOG(INFO) << y_data[i];
}*/
}
}
// namespace x86
}
// namespace kernels
}
// namespace lite
}
// namespace paddle
USE_LITE_KERNEL
(
batch_norm
,
kX86
,
kFloat
,
kNCHW
,
def
);
paddle/fluid/lite/tools/build.sh
浏览文件 @
58bf3c48
...
@@ -135,8 +135,8 @@ function test_arm_model {
...
@@ -135,8 +135,8 @@ function test_arm_model {
adb
-s
emulator-
${
port
}
push
${
model_dir
}
${
adb_work_dir
}
adb
-s
emulator-
${
port
}
push
${
model_dir
}
${
adb_work_dir
}
adb
-s
emulator-
${
port
}
push
${
testpath
}
${
adb_work_dir
}
adb
-s
emulator-
${
port
}
push
${
testpath
}
${
adb_work_dir
}
adb
-s
emulator-
${
port
}
shell
chmod
+x
"
${
adb_work_dir
}
/
${
test_name
}
"
adb
-s
emulator-
${
port
}
shell
chmod
+x
"
${
adb_work_dir
}
/
${
test_name
}
"
local
adb_model_path
=
"
./
${
adb_work_dir
}
/
`
basename
${
model_dir
}
`
"
local
adb_model_path
=
"
${
adb_work_dir
}
/
`
basename
${
model_dir
}
`
"
adb
-s
emulator-
${
port
}
shell
"
./
${
adb_work_dir
}
/
${
test_name
}
--eval_model_dir=
$adb_model_path
"
adb
-s
emulator-
${
port
}
shell
"
${
adb_work_dir
}
/
${
test_name
}
--eval_model_dir=
$adb_model_path
"
}
}
...
@@ -225,16 +225,11 @@ function test_arm {
...
@@ -225,16 +225,11 @@ function test_arm {
for
_test
in
$(
cat
$TESTS_FILE
)
;
do
for
_test
in
$(
cat
$TESTS_FILE
)
;
do
test_arm_android
$_test
$port
test_arm_android
$_test
$port
done
done
# TODO(sangoly): refine this
test_arm_model
"test_cxx_api_lite"
$port
"./third_party/install/mobilenet_v2_relu"
}
}
# Build the code and run lite arm tests. This is executed in the CI system.
function
prepare_emulator
{
function
build_test_arm
{
local
port_armv8
=
$1
########################################################################
local
port_armv7
=
$2
# job 1-4 must be in one runner
port_armv8
=
5554
port_armv7
=
5556
adb kill-server
adb kill-server
adb devices |
grep
emulator |
cut
-f1
|
while
read
line
;
do
adb
-s
$line
emu
kill
;
done
adb devices |
grep
emulator |
cut
-f1
|
while
read
line
;
do
adb
-s
$line
emu
kill
;
done
...
@@ -245,6 +240,18 @@ function build_test_arm {
...
@@ -245,6 +240,18 @@ function build_test_arm {
echo
n | avdmanager create avd
-f
-n
paddle-armv7
-k
"system-images;android-24;google_apis;armeabi-v7a"
echo
n | avdmanager create avd
-f
-n
paddle-armv7
-k
"system-images;android-24;google_apis;armeabi-v7a"
echo
-ne
'\n'
|
${
ANDROID_HOME
}
/emulator/emulator
-avd
paddle-armv7
-noaudio
-no-window
-gpu
off
-verbose
-port
${
port_armv7
}
&
echo
-ne
'\n'
|
${
ANDROID_HOME
}
/emulator/emulator
-avd
paddle-armv7
-noaudio
-no-window
-gpu
off
-verbose
-port
${
port_armv7
}
&
sleep
1m
sleep
1m
}
# We split the arm unittest into several sub-tasks to parallel and reduce the overall CI timetime.
# sub-task1
function
build_test_arm_subtask_android
{
########################################################################
# job 1-4 must be in one runner
port_armv8
=
5554
port_armv7
=
5556
prepare_emulator
$port_armv8
$port_armv7
# job 1
# job 1
build_arm
"android"
"armv8"
"gcc"
build_arm
"android"
"armv8"
"gcc"
...
@@ -252,9 +259,9 @@ function build_test_arm {
...
@@ -252,9 +259,9 @@ function build_test_arm {
cd
-
cd
-
# job 2
# job 2
build_arm
"android"
"armv8"
"clang"
#
build_arm "android" "armv8" "clang"
test_arm
"android"
"armv8"
"clang"
${
port_armv8
}
#
test_arm "android" "armv8" "clang" ${port_armv8}
cd
-
#
cd -
# job 3
# job 3
build_arm
"android"
"armv7"
"gcc"
build_arm
"android"
"armv7"
"gcc"
...
@@ -262,13 +269,22 @@ function build_test_arm {
...
@@ -262,13 +269,22 @@ function build_test_arm {
cd
-
cd
-
# job 4
# job 4
build_arm
"android"
"armv7"
"clang"
#
build_arm "android" "armv7" "clang"
test_arm
"android"
"armv7"
"clang"
${
port_armv7
}
#
test_arm "android" "armv7" "clang" ${port_armv7}
cd
-
#
cd -
adb devices |
grep
emulator |
cut
-f1
|
while
read
line
;
do
adb
-s
$line
emu
kill
;
done
adb devices |
grep
emulator |
cut
-f1
|
while
read
line
;
do
adb
-s
$line
emu
kill
;
done
echo
"Done"
echo
"Done"
}
# sub-task2
function
build_test_arm_subtask_armlinux
{
########################################################################
########################################################################
# job 1-4 must be in one runner
port_armv8
=
5554
port_armv7
=
5556
prepare_emulator
$port_armv8
$port_armv7
# job 5
# job 5
build_arm
"armlinux"
"armv8"
build_arm
"armlinux"
"armv8"
...
@@ -285,9 +301,47 @@ function build_test_arm {
...
@@ -285,9 +301,47 @@ function build_test_arm {
test_arm
"armlinux"
"armv7hf"
test_arm
"armlinux"
"armv7hf"
cd
-
cd
-
adb devices |
grep
emulator |
cut
-f1
|
while
read
line
;
do
adb
-s
$line
emu
kill
;
done
echo
"Done"
echo
"Done"
}
}
# sub-task3
function
build_test_arm_subtask3_mobilenet_v2
{
local
port_armv8
=
5554
local
port_armv7
=
5556
# We just test following single one environment to limit the CI time.
local
os
=
android
local
abi
=
armv8
local
lang
=
gcc
cur_dir
=
$(
pwd
)
build_dir
=
$cur_dir
/build.lite.
${
os
}
.
${
abi
}
.
${
lang
}
mkdir
-p
$build_dir
cd
$build_dir
cmake_arm
$os
$abi
$lang
make test_cxx_api_lite
-j
$NUM_CORES_FOR_COMPILE
prepare_emulator
$port_armv8
$port_armv7
# just test the model on armv8
test_arm_model
"test_cxx_api_lite"
$port_armv8
"./third_party/install/mobilenet_v2_relu"
adb devices |
grep
emulator |
cut
-f1
|
while
read
line
;
do
adb
-s
$line
emu
kill
;
done
echo
"Done"
}
# Build the code and run lite arm tests. This is executed in the CI system.
function
build_test_arm
{
########################################################################
# job 1-4 must be in one runner
port_armv8
=
5554
port_armv7
=
5556
build_test_arm_subtask_android
build_test_arm_subtask_armlinux
}
############################# MAIN #################################
############################# MAIN #################################
function
print_usage
{
function
print_usage
{
echo
-e
"
\n
USAGE:"
echo
-e
"
\n
USAGE:"
...
@@ -379,6 +433,18 @@ function main {
...
@@ -379,6 +433,18 @@ function main {
build_test_arm
build_test_arm
shift
shift
;;
;;
build_test_arm_subtask_android
)
build_test_arm_subtask_android
shift
;;
build_test_arm_subtask_armlinux
)
build_test_arm_subtask_armlinux
shift
;;
build_test_arm_model1
)
build_test_arm_subtask3_mobilenet_v2
shift
;;
check_style
)
check_style
)
check_style
check_style
shift
shift
...
@@ -397,4 +463,3 @@ function main {
...
@@ -397,4 +463,3 @@ function main {
}
}
main
$@
main
$@
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录