提交 58bf3c48 编写于 作者: N nhzlx

Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into xzl/incubate/lite

......@@ -2,6 +2,20 @@ before_script:
- env
- export CI_USER_DIR=$(pwd)
# prepare ccache
- apt install ccache
# for proxy
- export http_proxy=$CI_PROXY
- export https_proxy=$CI_PROXY
# merge the latest code
- git config --global user.email "you@example.com"
- git config --global user.name "Your Name"
- git fetch origin incubate/lite
- git merge --no-ff origin/incubate/lite
image: $SERVER_LITE_DOCKER_IMAGE
stages:
......@@ -14,19 +28,13 @@ check:prebuilt:
- lite
stage: ci
script:
# prepare for pre-commit
- rm -rf ~/.pip
- export http_proxy=$CI_PROXY
- export https_proxy=$CI_PROXY
- pip install pre-commit
- pre-commit install
# merge the latest code
- git config --global user.email "you@example.com"
- git config --global user.name "Your Name"
- git fetch origin incubate/lite
- git merge --no-ff origin/incubate/lite
- ./paddle/fluid/lite/tools/build.sh check_style
cache:
key: check_style
paths:
......@@ -42,17 +50,11 @@ build:server:
paths:
- build/third_party
- ~/.ccache
- $CI_PROJECT_DIR/_build_server_ccache
script:
- apt install ccache
- export http_proxy=$CI_PROXY
- export https_proxy=$CI_PROXY
# merge the latest code
- git config --global user.email "you@example.com"
- git config --global user.name "Your Name"
- git fetch origin incubate/lite
- git merge --no-ff origin/incubate/lite
# customize ccache path for specifying runner cache
- export CCACHE_DIR=$CI_PROJECT_DIR/_build_server_ccache
# run build and test
- mkdir -p build
- cd build
- ../paddle/fluid/lite/tools/build.sh cmake_x86
......@@ -66,7 +68,27 @@ build:server:
dependencies:
- check:prebuilt
build:mobile:
build:mobile_android:
tags:
- lite
stage: build_mobile
image: $MOBILE_LITE_DOCKER_IMAGE
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- ~/.ccache
- $CI_PROJECT_DIR/build_mobile_ccache
script:
- export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_ccache
- ./paddle/fluid/lite/tools/build.sh build_test_arm_subtask_android
dependencies:
- build:server
build:mobile_armlinux:
tags:
- lite
stage: build_mobile
......@@ -77,17 +99,43 @@ build:mobile:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- ~/.ccache
- $CI_PROJECT_DIR/build_mobile_ccache2
script:
- apt install ccache
- export http_proxy=$CI_PROXY
- export https_proxy=$CI_PROXY
- export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_ccache2
- ./paddle/fluid/lite/tools/build.sh build_test_arm_subtask_armlinux
dependencies:
- build:server
# merge the latest code
- git config --global user.email "you@example.com"
- git config --global user.name "Your Name"
- git fetch origin incubate/lite
- git merge --no-ff origin/incubate/lite
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- ~/.ccache
build:mobile_model_mobilenetv2:
tags:
- lite
stage: build_mobile
image: $MOBILE_LITE_DOCKER_IMAGE
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- ~/.ccache
script:
- export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model1
- ./paddle/fluid/lite/tools/build.sh build_test_arm_model1
- ./paddle/fluid/lite/tools/build.sh build_test_arm
dependencies:
- build:server
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- ~/.ccache
- $CI_PROJECT_DIR/build_mobile_model1
......@@ -29,9 +29,10 @@ double time_diff(Time t1, Time t2) {
return counter.count() / 1000.0;
}
void Run(const char* model_dir, int repeat) {
void Run(const char* model_dir, int repeat, int thread_num) {
#ifdef LITE_WITH_ARM
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, thread_num);
#endif
lite::ExecutorLite predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
......@@ -67,8 +68,8 @@ void Run(const char* model_dir, int repeat) {
} // namespace paddle
int main(int argc, char** argv) {
CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
paddle::lite::Run(argv[1], std::stoi(argv[2]));
CHECK_EQ(argc, 4) << "usage: ./cmd <model_dir> <repeat> <thread_num>";
paddle::lite::Run(argv[1], std::stoi(argv[2]), std::stoi(argv[3]));
return 0;
}
......
......@@ -13,322 +13,7 @@
// limitations under the License.
#include "paddle/fluid/lite/core/context.h"
#include "paddle/fluid/lite/core/cpu_info.h"
#ifdef LITE_WITH_LINUX
#include <sys/syscall.h>
#include <unistd.h>
#endif
#if __APPLE__
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#include <mach/machine.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#endif // TARGET_OS_IPHONE
#endif // __APPLE__
#ifdef ARM_WITH_OMP
#include <omp.h>
#endif
namespace paddle {
namespace lite {
#ifdef LITE_WITH_ARM
void Context<TargetType::kARM>::SetCache(int l1size, int l2size, int l3size) {
DeviceInfo& dev = DeviceInfo::Global();
int cpu_count = arm_get_cpucount();
dev.L1_cache_.resize(cpu_count);
dev.L2_cache_.resize(cpu_count);
dev.L3_cache_.resize(cpu_count);
for (int i = 0; i < cpu_count; ++i) {
dev.L1_cache_[i] = l1size;
dev.L2_cache_[i] = l2size;
dev.L3_cache_[i] = l3size;
}
workspace_.Resize({2 * (l1size + l2size)});
}
Context<TargetType::kARM>::Context() {
active_ids_ = {0};
mode_ = LITE_POWER_HIGH;
DeviceInfo& dev = DeviceInfo::Global();
workspace_.Resize(
{static_cast<int64_t>(dev.L2_cache_[active_ids_[0]] / sizeof(float))});
#ifdef TARGET_IOS
arch_ = APPLE; // use 6x8
#else
if (dev.big_core_ids_.size() > 0) {
arch_ = dev.archs_[dev.big_core_ids_[0]];
}
#endif
}
PowerMode Context<TargetType::kARM>::mode() const { return mode_; }
int Context<TargetType::kARM>::threads() const { return active_ids_.size(); }
Context<TargetType::kARM>::Context(const ARMContext& ctx) {
mode_ = ctx.mode_;
active_ids_ = ctx.active_ids_;
workspace_ = ctx.workspace_;
arch_ = ctx.arch_;
count_ = ctx.count_;
}
ARMContext& Context<TargetType::kARM>::operator=(const ARMContext& ctx) {
mode_ = ctx.mode_;
active_ids_ = ctx.active_ids_;
workspace_ = ctx.workspace_;
arch_ = ctx.arch_;
count_ = ctx.count_;
return *this;
}
void Context<TargetType::kARM>::BindDev() {
#ifdef ARM_WITH_OMP
int num_threads = active_ids_.size();
omp_set_num_threads(num_threads);
#ifdef LITE_WITH_LINUX
std::vector<int> ssarets;
for (int j = 0; j < num_threads; ++j) {
ssarets.push_back(0);
}
#pragma omp parallel for
for (int i = 0; i < num_threads; i++) {
ssarets[i] = set_sched_affinity(active_ids_);
}
for (int i = 0; i < num_threads; i++) {
if (ssarets[i] != 0) {
LOG(ERROR) << "set cpu affinity failed, cpuID: " << active_ids_[i];
return;
}
}
#endif // LITE_WITH_LINUX
#else // ARM_WITH_OMP
#ifdef LITE_WITH_LINUX
std::vector<int> cpuid1;
cpuid1.push_back(active_ids_[0]);
int ssaret = set_sched_affinity(cpuid1);
if (ssaret != 0) {
printf("set cpu affinity failed, cpuID: %d\n", active_ids_[0]);
return;
}
#endif // LITE_WITH_LINUX
#endif // ARM_WITH_OMP
}
void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
DeviceInfo& dev = DeviceInfo::Global();
int big_core_size = dev.big_core_ids_.size();
int small_core_size = dev.little_core_ids_.size();
if (threads > big_core_size + small_core_size) {
threads = big_core_size + small_core_size;
}
#ifdef ARM_WITH_OMP
count_++;
int shift_num = (count_ / 10) % big_core_size;
switch (mode) {
case LITE_POWER_FULL:
mode_ = mode;
active_ids_.clear();
for (int i = 0; i < threads; ++i) {
if (i < big_core_size) {
active_ids_.push_back(dev.big_core_ids_[i]);
} else {
active_ids_.push_back(dev.little_core_ids_[i - big_core_size]);
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
case LITE_POWER_HIGH:
active_ids_.clear();
if (big_core_size > 0) {
mode_ = LITE_POWER_HIGH;
if (threads > big_core_size) {
LOG(ERROR) << "threads: " << threads
<< ", exceed the big cores size: " << big_core_size;
active_ids_ = dev.big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.big_core_ids_[i]);
}
}
} else {
mode_ = LITE_POWER_LOW;
LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores";
if (threads > small_core_size) {
active_ids_ = dev.little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.little_core_ids_[i]);
}
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
case LITE_POWER_LOW:
active_ids_.clear();
if (small_core_size > 0) {
mode_ = LITE_POWER_LOW;
if (threads > small_core_size) {
LOG(WARNING) << "threads: " << threads
<< ", exceed the little cores size: " << small_core_size;
active_ids_ = dev.little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.little_core_ids_[i]);
}
}
} else {
mode_ = LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
if (threads > big_core_size) {
active_ids_ = dev.big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.big_core_ids_[i]);
}
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
case LITE_POWER_NO_BIND:
mode_ = LITE_POWER_NO_BIND;
active_ids_.clear();
if (threads > dev.core_ids_.size()) {
active_ids_.resize(dev.core_ids_.size());
} else {
active_ids_.resize(threads);
}
break;
case LITE_POWER_RAND_HIGH:
active_ids_.clear();
if (big_core_size > 0) {
mode_ = LITE_POWER_RAND_HIGH;
if (threads > big_core_size) {
LOG(WARNING) << "threads: " << threads
<< ", exceed the big cores size: " << big_core_size;
active_ids_ = dev.big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(
dev.big_core_ids_[(i + shift_num) % big_core_size]);
}
}
} else {
mode_ = LITE_POWER_LOW;
LOG(WARNING)
<< "HIGH POWER MODE is not support, switch to little cores";
if (threads > small_core_size) {
active_ids_ = dev.little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.little_core_ids_[i]);
}
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
case LITE_POWER_RAND_LOW:
active_ids_.clear();
if (small_core_size > 0) {
mode_ = LITE_POWER_RAND_LOW;
if (threads > small_core_size) {
LOG(WARNING) << "threads: " << threads
<< ", exceed the little cores size: " << small_core_size;
active_ids_ = dev.little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(
dev.little_core_ids_[(i + shift_num) % small_core_size]);
}
}
} else {
mode_ = LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
if (threads > big_core_size) {
active_ids_ = dev.big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.big_core_ids_[i]);
}
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
}
//! fix multi-threads LITE_POWER_HIGH mode
if (mode_ == LITE_POWER_NO_BIND || threads > 1) {
int threads = active_ids_.size();
omp_set_num_threads(threads);
} else {
if (check_online(active_ids_)) {
BindDev();
} else {
LOG(ERROR) << "core id " << active_ids_[0]
<< " is offline, switch to NO BIND MODE";
int threads = active_ids_.size();
omp_set_num_threads(threads);
}
}
#else
if (big_core_size > 0) {
active_ids_ = {dev.big_core_ids_[0]};
} else {
active_ids_ = {0};
}
#endif
//! alloc memory for sgemm in this context
int temp_mem_size =
DeviceInfo::Global().L2_cache_[active_ids_[0]] / sizeof(float);
workspace_.Resize({temp_mem_size});
arch_ = DeviceInfo::Global().archs_[active_ids_[0]];
}
ARMArch Context<TargetType::kARM>::arch() const { return arch_; }
void Context<TargetType::kARM>::SetArch(ARMArch arch) { arch_ = arch; }
int Context<TargetType::kARM>::l1_cache_size() const {
DeviceInfo& dev = DeviceInfo::Global();
return dev.L1_cache_[active_ids_[0]];
}
int Context<TargetType::kARM>::l2_cache_size() const {
DeviceInfo& dev = DeviceInfo::Global();
return dev.L2_cache_[active_ids_[0]];
}
int Context<TargetType::kARM>::l3_cache_size() const {
DeviceInfo& dev = DeviceInfo::Global();
return dev.L3_cache_[active_ids_[0]];
}
bool Context<TargetType::kARM>::ExtendWorkspace(DDimLite dims) {
auto count = dims.product();
auto old = workspace_.dims();
if (count == old.product()) {
return false;
}
workspace_.Resize(
{static_cast<int64_t>(count + l2_cache_size() / sizeof(float))});
return true;
}
#endif // LITE_WITH_ARM
} // namespace lite
namespace lite {} // namespace lite
} // namespace paddle
......@@ -61,47 +61,41 @@ class Context<TargetType::kHost> {
template <>
class Context<TargetType::kARM> {
public:
Context();
Context(PowerMode mode, int threads);
Context() {}
explicit Context(const ARMContext& ctx);
ARMContext& operator=(const ARMContext& ctx);
ARMContext& operator=(const ARMContext& ctx) {}
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() { DeviceInfo::Init(); }
void CopyShared(const ARMContext* ctx) {}
void SetRunMode(PowerMode mode, int threads);
void SetCache(int l1size, int l2size, int l3size);
void SetArch(ARMArch arch);
void BindDev();
void SetRunMode(PowerMode mode, int threads) {
return DeviceInfo::Global().SetRunMode(mode, threads);
}
void SetCache(int l1size, int l2size, int l3size) {
return DeviceInfo::Global().SetCache(l1size, l2size, l3size);
}
void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); }
PowerMode mode() const;
int threads() const;
ARMArch arch() const;
PowerMode mode() const { return DeviceInfo::Global().mode(); }
int threads() const { return DeviceInfo::Global().threads(); }
ARMArch arch() const { return DeviceInfo::Global().arch(); }
int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }
template <typename T>
T* workspace_data() {
return workspace_.mutable_data<T>();
return DeviceInfo::Global().workspace_data<T>();
}
int l1_cache_size() const;
int l2_cache_size() const;
int l3_cache_size() const;
bool ExtendWorkspace(DDimLite dims);
bool ExtendWorkspace(DDimLite dims) {
return DeviceInfo::Global().ExtendWorkspace(dims);
}
std::string name() const { return "ARMContext"; }
private:
// LITE_POWER_HIGH stands for using big cores,
// LITE_POWER_LOW stands for using small core,
// LITE_POWER_FULL stands for using all cores
ARMArch arch_;
PowerMode mode_;
std::vector<int> active_ids_;
TensorLite workspace_;
int64_t count_{0};
};
#endif
......
......@@ -12,312 +12,81 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef LITE_WITH_LINUX
#include <sys/syscall.h>
#include <unistd.h>
#endif
#if __APPLE__
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#include <mach/machine.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#endif // TARGET_OS_IPHONE
#endif // __APPLE__
#ifdef ARM_WITH_OMP
#include <omp.h>
#endif
#include <algorithm>
#include <limits>
#include "paddle/fluid/lite/core/cpu_info.h"
#include <cstdarg>
namespace paddle {
namespace lite {
#ifdef LITE_WITH_ARM
void DeviceInfo::InitInternal(DeviceInfo* dev) {
set_default_cache(dev);
dev->compute_core_num_ = arm_get_cpucount();
dev->max_memory_ = arm_get_meminfo();
// get max freq
#ifdef LITE_WITH_LINUX
std::vector<int> max_freq(dev->compute_core_num_);
for (int i = 0; i < dev->compute_core_num_; ++i) {
max_freq[i] = get_max_freq_khz(i) / 1000;
}
std::string cpu_name = arm_get_cpu_name();
if (get_cpu_info_from_name(dev, cpu_name) != true) {
arm_sort_cpuid_by_max_frequency(dev->compute_core_num_, &dev->core_ids_,
max_freq, &dev->cluster_ids_);
dev->big_core_ids_.clear();
dev->little_core_ids_.clear();
for (int i = 0; i < dev->cluster_ids_.size(); ++i) {
if (dev->cluster_ids_[i] == 0) {
dev->big_core_ids_.push_back(dev->core_ids_[i]);
} else {
dev->little_core_ids_.push_back(dev->core_ids_[i]);
}
}
arm_get_cpu_arch(&dev->archs_);
}
LOG(INFO) << "ARM multiprocessors number: " << dev->compute_core_num_;
for (int i = 0; i < dev->compute_core_num_; ++i) {
LOG(INFO) << "ARM multiprocessors ID: " << dev->core_ids_[i]
<< ", frequence: " << max_freq[i]
<< ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]]
<< ", CPU ARCH: A" << dev->archs_[i];
}
VLOG(1) << "L1 DataCache size is: ";
for (int i = 0; i < dev->compute_core_num_; ++i) {
VLOG(1) << dev->L1_cache_[i] / 1024 << " KB";
}
VLOG(1) << "L2 Cache size is: ";
for (int i = 0; i < dev->compute_core_num_; ++i) {
VLOG(1) << dev->L2_cache_[i] / 1024 << " KB";
}
VLOG(1) << "Total memory: " << dev->max_memory_ << "KB";
dev->max_freq_ = max_freq[0];
for (int j = 1; j < dev->compute_core_num_; ++j) {
if (dev->max_freq_ < max_freq[j]) {
dev->max_freq_ = max_freq[j];
}
}
#elif defined(TARGET_IOS)
arm_get_cpu_arch(&dev->archs_);
#ifdef TARGET_IOS
const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
const int DEFAULT_L3_CACHE_SIZE = 0;
#else
const int DEFAULT_L1_CACHE_SIZE = 32 * 1024;
const int DEFAULT_L2_CACHE_SIZE = 512 * 1024;
const int DEFAULT_L3_CACHE_SIZE = 0;
#endif
}
// cache_id : 0 -> L1, 1 -> L2, 2 -> L3
void set_cache_info(DeviceInfo* cpu_info, int cache_id, int argc, ...) {
va_list arg_ptr;
va_start(arg_ptr, argc);
std::vector<int>* cache;
switch (cache_id) {
case 0:
cache = &cpu_info->L1_cache_;
break;
case 1:
cache = &cpu_info->L2_cache_;
break;
case 2:
cache = &cpu_info->L3_cache_;
break;
default:
int get_cpu_num() {
#ifdef LITE_WITH_LINUX
// get cpu count from /sys/devices/system/cpu/cpunum/uevent
int max_cpu_num = 20;
int cpu_num = 0;
for (int i = 0; i < max_cpu_num; ++i) {
char path[256];
snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
FILE* fp = fopen(path, "rb");
if (!fp) {
break;
}
int core_num = cpu_info->compute_core_num_;
cache->resize(core_num);
if (argc == 1) {
int cache_size = va_arg(arg_ptr, int);
for (int i = 0; i < core_num; ++i) {
(*cache)[i] = cache_size;
}
} else {
int big_core_num = cpu_info->big_core_ids_.size();
int little_core_num = cpu_info->little_core_ids_.size();
int big_core_cache_size = va_arg(arg_ptr, int);
int little_core_cache_size = va_arg(arg_ptr, int);
for (int i = 0; i < big_core_num; ++i) {
(*cache)[cpu_info->big_core_ids_[i]] = big_core_cache_size;
}
for (int i = 0; i < little_core_num; ++i) {
(*cache)[cpu_info->little_core_ids_[i]] = little_core_cache_size;
}
cpu_num++;
fclose(fp);
}
va_end(arg_ptr);
}
void set_arch_info(DeviceInfo* cpu_info, int argc, ...) {
va_list arg_ptr;
va_start(arg_ptr, argc);
int core_num = cpu_info->compute_core_num_;
cpu_info->archs_.resize(core_num);
if (argc == 1) {
ARMArch arch = (ARMArch)va_arg(arg_ptr, int);
for (int i = 0; i < core_num; ++i) {
cpu_info->archs_[i] = arch;
}
} else {
ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int);
ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int);
int big_core_num = cpu_info->big_core_ids_.size();
int little_core_num = cpu_info->little_core_ids_.size();
for (int i = 0; i < big_core_num; ++i) {
cpu_info->archs_[cpu_info->big_core_ids_[i]] = big_core_arch;
}
for (int i = 0; i < little_core_num; ++i) {
cpu_info->archs_[cpu_info->little_core_ids_[i]] = little_core_arch;
}
if (cpu_num < 1) {
cpu_num = 1;
}
va_end(arg_ptr);
}
bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name) {
/* Snapdragon */
if (hardware_name.find("SDM845") != std::string::npos) { // 845
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 2, kA75, kA55);
set_cache_info(cpu_info, 0, 1, 32 * 1024);
set_cache_info(cpu_info, 1, 2, 256 * 1024, 128 * 1024);
set_cache_info(cpu_info, 2, 1, 2048 * 1024);
return true;
} else if (hardware_name.find("SDM710") != std::string::npos) { // 710
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
set_arch_info(cpu_info, 2, kA75, kA55);
return true;
} else if (hardware_name.find("MSM8998") != std::string::npos) { // 835
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 2, kA73, kA53);
set_cache_info(cpu_info, 0, 2, 64 * 1024);
set_cache_info(cpu_info, 1, 2, 1024 * 1024,
/*real cache size is 2M, while that will get bad performace
on conv3x3s1 or gemm, set to 1M or 512K*/
1024 * 1024);
return true;
} else if (hardware_name.find("MSM8996") != std::string::npos) { // 820
cpu_info->compute_core_num_ = 4;
cpu_info->core_ids_ = {0, 1, 2, 3};
cpu_info->big_core_ids_ = {2, 3};
cpu_info->little_core_ids_ = {0, 1};
cpu_info->cluster_ids_ = {1, 1, 0, 0};
set_arch_info(cpu_info, 1, kA72);
set_cache_info(cpu_info, 0, 1, 24 * 1024);
set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (hardware_name.find("SDM660") != std::string::npos ||
hardware_name.find("SDM636") != std::string::npos) { // 660, 636
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA73);
set_cache_info(cpu_info, 0, 2, 64 * 1024, 32 * 1024);
set_cache_info(cpu_info, 1, 1, 1024 * 1024);
return true;
} else if (hardware_name.find("MSM8976") != std::string::npos) { // 652,653
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 2, kA72, kA53);
set_cache_info(cpu_info, 0, 1, 32 * 1024);
set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (hardware_name.find("MSM8953") != std::string::npos) { // 625
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->little_core_ids_ = {};
cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA53);
set_cache_info(cpu_info, 0, 1, 32 * 1024);
set_cache_info(cpu_info, 1, 1, 1024 * 1024);
return true;
} else if (hardware_name.find("MSM8939") != std::string::npos) { // 615
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {0, 1, 2, 3};
cpu_info->little_core_ids_ = {4, 5, 6, 7};
cpu_info->cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
set_arch_info(cpu_info, 1, kA53);
set_cache_info(cpu_info, 0, 1, 32 * 1024);
set_cache_info(cpu_info, 1, 2, 512 * 1024, 256 * 1024);
return true;
/* MediaTek */
} else if (hardware_name.find("MT6797") !=
std::string::npos) { // X20/X23/X25/X27
cpu_info->compute_core_num_ = 10;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
cpu_info->big_core_ids_ = {8, 9};
cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
set_arch_info(cpu_info, 2, kA72, kA53);
set_cache_info(cpu_info, 0, 1, 32 * 1024);
set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (hardware_name.find("MT6799") != std::string::npos) { // X30
cpu_info->compute_core_num_ = 10;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
cpu_info->big_core_ids_ = {8, 9};
cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
set_arch_info(cpu_info, 2, kA73, kA53);
return true;
} else if (hardware_name.find("MT6795") != std::string::npos ||
hardware_name.find("MT6762") != std::string::npos ||
hardware_name.find("MT6755T") != std::string::npos ||
hardware_name.find("MT6755S") != std::string::npos ||
hardware_name.find("MT6753") != std::string::npos ||
hardware_name.find("MT6752") != std::string::npos ||
hardware_name.find("MT6750") != std::string::npos) {
// X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->little_core_ids_ = {};
cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA53);
return true;
} else if (hardware_name.find("MT6758") != std::string::npos ||
hardware_name.find("MT6757") != std::string::npos ||
hardware_name.find("MT6763") != std::string::npos ||
hardware_name.find("MT6755M") != std::string::npos ||
hardware_name.find("MT6755") !=
std::string::npos) { // P30, P20/P25, P23, P10
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA53);
return true;
} else if (hardware_name.find("MT6771") != std::string::npos) { // P60
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 2, kA73, kA53);
return true;
} else if (hardware_name.find("MT6765") != std::string::npos ||
hardware_name.find("MT6739") != std::string::npos ||
hardware_name.find("MT6738") != std::string::npos ||
hardware_name.find("MT6737") !=
std::string::npos) { // A22, MT6739, MT6738, MT6767
cpu_info->compute_core_num_ = 4;
cpu_info->core_ids_ = {0, 1, 2, 3};
cpu_info->big_core_ids_ = {0, 0, 0, 0};
cpu_info->little_core_ids_ = {};
cpu_info->cluster_ids_ = {0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA53);
return true;
return cpu_num;
#elif defined(TARGET_IOS)
int cpu_num = 0;
size_t len = sizeof(cpu_num);
sysctlbyname("hw.ncpu", &cpu_num, &len, NULL, 0);
if (cpu_num < 1) {
cpu_num = 1;
}
return false;
return cpu_num;
#else
return 1;
#endif
}
size_t arm_get_meminfo() {
size_t get_mem_size() {
#ifdef LITE_WITH_LINUX
// get cpu count from /proc/cpuinfo
FILE* fp = fopen("/proc/meminfo", "rb");
if (!fp) {
return 1;
}
size_t memsize = 0;
char line[1024];
while (!feof(fp)) {
......@@ -327,52 +96,18 @@ size_t arm_get_meminfo() {
}
sscanf(s, "MemTotal: %d kB", &memsize);
}
fclose(fp);
return memsize;
#elif defined(TARGET_IOS)
// to be implemented
printf("not implemented\n");
return 0;
#endif
}
int arm_get_cpucount() {
#ifdef LITE_WITH_LINUX
// get cpu count from /sys/devices/system/cpu/cpunum/uevent
int max_cpu_count = 20;
int count = 0;
for (int i = 0; i < max_cpu_count; ++i) {
char path[256];
snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
FILE* fp = fopen(path, "rb");
if (!fp) {
break;
}
count++;
fclose(fp);
}
if (count < 1) {
count = 1;
}
return count;
#elif defined(TARGET_IOS)
int count = 0;
size_t len = sizeof(count);
sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
if (count < 1) {
count = 1;
}
return count;
#else
return 1;
#endif
return 0;
}
void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
#ifdef LITE_WITH_LINUX
void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
archs->clear();
#ifdef LITE_WITH_LINUX
//! get CPU ARCH
FILE* fp = fopen("/proc/cpuinfo", "rb");
if (!fp) {
......@@ -406,6 +141,29 @@ void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
case 0xd0a:
archs->push_back(kA75);
break;
case 0xd40:
archs->push_back(kA76);
break;
case 0x804:
// 855
archs->push_back(kA76);
break;
case 0x805:
// 855
archs->push_back(kA55);
break;
case 0x802:
// 845
archs->push_back(kA75);
break;
case 0x803:
// 845
archs->push_back(kA55);
break;
case 0x801:
// 835
archs->push_back(kA73);
break;
case 0x800:
// 835
archs->push_back(kA73);
......@@ -415,49 +173,31 @@ void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
archs->push_back(kA72);
break;
default:
LOG(ERROR) << "unknow type";
LOG(ERROR) << "Unknow cpu arch: " << arch_id;
archs->push_back(kARMArch_UNKOWN);
}
}
}
fclose(fp);
int cpu_count = arm_get_cpucount();
if (archs->size() < cpu_count) {
for (int i = archs->size(); i < cpu_count; ++i) {
if (archs->size() < cpu_num) {
for (int i = archs->size(); i < cpu_num; ++i) {
archs->push_back(archs->at(i - 1));
}
}
#endif
#ifdef TARGET_IOS
int cpu_count = arm_get_cpucount();
for (int i = 0; i < cpu_count; ++i) {
#elif defined(TARGET_IOS)
for (int i = 0; i < cpu_num; ++i) {
archs->push_back(APPLE);
}
#else
for (int i = 0; i < cpu_num; ++i) {
archs->push_back(kARMArch_UNKOWN);
}
#endif
}
#ifdef LITE_WITH_LINUX
void set_default_cache(DeviceInfo* dev) {
int cpu_count = arm_get_cpucount();
dev->L1_cache_.resize(cpu_count);
dev->L2_cache_.resize(cpu_count);
dev->L3_cache_.resize(cpu_count);
#ifdef TARGET_IOS
for (int i = 0; i < cpu_count; ++i) {
dev->L1_cache_[i] = 64 * 1024;
dev->L2_cache_[i] = 2048 * 1024;
dev->L3_cache_[i] = 0;
}
#else
for (int i = 0; i < cpu_count; ++i) {
dev->L1_cache_[i] = 32 * 1024;
dev->L2_cache_[i] = 512 * 1024;
dev->L3_cache_[i] = 0;
}
#endif
}
std::string arm_get_cpu_name() {
std::string get_cpu_name() {
FILE* fp = fopen("/proc/cpuinfo", "rb");
if (!fp) {
return "";
......@@ -477,122 +217,163 @@ std::string arm_get_cpu_name() {
return "";
}
int get_max_freq_khz(int cpuid) {
void get_cpu_max_min_freq(int cpu_id, int* max_freq, int* min_freq) {
*max_freq = 0;
*min_freq = 0;
// first try, for all possible cpu
char path[256];
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
"/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpu_id);
FILE* fp = fopen(path, "rb");
if (!fp) {
// second try, for online cpu
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",
cpuid);
cpu_id);
fp = fopen(path, "rb");
if (!fp) {
// third try, for online cpu
// get max_freq
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
cpu_id);
fp = fopen(path, "rb");
if (!fp) {
return -1;
return;
}
int max_freq_khz = -1;
fscanf(fp, "%d", &max_freq_khz);
fscanf(fp, "%d", max_freq);
fclose(fp);
return max_freq_khz;
// get min_freq
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_min_freq",
cpu_id);
fp = fopen(path, "rb");
if (!fp) {
return;
}
fscanf(fp, "%d", min_freq);
fclose(fp);
return;
}
}
int max_freq_khz = 0;
*min_freq = std::numeric_limits<int>::max();
while (!feof(fp)) {
int freq_khz = 0;
int nscan = fscanf(fp, "%d %*d", &freq_khz);
int freq = 0;
int nscan = fscanf(fp, "%d %*d", &freq);
if (nscan != 1) {
break;
}
if (freq_khz > max_freq_khz) {
max_freq_khz = freq_khz;
if (freq > *max_freq) {
*max_freq = freq;
}
if (freq < *min_freq) {
*min_freq = freq;
}
}
fclose(fp);
return max_freq_khz;
}
int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
const std::vector<int>& cpu_freq,
std::vector<int>* cluster_ids) {
if (cpu_count == 0) {
return 0;
void sort_cpuid_by_max_freq(const std::vector<int>& max_freqs,
std::vector<int>* cpu_ids,
std::vector<int>* cluster_ids) {
int cpu_num = max_freqs.size();
if (cpu_num == 0) {
return;
}
cpuids->resize(cpu_count);
cluster_ids->resize(cpu_count);
for (int i = 0; i < cpu_count; i++) {
cpuids->at(i) = i;
cpu_ids->resize(cpu_num);
cluster_ids->resize(cpu_num);
for (int i = 0; i < cpu_num; i++) {
cpu_ids->at(i) = i;
}
// sort cpuid as big core first
// simple bubble sort
for (int i = 0; i < cpu_count; i++) {
for (int j = i + 1; j < cpu_count; j++) {
if (cpu_freq[i] < cpu_freq[j]) {
for (int i = 0; i < cpu_num; i++) {
for (int j = i + 1; j < cpu_num; j++) {
if (max_freqs[i] < max_freqs[j]) {
// swap
int tmp = cpuids->at(i);
cpuids->at(i) = cpuids->at(j);
cpuids->at(j) = tmp;
int tmp = cpu_ids->at(i);
cpu_ids->at(i) = cpu_ids->at(j);
cpu_ids->at(j) = tmp;
}
}
}
// SMP
int mid_max_freq_khz =
(cpu_freq[cpuids->at(0)] + cpu_freq[cpuids->at(cpu_count - 1)]) / 2;
int mid_max_freq =
(max_freqs[cpu_ids->at(0)] + max_freqs[cpu_ids->at(cpu_num - 1)]) / 2;
for (int i = 0; i < cpu_count; i++) {
cpuids->at(i) = i;
if (cpu_freq[i] >= mid_max_freq_khz) {
for (int i = 0; i < cpu_num; i++) {
cpu_ids->at(i) = i;
if (max_freqs[i] >= mid_max_freq) {
cluster_ids->at(i) = 0;
} else {
cluster_ids->at(i) = 1;
}
}
return 0;
}
int check_online(const std::vector<int>& core_ids) {
if (core_ids.size() == 0) {
return 0;
void get_cpu_cache_size(int cpu_id, int* l1_cache_size, int* l2_cache_size,
int* l3_cache_size) {
int max_cache_idx_num = 10;
*l1_cache_size = DEFAULT_L1_CACHE_SIZE;
*l2_cache_size = DEFAULT_L2_CACHE_SIZE;
*l3_cache_size = DEFAULT_L3_CACHE_SIZE;
for (int i = 0; i < max_cache_idx_num; i++) {
char path[256];
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpu_id, i);
FILE* fp = fopen(path, "rb");
if (fp) {
int level = -1;
fscanf(fp, "%d", &level);
fclose(fp);
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpu_id, i);
fp = fopen(path, "rb");
if (fp) {
int size = -1;
fscanf(fp, "%d", &size);
fclose(fp);
if (size >= 0) {
if (level == 1) {
*l1_cache_size = size * 1024;
} else if (level == 2) {
*l2_cache_size = size * 1024;
} else if (level == 3) {
*l3_cache_size = size * 1024;
}
}
}
}
}
}
bool check_cpu_online(const std::vector<int>& cpu_ids) {
if (cpu_ids.size() == 0) {
return false;
}
char path[256];
int online = 1;
for (int i = 0; i < core_ids.size(); ++i) {
bool all_online = true;
for (int i = 0; i < cpu_ids.size(); ++i) {
snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online",
core_ids[i]);
cpu_ids[i]);
FILE* fp = fopen(path, "rb");
if (!fp) {
return 0;
int is_online = 0;
if (fp) {
fscanf(fp, "%d", &is_online);
fclose(fp);
} else {
LOG(ERROR) << "Failed to query the online statue of CPU id:"
<< cpu_ids[i];
}
if (is_online == 0) {
all_online = false;
LOG(ERROR) << "CPU id:" << cpu_ids[i] << " is offine";
}
int cur_online = 0;
fscanf(fp, "%d", &cur_online);
online &= cur_online;
fclose(fp);
}
return online;
return all_online;
}
int set_sched_affinity(const std::vector<int>& cpuids) {
int set_sched_affinity(const std::vector<int>& cpu_ids) {
// #define CPU_SETSIZE 1024
// #define __NCPUBITS (8 * sizeof (unsigned long))
// typedef struct
......@@ -608,20 +389,569 @@ int set_sched_affinity(const std::vector<int>& cpuids) {
#endif
cpu_set_t mask;
CPU_ZERO(&mask);
for (int i = 0; i < cpuids.size(); i++) {
CPU_SET(cpuids[i], &mask);
for (int i = 0; i < cpu_ids.size(); ++i) {
CPU_SET(cpu_ids[i], &mask);
}
int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
if (syscallret) {
LOG(ERROR) << "syscall error " << syscallret;
return -1;
}
return 0;
}
bool bind_threads(const std::vector<int> cpu_ids) {
#ifdef ARM_WITH_OMP
int thread_num = cpu_ids.size();
omp_set_num_threads(thread_num);
std::vector<int> ssarets;
for (int i = 0; i < thread_num; ++i) {
ssarets.push_back(0);
}
#pragma omp parallel for
for (int i = 0; i < thread_num; i++) {
ssarets[i] = set_sched_affinity(cpu_ids);
}
for (int i = 0; i < thread_num; i++) {
if (ssarets[i] != 0) {
LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[i];
return false;
}
}
#else // ARM_WITH_OMP
std::vector<int> first_cpu_id;
first_cpu_id.push_back(cpu_ids[0]);
int ssaret = set_sched_affinity(first_cpu_id);
if (ssaret != 0) {
LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[0];
return false;
}
#endif // ARM_WITH_OMP
}
#endif // LITE_WITH_LINUX
// cache_id : 0 -> L1, 1 -> L2, 2 -> L3
void DeviceInfo::SetCacheInfo(int cache_id, int argc, ...) {
va_list arg_ptr;
va_start(arg_ptr, argc);
std::vector<int>* cache;
switch (cache_id) {
case 0:
cache = &L1_cache_;
break;
case 1:
cache = &L2_cache_;
break;
case 2:
cache = &L3_cache_;
break;
default:
break;
}
cache->resize(core_num_);
if (argc == 1) {
int cache_size = va_arg(arg_ptr, int);
for (int i = 0; i < core_num_; ++i) {
(*cache)[i] = cache_size;
}
} else {
int big_core_num = big_core_ids_.size();
int little_core_num = little_core_ids_.size();
int big_core_cache_size = va_arg(arg_ptr, int);
int little_core_cache_size = va_arg(arg_ptr, int);
for (int i = 0; i < big_core_num; ++i) {
(*cache)[big_core_ids_[i]] = big_core_cache_size;
}
for (int i = 0; i < little_core_num; ++i) {
(*cache)[little_core_ids_[i]] = little_core_cache_size;
}
}
va_end(arg_ptr);
}
void DeviceInfo::SetArchInfo(int argc, ...) {
va_list arg_ptr;
va_start(arg_ptr, argc);
archs_.resize(core_num_);
if (argc == 1) {
ARMArch arch = (ARMArch)va_arg(arg_ptr, int);
for (int i = 0; i < core_num_; ++i) {
archs_[i] = arch;
}
} else {
ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int);
ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int);
int big_core_num = big_core_ids_.size();
int little_core_num = little_core_ids_.size();
for (int i = 0; i < big_core_num; ++i) {
archs_[big_core_ids_[i]] = big_core_arch;
}
for (int i = 0; i < little_core_num; ++i) {
archs_[little_core_ids_[i]] = little_core_arch;
}
}
va_end(arg_ptr);
}
bool DeviceInfo::SetCPUInfoByName() {
/* Snapdragon */
if (dev_name_.find("SM8150") != std::string::npos) { // 855
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(2, kA76, kA55);
SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
SetCacheInfo(2, 1, 2048 * 1024);
return true;
} else if (dev_name_.find("SDM845") != std::string::npos) { // 845
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(2, kA75, kA55);
SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
SetCacheInfo(2, 1, 2048 * 1024);
return true;
} else if (dev_name_.find("SDM710") != std::string::npos) { // 710
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {6, 7};
little_core_ids_ = {0, 1, 2, 3, 4, 5};
cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
SetArchInfo(2, kA75, kA55);
SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
SetCacheInfo(2, 1, 1024 * 1024);
return true;
} else if (dev_name_.find("MSM8998") != std::string::npos) { // 835
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(2, kA73, kA53);
SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
SetCacheInfo(1, 2, 1024 * 1024,
/*real cache size is 2M, while that will get bad performace
on conv3x3s1 or gemm, set to 1M or 512K*/
1024 * 1024);
return true;
} else if (dev_name_.find("MSM8996") != std::string::npos) { // 820
core_num_ = 4;
core_ids_ = {0, 1, 2, 3};
big_core_ids_ = {2, 3};
little_core_ids_ = {0, 1};
cluster_ids_ = {1, 1, 0, 0};
SetArchInfo(1, kA72);
SetCacheInfo(0, 1, 24 * 1024);
SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (dev_name_.find("SDM660") != std::string::npos ||
dev_name_.find("SDM636") != std::string::npos) { // 660, 636
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(1, kA73);
SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
SetCacheInfo(1, 1, 1024 * 1024);
return true;
} else if (dev_name_.find("MSM8976") != std::string::npos) { // 652,653
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(2, kA72, kA53);
SetCacheInfo(0, 1, 32 * 1024);
SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (dev_name_.find("MSM8953") != std::string::npos) { // 625
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
little_core_ids_ = {};
cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
SetArchInfo(1, kA53);
SetCacheInfo(0, 1, 32 * 1024);
SetCacheInfo(1, 1, 1024 * 1024);
return true;
} else if (dev_name_.find("MSM8939") != std::string::npos) { // 615
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {0, 1, 2, 3};
little_core_ids_ = {4, 5, 6, 7};
cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
SetArchInfo(1, kA53);
SetCacheInfo(0, 1, 32 * 1024);
SetCacheInfo(1, 2, 512 * 1024, 256 * 1024);
return true;
/* MediaTek */
} else if (dev_name_.find("MT6797") !=
std::string::npos) { // X20/X23/X25/X27
core_num_ = 10;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
big_core_ids_ = {8, 9};
little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
SetArchInfo(2, kA72, kA53);
SetCacheInfo(0, 1, 32 * 1024);
SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (dev_name_.find("MT6799") != std::string::npos) { // X30
core_num_ = 10;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
big_core_ids_ = {8, 9};
little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
SetArchInfo(2, kA73, kA53);
return true;
} else if (dev_name_.find("MT6795") != std::string::npos ||
dev_name_.find("MT6762") != std::string::npos ||
dev_name_.find("MT6755T") != std::string::npos ||
dev_name_.find("MT6755S") != std::string::npos ||
dev_name_.find("MT6753") != std::string::npos ||
dev_name_.find("MT6752") != std::string::npos ||
dev_name_.find("MT6750") != std::string::npos) {
// X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
little_core_ids_ = {};
cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
SetArchInfo(1, kA53);
return true;
} else if (dev_name_.find("MT6758") != std::string::npos ||
dev_name_.find("MT6757") != std::string::npos ||
dev_name_.find("MT6763") != std::string::npos ||
dev_name_.find("MT6755M") != std::string::npos ||
dev_name_.find("MT6755") !=
std::string::npos) { // P30, P20/P25, P23, P10
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(1, kA53);
return true;
} else if (dev_name_.find("MT6771") != std::string::npos) { // P60
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(2, kA73, kA53);
return true;
} else if (dev_name_.find("MT6765") != std::string::npos ||
dev_name_.find("MT6739") != std::string::npos ||
dev_name_.find("MT6738") != std::string::npos ||
dev_name_.find("MT6737") !=
std::string::npos) { // A22, MT6739, MT6738, MT6767
core_num_ = 4;
core_ids_ = {0, 1, 2, 3};
big_core_ids_ = {0, 1, 2, 3};
little_core_ids_ = {};
cluster_ids_ = {0, 0, 0, 0};
SetArchInfo(1, kA53);
return true;
}
return false;
}
void DeviceInfo::SetCPUInfoByProb() {
#ifdef LITE_WITH_LINUX
// get big.LITTLE cores by sorting CPU frequency
sort_cpuid_by_max_freq(max_freqs_, &core_ids_, &cluster_ids_);
big_core_ids_.clear();
little_core_ids_.clear();
for (int i = 0; i < cluster_ids_.size(); ++i) {
if (cluster_ids_[i] == 0) {
big_core_ids_.push_back(core_ids_[i]);
} else {
little_core_ids_.push_back(core_ids_[i]);
}
}
// get l1, l2, l3 cache size for each core
for (int i = 0; i < core_num_; i++) {
get_cpu_cache_size(i, &(L1_cache_[i]), &(L2_cache_[i]), &(L3_cache_[i]));
}
#endif // LITE_WITH_LINUX
}
void DeviceInfo::RequestPowerFullMode(const int thread_num) {
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
active_ids_.clear();
for (int i = 0; i < thread_num; ++i) {
if (i < big_core_size) {
active_ids_.push_back(big_core_ids_[i]);
} else if (i < big_core_size + little_core_size) {
active_ids_.push_back(little_core_ids_[i - big_core_size]);
}
}
mode_ = LITE_POWER_FULL;
}
void DeviceInfo::RequestPowerHighMode(const int thread_num) {
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
active_ids_.clear();
if (big_core_size > 0) {
mode_ = LITE_POWER_HIGH;
if (thread_num > big_core_size) {
LOG(ERROR) << "Request thread num: " << thread_num
<< ", exceed the big cores size: " << big_core_size
<< ", truncate thread num to " << big_core_size;
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(big_core_ids_[i]);
}
}
} else {
mode_ = LITE_POWER_LOW;
LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores.";
if (thread_num > little_core_size) {
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(little_core_ids_[i]);
}
}
}
}
void DeviceInfo::RequestPowerLowMode(const int thread_num) {
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
active_ids_.clear();
if (little_core_size > 0) {
mode_ = LITE_POWER_LOW;
if (thread_num > little_core_size) {
LOG(WARNING) << "Request thread num: " << thread_num
<< ", exceed the little cores size: " << little_core_size
<< ", truncate thread num to " << little_core_size;
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < thread_num; i++) {
active_ids_.push_back(little_core_ids_[i]);
}
}
} else {
mode_ = LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
if (thread_num > big_core_size) {
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < thread_num; i++) {
active_ids_.push_back(big_core_ids_[i]);
}
}
}
}
void DeviceInfo::RequestPowerNoBindMode(const int thread_num) {
active_ids_.clear();
for (int i = 0; i < thread_num; i++) {
active_ids_.push_back(0);
}
mode_ = LITE_POWER_NO_BIND;
}
void DeviceInfo::RequestPowerRandHighMode(const int shift_num,
const int thread_num) {
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
if (big_core_size > 0) {
mode_ = LITE_POWER_RAND_HIGH;
if (thread_num > big_core_size) {
LOG(WARNING) << "Request thread num: " << thread_num
<< ", exceed the big cores size: " << big_core_size
<< ", truncate thread num to " << big_core_size;
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(big_core_ids_[(i + shift_num) % big_core_size]);
}
}
} else {
mode_ = LITE_POWER_LOW;
LOG(WARNING) << "HIGH POWER MODE is not support, switch to little cores.";
if (thread_num > little_core_size) {
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(little_core_ids_[i]);
}
}
}
}
void DeviceInfo::RequestPowerRandLowMode(const int shift_num,
const int thread_num) {
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
active_ids_.clear();
if (little_core_size > 0) {
mode_ = LITE_POWER_RAND_LOW;
if (thread_num > little_core_size) {
LOG(WARNING) << "Request thread num: " << thread_num
<< ", exceed the little cores size: " << little_core_size
<< ", truncate thread num to " << little_core_size;
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(
little_core_ids_[(i + shift_num) % little_core_size]);
}
}
} else {
mode_ = LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores.";
if (thread_num > big_core_size) {
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(big_core_ids_[i]);
}
}
}
}
int DeviceInfo::Setup() {
core_num_ = get_cpu_num();
mem_size_ = get_mem_size();
get_cpu_arch(&archs_, core_num_);
// set defalut CPU info
SetCacheInfo(0, DEFAULT_L1_CACHE_SIZE);
SetCacheInfo(1, DEFAULT_L2_CACHE_SIZE);
SetCacheInfo(2, DEFAULT_L3_CACHE_SIZE);
#ifdef LITE_WITH_LINUX
// get max&min freq
max_freqs_.resize(core_num_);
min_freqs_.resize(core_num_);
for (int i = 0; i < core_num_; ++i) {
int max_freq, min_freq;
get_cpu_max_min_freq(i, &max_freq, &min_freq);
max_freqs_[i] = max_freq / 1000;
min_freqs_[i] = min_freq / 1000;
}
// get cache size and big.LITTLE core ids
dev_name_ = get_cpu_name();
if (!SetCPUInfoByName()) {
SetCPUInfoByProb();
}
// output info
LOG(INFO) << "ARM multiprocessors name: " << dev_name_;
LOG(INFO) << "ARM multiprocessors number: " << core_num_;
for (int i = 0; i < core_num_; ++i) {
LOG(INFO) << "ARM multiprocessors ID: " << core_ids_[i]
<< ", max freq: " << max_freqs_[i]
<< ", min freq: " << min_freqs_[i]
<< ", cluster ID: " << cluster_ids_[core_ids_[i]]
<< ", CPU ARCH: A" << archs_[i];
}
LOG(INFO) << "L1 DataCache size is: ";
for (int i = 0; i < core_num_; ++i) {
LOG(INFO) << L1_cache_[i] / 1024 << " KB";
}
LOG(INFO) << "L2 Cache size is: ";
for (int i = 0; i < core_num_; ++i) {
LOG(INFO) << L2_cache_[i] / 1024 << " KB";
}
LOG(INFO) << "Total memory: " << mem_size_ << "KB";
#endif
// set default run mode
SetRunMode(LITE_POWER_NO_BIND, 1); // use single thread by default
return 0;
}
void DeviceInfo::SetRunMode(PowerMode mode, int thread_num) {
#ifdef ARM_WITH_OMP
thread_num = std::min(thread_num, core_num_);
#else
thread_num = 1; // force thread_num to 1 if OpenMP is disabled
#endif
#ifdef LITE_WITH_LINUX
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
int big_little_core_size = big_core_size + little_core_size;
thread_num = std::min(thread_num, big_little_core_size);
count_++;
int shift_num = (count_ / 10) % big_core_size;
switch (mode) {
case LITE_POWER_FULL:
RequestPowerFullMode(thread_num);
break;
case LITE_POWER_HIGH:
RequestPowerHighMode(thread_num);
break;
case LITE_POWER_LOW:
RequestPowerLowMode(thread_num);
break;
case LITE_POWER_NO_BIND:
RequestPowerNoBindMode(thread_num);
break;
case LITE_POWER_RAND_HIGH:
RequestPowerRandHighMode(shift_num, thread_num);
break;
case LITE_POWER_RAND_LOW:
RequestPowerRandLowMode(shift_num, thread_num);
break;
default:
LOG(FATAL) << "Unsupported power mode: " << mode;
break;
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
#ifdef ARM_WITH_OMP
omp_set_num_threads(active_ids_.size());
#endif
if (mode_ != LITE_POWER_NO_BIND) {
if (check_cpu_online(active_ids_)) {
bind_threads(active_ids_);
} else {
LOG(WARNING) << "Some cores are offline, switch to NO BIND MODE";
mode_ = LITE_POWER_NO_BIND;
}
}
#else // LITE_WITH_LINUX
// only LITE_POWER_NO_BIND is supported in other OS
RequestPowerNoBindMode(thread_num);
#ifdef ARM_WITH_OMP
omp_set_num_threads(active_ids_.size());
#endif
#endif // LITE_WITH_LINUX
//! alloc memory for sgemm in this context
workspace_.Resize(
{static_cast<int64_t>(L2_cache_[active_ids_[0]] / sizeof(float))});
arch_ = archs_[active_ids_[0]];
}
void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
SetCacheInfo(0, l1size);
SetCacheInfo(1, l2size);
SetCacheInfo(2, l3size);
workspace_.Resize({2 * (l1size + l2size)});
}
bool DeviceInfo::ExtendWorkspace(DDimLite dims) {
auto count = dims.product();
auto old = workspace_.dims();
if (count == old.product()) {
return false;
}
workspace_.Resize({static_cast<int64_t>(
count + L2_cache_[active_ids_[0]] / sizeof(float))});
return true;
}
#endif // LITE_WITH_ARM
......
......@@ -14,24 +14,12 @@
#pragma once
#include <cstdarg>
#include <string>
#include <vector>
#include "paddle/fluid/lite/core/lite_tensor.h"
#include "paddle/fluid/lite/utils/cp_logging.h"
#ifdef LITE_WITH_LINUX
#include <sys/syscall.h>
#include <unistd.h>
#endif
#if __APPLE__
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#include <mach/machine.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#endif // TARGET_OS_IPHONE
#endif // __APPLE__
namespace paddle {
namespace lite {
......@@ -60,64 +48,73 @@ typedef enum {
class DeviceInfo {
public:
int idx_;
int max_freq_;
int min_freq_;
int generate_arch_;
int compute_core_num_;
int max_memory_;
int sharemem_size_;
std::string device_name_;
std::string compute_ability_;
std::vector<int> L1_cache_;
std::vector<int> L2_cache_;
std::vector<int> L3_cache_;
std::vector<int> core_ids_;
std::vector<int> big_core_ids_;
std::vector<int> little_core_ids_;
std::vector<int> cluster_ids_;
std::vector<ARMArch> archs_;
static DeviceInfo& Global() {
static auto* x = new DeviceInfo;
return *x;
}
static void Init() {
auto& info = Global();
InitInternal(&info);
static int Init() {
static int ret = Global().Setup();
return ret;
}
private:
DeviceInfo() = default;
static void InitInternal(DeviceInfo* dev);
};
int Setup();
size_t arm_get_meminfo();
void SetRunMode(PowerMode mode, int thread_num);
void SetCache(int l1size, int l2size, int l3size);
void SetArch(ARMArch arch) { arch_ = arch; }
int arm_get_cpucount();
PowerMode mode() const { return mode_; }
int threads() const { return active_ids_.size(); }
ARMArch arch() const { return arch_; }
int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
void arm_get_cpu_arch(std::vector<ARMArch>* archs);
bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name);
#ifdef LITE_WITH_LINUX
void set_default_cache(DeviceInfo* dev);
template <typename T>
T* workspace_data() {
return workspace_.mutable_data<T>();
}
bool ExtendWorkspace(DDimLite dims);
std::string arm_get_cpu_name();
private:
int core_num_;
std::vector<int> max_freqs_;
std::vector<int> min_freqs_;
int mem_size_;
std::string dev_name_;
int get_max_freq_khz(int cpuid);
std::vector<int> L1_cache_;
std::vector<int> L2_cache_;
std::vector<int> L3_cache_;
std::vector<int> core_ids_;
std::vector<int> big_core_ids_;
std::vector<int> little_core_ids_;
std::vector<int> cluster_ids_;
std::vector<ARMArch> archs_;
int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
const std::vector<int>& cpu_freq,
std::vector<int>* cluster_ids);
int check_online(const std::vector<int>& core_ids);
int set_sched_affinity(const std::vector<int>& cpuids);
ARMArch arch_;
// LITE_POWER_HIGH stands for using big cores,
// LITE_POWER_LOW stands for using small core,
// LITE_POWER_FULL stands for using all cores
PowerMode mode_;
std::vector<int> active_ids_;
TensorLite workspace_;
int64_t count_{0};
void SetCacheInfo(int cache_id, int argc, ...);
void SetArchInfo(int argc, ...);
bool SetCPUInfoByName();
void SetCPUInfoByProb();
void RequestPowerFullMode(const int thread_num);
void RequestPowerHighMode(const int thread_num);
void RequestPowerLowMode(const int thread_num);
void RequestPowerNoBindMode(const int thread_num);
void RequestPowerRandHighMode(const int shift_num, const int thread_num);
void RequestPowerRandLowMode(const int shift_num, const int thread_num);
#endif // LITE_WITH_LINUX
DeviceInfo() = default;
};
#endif // LITE_WITH_ARM
......
......@@ -28,8 +28,6 @@ void ConvCompute::PrepareForRun() {
auto o_dims = param.output->dims();
auto& ctx = this->ctx_->template As<ARMContext>();
// TODO(xxx): make api and expose it
ctx.SetRunMode(LITE_POWER_HIGH, 4);
int win = x_dims[3]; // nchw
int hin = x_dims[2];
......
......@@ -28,7 +28,6 @@ void FcCompute::PrepareForRun() {
auto w_dims = param.w->dims();
auto& ctx = this->ctx_->template As<ARMContext>();
ctx.SetRunMode(LITE_POWER_HIGH, 4);
CHECK_GE(x_dims.size(), 2UL);
CHECK_EQ(w_dims.size(), 2UL);
......
......@@ -24,7 +24,6 @@ namespace arm {
void MulCompute::PrepareForRun() {
auto& ctx = this->ctx_->template As<ARMContext>();
ctx.SetRunMode(LITE_POWER_HIGH, 4);
}
void MulCompute::Run() {
......
......@@ -26,7 +26,6 @@ namespace arm {
void PoolCompute::PrepareForRun() {
auto& ctx = this->ctx_->template As<ARMContext>();
ctx.SetRunMode(LITE_POWER_HIGH, 4);
}
void PoolCompute::Run() {
......
......@@ -17,6 +17,7 @@ cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps}
cc_library(concat_compute_x86 SRCS concat_compute.cc DEPS ${lite_kernel_deps} )
cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col)
cc_library(pool_compute_x86 SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling)
cc_library(batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86)
lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
......@@ -28,6 +29,7 @@ lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS relu_compute_x
lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86 operator)
lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86)
set(x86_kernels
......@@ -44,6 +46,7 @@ set(x86_kernels
concat_compute_x86
conv_compute_x86
pool_compute_x86
batch_norm_compute_x86
)
set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels")
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/x86/batch_norm_compute.h"
REGISTER_LITE_KERNEL(batch_norm, kX86, kFloat, kNCHW,
paddle::lite::kernels::x86::BatchNormCompute<float>, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kX86))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))})
.BindInput("Mean", {LiteType::GetTensorTy(TARGET(kX86))})
.BindInput("Variance", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <random>
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::BatchNormParam;
void Run() override {
auto &param = *param_.get_mutable<operators::BatchNormParam>();
bool global_stats = param.is_test || param.use_global_stats;
const auto *x = param.x;
const auto &x_dims = x->dims();
CHECK(x_dims.size() >= 2 && x_dims.size() <= 5);
const int N = x_dims[0];
const int C = param.data_layout == DATALAYOUT(kNCHW)
? x_dims[1]
: x_dims[x_dims.size() - 1];
const int sample_size = x->dims().production() / N / C;
// alloc memory
param.y->template mutable_data<T>();
param.mean_out->template mutable_data<T>();
param.variance_out->template mutable_data<T>();
param.saved_mean->template mutable_data<T>();
param.saved_variance->template mutable_data<T>();
if (!global_stats) {
// saved_xx is use just in this batch of data
EigenVectorArrayMap<T> saved_mean_e(param.saved_mean->mutable_data<T>(),
C);
EigenVectorArrayMap<T> saved_variance_e(
param.saved_variance->mutable_data<T>(), C);
saved_mean_e.setZero();
saved_variance_e.setZero();
EigenVectorArrayMap<T> running_mean_arr(param.mean_out->mutable_data<T>(),
C);
EigenVectorArrayMap<T> running_var_arr(
param.variance_out->mutable_data<T>(), C);
if ((N * sample_size) == 1) {
LOG(WARNING) << "Only 1 element in normalization dimension, "
<< "we skip the batch norm calculation, let y = x.";
framework::TensorCopy(x->raw_tensor(), platform::CPUPlace(),
&param.y->raw_tensor());
return;
}
switch (param.data_layout) {
case DATALAYOUT(kNCHW): {
ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) {
saved_mean_e(nc % C) += x_arr.col(nc).sum();
}
saved_mean_e /= N * sample_size;
for (int nc = 0; nc < N * C; ++nc) {
saved_variance_e(nc % C) +=
(x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
}
saved_variance_e /= N * sample_size;
break;
}
default:
LOG(FATAL) << "Unknown storage order: "
<< DataLayoutToStr(param.data_layout);
break;
}
running_mean_arr = running_mean_arr * param.momentum +
saved_mean_e * (1. - param.momentum);
running_var_arr = running_var_arr * param.momentum +
saved_variance_e * (1. - param.momentum);
}
// use SavedMean and SavedVariance to do normalize
Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
if (global_stats) {
ConstEigenVectorArrayMap<T> var_arr(param.variance->data<T>(), C);
inv_std = (var_arr + param.epsilon).sqrt().inverse();
} else {
EigenVectorArrayMap<T> saved_inv_std(
param.saved_variance->mutable_data<T>(), C);
// inverse SavedVariance first, gradient will use it too.
saved_inv_std = (saved_inv_std + param.epsilon).inverse().sqrt();
inv_std = saved_inv_std;
}
ConstEigenVectorArrayMap<T> mean_arr(
global_stats ? param.mean->data<T>() : param.saved_mean->data<T>(), C);
// ((x - est_mean) * (inv_var) * scale + bias
// formula transform ====>
// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
ConstEigenVectorArrayMap<T> scale_arr(param.scale->data<T>(), C);
ConstEigenVectorArrayMap<T> bias_arr(param.bias->data<T>(), C);
Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
bias_arr - mean_arr * inv_std * scale_arr;
switch (param.data_layout) {
case DATALAYOUT(kNCHW): {
EigenArrayMap<T> y_arr(param.y->mutable_data<T>(), sample_size, N * C);
ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) {
y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
}
break;
}
default:
LOG(FATAL) << "Unknown storage order: "
<< DataLayoutToStr(param.data_layout);
break;
}
}
virtual ~BatchNormCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/x86/batch_norm_compute.h"
#include <gtest/gtest.h>
#include <iostream>
#include <vector>
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
TEST(batch_norm_x86, retrive_op) {
auto batch_norm =
KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
"batch_norm");
ASSERT_FALSE(batch_norm.empty());
ASSERT_TRUE(batch_norm.front());
}
TEST(batch_norm_x86, init) {
BatchNormCompute<float> batch_norm;
ASSERT_EQ(batch_norm.precision(), PRECISION(kFloat));
ASSERT_EQ(batch_norm.target(), TARGET(kX86));
}
TEST(batch_norm_x86, run_test) {
lite::Tensor x, scale, bias, mean, variance, y, mean_out, variance_out,
saved_mean, saved_variance;
constexpr int batch_size = 2;
std::vector<int64_t> x_shape{batch_size, 3, 64, 64};
x.Resize(lite::DDim(x_shape));
std::vector<int64_t> scale_shape{3};
scale.Resize(lite::DDim(scale_shape));
std::vector<int64_t> bias_shape{3};
bias.Resize(lite::DDim(bias_shape));
std::vector<int64_t> mean_shape{3};
mean.Resize(lite::DDim(mean_shape));
std::vector<int64_t> variance_shape{3};
variance.Resize(lite::DDim(variance_shape));
std::vector<int64_t> y_shape{batch_size, 3, 64, 64};
y.Resize(lite::DDim(y_shape));
std::vector<int64_t> mean_out_shape{3};
mean_out.Resize(lite::DDim(mean_out_shape));
std::vector<int64_t> variance_out_shape{3};
variance_out.Resize(lite::DDim(variance_out_shape));
std::vector<int64_t> saved_mean_shape{3};
saved_mean.Resize(lite::DDim(saved_mean_shape));
std::vector<int64_t> saved_variance_shape{3};
saved_variance.Resize(lite::DDim(saved_variance_shape));
auto x_data = x.mutable_data<float>();
auto scale_data = scale.mutable_data<float>();
auto bias_data = bias.mutable_data<float>();
auto mean_data = mean.mutable_data<float>();
auto variance_data = variance.mutable_data<float>();
y.mutable_data<float>();
mean_out.mutable_data<float>();
variance_out.mutable_data<float>();
saved_mean.mutable_data<float>();
saved_variance.mutable_data<float>();
for (int64_t i = 0; i < x.dims().production(); i++) {
x_data[i] = static_cast<float>(i);
}
for (int i = 0; i < scale.dims().production(); i++) {
scale_data[i] = static_cast<float>(i) * 0.01f + 0.03f;
}
for (int i = 0; i < bias.dims().production(); i++) {
bias_data[i] = static_cast<float>(i) * 0.065f + 0.1f;
}
for (int i = 0; i < mean.dims().production(); i++) {
mean_data[i] = static_cast<float>(i) * 0.0565f;
}
for (int i = 0; i < variance.dims().production(); i++) {
variance_data[i] = static_cast<float>(i) * 2.08f + 1.5f;
}
// BatchNormCompute batch_norm;
BatchNormCompute<float> batch_norm;
operators::BatchNormParam param;
param.x = &x;
param.is_test = false;
param.scale = &scale;
param.bias = &bias;
param.mean = &mean;
param.variance = &variance;
param.use_global_stats = false;
param.epsilon = 1e-4f;
param.momentum = 0.9f;
param.y = &y;
param.mean_out = &mean_out;
param.variance_out = &variance_out;
param.saved_mean = &saved_mean;
param.saved_variance = &saved_variance;
batch_norm.SetParam(param);
batch_norm.Run();
LOG(INFO) << "output: " << y;
LOG(INFO) << "mean_out: " << mean_out;
LOG(INFO) << "variance_out: " << mean_out;
LOG(INFO) << "saved_mean: " << saved_mean;
LOG(INFO) << "saved_variance: " << saved_variance;
/*for (int i = 0; i < y.dims().production(); i++) {
if(i < 5 || i > y.dims().production() - 5)
LOG(INFO) << y_data[i];
}*/
}
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(batch_norm, kX86, kFloat, kNCHW, def);
......@@ -135,8 +135,8 @@ function test_arm_model {
adb -s emulator-${port} push ${model_dir} ${adb_work_dir}
adb -s emulator-${port} push ${testpath} ${adb_work_dir}
adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
local adb_model_path="./${adb_work_dir}/`basename ${model_dir}`"
adb -s emulator-${port} shell "./${adb_work_dir}/${test_name} --eval_model_dir=$adb_model_path"
local adb_model_path="${adb_work_dir}/`basename ${model_dir}`"
adb -s emulator-${port} shell "${adb_work_dir}/${test_name} --eval_model_dir=$adb_model_path"
}
......@@ -225,16 +225,11 @@ function test_arm {
for _test in $(cat $TESTS_FILE); do
test_arm_android $_test $port
done
# TODO(sangoly): refine this
test_arm_model "test_cxx_api_lite" $port "./third_party/install/mobilenet_v2_relu"
}
# Build the code and run lite arm tests. This is executed in the CI system.
function build_test_arm {
########################################################################
# job 1-4 must be in one runner
port_armv8=5554
port_armv7=5556
function prepare_emulator {
local port_armv8=$1
local port_armv7=$2
adb kill-server
adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
......@@ -245,6 +240,18 @@ function build_test_arm {
echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a"
echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -verbose -port ${port_armv7} &
sleep 1m
}
# We split the arm unittest into several sub-tasks to parallel and reduce the overall CI timetime.
# sub-task1
function build_test_arm_subtask_android {
########################################################################
# job 1-4 must be in one runner
port_armv8=5554
port_armv7=5556
prepare_emulator $port_armv8 $port_armv7
# job 1
build_arm "android" "armv8" "gcc"
......@@ -252,9 +259,9 @@ function build_test_arm {
cd -
# job 2
build_arm "android" "armv8" "clang"
test_arm "android" "armv8" "clang" ${port_armv8}
cd -
#build_arm "android" "armv8" "clang"
#test_arm "android" "armv8" "clang" ${port_armv8}
#cd -
# job 3
build_arm "android" "armv7" "gcc"
......@@ -262,13 +269,22 @@ function build_test_arm {
cd -
# job 4
build_arm "android" "armv7" "clang"
test_arm "android" "armv7" "clang" ${port_armv7}
cd -
#build_arm "android" "armv7" "clang"
#test_arm "android" "armv7" "clang" ${port_armv7}
#cd -
adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
echo "Done"
}
# sub-task2
function build_test_arm_subtask_armlinux {
########################################################################
# job 1-4 must be in one runner
port_armv8=5554
port_armv7=5556
prepare_emulator $port_armv8 $port_armv7
# job 5
build_arm "armlinux" "armv8"
......@@ -285,9 +301,47 @@ function build_test_arm {
test_arm "armlinux" "armv7hf"
cd -
adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
echo "Done"
}
# sub-task3
function build_test_arm_subtask3_mobilenet_v2 {
local port_armv8=5554
local port_armv7=5556
# We just test following single one environment to limit the CI time.
local os=android
local abi=armv8
local lang=gcc
cur_dir=$(pwd)
build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
mkdir -p $build_dir
cd $build_dir
cmake_arm $os $abi $lang
make test_cxx_api_lite -j$NUM_CORES_FOR_COMPILE
prepare_emulator $port_armv8 $port_armv7
# just test the model on armv8
test_arm_model "test_cxx_api_lite" $port_armv8 "./third_party/install/mobilenet_v2_relu"
adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
echo "Done"
}
# Build the code and run lite arm tests. This is executed in the CI system.
function build_test_arm {
########################################################################
# job 1-4 must be in one runner
port_armv8=5554
port_armv7=5556
build_test_arm_subtask_android
build_test_arm_subtask_armlinux
}
############################# MAIN #################################
function print_usage {
echo -e "\nUSAGE:"
......@@ -379,6 +433,18 @@ function main {
build_test_arm
shift
;;
build_test_arm_subtask_android)
build_test_arm_subtask_android
shift
;;
build_test_arm_subtask_armlinux)
build_test_arm_subtask_armlinux
shift
;;
build_test_arm_model1)
build_test_arm_subtask3_mobilenet_v2
shift
;;
check_style)
check_style
shift
......@@ -397,4 +463,3 @@ function main {
}
main $@
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册