diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a2f0644f218db3891339b9ad672fbb1a3989f5cf..d3f5df342e6b512d5de835ed9f4f7502a60ae15b 100755 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -2,6 +2,20 @@ before_script: - env - export CI_USER_DIR=$(pwd) + # prepare ccache + - apt install ccache + + # for proxy + - export http_proxy=$CI_PROXY + - export https_proxy=$CI_PROXY + + # merge the latest code + - git config --global user.email "you@example.com" + - git config --global user.name "Your Name" + - git fetch origin incubate/lite + - git merge --no-ff origin/incubate/lite + + image: $SERVER_LITE_DOCKER_IMAGE stages: @@ -14,19 +28,13 @@ check:prebuilt: - lite stage: ci script: + # prepare for pre-commit - rm -rf ~/.pip - - export http_proxy=$CI_PROXY - - export https_proxy=$CI_PROXY - pip install pre-commit - pre-commit install - # merge the latest code - - git config --global user.email "you@example.com" - - git config --global user.name "Your Name" - - git fetch origin incubate/lite - - git merge --no-ff origin/incubate/lite - - ./paddle/fluid/lite/tools/build.sh check_style + cache: key: check_style paths: @@ -42,17 +50,11 @@ build:server: paths: - build/third_party - ~/.ccache + - $CI_PROJECT_DIR/_build_server_ccache script: - - apt install ccache - - export http_proxy=$CI_PROXY - - export https_proxy=$CI_PROXY - - # merge the latest code - - git config --global user.email "you@example.com" - - git config --global user.name "Your Name" - - git fetch origin incubate/lite - - git merge --no-ff origin/incubate/lite - + # customize ccache path for specifying runner cache + - export CCACHE_DIR=$CI_PROJECT_DIR/_build_server_ccache + # run build and test - mkdir -p build - cd build - ../paddle/fluid/lite/tools/build.sh cmake_x86 @@ -66,7 +68,27 @@ build:server: dependencies: - check:prebuilt -build:mobile: +build:mobile_android: + tags: + - lite + stage: build_mobile + image: $MOBILE_LITE_DOCKER_IMAGE + cache: + key: mobile_thirdparty + paths: + - $MOBILE_LITE_CACHE0 + - $MOBILE_LITE_CACHE1 + - ~/.ccache + - $CI_PROJECT_DIR/build_mobile_ccache + script: + - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_ccache + - ./paddle/fluid/lite/tools/build.sh build_test_arm_subtask_android + + dependencies: + - build:server + + +build:mobile_armlinux: tags: - lite stage: build_mobile @@ -77,17 +99,43 @@ build:mobile: - $MOBILE_LITE_CACHE0 - $MOBILE_LITE_CACHE1 - ~/.ccache + - $CI_PROJECT_DIR/build_mobile_ccache2 script: - - apt install ccache - - export http_proxy=$CI_PROXY - - export https_proxy=$CI_PROXY + - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_ccache2 + - ./paddle/fluid/lite/tools/build.sh build_test_arm_subtask_armlinux + + dependencies: + - build:server - # merge the latest code - - git config --global user.email "you@example.com" - - git config --global user.name "Your Name" - - git fetch origin incubate/lite - - git merge --no-ff origin/incubate/lite + cache: + key: mobile_thirdparty + paths: + - $MOBILE_LITE_CACHE0 + - $MOBILE_LITE_CACHE1 + - ~/.ccache + +build:mobile_model_mobilenetv2: + tags: + - lite + stage: build_mobile + image: $MOBILE_LITE_DOCKER_IMAGE + cache: + key: mobile_thirdparty + paths: + - $MOBILE_LITE_CACHE0 + - $MOBILE_LITE_CACHE1 + - ~/.ccache + script: + - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model1 + - ./paddle/fluid/lite/tools/build.sh build_test_arm_model1 - - ./paddle/fluid/lite/tools/build.sh build_test_arm dependencies: - build:server + + cache: + key: mobile_thirdparty + paths: + - $MOBILE_LITE_CACHE0 + - $MOBILE_LITE_CACHE1 + - ~/.ccache + - $CI_PROJECT_DIR/build_mobile_model1 diff --git a/paddle/fluid/lite/api/cxx_api_bin.cc b/paddle/fluid/lite/api/cxx_api_bin.cc index ae3d77e296c2ae98fa0aa50f13d296f5e8b60c4b..58cf5dd785efc5de02e746e0ef1d5609a7c120a5 100644 --- a/paddle/fluid/lite/api/cxx_api_bin.cc +++ b/paddle/fluid/lite/api/cxx_api_bin.cc @@ -29,9 +29,10 @@ double time_diff(Time t1, Time t2) { return counter.count() / 1000.0; } -void Run(const char* model_dir, int repeat) { +void Run(const char* model_dir, int repeat, int thread_num) { #ifdef LITE_WITH_ARM DeviceInfo::Init(); + DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, thread_num); #endif lite::ExecutorLite predictor; std::vector valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, @@ -67,8 +68,8 @@ void Run(const char* model_dir, int repeat) { } // namespace paddle int main(int argc, char** argv) { - CHECK_EQ(argc, 3) << "usage: ./cmd "; - paddle::lite::Run(argv[1], std::stoi(argv[2])); + CHECK_EQ(argc, 4) << "usage: ./cmd "; + paddle::lite::Run(argv[1], std::stoi(argv[2]), std::stoi(argv[3])); return 0; } diff --git a/paddle/fluid/lite/core/context.cc b/paddle/fluid/lite/core/context.cc index 89ec7278c1aaf8e372c45f24a32525df4f223418..aae36b4a23133cef5b6a477e9144d5a14c90e45f 100644 --- a/paddle/fluid/lite/core/context.cc +++ b/paddle/fluid/lite/core/context.cc @@ -13,322 +13,7 @@ // limitations under the License. #include "paddle/fluid/lite/core/context.h" -#include "paddle/fluid/lite/core/cpu_info.h" - -#ifdef LITE_WITH_LINUX -#include -#include -#endif -#if __APPLE__ -#include "TargetConditionals.h" -#if TARGET_OS_IPHONE -#include -#include -#include -#endif // TARGET_OS_IPHONE -#endif // __APPLE__ - -#ifdef ARM_WITH_OMP -#include -#endif namespace paddle { -namespace lite { - -#ifdef LITE_WITH_ARM - -void Context::SetCache(int l1size, int l2size, int l3size) { - DeviceInfo& dev = DeviceInfo::Global(); - int cpu_count = arm_get_cpucount(); - dev.L1_cache_.resize(cpu_count); - dev.L2_cache_.resize(cpu_count); - dev.L3_cache_.resize(cpu_count); - for (int i = 0; i < cpu_count; ++i) { - dev.L1_cache_[i] = l1size; - dev.L2_cache_[i] = l2size; - dev.L3_cache_[i] = l3size; - } - workspace_.Resize({2 * (l1size + l2size)}); -} - -Context::Context() { - active_ids_ = {0}; - mode_ = LITE_POWER_HIGH; - DeviceInfo& dev = DeviceInfo::Global(); - workspace_.Resize( - {static_cast(dev.L2_cache_[active_ids_[0]] / sizeof(float))}); -#ifdef TARGET_IOS - arch_ = APPLE; // use 6x8 -#else - if (dev.big_core_ids_.size() > 0) { - arch_ = dev.archs_[dev.big_core_ids_[0]]; - } -#endif -} - -PowerMode Context::mode() const { return mode_; } - -int Context::threads() const { return active_ids_.size(); } - -Context::Context(const ARMContext& ctx) { - mode_ = ctx.mode_; - active_ids_ = ctx.active_ids_; - workspace_ = ctx.workspace_; - arch_ = ctx.arch_; - count_ = ctx.count_; -} - -ARMContext& Context::operator=(const ARMContext& ctx) { - mode_ = ctx.mode_; - active_ids_ = ctx.active_ids_; - workspace_ = ctx.workspace_; - arch_ = ctx.arch_; - count_ = ctx.count_; - return *this; -} - -void Context::BindDev() { -#ifdef ARM_WITH_OMP - int num_threads = active_ids_.size(); - omp_set_num_threads(num_threads); -#ifdef LITE_WITH_LINUX - std::vector ssarets; - for (int j = 0; j < num_threads; ++j) { - ssarets.push_back(0); - } -#pragma omp parallel for - for (int i = 0; i < num_threads; i++) { - ssarets[i] = set_sched_affinity(active_ids_); - } - for (int i = 0; i < num_threads; i++) { - if (ssarets[i] != 0) { - LOG(ERROR) << "set cpu affinity failed, cpuID: " << active_ids_[i]; - return; - } - } -#endif // LITE_WITH_LINUX -#else // ARM_WITH_OMP -#ifdef LITE_WITH_LINUX - std::vector cpuid1; - cpuid1.push_back(active_ids_[0]); - int ssaret = set_sched_affinity(cpuid1); - if (ssaret != 0) { - printf("set cpu affinity failed, cpuID: %d\n", active_ids_[0]); - return; - } -#endif // LITE_WITH_LINUX -#endif // ARM_WITH_OMP -} - -void Context::SetRunMode(PowerMode mode, int threads) { - DeviceInfo& dev = DeviceInfo::Global(); - int big_core_size = dev.big_core_ids_.size(); - int small_core_size = dev.little_core_ids_.size(); - if (threads > big_core_size + small_core_size) { - threads = big_core_size + small_core_size; - } -#ifdef ARM_WITH_OMP - count_++; - int shift_num = (count_ / 10) % big_core_size; - switch (mode) { - case LITE_POWER_FULL: - mode_ = mode; - active_ids_.clear(); - for (int i = 0; i < threads; ++i) { - if (i < big_core_size) { - active_ids_.push_back(dev.big_core_ids_[i]); - } else { - active_ids_.push_back(dev.little_core_ids_[i - big_core_size]); - } - } - if (active_ids_.size() == 0) { - active_ids_.push_back(0); - } - break; - case LITE_POWER_HIGH: - active_ids_.clear(); - if (big_core_size > 0) { - mode_ = LITE_POWER_HIGH; - if (threads > big_core_size) { - LOG(ERROR) << "threads: " << threads - << ", exceed the big cores size: " << big_core_size; - active_ids_ = dev.big_core_ids_; - } else { - for (int i = 0; i < threads; ++i) { - active_ids_.push_back(dev.big_core_ids_[i]); - } - } - } else { - mode_ = LITE_POWER_LOW; - LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores"; - if (threads > small_core_size) { - active_ids_ = dev.little_core_ids_; - } else { - for (int i = 0; i < threads; ++i) { - active_ids_.push_back(dev.little_core_ids_[i]); - } - } - } - if (active_ids_.size() == 0) { - active_ids_.push_back(0); - } - break; - case LITE_POWER_LOW: - active_ids_.clear(); - if (small_core_size > 0) { - mode_ = LITE_POWER_LOW; - if (threads > small_core_size) { - LOG(WARNING) << "threads: " << threads - << ", exceed the little cores size: " << small_core_size; - active_ids_ = dev.little_core_ids_; - } else { - for (int i = 0; i < threads; ++i) { - active_ids_.push_back(dev.little_core_ids_[i]); - } - } - } else { - mode_ = LITE_POWER_HIGH; - LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores"; - if (threads > big_core_size) { - active_ids_ = dev.big_core_ids_; - } else { - for (int i = 0; i < threads; ++i) { - active_ids_.push_back(dev.big_core_ids_[i]); - } - } - } - if (active_ids_.size() == 0) { - active_ids_.push_back(0); - } - break; - case LITE_POWER_NO_BIND: - mode_ = LITE_POWER_NO_BIND; - active_ids_.clear(); - if (threads > dev.core_ids_.size()) { - active_ids_.resize(dev.core_ids_.size()); - } else { - active_ids_.resize(threads); - } - break; - case LITE_POWER_RAND_HIGH: - active_ids_.clear(); - if (big_core_size > 0) { - mode_ = LITE_POWER_RAND_HIGH; - if (threads > big_core_size) { - LOG(WARNING) << "threads: " << threads - << ", exceed the big cores size: " << big_core_size; - active_ids_ = dev.big_core_ids_; - } else { - for (int i = 0; i < threads; ++i) { - active_ids_.push_back( - dev.big_core_ids_[(i + shift_num) % big_core_size]); - } - } - } else { - mode_ = LITE_POWER_LOW; - LOG(WARNING) - << "HIGH POWER MODE is not support, switch to little cores"; - if (threads > small_core_size) { - active_ids_ = dev.little_core_ids_; - } else { - for (int i = 0; i < threads; ++i) { - active_ids_.push_back(dev.little_core_ids_[i]); - } - } - } - if (active_ids_.size() == 0) { - active_ids_.push_back(0); - } - break; - case LITE_POWER_RAND_LOW: - active_ids_.clear(); - if (small_core_size > 0) { - mode_ = LITE_POWER_RAND_LOW; - if (threads > small_core_size) { - LOG(WARNING) << "threads: " << threads - << ", exceed the little cores size: " << small_core_size; - active_ids_ = dev.little_core_ids_; - } else { - for (int i = 0; i < threads; ++i) { - active_ids_.push_back( - dev.little_core_ids_[(i + shift_num) % small_core_size]); - } - } - } else { - mode_ = LITE_POWER_HIGH; - LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores"; - if (threads > big_core_size) { - active_ids_ = dev.big_core_ids_; - } else { - for (int i = 0; i < threads; ++i) { - active_ids_.push_back(dev.big_core_ids_[i]); - } - } - } - if (active_ids_.size() == 0) { - active_ids_.push_back(0); - } - break; - } - //! fix multi-threads LITE_POWER_HIGH mode - if (mode_ == LITE_POWER_NO_BIND || threads > 1) { - int threads = active_ids_.size(); - omp_set_num_threads(threads); - } else { - if (check_online(active_ids_)) { - BindDev(); - } else { - LOG(ERROR) << "core id " << active_ids_[0] - << " is offline, switch to NO BIND MODE"; - int threads = active_ids_.size(); - omp_set_num_threads(threads); - } - } -#else - if (big_core_size > 0) { - active_ids_ = {dev.big_core_ids_[0]}; - } else { - active_ids_ = {0}; - } -#endif - //! alloc memory for sgemm in this context - int temp_mem_size = - DeviceInfo::Global().L2_cache_[active_ids_[0]] / sizeof(float); - workspace_.Resize({temp_mem_size}); - arch_ = DeviceInfo::Global().archs_[active_ids_[0]]; -} - -ARMArch Context::arch() const { return arch_; } - -void Context::SetArch(ARMArch arch) { arch_ = arch; } - -int Context::l1_cache_size() const { - DeviceInfo& dev = DeviceInfo::Global(); - return dev.L1_cache_[active_ids_[0]]; -} - -int Context::l2_cache_size() const { - DeviceInfo& dev = DeviceInfo::Global(); - return dev.L2_cache_[active_ids_[0]]; -} - -int Context::l3_cache_size() const { - DeviceInfo& dev = DeviceInfo::Global(); - return dev.L3_cache_[active_ids_[0]]; -} - -bool Context::ExtendWorkspace(DDimLite dims) { - auto count = dims.product(); - auto old = workspace_.dims(); - if (count == old.product()) { - return false; - } - - workspace_.Resize( - {static_cast(count + l2_cache_size() / sizeof(float))}); - return true; -} -#endif // LITE_WITH_ARM - -} // namespace lite +namespace lite {} // namespace lite } // namespace paddle diff --git a/paddle/fluid/lite/core/context.h b/paddle/fluid/lite/core/context.h index 483f51541440fe51e1ee998f07f2e5e12f2441fd..a79ce04fab3146f840b44670eb5a6980a4775372 100644 --- a/paddle/fluid/lite/core/context.h +++ b/paddle/fluid/lite/core/context.h @@ -61,47 +61,41 @@ class Context { template <> class Context { public: - Context(); - Context(PowerMode mode, int threads); + Context() {} explicit Context(const ARMContext& ctx); - ARMContext& operator=(const ARMContext& ctx); + ARMContext& operator=(const ARMContext& ctx) {} // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() { DeviceInfo::Init(); } void CopyShared(const ARMContext* ctx) {} - void SetRunMode(PowerMode mode, int threads); - void SetCache(int l1size, int l2size, int l3size); - void SetArch(ARMArch arch); - void BindDev(); + void SetRunMode(PowerMode mode, int threads) { + return DeviceInfo::Global().SetRunMode(mode, threads); + } + void SetCache(int l1size, int l2size, int l3size) { + return DeviceInfo::Global().SetCache(l1size, l2size, l3size); + } + void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); } - PowerMode mode() const; - int threads() const; - ARMArch arch() const; + PowerMode mode() const { return DeviceInfo::Global().mode(); } + int threads() const { return DeviceInfo::Global().threads(); } + ARMArch arch() const { return DeviceInfo::Global().arch(); } + int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); } + int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); } + int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); } template T* workspace_data() { - return workspace_.mutable_data(); + return DeviceInfo::Global().workspace_data(); } - int l1_cache_size() const; - int l2_cache_size() const; - int l3_cache_size() const; - bool ExtendWorkspace(DDimLite dims); + bool ExtendWorkspace(DDimLite dims) { + return DeviceInfo::Global().ExtendWorkspace(dims); + } std::string name() const { return "ARMContext"; } - - private: - // LITE_POWER_HIGH stands for using big cores, - // LITE_POWER_LOW stands for using small core, - // LITE_POWER_FULL stands for using all cores - ARMArch arch_; - PowerMode mode_; - std::vector active_ids_; - TensorLite workspace_; - int64_t count_{0}; }; #endif diff --git a/paddle/fluid/lite/core/cpu_info.cc b/paddle/fluid/lite/core/cpu_info.cc index ab1968295813006d5d11fc4fbf416b4f9c3a3215..40353631f20765dd0a2744a7f2520c51d11be624 100644 --- a/paddle/fluid/lite/core/cpu_info.cc +++ b/paddle/fluid/lite/core/cpu_info.cc @@ -12,312 +12,81 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef LITE_WITH_LINUX +#include +#include +#endif +#if __APPLE__ +#include "TargetConditionals.h" +#if TARGET_OS_IPHONE +#include +#include +#include +#endif // TARGET_OS_IPHONE +#endif // __APPLE__ + +#ifdef ARM_WITH_OMP +#include +#endif + +#include +#include #include "paddle/fluid/lite/core/cpu_info.h" -#include namespace paddle { namespace lite { #ifdef LITE_WITH_ARM -void DeviceInfo::InitInternal(DeviceInfo* dev) { - set_default_cache(dev); - dev->compute_core_num_ = arm_get_cpucount(); - dev->max_memory_ = arm_get_meminfo(); - -// get max freq -#ifdef LITE_WITH_LINUX - std::vector max_freq(dev->compute_core_num_); - for (int i = 0; i < dev->compute_core_num_; ++i) { - max_freq[i] = get_max_freq_khz(i) / 1000; - } - std::string cpu_name = arm_get_cpu_name(); - if (get_cpu_info_from_name(dev, cpu_name) != true) { - arm_sort_cpuid_by_max_frequency(dev->compute_core_num_, &dev->core_ids_, - max_freq, &dev->cluster_ids_); - dev->big_core_ids_.clear(); - dev->little_core_ids_.clear(); - for (int i = 0; i < dev->cluster_ids_.size(); ++i) { - if (dev->cluster_ids_[i] == 0) { - dev->big_core_ids_.push_back(dev->core_ids_[i]); - } else { - dev->little_core_ids_.push_back(dev->core_ids_[i]); - } - } - arm_get_cpu_arch(&dev->archs_); - } - - LOG(INFO) << "ARM multiprocessors number: " << dev->compute_core_num_; - for (int i = 0; i < dev->compute_core_num_; ++i) { - LOG(INFO) << "ARM multiprocessors ID: " << dev->core_ids_[i] - << ", frequence: " << max_freq[i] - << ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]] - << ", CPU ARCH: A" << dev->archs_[i]; - } - VLOG(1) << "L1 DataCache size is: "; - for (int i = 0; i < dev->compute_core_num_; ++i) { - VLOG(1) << dev->L1_cache_[i] / 1024 << " KB"; - } - VLOG(1) << "L2 Cache size is: "; - for (int i = 0; i < dev->compute_core_num_; ++i) { - VLOG(1) << dev->L2_cache_[i] / 1024 << " KB"; - } - VLOG(1) << "Total memory: " << dev->max_memory_ << "KB"; - - dev->max_freq_ = max_freq[0]; - for (int j = 1; j < dev->compute_core_num_; ++j) { - if (dev->max_freq_ < max_freq[j]) { - dev->max_freq_ = max_freq[j]; - } - } -#elif defined(TARGET_IOS) - arm_get_cpu_arch(&dev->archs_); +#ifdef TARGET_IOS +const int DEFAULT_L1_CACHE_SIZE = 64 * 1024; +const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024; +const int DEFAULT_L3_CACHE_SIZE = 0; +#else +const int DEFAULT_L1_CACHE_SIZE = 32 * 1024; +const int DEFAULT_L2_CACHE_SIZE = 512 * 1024; +const int DEFAULT_L3_CACHE_SIZE = 0; #endif -} -// cache_id : 0 -> L1, 1 -> L2, 2 -> L3 -void set_cache_info(DeviceInfo* cpu_info, int cache_id, int argc, ...) { - va_list arg_ptr; - va_start(arg_ptr, argc); - std::vector* cache; - switch (cache_id) { - case 0: - cache = &cpu_info->L1_cache_; - break; - case 1: - cache = &cpu_info->L2_cache_; - break; - case 2: - cache = &cpu_info->L3_cache_; - break; - default: +int get_cpu_num() { +#ifdef LITE_WITH_LINUX + // get cpu count from /sys/devices/system/cpu/cpunum/uevent + int max_cpu_num = 20; + int cpu_num = 0; + for (int i = 0; i < max_cpu_num; ++i) { + char path[256]; + snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i); + FILE* fp = fopen(path, "rb"); + if (!fp) { break; - } - int core_num = cpu_info->compute_core_num_; - cache->resize(core_num); - if (argc == 1) { - int cache_size = va_arg(arg_ptr, int); - for (int i = 0; i < core_num; ++i) { - (*cache)[i] = cache_size; - } - } else { - int big_core_num = cpu_info->big_core_ids_.size(); - int little_core_num = cpu_info->little_core_ids_.size(); - int big_core_cache_size = va_arg(arg_ptr, int); - int little_core_cache_size = va_arg(arg_ptr, int); - for (int i = 0; i < big_core_num; ++i) { - (*cache)[cpu_info->big_core_ids_[i]] = big_core_cache_size; - } - for (int i = 0; i < little_core_num; ++i) { - (*cache)[cpu_info->little_core_ids_[i]] = little_core_cache_size; } + cpu_num++; + fclose(fp); } - va_end(arg_ptr); -} - -void set_arch_info(DeviceInfo* cpu_info, int argc, ...) { - va_list arg_ptr; - va_start(arg_ptr, argc); - int core_num = cpu_info->compute_core_num_; - cpu_info->archs_.resize(core_num); - if (argc == 1) { - ARMArch arch = (ARMArch)va_arg(arg_ptr, int); - for (int i = 0; i < core_num; ++i) { - cpu_info->archs_[i] = arch; - } - } else { - ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int); - ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int); - int big_core_num = cpu_info->big_core_ids_.size(); - int little_core_num = cpu_info->little_core_ids_.size(); - for (int i = 0; i < big_core_num; ++i) { - cpu_info->archs_[cpu_info->big_core_ids_[i]] = big_core_arch; - } - for (int i = 0; i < little_core_num; ++i) { - cpu_info->archs_[cpu_info->little_core_ids_[i]] = little_core_arch; - } + if (cpu_num < 1) { + cpu_num = 1; } - va_end(arg_ptr); -} - -bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name) { - /* Snapdragon */ - if (hardware_name.find("SDM845") != std::string::npos) { // 845 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {4, 5, 6, 7}; - cpu_info->little_core_ids_ = {0, 1, 2, 3}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - set_arch_info(cpu_info, 2, kA75, kA55); - set_cache_info(cpu_info, 0, 1, 32 * 1024); - set_cache_info(cpu_info, 1, 2, 256 * 1024, 128 * 1024); - set_cache_info(cpu_info, 2, 1, 2048 * 1024); - return true; - - } else if (hardware_name.find("SDM710") != std::string::npos) { // 710 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {6, 7}; - cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0}; - set_arch_info(cpu_info, 2, kA75, kA55); - return true; - } else if (hardware_name.find("MSM8998") != std::string::npos) { // 835 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {4, 5, 6, 7}; - cpu_info->little_core_ids_ = {0, 1, 2, 3}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - set_arch_info(cpu_info, 2, kA73, kA53); - set_cache_info(cpu_info, 0, 2, 64 * 1024); - set_cache_info(cpu_info, 1, 2, 1024 * 1024, - /*real cache size is 2M, while that will get bad performace - on conv3x3s1 or gemm, set to 1M or 512K*/ - 1024 * 1024); - return true; - - } else if (hardware_name.find("MSM8996") != std::string::npos) { // 820 - cpu_info->compute_core_num_ = 4; - cpu_info->core_ids_ = {0, 1, 2, 3}; - cpu_info->big_core_ids_ = {2, 3}; - cpu_info->little_core_ids_ = {0, 1}; - cpu_info->cluster_ids_ = {1, 1, 0, 0}; - set_arch_info(cpu_info, 1, kA72); - set_cache_info(cpu_info, 0, 1, 24 * 1024); - set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024); - return true; - - } else if (hardware_name.find("SDM660") != std::string::npos || - hardware_name.find("SDM636") != std::string::npos) { // 660, 636 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {4, 5, 6, 7}; - cpu_info->little_core_ids_ = {0, 1, 2, 3}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - set_arch_info(cpu_info, 1, kA73); - set_cache_info(cpu_info, 0, 2, 64 * 1024, 32 * 1024); - set_cache_info(cpu_info, 1, 1, 1024 * 1024); - return true; - - } else if (hardware_name.find("MSM8976") != std::string::npos) { // 652,653 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {4, 5, 6, 7}; - cpu_info->little_core_ids_ = {0, 1, 2, 3}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - set_arch_info(cpu_info, 2, kA72, kA53); - set_cache_info(cpu_info, 0, 1, 32 * 1024); - set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024); - return true; - - } else if (hardware_name.find("MSM8953") != std::string::npos) { // 625 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->little_core_ids_ = {}; - cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0}; - set_arch_info(cpu_info, 1, kA53); - set_cache_info(cpu_info, 0, 1, 32 * 1024); - set_cache_info(cpu_info, 1, 1, 1024 * 1024); - return true; - - } else if (hardware_name.find("MSM8939") != std::string::npos) { // 615 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {0, 1, 2, 3}; - cpu_info->little_core_ids_ = {4, 5, 6, 7}; - cpu_info->cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1}; - set_arch_info(cpu_info, 1, kA53); - set_cache_info(cpu_info, 0, 1, 32 * 1024); - set_cache_info(cpu_info, 1, 2, 512 * 1024, 256 * 1024); - return true; - - /* MediaTek */ - - } else if (hardware_name.find("MT6797") != - std::string::npos) { // X20/X23/X25/X27 - cpu_info->compute_core_num_ = 10; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - cpu_info->big_core_ids_ = {8, 9}; - cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}; - set_arch_info(cpu_info, 2, kA72, kA53); - set_cache_info(cpu_info, 0, 1, 32 * 1024); - set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024); - return true; - - } else if (hardware_name.find("MT6799") != std::string::npos) { // X30 - cpu_info->compute_core_num_ = 10; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - cpu_info->big_core_ids_ = {8, 9}; - cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}; - set_arch_info(cpu_info, 2, kA73, kA53); - return true; - - } else if (hardware_name.find("MT6795") != std::string::npos || - hardware_name.find("MT6762") != std::string::npos || - hardware_name.find("MT6755T") != std::string::npos || - hardware_name.find("MT6755S") != std::string::npos || - hardware_name.find("MT6753") != std::string::npos || - hardware_name.find("MT6752") != std::string::npos || - hardware_name.find("MT6750") != std::string::npos) { - // X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->little_core_ids_ = {}; - cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0}; - set_arch_info(cpu_info, 1, kA53); - return true; - - } else if (hardware_name.find("MT6758") != std::string::npos || - hardware_name.find("MT6757") != std::string::npos || - hardware_name.find("MT6763") != std::string::npos || - hardware_name.find("MT6755M") != std::string::npos || - hardware_name.find("MT6755") != - std::string::npos) { // P30, P20/P25, P23, P10 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {4, 5, 6, 7}; - cpu_info->little_core_ids_ = {0, 1, 2, 3}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - set_arch_info(cpu_info, 1, kA53); - return true; - - } else if (hardware_name.find("MT6771") != std::string::npos) { // P60 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {4, 5, 6, 7}; - cpu_info->little_core_ids_ = {0, 1, 2, 3}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - set_arch_info(cpu_info, 2, kA73, kA53); - return true; - - } else if (hardware_name.find("MT6765") != std::string::npos || - hardware_name.find("MT6739") != std::string::npos || - hardware_name.find("MT6738") != std::string::npos || - hardware_name.find("MT6737") != - std::string::npos) { // A22, MT6739, MT6738, MT6767 - cpu_info->compute_core_num_ = 4; - cpu_info->core_ids_ = {0, 1, 2, 3}; - cpu_info->big_core_ids_ = {0, 0, 0, 0}; - cpu_info->little_core_ids_ = {}; - cpu_info->cluster_ids_ = {0, 0, 0, 0}; - set_arch_info(cpu_info, 1, kA53); - return true; + return cpu_num; +#elif defined(TARGET_IOS) + int cpu_num = 0; + size_t len = sizeof(cpu_num); + sysctlbyname("hw.ncpu", &cpu_num, &len, NULL, 0); + if (cpu_num < 1) { + cpu_num = 1; } - return false; + return cpu_num; +#else + return 1; +#endif } -size_t arm_get_meminfo() { +size_t get_mem_size() { #ifdef LITE_WITH_LINUX // get cpu count from /proc/cpuinfo FILE* fp = fopen("/proc/meminfo", "rb"); if (!fp) { return 1; } - size_t memsize = 0; char line[1024]; while (!feof(fp)) { @@ -327,52 +96,18 @@ size_t arm_get_meminfo() { } sscanf(s, "MemTotal: %d kB", &memsize); } - fclose(fp); - return memsize; #elif defined(TARGET_IOS) // to be implemented printf("not implemented\n"); - return 0; -#endif -} - -int arm_get_cpucount() { -#ifdef LITE_WITH_LINUX - // get cpu count from /sys/devices/system/cpu/cpunum/uevent - int max_cpu_count = 20; - int count = 0; - for (int i = 0; i < max_cpu_count; ++i) { - char path[256]; - snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i); - FILE* fp = fopen(path, "rb"); - if (!fp) { - break; - } - count++; - fclose(fp); - } - if (count < 1) { - count = 1; - } - return count; -#elif defined(TARGET_IOS) - int count = 0; - size_t len = sizeof(count); - sysctlbyname("hw.ncpu", &count, &len, NULL, 0); - if (count < 1) { - count = 1; - } - return count; -#else - return 1; #endif + return 0; } -void arm_get_cpu_arch(std::vector* archs) { -#ifdef LITE_WITH_LINUX +void get_cpu_arch(std::vector* archs, const int cpu_num) { archs->clear(); +#ifdef LITE_WITH_LINUX //! get CPU ARCH FILE* fp = fopen("/proc/cpuinfo", "rb"); if (!fp) { @@ -406,6 +141,29 @@ void arm_get_cpu_arch(std::vector* archs) { case 0xd0a: archs->push_back(kA75); break; + case 0xd40: + archs->push_back(kA76); + break; + case 0x804: + // 855 + archs->push_back(kA76); + break; + case 0x805: + // 855 + archs->push_back(kA55); + break; + case 0x802: + // 845 + archs->push_back(kA75); + break; + case 0x803: + // 845 + archs->push_back(kA55); + break; + case 0x801: + // 835 + archs->push_back(kA73); + break; case 0x800: // 835 archs->push_back(kA73); @@ -415,49 +173,31 @@ void arm_get_cpu_arch(std::vector* archs) { archs->push_back(kA72); break; default: - LOG(ERROR) << "unknow type"; + LOG(ERROR) << "Unknow cpu arch: " << arch_id; archs->push_back(kARMArch_UNKOWN); } } } fclose(fp); - int cpu_count = arm_get_cpucount(); - if (archs->size() < cpu_count) { - for (int i = archs->size(); i < cpu_count; ++i) { + if (archs->size() < cpu_num) { + for (int i = archs->size(); i < cpu_num; ++i) { archs->push_back(archs->at(i - 1)); } } -#endif -#ifdef TARGET_IOS - int cpu_count = arm_get_cpucount(); - for (int i = 0; i < cpu_count; ++i) { +#elif defined(TARGET_IOS) + for (int i = 0; i < cpu_num; ++i) { archs->push_back(APPLE); } +#else + for (int i = 0; i < cpu_num; ++i) { + archs->push_back(kARMArch_UNKOWN); + } #endif } #ifdef LITE_WITH_LINUX -void set_default_cache(DeviceInfo* dev) { - int cpu_count = arm_get_cpucount(); - dev->L1_cache_.resize(cpu_count); - dev->L2_cache_.resize(cpu_count); - dev->L3_cache_.resize(cpu_count); -#ifdef TARGET_IOS - for (int i = 0; i < cpu_count; ++i) { - dev->L1_cache_[i] = 64 * 1024; - dev->L2_cache_[i] = 2048 * 1024; - dev->L3_cache_[i] = 0; - } -#else - for (int i = 0; i < cpu_count; ++i) { - dev->L1_cache_[i] = 32 * 1024; - dev->L2_cache_[i] = 512 * 1024; - dev->L3_cache_[i] = 0; - } -#endif -} -std::string arm_get_cpu_name() { +std::string get_cpu_name() { FILE* fp = fopen("/proc/cpuinfo", "rb"); if (!fp) { return ""; @@ -477,122 +217,163 @@ std::string arm_get_cpu_name() { return ""; } -int get_max_freq_khz(int cpuid) { +void get_cpu_max_min_freq(int cpu_id, int* max_freq, int* min_freq) { + *max_freq = 0; + *min_freq = 0; // first try, for all possible cpu char path[256]; snprintf(path, sizeof(path), - "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid); - + "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpu_id); FILE* fp = fopen(path, "rb"); - if (!fp) { // second try, for online cpu snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", - cpuid); + cpu_id); fp = fopen(path, "rb"); - if (!fp) { // third try, for online cpu + // get max_freq snprintf(path, sizeof(path), - "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid); + "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", + cpu_id); fp = fopen(path, "rb"); - if (!fp) { - return -1; + return; } - - int max_freq_khz = -1; - fscanf(fp, "%d", &max_freq_khz); - + fscanf(fp, "%d", max_freq); fclose(fp); - - return max_freq_khz; + // get min_freq + snprintf(path, sizeof(path), + "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_min_freq", + cpu_id); + fp = fopen(path, "rb"); + if (!fp) { + return; + } + fscanf(fp, "%d", min_freq); + fclose(fp); + return; } } - - int max_freq_khz = 0; + *min_freq = std::numeric_limits::max(); while (!feof(fp)) { - int freq_khz = 0; - int nscan = fscanf(fp, "%d %*d", &freq_khz); + int freq = 0; + int nscan = fscanf(fp, "%d %*d", &freq); if (nscan != 1) { break; } - - if (freq_khz > max_freq_khz) { - max_freq_khz = freq_khz; + if (freq > *max_freq) { + *max_freq = freq; + } + if (freq < *min_freq) { + *min_freq = freq; } } - fclose(fp); - - return max_freq_khz; } -int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector* cpuids, - const std::vector& cpu_freq, - std::vector* cluster_ids) { - if (cpu_count == 0) { - return 0; +void sort_cpuid_by_max_freq(const std::vector& max_freqs, + std::vector* cpu_ids, + std::vector* cluster_ids) { + int cpu_num = max_freqs.size(); + if (cpu_num == 0) { + return; } - - cpuids->resize(cpu_count); - cluster_ids->resize(cpu_count); - - for (int i = 0; i < cpu_count; i++) { - cpuids->at(i) = i; + cpu_ids->resize(cpu_num); + cluster_ids->resize(cpu_num); + for (int i = 0; i < cpu_num; i++) { + cpu_ids->at(i) = i; } - // sort cpuid as big core first // simple bubble sort - - for (int i = 0; i < cpu_count; i++) { - for (int j = i + 1; j < cpu_count; j++) { - if (cpu_freq[i] < cpu_freq[j]) { + for (int i = 0; i < cpu_num; i++) { + for (int j = i + 1; j < cpu_num; j++) { + if (max_freqs[i] < max_freqs[j]) { // swap - int tmp = cpuids->at(i); - cpuids->at(i) = cpuids->at(j); - cpuids->at(j) = tmp; + int tmp = cpu_ids->at(i); + cpu_ids->at(i) = cpu_ids->at(j); + cpu_ids->at(j) = tmp; } } } // SMP - int mid_max_freq_khz = - (cpu_freq[cpuids->at(0)] + cpu_freq[cpuids->at(cpu_count - 1)]) / 2; + int mid_max_freq = + (max_freqs[cpu_ids->at(0)] + max_freqs[cpu_ids->at(cpu_num - 1)]) / 2; - for (int i = 0; i < cpu_count; i++) { - cpuids->at(i) = i; - if (cpu_freq[i] >= mid_max_freq_khz) { + for (int i = 0; i < cpu_num; i++) { + cpu_ids->at(i) = i; + if (max_freqs[i] >= mid_max_freq) { cluster_ids->at(i) = 0; } else { cluster_ids->at(i) = 1; } } - return 0; } -int check_online(const std::vector& core_ids) { - if (core_ids.size() == 0) { - return 0; +void get_cpu_cache_size(int cpu_id, int* l1_cache_size, int* l2_cache_size, + int* l3_cache_size) { + int max_cache_idx_num = 10; + *l1_cache_size = DEFAULT_L1_CACHE_SIZE; + *l2_cache_size = DEFAULT_L2_CACHE_SIZE; + *l3_cache_size = DEFAULT_L3_CACHE_SIZE; + for (int i = 0; i < max_cache_idx_num; i++) { + char path[256]; + snprintf(path, sizeof(path), + "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpu_id, i); + FILE* fp = fopen(path, "rb"); + if (fp) { + int level = -1; + fscanf(fp, "%d", &level); + fclose(fp); + snprintf(path, sizeof(path), + "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpu_id, i); + fp = fopen(path, "rb"); + if (fp) { + int size = -1; + fscanf(fp, "%d", &size); + fclose(fp); + if (size >= 0) { + if (level == 1) { + *l1_cache_size = size * 1024; + } else if (level == 2) { + *l2_cache_size = size * 1024; + } else if (level == 3) { + *l3_cache_size = size * 1024; + } + } + } + } + } +} + +bool check_cpu_online(const std::vector& cpu_ids) { + if (cpu_ids.size() == 0) { + return false; } char path[256]; - int online = 1; - for (int i = 0; i < core_ids.size(); ++i) { + bool all_online = true; + for (int i = 0; i < cpu_ids.size(); ++i) { snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online", - core_ids[i]); + cpu_ids[i]); FILE* fp = fopen(path, "rb"); - if (!fp) { - return 0; + int is_online = 0; + if (fp) { + fscanf(fp, "%d", &is_online); + fclose(fp); + } else { + LOG(ERROR) << "Failed to query the online statue of CPU id:" + << cpu_ids[i]; + } + if (is_online == 0) { + all_online = false; + LOG(ERROR) << "CPU id:" << cpu_ids[i] << " is offine"; } - int cur_online = 0; - fscanf(fp, "%d", &cur_online); - online &= cur_online; - fclose(fp); } - return online; + return all_online; } -int set_sched_affinity(const std::vector& cpuids) { +int set_sched_affinity(const std::vector& cpu_ids) { // #define CPU_SETSIZE 1024 // #define __NCPUBITS (8 * sizeof (unsigned long)) // typedef struct @@ -608,20 +389,569 @@ int set_sched_affinity(const std::vector& cpuids) { #endif cpu_set_t mask; CPU_ZERO(&mask); - for (int i = 0; i < cpuids.size(); i++) { - CPU_SET(cpuids[i], &mask); + for (int i = 0; i < cpu_ids.size(); ++i) { + CPU_SET(cpu_ids[i], &mask); } - int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask); if (syscallret) { - LOG(ERROR) << "syscall error " << syscallret; return -1; } + return 0; +} + +bool bind_threads(const std::vector cpu_ids) { +#ifdef ARM_WITH_OMP + int thread_num = cpu_ids.size(); + omp_set_num_threads(thread_num); + std::vector ssarets; + for (int i = 0; i < thread_num; ++i) { + ssarets.push_back(0); + } +#pragma omp parallel for + for (int i = 0; i < thread_num; i++) { + ssarets[i] = set_sched_affinity(cpu_ids); + } + for (int i = 0; i < thread_num; i++) { + if (ssarets[i] != 0) { + LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[i]; + return false; + } + } +#else // ARM_WITH_OMP + std::vector first_cpu_id; + first_cpu_id.push_back(cpu_ids[0]); + int ssaret = set_sched_affinity(first_cpu_id); + if (ssaret != 0) { + LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[0]; + return false; + } +#endif // ARM_WITH_OMP +} + +#endif // LITE_WITH_LINUX + +// cache_id : 0 -> L1, 1 -> L2, 2 -> L3 +void DeviceInfo::SetCacheInfo(int cache_id, int argc, ...) { + va_list arg_ptr; + va_start(arg_ptr, argc); + std::vector* cache; + switch (cache_id) { + case 0: + cache = &L1_cache_; + break; + case 1: + cache = &L2_cache_; + break; + case 2: + cache = &L3_cache_; + break; + default: + break; + } + cache->resize(core_num_); + if (argc == 1) { + int cache_size = va_arg(arg_ptr, int); + for (int i = 0; i < core_num_; ++i) { + (*cache)[i] = cache_size; + } + } else { + int big_core_num = big_core_ids_.size(); + int little_core_num = little_core_ids_.size(); + int big_core_cache_size = va_arg(arg_ptr, int); + int little_core_cache_size = va_arg(arg_ptr, int); + for (int i = 0; i < big_core_num; ++i) { + (*cache)[big_core_ids_[i]] = big_core_cache_size; + } + for (int i = 0; i < little_core_num; ++i) { + (*cache)[little_core_ids_[i]] = little_core_cache_size; + } + } + va_end(arg_ptr); +} + +void DeviceInfo::SetArchInfo(int argc, ...) { + va_list arg_ptr; + va_start(arg_ptr, argc); + archs_.resize(core_num_); + if (argc == 1) { + ARMArch arch = (ARMArch)va_arg(arg_ptr, int); + for (int i = 0; i < core_num_; ++i) { + archs_[i] = arch; + } + } else { + ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int); + ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int); + int big_core_num = big_core_ids_.size(); + int little_core_num = little_core_ids_.size(); + for (int i = 0; i < big_core_num; ++i) { + archs_[big_core_ids_[i]] = big_core_arch; + } + for (int i = 0; i < little_core_num; ++i) { + archs_[little_core_ids_[i]] = little_core_arch; + } + } + va_end(arg_ptr); +} + +bool DeviceInfo::SetCPUInfoByName() { + /* Snapdragon */ + if (dev_name_.find("SM8150") != std::string::npos) { // 855 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {4, 5, 6, 7}; + little_core_ids_ = {0, 1, 2, 3}; + cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + SetArchInfo(2, kA76, kA55); + SetCacheInfo(0, 2, 64 * 1024, 32 * 1024); + SetCacheInfo(1, 2, 256 * 1024, 128 * 1024); + SetCacheInfo(2, 1, 2048 * 1024); + return true; + } else if (dev_name_.find("SDM845") != std::string::npos) { // 845 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {4, 5, 6, 7}; + little_core_ids_ = {0, 1, 2, 3}; + cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + SetArchInfo(2, kA75, kA55); + SetCacheInfo(0, 2, 64 * 1024, 32 * 1024); + SetCacheInfo(1, 2, 256 * 1024, 128 * 1024); + SetCacheInfo(2, 1, 2048 * 1024); + return true; + } else if (dev_name_.find("SDM710") != std::string::npos) { // 710 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {6, 7}; + little_core_ids_ = {0, 1, 2, 3, 4, 5}; + cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0}; + SetArchInfo(2, kA75, kA55); + SetCacheInfo(0, 2, 64 * 1024, 32 * 1024); + SetCacheInfo(1, 2, 256 * 1024, 128 * 1024); + SetCacheInfo(2, 1, 1024 * 1024); + return true; + } else if (dev_name_.find("MSM8998") != std::string::npos) { // 835 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {4, 5, 6, 7}; + little_core_ids_ = {0, 1, 2, 3}; + cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + SetArchInfo(2, kA73, kA53); + SetCacheInfo(0, 2, 64 * 1024, 32 * 1024); + SetCacheInfo(1, 2, 1024 * 1024, + /*real cache size is 2M, while that will get bad performace + on conv3x3s1 or gemm, set to 1M or 512K*/ + 1024 * 1024); + return true; + } else if (dev_name_.find("MSM8996") != std::string::npos) { // 820 + core_num_ = 4; + core_ids_ = {0, 1, 2, 3}; + big_core_ids_ = {2, 3}; + little_core_ids_ = {0, 1}; + cluster_ids_ = {1, 1, 0, 0}; + SetArchInfo(1, kA72); + SetCacheInfo(0, 1, 24 * 1024); + SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024); + return true; + } else if (dev_name_.find("SDM660") != std::string::npos || + dev_name_.find("SDM636") != std::string::npos) { // 660, 636 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {4, 5, 6, 7}; + little_core_ids_ = {0, 1, 2, 3}; + cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + SetArchInfo(1, kA73); + SetCacheInfo(0, 2, 64 * 1024, 32 * 1024); + SetCacheInfo(1, 1, 1024 * 1024); + return true; + } else if (dev_name_.find("MSM8976") != std::string::npos) { // 652,653 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {4, 5, 6, 7}; + little_core_ids_ = {0, 1, 2, 3}; + cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + SetArchInfo(2, kA72, kA53); + SetCacheInfo(0, 1, 32 * 1024); + SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024); + return true; + } else if (dev_name_.find("MSM8953") != std::string::npos) { // 625 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + little_core_ids_ = {}; + cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0}; + SetArchInfo(1, kA53); + SetCacheInfo(0, 1, 32 * 1024); + SetCacheInfo(1, 1, 1024 * 1024); + return true; + } else if (dev_name_.find("MSM8939") != std::string::npos) { // 615 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {0, 1, 2, 3}; + little_core_ids_ = {4, 5, 6, 7}; + cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1}; + SetArchInfo(1, kA53); + SetCacheInfo(0, 1, 32 * 1024); + SetCacheInfo(1, 2, 512 * 1024, 256 * 1024); + return true; + /* MediaTek */ + } else if (dev_name_.find("MT6797") != + std::string::npos) { // X20/X23/X25/X27 + core_num_ = 10; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + big_core_ids_ = {8, 9}; + little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}; + SetArchInfo(2, kA72, kA53); + SetCacheInfo(0, 1, 32 * 1024); + SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024); + return true; + } else if (dev_name_.find("MT6799") != std::string::npos) { // X30 + core_num_ = 10; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + big_core_ids_ = {8, 9}; + little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}; + SetArchInfo(2, kA73, kA53); + return true; + } else if (dev_name_.find("MT6795") != std::string::npos || + dev_name_.find("MT6762") != std::string::npos || + dev_name_.find("MT6755T") != std::string::npos || + dev_name_.find("MT6755S") != std::string::npos || + dev_name_.find("MT6753") != std::string::npos || + dev_name_.find("MT6752") != std::string::npos || + dev_name_.find("MT6750") != std::string::npos) { + // X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + little_core_ids_ = {}; + cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0}; + SetArchInfo(1, kA53); + return true; + } else if (dev_name_.find("MT6758") != std::string::npos || + dev_name_.find("MT6757") != std::string::npos || + dev_name_.find("MT6763") != std::string::npos || + dev_name_.find("MT6755M") != std::string::npos || + dev_name_.find("MT6755") != + std::string::npos) { // P30, P20/P25, P23, P10 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {4, 5, 6, 7}; + little_core_ids_ = {0, 1, 2, 3}; + cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + SetArchInfo(1, kA53); + return true; + } else if (dev_name_.find("MT6771") != std::string::npos) { // P60 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {4, 5, 6, 7}; + little_core_ids_ = {0, 1, 2, 3}; + cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + SetArchInfo(2, kA73, kA53); + return true; + } else if (dev_name_.find("MT6765") != std::string::npos || + dev_name_.find("MT6739") != std::string::npos || + dev_name_.find("MT6738") != std::string::npos || + dev_name_.find("MT6737") != + std::string::npos) { // A22, MT6739, MT6738, MT6767 + core_num_ = 4; + core_ids_ = {0, 1, 2, 3}; + big_core_ids_ = {0, 1, 2, 3}; + little_core_ids_ = {}; + cluster_ids_ = {0, 0, 0, 0}; + SetArchInfo(1, kA53); + return true; + } + return false; +} + +void DeviceInfo::SetCPUInfoByProb() { +#ifdef LITE_WITH_LINUX + // get big.LITTLE cores by sorting CPU frequency + sort_cpuid_by_max_freq(max_freqs_, &core_ids_, &cluster_ids_); + big_core_ids_.clear(); + little_core_ids_.clear(); + for (int i = 0; i < cluster_ids_.size(); ++i) { + if (cluster_ids_[i] == 0) { + big_core_ids_.push_back(core_ids_[i]); + } else { + little_core_ids_.push_back(core_ids_[i]); + } + } + // get l1, l2, l3 cache size for each core + for (int i = 0; i < core_num_; i++) { + get_cpu_cache_size(i, &(L1_cache_[i]), &(L2_cache_[i]), &(L3_cache_[i])); + } +#endif // LITE_WITH_LINUX +} + +void DeviceInfo::RequestPowerFullMode(const int thread_num) { + int big_core_size = big_core_ids_.size(); + int little_core_size = little_core_ids_.size(); + active_ids_.clear(); + for (int i = 0; i < thread_num; ++i) { + if (i < big_core_size) { + active_ids_.push_back(big_core_ids_[i]); + } else if (i < big_core_size + little_core_size) { + active_ids_.push_back(little_core_ids_[i - big_core_size]); + } + } + mode_ = LITE_POWER_FULL; +} + +void DeviceInfo::RequestPowerHighMode(const int thread_num) { + int big_core_size = big_core_ids_.size(); + int little_core_size = little_core_ids_.size(); + active_ids_.clear(); + if (big_core_size > 0) { + mode_ = LITE_POWER_HIGH; + if (thread_num > big_core_size) { + LOG(ERROR) << "Request thread num: " << thread_num + << ", exceed the big cores size: " << big_core_size + << ", truncate thread num to " << big_core_size; + active_ids_ = big_core_ids_; + } else { + for (int i = 0; i < thread_num; ++i) { + active_ids_.push_back(big_core_ids_[i]); + } + } + } else { + mode_ = LITE_POWER_LOW; + LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores."; + if (thread_num > little_core_size) { + active_ids_ = little_core_ids_; + } else { + for (int i = 0; i < thread_num; ++i) { + active_ids_.push_back(little_core_ids_[i]); + } + } + } +} + +void DeviceInfo::RequestPowerLowMode(const int thread_num) { + int big_core_size = big_core_ids_.size(); + int little_core_size = little_core_ids_.size(); + active_ids_.clear(); + if (little_core_size > 0) { + mode_ = LITE_POWER_LOW; + if (thread_num > little_core_size) { + LOG(WARNING) << "Request thread num: " << thread_num + << ", exceed the little cores size: " << little_core_size + << ", truncate thread num to " << little_core_size; + active_ids_ = little_core_ids_; + } else { + for (int i = 0; i < thread_num; i++) { + active_ids_.push_back(little_core_ids_[i]); + } + } + } else { + mode_ = LITE_POWER_HIGH; + LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores"; + if (thread_num > big_core_size) { + active_ids_ = big_core_ids_; + } else { + for (int i = 0; i < thread_num; i++) { + active_ids_.push_back(big_core_ids_[i]); + } + } + } +} + +void DeviceInfo::RequestPowerNoBindMode(const int thread_num) { + active_ids_.clear(); + for (int i = 0; i < thread_num; i++) { + active_ids_.push_back(0); + } + mode_ = LITE_POWER_NO_BIND; +} + +void DeviceInfo::RequestPowerRandHighMode(const int shift_num, + const int thread_num) { + int big_core_size = big_core_ids_.size(); + int little_core_size = little_core_ids_.size(); + if (big_core_size > 0) { + mode_ = LITE_POWER_RAND_HIGH; + if (thread_num > big_core_size) { + LOG(WARNING) << "Request thread num: " << thread_num + << ", exceed the big cores size: " << big_core_size + << ", truncate thread num to " << big_core_size; + active_ids_ = big_core_ids_; + } else { + for (int i = 0; i < thread_num; ++i) { + active_ids_.push_back(big_core_ids_[(i + shift_num) % big_core_size]); + } + } + } else { + mode_ = LITE_POWER_LOW; + LOG(WARNING) << "HIGH POWER MODE is not support, switch to little cores."; + if (thread_num > little_core_size) { + active_ids_ = little_core_ids_; + } else { + for (int i = 0; i < thread_num; ++i) { + active_ids_.push_back(little_core_ids_[i]); + } + } + } +} + +void DeviceInfo::RequestPowerRandLowMode(const int shift_num, + const int thread_num) { + int big_core_size = big_core_ids_.size(); + int little_core_size = little_core_ids_.size(); + active_ids_.clear(); + if (little_core_size > 0) { + mode_ = LITE_POWER_RAND_LOW; + if (thread_num > little_core_size) { + LOG(WARNING) << "Request thread num: " << thread_num + << ", exceed the little cores size: " << little_core_size + << ", truncate thread num to " << little_core_size; + active_ids_ = little_core_ids_; + } else { + for (int i = 0; i < thread_num; ++i) { + active_ids_.push_back( + little_core_ids_[(i + shift_num) % little_core_size]); + } + } + } else { + mode_ = LITE_POWER_HIGH; + LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores."; + if (thread_num > big_core_size) { + active_ids_ = big_core_ids_; + } else { + for (int i = 0; i < thread_num; ++i) { + active_ids_.push_back(big_core_ids_[i]); + } + } + } +} +int DeviceInfo::Setup() { + core_num_ = get_cpu_num(); + mem_size_ = get_mem_size(); + get_cpu_arch(&archs_, core_num_); + // set defalut CPU info + SetCacheInfo(0, DEFAULT_L1_CACHE_SIZE); + SetCacheInfo(1, DEFAULT_L2_CACHE_SIZE); + SetCacheInfo(2, DEFAULT_L3_CACHE_SIZE); +#ifdef LITE_WITH_LINUX + // get max&min freq + max_freqs_.resize(core_num_); + min_freqs_.resize(core_num_); + for (int i = 0; i < core_num_; ++i) { + int max_freq, min_freq; + get_cpu_max_min_freq(i, &max_freq, &min_freq); + max_freqs_[i] = max_freq / 1000; + min_freqs_[i] = min_freq / 1000; + } + // get cache size and big.LITTLE core ids + dev_name_ = get_cpu_name(); + if (!SetCPUInfoByName()) { + SetCPUInfoByProb(); + } + // output info + LOG(INFO) << "ARM multiprocessors name: " << dev_name_; + LOG(INFO) << "ARM multiprocessors number: " << core_num_; + for (int i = 0; i < core_num_; ++i) { + LOG(INFO) << "ARM multiprocessors ID: " << core_ids_[i] + << ", max freq: " << max_freqs_[i] + << ", min freq: " << min_freqs_[i] + << ", cluster ID: " << cluster_ids_[core_ids_[i]] + << ", CPU ARCH: A" << archs_[i]; + } + LOG(INFO) << "L1 DataCache size is: "; + for (int i = 0; i < core_num_; ++i) { + LOG(INFO) << L1_cache_[i] / 1024 << " KB"; + } + LOG(INFO) << "L2 Cache size is: "; + for (int i = 0; i < core_num_; ++i) { + LOG(INFO) << L2_cache_[i] / 1024 << " KB"; + } + LOG(INFO) << "Total memory: " << mem_size_ << "KB"; +#endif + // set default run mode + SetRunMode(LITE_POWER_NO_BIND, 1); // use single thread by default return 0; } +void DeviceInfo::SetRunMode(PowerMode mode, int thread_num) { +#ifdef ARM_WITH_OMP + thread_num = std::min(thread_num, core_num_); +#else + thread_num = 1; // force thread_num to 1 if OpenMP is disabled +#endif +#ifdef LITE_WITH_LINUX + int big_core_size = big_core_ids_.size(); + int little_core_size = little_core_ids_.size(); + int big_little_core_size = big_core_size + little_core_size; + thread_num = std::min(thread_num, big_little_core_size); + count_++; + int shift_num = (count_ / 10) % big_core_size; + switch (mode) { + case LITE_POWER_FULL: + RequestPowerFullMode(thread_num); + break; + case LITE_POWER_HIGH: + RequestPowerHighMode(thread_num); + break; + case LITE_POWER_LOW: + RequestPowerLowMode(thread_num); + break; + case LITE_POWER_NO_BIND: + RequestPowerNoBindMode(thread_num); + break; + case LITE_POWER_RAND_HIGH: + RequestPowerRandHighMode(shift_num, thread_num); + break; + case LITE_POWER_RAND_LOW: + RequestPowerRandLowMode(shift_num, thread_num); + break; + default: + LOG(FATAL) << "Unsupported power mode: " << mode; + break; + } + if (active_ids_.size() == 0) { + active_ids_.push_back(0); + } +#ifdef ARM_WITH_OMP + omp_set_num_threads(active_ids_.size()); +#endif + if (mode_ != LITE_POWER_NO_BIND) { + if (check_cpu_online(active_ids_)) { + bind_threads(active_ids_); + } else { + LOG(WARNING) << "Some cores are offline, switch to NO BIND MODE"; + mode_ = LITE_POWER_NO_BIND; + } + } +#else // LITE_WITH_LINUX + // only LITE_POWER_NO_BIND is supported in other OS + RequestPowerNoBindMode(thread_num); +#ifdef ARM_WITH_OMP + omp_set_num_threads(active_ids_.size()); +#endif #endif // LITE_WITH_LINUX + //! alloc memory for sgemm in this context + workspace_.Resize( + {static_cast(L2_cache_[active_ids_[0]] / sizeof(float))}); + arch_ = archs_[active_ids_[0]]; +} + +void DeviceInfo::SetCache(int l1size, int l2size, int l3size) { + SetCacheInfo(0, l1size); + SetCacheInfo(1, l2size); + SetCacheInfo(2, l3size); + workspace_.Resize({2 * (l1size + l2size)}); +} + +bool DeviceInfo::ExtendWorkspace(DDimLite dims) { + auto count = dims.product(); + auto old = workspace_.dims(); + if (count == old.product()) { + return false; + } + workspace_.Resize({static_cast( + count + L2_cache_[active_ids_[0]] / sizeof(float))}); + return true; +} #endif // LITE_WITH_ARM diff --git a/paddle/fluid/lite/core/cpu_info.h b/paddle/fluid/lite/core/cpu_info.h index 385954e6d8e480cbc17fc2ec467f88d7d24331fb..3c89d5eb43b972ea1e4ad071b2a9d8718d28c35a 100644 --- a/paddle/fluid/lite/core/cpu_info.h +++ b/paddle/fluid/lite/core/cpu_info.h @@ -14,24 +14,12 @@ #pragma once +#include #include #include +#include "paddle/fluid/lite/core/lite_tensor.h" #include "paddle/fluid/lite/utils/cp_logging.h" -#ifdef LITE_WITH_LINUX -#include -#include -#endif - -#if __APPLE__ -#include "TargetConditionals.h" -#if TARGET_OS_IPHONE -#include -#include -#include -#endif // TARGET_OS_IPHONE -#endif // __APPLE__ - namespace paddle { namespace lite { @@ -60,64 +48,73 @@ typedef enum { class DeviceInfo { public: - int idx_; - int max_freq_; - int min_freq_; - int generate_arch_; - int compute_core_num_; - int max_memory_; - int sharemem_size_; - - std::string device_name_; - std::string compute_ability_; - - std::vector L1_cache_; - std::vector L2_cache_; - std::vector L3_cache_; - std::vector core_ids_; - std::vector big_core_ids_; - std::vector little_core_ids_; - std::vector cluster_ids_; - std::vector archs_; - static DeviceInfo& Global() { static auto* x = new DeviceInfo; return *x; } - static void Init() { - auto& info = Global(); - InitInternal(&info); + static int Init() { + static int ret = Global().Setup(); + return ret; } - private: - DeviceInfo() = default; - static void InitInternal(DeviceInfo* dev); -}; + int Setup(); -size_t arm_get_meminfo(); + void SetRunMode(PowerMode mode, int thread_num); + void SetCache(int l1size, int l2size, int l3size); + void SetArch(ARMArch arch) { arch_ = arch; } -int arm_get_cpucount(); + PowerMode mode() const { return mode_; } + int threads() const { return active_ids_.size(); } + ARMArch arch() const { return arch_; } + int l1_cache_size() const { return L1_cache_[active_ids_[0]]; } + int l2_cache_size() const { return L2_cache_[active_ids_[0]]; } + int l3_cache_size() const { return L3_cache_[active_ids_[0]]; } -void arm_get_cpu_arch(std::vector* archs); - -bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name); - -#ifdef LITE_WITH_LINUX - -void set_default_cache(DeviceInfo* dev); + template + T* workspace_data() { + return workspace_.mutable_data(); + } + bool ExtendWorkspace(DDimLite dims); -std::string arm_get_cpu_name(); + private: + int core_num_; + std::vector max_freqs_; + std::vector min_freqs_; + int mem_size_; + std::string dev_name_; -int get_max_freq_khz(int cpuid); + std::vector L1_cache_; + std::vector L2_cache_; + std::vector L3_cache_; + std::vector core_ids_; + std::vector big_core_ids_; + std::vector little_core_ids_; + std::vector cluster_ids_; + std::vector archs_; -int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector* cpuids, - const std::vector& cpu_freq, - std::vector* cluster_ids); -int check_online(const std::vector& core_ids); -int set_sched_affinity(const std::vector& cpuids); + ARMArch arch_; + // LITE_POWER_HIGH stands for using big cores, + // LITE_POWER_LOW stands for using small core, + // LITE_POWER_FULL stands for using all cores + PowerMode mode_; + std::vector active_ids_; + TensorLite workspace_; + int64_t count_{0}; + + void SetCacheInfo(int cache_id, int argc, ...); + void SetArchInfo(int argc, ...); + bool SetCPUInfoByName(); + void SetCPUInfoByProb(); + void RequestPowerFullMode(const int thread_num); + void RequestPowerHighMode(const int thread_num); + void RequestPowerLowMode(const int thread_num); + void RequestPowerNoBindMode(const int thread_num); + void RequestPowerRandHighMode(const int shift_num, const int thread_num); + void RequestPowerRandLowMode(const int shift_num, const int thread_num); -#endif // LITE_WITH_LINUX + DeviceInfo() = default; +}; #endif // LITE_WITH_ARM diff --git a/paddle/fluid/lite/kernels/arm/conv_compute.cc b/paddle/fluid/lite/kernels/arm/conv_compute.cc index 4ac6cd4b76121dca1ba9dc2fde541d32f1b377c0..af8f8e1242a32f58727ad1658b7db2cefbc1b653 100644 --- a/paddle/fluid/lite/kernels/arm/conv_compute.cc +++ b/paddle/fluid/lite/kernels/arm/conv_compute.cc @@ -28,8 +28,6 @@ void ConvCompute::PrepareForRun() { auto o_dims = param.output->dims(); auto& ctx = this->ctx_->template As(); - // TODO(xxx): make api and expose it - ctx.SetRunMode(LITE_POWER_HIGH, 4); int win = x_dims[3]; // nchw int hin = x_dims[2]; diff --git a/paddle/fluid/lite/kernels/arm/fc_compute.cc b/paddle/fluid/lite/kernels/arm/fc_compute.cc index c7a9269b5f9af40e89a8e58e1363c1b131f81ac4..2e6f46a0e07e422bb118834214fee3fc43ae1d61 100644 --- a/paddle/fluid/lite/kernels/arm/fc_compute.cc +++ b/paddle/fluid/lite/kernels/arm/fc_compute.cc @@ -28,7 +28,6 @@ void FcCompute::PrepareForRun() { auto w_dims = param.w->dims(); auto& ctx = this->ctx_->template As(); - ctx.SetRunMode(LITE_POWER_HIGH, 4); CHECK_GE(x_dims.size(), 2UL); CHECK_EQ(w_dims.size(), 2UL); diff --git a/paddle/fluid/lite/kernels/arm/mul_compute.cc b/paddle/fluid/lite/kernels/arm/mul_compute.cc index a176086a4cae61e2dc4ab2dec035c25a6df4b512..57c28e63bbf3bfdacf861d60ba2ab25436b61b42 100644 --- a/paddle/fluid/lite/kernels/arm/mul_compute.cc +++ b/paddle/fluid/lite/kernels/arm/mul_compute.cc @@ -24,7 +24,6 @@ namespace arm { void MulCompute::PrepareForRun() { auto& ctx = this->ctx_->template As(); - ctx.SetRunMode(LITE_POWER_HIGH, 4); } void MulCompute::Run() { diff --git a/paddle/fluid/lite/kernels/arm/pool_compute.cc b/paddle/fluid/lite/kernels/arm/pool_compute.cc index ea3d47a268588f7d593f0c3ac58f3421d9456fa8..3ee82ae6303f849a11d8685aae09b267bb991604 100644 --- a/paddle/fluid/lite/kernels/arm/pool_compute.cc +++ b/paddle/fluid/lite/kernels/arm/pool_compute.cc @@ -26,7 +26,6 @@ namespace arm { void PoolCompute::PrepareForRun() { auto& ctx = this->ctx_->template As(); - ctx.SetRunMode(LITE_POWER_HIGH, 4); } void PoolCompute::Run() { diff --git a/paddle/fluid/lite/kernels/x86/CMakeLists.txt b/paddle/fluid/lite/kernels/x86/CMakeLists.txt index 35c61376153e64690f40836812079a20c6c4dc49..f66818b2e9dacd8e8aae3506a2f4f12b1b117cdb 100644 --- a/paddle/fluid/lite/kernels/x86/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/x86/CMakeLists.txt @@ -17,6 +17,7 @@ cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps} cc_library(concat_compute_x86 SRCS concat_compute.cc DEPS ${lite_kernel_deps} ) cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col) cc_library(pool_compute_x86 SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling) +cc_library(batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps}) lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86) lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86) @@ -28,6 +29,7 @@ lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS relu_compute_x lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86 operator) lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86) lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86) +lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86) set(x86_kernels @@ -44,6 +46,7 @@ set(x86_kernels concat_compute_x86 conv_compute_x86 pool_compute_x86 + batch_norm_compute_x86 ) set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels") diff --git a/paddle/fluid/lite/kernels/x86/batch_norm_compute.cc b/paddle/fluid/lite/kernels/x86/batch_norm_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..008d239801405b13dadbee82d08405d9283d4ac0 --- /dev/null +++ b/paddle/fluid/lite/kernels/x86/batch_norm_compute.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/kernels/x86/batch_norm_compute.h" + +REGISTER_LITE_KERNEL(batch_norm, kX86, kFloat, kNCHW, + paddle::lite::kernels::x86::BatchNormCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/paddle/fluid/lite/kernels/x86/batch_norm_compute.h b/paddle/fluid/lite/kernels/x86/batch_norm_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..e9cf55d208d017c17896731105d3c0d60283dfe7 --- /dev/null +++ b/paddle/fluid/lite/kernels/x86/batch_norm_compute.h @@ -0,0 +1,158 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/lite/core/kernel.h" +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +template +class BatchNormCompute : public KernelLite { + public: + using param_t = operators::BatchNormParam; + void Run() override { + auto ¶m = *param_.get_mutable(); + bool global_stats = param.is_test || param.use_global_stats; + + const auto *x = param.x; + const auto &x_dims = x->dims(); + CHECK(x_dims.size() >= 2 && x_dims.size() <= 5); + const int N = x_dims[0]; + const int C = param.data_layout == DATALAYOUT(kNCHW) + ? x_dims[1] + : x_dims[x_dims.size() - 1]; + const int sample_size = x->dims().production() / N / C; + + // alloc memory + param.y->template mutable_data(); + param.mean_out->template mutable_data(); + param.variance_out->template mutable_data(); + param.saved_mean->template mutable_data(); + param.saved_variance->template mutable_data(); + + if (!global_stats) { + // saved_xx is use just in this batch of data + EigenVectorArrayMap saved_mean_e(param.saved_mean->mutable_data(), + C); + EigenVectorArrayMap saved_variance_e( + param.saved_variance->mutable_data(), C); + saved_mean_e.setZero(); + saved_variance_e.setZero(); + + EigenVectorArrayMap running_mean_arr(param.mean_out->mutable_data(), + C); + EigenVectorArrayMap running_var_arr( + param.variance_out->mutable_data(), C); + + if ((N * sample_size) == 1) { + LOG(WARNING) << "Only 1 element in normalization dimension, " + << "we skip the batch norm calculation, let y = x."; + framework::TensorCopy(x->raw_tensor(), platform::CPUPlace(), + ¶m.y->raw_tensor()); + return; + } + + switch (param.data_layout) { + case DATALAYOUT(kNCHW): { + ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); + for (int nc = 0; nc < N * C; ++nc) { + saved_mean_e(nc % C) += x_arr.col(nc).sum(); + } + saved_mean_e /= N * sample_size; + for (int nc = 0; nc < N * C; ++nc) { + saved_variance_e(nc % C) += + (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm(); + } + saved_variance_e /= N * sample_size; + break; + } + default: + LOG(FATAL) << "Unknown storage order: " + << DataLayoutToStr(param.data_layout); + break; + } + running_mean_arr = running_mean_arr * param.momentum + + saved_mean_e * (1. - param.momentum); + running_var_arr = running_var_arr * param.momentum + + saved_variance_e * (1. - param.momentum); + } + + // use SavedMean and SavedVariance to do normalize + Eigen::Array inv_std(C); + if (global_stats) { + ConstEigenVectorArrayMap var_arr(param.variance->data(), C); + inv_std = (var_arr + param.epsilon).sqrt().inverse(); + } else { + EigenVectorArrayMap saved_inv_std( + param.saved_variance->mutable_data(), C); + // inverse SavedVariance first, gradient will use it too. + saved_inv_std = (saved_inv_std + param.epsilon).inverse().sqrt(); + inv_std = saved_inv_std; + } + + ConstEigenVectorArrayMap mean_arr( + global_stats ? param.mean->data() : param.saved_mean->data(), C); + + // ((x - est_mean) * (inv_var) * scale + bias + // formula transform ====> + // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) + + ConstEigenVectorArrayMap scale_arr(param.scale->data(), C); + ConstEigenVectorArrayMap bias_arr(param.bias->data(), C); + Eigen::Array new_scale = inv_std * scale_arr; + Eigen::Array new_bias = + bias_arr - mean_arr * inv_std * scale_arr; + + switch (param.data_layout) { + case DATALAYOUT(kNCHW): { + EigenArrayMap y_arr(param.y->mutable_data(), sample_size, N * C); + ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); + for (int nc = 0; nc < N * C; ++nc) { + y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C); + } + break; + } + default: + LOG(FATAL) << "Unknown storage order: " + << DataLayoutToStr(param.data_layout); + break; + } + } + virtual ~BatchNormCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/kernels/x86/batch_norm_compute_test.cc b/paddle/fluid/lite/kernels/x86/batch_norm_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d9c53035db1c73fce642358018dc7b7db139f7c8 --- /dev/null +++ b/paddle/fluid/lite/kernels/x86/batch_norm_compute_test.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/kernels/x86/batch_norm_compute.h" +#include +#include +#include +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +TEST(batch_norm_x86, retrive_op) { + auto batch_norm = + KernelRegistry::Global().Create( + "batch_norm"); + ASSERT_FALSE(batch_norm.empty()); + ASSERT_TRUE(batch_norm.front()); +} + +TEST(batch_norm_x86, init) { + BatchNormCompute batch_norm; + ASSERT_EQ(batch_norm.precision(), PRECISION(kFloat)); + ASSERT_EQ(batch_norm.target(), TARGET(kX86)); +} + +TEST(batch_norm_x86, run_test) { + lite::Tensor x, scale, bias, mean, variance, y, mean_out, variance_out, + saved_mean, saved_variance; + constexpr int batch_size = 2; + std::vector x_shape{batch_size, 3, 64, 64}; + x.Resize(lite::DDim(x_shape)); + + std::vector scale_shape{3}; + scale.Resize(lite::DDim(scale_shape)); + + std::vector bias_shape{3}; + bias.Resize(lite::DDim(bias_shape)); + + std::vector mean_shape{3}; + mean.Resize(lite::DDim(mean_shape)); + + std::vector variance_shape{3}; + variance.Resize(lite::DDim(variance_shape)); + + std::vector y_shape{batch_size, 3, 64, 64}; + y.Resize(lite::DDim(y_shape)); + + std::vector mean_out_shape{3}; + mean_out.Resize(lite::DDim(mean_out_shape)); + + std::vector variance_out_shape{3}; + variance_out.Resize(lite::DDim(variance_out_shape)); + + std::vector saved_mean_shape{3}; + saved_mean.Resize(lite::DDim(saved_mean_shape)); + + std::vector saved_variance_shape{3}; + saved_variance.Resize(lite::DDim(saved_variance_shape)); + + auto x_data = x.mutable_data(); + auto scale_data = scale.mutable_data(); + auto bias_data = bias.mutable_data(); + auto mean_data = mean.mutable_data(); + auto variance_data = variance.mutable_data(); + y.mutable_data(); + mean_out.mutable_data(); + variance_out.mutable_data(); + saved_mean.mutable_data(); + saved_variance.mutable_data(); + + for (int64_t i = 0; i < x.dims().production(); i++) { + x_data[i] = static_cast(i); + } + for (int i = 0; i < scale.dims().production(); i++) { + scale_data[i] = static_cast(i) * 0.01f + 0.03f; + } + for (int i = 0; i < bias.dims().production(); i++) { + bias_data[i] = static_cast(i) * 0.065f + 0.1f; + } + for (int i = 0; i < mean.dims().production(); i++) { + mean_data[i] = static_cast(i) * 0.0565f; + } + for (int i = 0; i < variance.dims().production(); i++) { + variance_data[i] = static_cast(i) * 2.08f + 1.5f; + } + // BatchNormCompute batch_norm; + BatchNormCompute batch_norm; + operators::BatchNormParam param; + + param.x = &x; + param.is_test = false; + param.scale = &scale; + param.bias = &bias; + param.mean = &mean; + param.variance = &variance; + param.use_global_stats = false; + param.epsilon = 1e-4f; + param.momentum = 0.9f; + param.y = &y; + param.mean_out = &mean_out; + param.variance_out = &variance_out; + param.saved_mean = &saved_mean; + param.saved_variance = &saved_variance; + + batch_norm.SetParam(param); + batch_norm.Run(); + + LOG(INFO) << "output: " << y; + LOG(INFO) << "mean_out: " << mean_out; + LOG(INFO) << "variance_out: " << mean_out; + LOG(INFO) << "saved_mean: " << saved_mean; + LOG(INFO) << "saved_variance: " << saved_variance; + + /*for (int i = 0; i < y.dims().production(); i++) { + if(i < 5 || i > y.dims().production() - 5) + LOG(INFO) << y_data[i]; + }*/ +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(batch_norm, kX86, kFloat, kNCHW, def); diff --git a/paddle/fluid/lite/tools/build.sh b/paddle/fluid/lite/tools/build.sh index 70a23ecf691ccb9667509868ea774ddc6b0659a0..b66efe8959e9a2ab7bfb5dabee73243b2d4fac1b 100755 --- a/paddle/fluid/lite/tools/build.sh +++ b/paddle/fluid/lite/tools/build.sh @@ -135,8 +135,8 @@ function test_arm_model { adb -s emulator-${port} push ${model_dir} ${adb_work_dir} adb -s emulator-${port} push ${testpath} ${adb_work_dir} adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}" - local adb_model_path="./${adb_work_dir}/`basename ${model_dir}`" - adb -s emulator-${port} shell "./${adb_work_dir}/${test_name} --eval_model_dir=$adb_model_path" + local adb_model_path="${adb_work_dir}/`basename ${model_dir}`" + adb -s emulator-${port} shell "${adb_work_dir}/${test_name} --eval_model_dir=$adb_model_path" } @@ -225,16 +225,11 @@ function test_arm { for _test in $(cat $TESTS_FILE); do test_arm_android $_test $port done - # TODO(sangoly): refine this - test_arm_model "test_cxx_api_lite" $port "./third_party/install/mobilenet_v2_relu" } -# Build the code and run lite arm tests. This is executed in the CI system. -function build_test_arm { - ######################################################################## - # job 1-4 must be in one runner - port_armv8=5554 - port_armv7=5556 +function prepare_emulator { + local port_armv8=$1 + local port_armv7=$2 adb kill-server adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done @@ -245,6 +240,18 @@ function build_test_arm { echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a" echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -verbose -port ${port_armv7} & sleep 1m +} + + +# We split the arm unittest into several sub-tasks to parallel and reduce the overall CI timetime. +# sub-task1 +function build_test_arm_subtask_android { + ######################################################################## + # job 1-4 must be in one runner + port_armv8=5554 + port_armv7=5556 + + prepare_emulator $port_armv8 $port_armv7 # job 1 build_arm "android" "armv8" "gcc" @@ -252,9 +259,9 @@ function build_test_arm { cd - # job 2 - build_arm "android" "armv8" "clang" - test_arm "android" "armv8" "clang" ${port_armv8} - cd - + #build_arm "android" "armv8" "clang" + #test_arm "android" "armv8" "clang" ${port_armv8} + #cd - # job 3 build_arm "android" "armv7" "gcc" @@ -262,13 +269,22 @@ function build_test_arm { cd - # job 4 - build_arm "android" "armv7" "clang" - test_arm "android" "armv7" "clang" ${port_armv7} - cd - + #build_arm "android" "armv7" "clang" + #test_arm "android" "armv7" "clang" ${port_armv7} + #cd - adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done echo "Done" +} + +# sub-task2 +function build_test_arm_subtask_armlinux { ######################################################################## + # job 1-4 must be in one runner + port_armv8=5554 + port_armv7=5556 + + prepare_emulator $port_armv8 $port_armv7 # job 5 build_arm "armlinux" "armv8" @@ -285,9 +301,47 @@ function build_test_arm { test_arm "armlinux" "armv7hf" cd - + adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done + echo "Done" +} + +# sub-task3 +function build_test_arm_subtask3_mobilenet_v2 { + local port_armv8=5554 + local port_armv7=5556 + # We just test following single one environment to limit the CI time. + local os=android + local abi=armv8 + local lang=gcc + + cur_dir=$(pwd) + build_dir=$cur_dir/build.lite.${os}.${abi}.${lang} + mkdir -p $build_dir + cd $build_dir + cmake_arm $os $abi $lang + make test_cxx_api_lite -j$NUM_CORES_FOR_COMPILE + + prepare_emulator $port_armv8 $port_armv7 + + # just test the model on armv8 + test_arm_model "test_cxx_api_lite" $port_armv8 "./third_party/install/mobilenet_v2_relu" + + adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done echo "Done" } +# Build the code and run lite arm tests. This is executed in the CI system. +function build_test_arm { + ######################################################################## + # job 1-4 must be in one runner + port_armv8=5554 + port_armv7=5556 + + build_test_arm_subtask_android + build_test_arm_subtask_armlinux +} + + ############################# MAIN ################################# function print_usage { echo -e "\nUSAGE:" @@ -379,6 +433,18 @@ function main { build_test_arm shift ;; + build_test_arm_subtask_android) + build_test_arm_subtask_android + shift + ;; + build_test_arm_subtask_armlinux) + build_test_arm_subtask_armlinux + shift + ;; + build_test_arm_model1) + build_test_arm_subtask3_mobilenet_v2 + shift + ;; check_style) check_style shift @@ -397,4 +463,3 @@ function main { } main $@ -