From 9654b2d8db6d9907420c062e6659a25b7a9e2500 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 28 May 2019 16:47:08 +0800 Subject: [PATCH] [Lite] enable fc kernel (#17674) * add fc unit test * refine eigen fc add cpu info, arm context init packed sgemm * enable packed sgemm * add arm math * pass fc ut * follow comments --- cmake/cross_compiling/android.cmake | 2 + paddle/fluid/lite/CMakeLists.txt | 1 + paddle/fluid/lite/arm/CMakeLists.txt | 2 + paddle/fluid/lite/arm/math/CMakeLists.txt | 2 + paddle/fluid/lite/core/CMakeLists.txt | 3 +- paddle/fluid/lite/core/context.cc | 317 ++++++++- paddle/fluid/lite/core/context.h | 41 +- paddle/fluid/lite/core/cpu_info.cc | 629 ++++++++++++++++++ paddle/fluid/lite/core/cpu_info.h | 125 ++++ paddle/fluid/lite/core/kernel.h | 12 +- paddle/fluid/lite/core/lite_tensor.h | 7 + .../core/mir/runtime_context_assign_pass.cc | 17 +- paddle/fluid/lite/kernels/CMakeLists.txt | 2 +- paddle/fluid/lite/kernels/arm/CMakeLists.txt | 4 +- paddle/fluid/lite/kernels/arm/fc_compute.cc | 46 +- paddle/fluid/lite/kernels/arm/fc_compute.h | 47 -- .../fluid/lite/kernels/arm/fc_compute_test.cc | 133 ++-- paddle/fluid/lite/kernels/cuda/mul_compute.h | 4 +- .../lite/kernels/x86/activation_compute.cc | 4 +- .../lite/kernels/x86/elementwise_compute.cc | 2 +- paddle/fluid/lite/tools/build.sh | 2 + 21 files changed, 1260 insertions(+), 142 deletions(-) create mode 100644 paddle/fluid/lite/arm/CMakeLists.txt create mode 100644 paddle/fluid/lite/arm/math/CMakeLists.txt create mode 100644 paddle/fluid/lite/core/cpu_info.cc create mode 100644 paddle/fluid/lite/core/cpu_info.h diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake index c36057544..a12ecdccc 100644 --- a/cmake/cross_compiling/android.cmake +++ b/cmake/cross_compiling/android.cmake @@ -16,6 +16,8 @@ if(NOT ANDROID) return() endif() +add_definitions(-DLITE_WITH_ANDROID) + if(NOT DEFINED ANDROID_NDK) set(ANDROID_NDK $ENV{NDK_ROOT}) if(NOT ANDROID_NDK) diff --git a/paddle/fluid/lite/CMakeLists.txt b/paddle/fluid/lite/CMakeLists.txt index ba05973a8..93c3d9167 100644 --- a/paddle/fluid/lite/CMakeLists.txt +++ b/paddle/fluid/lite/CMakeLists.txt @@ -118,6 +118,7 @@ endfunction() add_subdirectory(core) add_subdirectory(x86) +add_subdirectory(arm) add_subdirectory(host) add_subdirectory(cuda) add_subdirectory(operators) diff --git a/paddle/fluid/lite/arm/CMakeLists.txt b/paddle/fluid/lite/arm/CMakeLists.txt new file mode 100644 index 000000000..8abd04b52 --- /dev/null +++ b/paddle/fluid/lite/arm/CMakeLists.txt @@ -0,0 +1,2 @@ + +add_subdirectory(math) diff --git a/paddle/fluid/lite/arm/math/CMakeLists.txt b/paddle/fluid/lite/arm/math/CMakeLists.txt new file mode 100644 index 000000000..278cb54a4 --- /dev/null +++ b/paddle/fluid/lite/arm/math/CMakeLists.txt @@ -0,0 +1,2 @@ + +cc_library(math_arm SRCS funcs.cc packed_sgemm.cc DEPS ${lite_kernel_deps} eigen3) diff --git a/paddle/fluid/lite/core/CMakeLists.txt b/paddle/fluid/lite/core/CMakeLists.txt index 25fdf32c1..4e55ba74f 100644 --- a/paddle/fluid/lite/core/CMakeLists.txt +++ b/paddle/fluid/lite/core/CMakeLists.txt @@ -23,7 +23,8 @@ cc_library(kernel_lite SRCS kernel.cc DEPS type_system target_wrapper_lite any_l cc_library(variable_lite SRCS variable.cc) cc_library(op_registry_lite SRCS op_registry.cc DEPS framework_proto_lite) cc_library(scope_lite SRCS scope.cc) -cc_library(context_lite SRCS context.cc DEPS any_lite) +cc_library(cpu_info_lite SRCS cpu_info.cc) +cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite) cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite compatible_pb_lite target_wrapper_lite ${tensor_lite}) cc_library(types_lite SRCS types.cc) cc_library(type_system SRCS type_system.cc DEPS ${tensor_lite} target_wrapper_lite) diff --git a/paddle/fluid/lite/core/context.cc b/paddle/fluid/lite/core/context.cc index fa01f1d3e..c2dfe2aba 100644 --- a/paddle/fluid/lite/core/context.cc +++ b/paddle/fluid/lite/core/context.cc @@ -12,8 +12,317 @@ // See the License for the specific language governing permissions and // limitations under the License. -// -// Created by chunwei on 19-2-22. -// - #include "paddle/fluid/lite/core/context.h" +#include "paddle/fluid/lite/core/cpu_info.h" + +#ifdef LITE_WITH_ANDROID +#include +#include +#endif +#if __APPLE__ +#include "TargetConditionals.h" +#if TARGET_OS_IPHONE +#include +#include +#include +#endif // TARGET_OS_IPHONE +#endif // __APPLE__ + +namespace paddle { +namespace lite { + +#ifdef LITE_WITH_ARM + +void ARMContext::SetCache(int l1size, int l2size, int l3size) { + DeviceInfo& dev = DeviceInfo::Global(); + int cpu_count = arm_get_cpucount(); + dev.L1_cache_.resize(cpu_count); + dev.L2_cache_.resize(cpu_count); + dev.L3_cache_.resize(cpu_count); + for (int i = 0; i < cpu_count; ++i) { + dev.L1_cache_[i] = l1size; + dev.L2_cache_[i] = l2size; + dev.L3_cache_[i] = l3size; + } + workspace_.Resize({2 * (l1size + l2size)}); +} + +ARMContext::ARMContext() { + active_ids_ = {0}; + mode_ = LITE_POWER_HIGH; + DeviceInfo& dev = DeviceInfo::Global(); + workspace_.Resize( + {static_cast(dev.L2_cache_[active_ids_[0]] / sizeof(float))}); +#ifdef TARGET_IOS + arch_ = APPLE; // use 6x8 +#else + if (dev.big_core_ids_.size() > 0) { + arch_ = dev.archs_[dev.big_core_ids_[0]]; + } +#endif +} + +PowerMode ARMContext::mode() const { return mode_; } + +int ARMContext::threads() const { return active_ids_.size(); } + +ARMContext::ARMContext(const ARMContext& ctx) { + mode_ = ctx.mode_; + active_ids_ = ctx.active_ids_; + workspace_ = ctx.workspace_; + arch_ = ctx.arch_; + count_ = ctx.count_; +} + +ARMContext& ARMContext::operator=(const ARMContext& ctx) { + mode_ = ctx.mode_; + active_ids_ = ctx.active_ids_; + workspace_ = ctx.workspace_; + arch_ = ctx.arch_; + count_ = ctx.count_; + return *this; +} + +void ARMContext::BindDev() { +#ifdef USE_OPENMP + int num_threads = active_ids_.size(); + omp_set_num_threads(num_threads); +#ifdef LITE_WITH_ANDROID + std::vector ssarets; + for (int j = 0; j < num_threads; ++j) { + ssarets.push_back(0); + } +#pragma omp parallel for + for (int i = 0; i < num_threads; i++) { + ssarets[i] = set_sched_affinity(active_ids_); + } + for (int i = 0; i < num_threads; i++) { + if (ssarets[i] != 0) { + LOGE("set cpu affinity failed, cpuID: %d\n", active_ids_[i]); + return; + } + } +#endif // LITE_WITH_ANDROID +#else // USE_OPENMP +#ifdef LITE_WITH_ANDROID + std::vector cpuid1; + cpuid1.push_back(active_ids_[0]); + int ssaret = set_sched_affinity(cpuid1); + if (ssaret != 0) { + printf("set cpu affinity failed, cpuID: %d\n", active_ids_[0]); + return; + } +#endif // LITE_WITH_ANDROID +#endif // USE_OPENMP +} + +void ARMContext::SetRunMode(PowerMode mode, int threads) { + DeviceInfo& dev = DeviceInfo::Global(); + int big_core_size = dev.big_core_ids_.size(); + int small_core_size = dev.little_core_ids_.size(); + if (threads > big_core_size + small_core_size) { + threads = big_core_size + small_core_size; + } +#ifdef USE_OPENMP + count_++; + int shift_num = (count_ / 10) % big_core_size; + switch (mode) { + case LITE_POWER_FULL: + mode_ = mode; + active_ids_.clear(); + for (int i = 0; i < threads; ++i) { + if (i < big_core_size) { + active_ids_.push_back(dev.big_core_ids_[i]); + } else { + active_ids_.push_back(dev.little_core_ids_[i - big_core_size]); + } + } + if (active_ids_.size() == 0) { + active_ids_.push_back(0); + } + break; + case LITE_POWER_HIGH: + active_ids_.clear(); + if (big_core_size > 0) { + mode_ = LITE_POWER_HIGH; + if (threads > big_core_size) { + LOGE("threads: %d, exceed the big cores size: %d\n", threads, + big_core_size); + active_ids_ = dev.big_core_ids_; + } else { + for (int i = 0; i < threads; ++i) { + active_ids_.push_back(dev.big_core_ids_[i]); + } + } + } else { + mode_ = LITE_POWER_LOW; + LOGE("HIGH POWER MODE is not support, switch to little cores\n"); + if (threads > small_core_size) { + active_ids_ = dev.little_core_ids_; + } else { + for (int i = 0; i < threads; ++i) { + active_ids_.push_back(dev.little_core_ids_[i]); + } + } + } + if (active_ids_.size() == 0) { + active_ids_.push_back(0); + } + break; + case LITE_POWER_LOW: + active_ids_.clear(); + if (small_core_size > 0) { + mode_ = LITE_POWER_LOW; + if (threads > small_core_size) { + LOGW("threads: %d, exceed the little cores size: %d\n", threads, + small_core_size); + active_ids_ = dev.little_core_ids_; + } else { + for (int i = 0; i < threads; ++i) { + active_ids_.push_back(dev.little_core_ids_[i]); + } + } + } else { + mode_ = LITE_POWER_HIGH; + LOGW("LOW POWER MODE is not support, switch to big cores\n"); + if (threads > big_core_size) { + active_ids_ = dev.big_core_ids_; + } else { + for (int i = 0; i < threads; ++i) { + active_ids_.push_back(dev.big_core_ids_[i]); + } + } + } + if (active_ids_.size() == 0) { + active_ids_.push_back(0); + } + break; + case LITE_POWER_NO_BIND: + mode_ = LITE_POWER_NO_BIND; + active_ids_.clear(); + if (threads > dev.core_ids_.size()) { + active_ids_.resize(dev.core_ids_.size()); + } else { + active_ids_.resize(threads); + } + break; + case LITE_POWER_RAND_HIGH: + active_ids_.clear(); + if (big_core_size > 0) { + mode_ = LITE_POWER_RAND_HIGH; + if (threads > big_core_size) { + LOGW("threads: %d, exceed the big cores size: %d\n", threads, + big_core_size); + active_ids_ = dev.big_core_ids_; + } else { + for (int i = 0; i < threads; ++i) { + active_ids_.push_back( + dev.big_core_ids_[(i + shift_num) % big_core_size]); + } + } + } else { + mode_ = LITE_POWER_LOW; + LOGW("HIGH POWER MODE is not support, switch to little cores\n"); + if (threads > small_core_size) { + active_ids_ = dev.little_core_ids_; + } else { + for (int i = 0; i < threads; ++i) { + active_ids_.push_back(dev.little_core_ids_[i]); + } + } + } + if (active_ids_.size() == 0) { + active_ids_.push_back(0); + } + break; + case LITE_POWER_RAND_LOW: + active_ids_.clear(); + if (small_core_size > 0) { + mode_ = LITE_POWER_RAND_LOW; + if (threads > small_core_size) { + LOGW("threads: %d, exceed the little cores size: %d\n", threads, + small_core_size); + active_ids_ = dev.little_core_ids_; + } else { + for (int i = 0; i < threads; ++i) { + active_ids_.push_back( + dev.little_core_ids_[(i + shift_num) % small_core_size]); + } + } + } else { + mode_ = LITE_POWER_HIGH; + LOGW("LOW POWER MODE is not support, switch to big cores\n"); + if (threads > big_core_size) { + active_ids_ = dev.big_core_ids_; + } else { + for (int i = 0; i < threads; ++i) { + active_ids_.push_back(dev.big_core_ids_[i]); + } + } + } + if (active_ids_.size() == 0) { + active_ids_.push_back(0); + } + break; + } + //! fix multi-threads LITE_POWER_HIGH mode + if (mode_ == LITE_POWER_NO_BIND || threads > 1) { + int threads = active_ids_.size(); + omp_set_num_threads(threads); + } else { + if (check_online(active_ids_)) { + BindDev(); + } else { + LOG(ERROR) << "core id " << active_ids_[0] + << " is offline, switch to NO BIND MODE"; + int threads = active_ids_.size(); + omp_set_num_threads(threads); + } + } +#else + if (big_core_size > 0) { + active_ids_ = {dev.big_core_ids_[0]}; + } else { + active_ids_ = {0}; + } +#endif + //! alloc memory for sgemm in this context + int temp_mem_size = + DeviceInfo::Global().L2_cache_[active_ids_[0]] / sizeof(float); + workspace_.Resize({temp_mem_size}); + arch_ = DeviceInfo::Global().archs_[active_ids_[0]]; +} + +ARMArch ARMContext::arch() const { return arch_; } + +void ARMContext::SetArch(ARMArch arch) { arch_ = arch; } + +int ARMContext::l1_cache_size() const { + DeviceInfo& dev = DeviceInfo::Global(); + return dev.L1_cache_[active_ids_[0]]; +} + +int ARMContext::l2_cache_size() const { + DeviceInfo& dev = DeviceInfo::Global(); + return dev.L2_cache_[active_ids_[0]]; +} + +int ARMContext::l3_cache_size() const { + DeviceInfo& dev = DeviceInfo::Global(); + return dev.L3_cache_[active_ids_[0]]; +} + +bool ARMContext::ExtendWorkspace(DDimLite dims) { + auto count = dims.product(); + auto old = workspace_.dims(); + if (count == old.product()) { + return false; + } + + workspace_.Resize( + {static_cast(count + l2_cache_size() / sizeof(float))}); + return true; +} +#endif // LITE_WITH_ARM +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/core/context.h b/paddle/fluid/lite/core/context.h index 01253e0de..e09a03f55 100644 --- a/paddle/fluid/lite/core/context.h +++ b/paddle/fluid/lite/core/context.h @@ -26,6 +26,8 @@ #include #include #include +#include "paddle/fluid/lite/core/cpu_info.h" +#include "paddle/fluid/lite/core/lite_tensor.h" #include "paddle/fluid/lite/core/target_wrapper.h" namespace paddle { @@ -34,7 +36,44 @@ namespace lite { struct HostContext {}; #ifdef LITE_WITH_ARM -struct ARMContext {}; + +struct ARMContext { + public: + ARMContext(); + ARMContext(PowerMode mode, int threads); + ARMContext(const ARMContext& ctx); + + ARMContext& operator=(const ARMContext& ctx); + + void SetRunMode(PowerMode mode, int threads); + void SetCache(int l1size, int l2size, int l3size); + void SetArch(ARMArch arch); + void BindDev(); + + PowerMode mode() const; + int threads() const; + ARMArch arch() const; + + template + T* workspace_data() { + return workspace_.mutable_data(); + } + + int l1_cache_size() const; + int l2_cache_size() const; + int l3_cache_size() const; + bool ExtendWorkspace(DDimLite dims); + + private: + // LITE_POWER_HIGH stands for using big cores, + // LITE_POWER_LOW stands for using small core, + // LITE_POWER_FULL stands for using all cores + ARMArch arch_; + PowerMode mode_; + std::vector active_ids_; + TensorLite workspace_; + int64_t count_{0}; +}; #endif #ifdef LITE_WITH_CUDA diff --git a/paddle/fluid/lite/core/cpu_info.cc b/paddle/fluid/lite/core/cpu_info.cc new file mode 100644 index 000000000..0336c2d7a --- /dev/null +++ b/paddle/fluid/lite/core/cpu_info.cc @@ -0,0 +1,629 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/core/cpu_info.h" +#include + +namespace paddle { +namespace lite { + +#ifdef LITE_WITH_ARM + +void DeviceInfo::get_info(DeviceInfo* dev) { + set_default_cache(dev); + dev->compute_core_num_ = arm_get_cpucount(); + dev->max_memory_ = arm_get_meminfo(); + +// get max freq +#ifdef LITE_WITH_ANDROID + std::vector max_freq(dev->compute_core_num_); + for (int i = 0; i < dev->compute_core_num_; ++i) { + max_freq[i] = get_max_freq_khz(i) / 1000; + } + std::string cpu_name = arm_get_cpu_name(); + if (get_cpu_info_from_name(dev, cpu_name) != true) { + arm_sort_cpuid_by_max_frequency(dev->compute_core_num_, &dev->core_ids_, + max_freq, &dev->cluster_ids_); + dev->big_core_ids_.clear(); + dev->little_core_ids_.clear(); + for (int i = 0; i < dev->cluster_ids_.size(); ++i) { + if (dev->cluster_ids_[i] == 0) { + dev->big_core_ids_.push_back(dev->core_ids_[i]); + } else { + dev->little_core_ids_.push_back(dev->core_ids_[i]); + } + } + arm_get_cpu_arch(&dev->archs_); + } + + LOG(INFO) << "ARM multiprocessors number: " << dev->compute_core_num_; + for (int i = 0; i < dev->compute_core_num_; ++i) { + LOG(INFO) << "ARM multiprocessors ID: " << dev->core_ids_[i] + << ", frequence: " << max_freq[i] + << ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]] + << ", CPU ARCH: A" << dev->archs_[i]; + } + LOG(INFO) << "L1 DataCache size is: "; + for (int i = 0; i < dev->compute_core_num_; ++i) { + LOG(INFO) << dev->L1_cache_[i] / 1024 << " KB"; + } + LOG(INFO) << "L2 Cache size is: "; + for (int i = 0; i < dev->compute_core_num_; ++i) { + LOG(INFO) << dev->L2_cache_[i] / 1024 << " KB"; + } + LOG(INFO) << "Total memory: " << dev->max_memory_ << "KB"; + + dev->max_freq_ = max_freq[0]; + for (int j = 1; j < dev->compute_core_num_; ++j) { + if (dev->max_freq_ < max_freq[j]) { + dev->max_freq_ = max_freq[j]; + } + } +#elif defined(TARGET_IOS) + arm_get_cpu_arch(&dev->archs_); +#endif +} + +// cache_id : 0 -> L1, 1 -> L2, 2 -> L3 +void set_cache_info(DeviceInfo* cpu_info, int cache_id, int argc, ...) { + va_list arg_ptr; + va_start(arg_ptr, argc); + std::vector* cache; + switch (cache_id) { + case 0: + cache = &cpu_info->L1_cache_; + break; + case 1: + cache = &cpu_info->L2_cache_; + break; + case 2: + cache = &cpu_info->L3_cache_; + break; + default: + break; + } + int core_num = cpu_info->compute_core_num_; + cache->resize(core_num); + if (argc == 1) { + int cache_size = va_arg(arg_ptr, int); + for (int i = 0; i < core_num; ++i) { + (*cache)[i] = cache_size; + } + } else { + int big_core_num = cpu_info->big_core_ids_.size(); + int little_core_num = cpu_info->little_core_ids_.size(); + int big_core_cache_size = va_arg(arg_ptr, int); + int little_core_cache_size = va_arg(arg_ptr, int); + for (int i = 0; i < big_core_num; ++i) { + (*cache)[cpu_info->big_core_ids_[i]] = big_core_cache_size; + } + for (int i = 0; i < little_core_num; ++i) { + (*cache)[cpu_info->little_core_ids_[i]] = little_core_cache_size; + } + } + va_end(arg_ptr); +} + +void set_arch_info(DeviceInfo* cpu_info, int argc, ...) { + va_list arg_ptr; + va_start(arg_ptr, argc); + int core_num = cpu_info->compute_core_num_; + cpu_info->archs_.resize(core_num); + if (argc == 1) { + ARMArch arch = (ARMArch)va_arg(arg_ptr, int); + for (int i = 0; i < core_num; ++i) { + cpu_info->archs_[i] = arch; + } + } else { + ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int); + ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int); + int big_core_num = cpu_info->big_core_ids_.size(); + int little_core_num = cpu_info->little_core_ids_.size(); + for (int i = 0; i < big_core_num; ++i) { + cpu_info->archs_[cpu_info->big_core_ids_[i]] = big_core_arch; + } + for (int i = 0; i < little_core_num; ++i) { + cpu_info->archs_[cpu_info->little_core_ids_[i]] = little_core_arch; + } + } + va_end(arg_ptr); +} + +bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name) { + /* Snapdragon */ + if (hardware_name.find("SDM845") != std::string::npos) { // 845 + cpu_info->compute_core_num_ = 8; + cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info->big_core_ids_ = {4, 5, 6, 7}; + cpu_info->little_core_ids_ = {0, 1, 2, 3}; + cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + set_arch_info(cpu_info, 2, kA75, kA55); + set_cache_info(cpu_info, 0, 1, 32 * 1024); + set_cache_info(cpu_info, 1, 2, 256 * 1024, 128 * 1024); + set_cache_info(cpu_info, 2, 1, 2048 * 1024); + return true; + + } else if (hardware_name.find("SDM710") != std::string::npos) { // 710 + cpu_info->compute_core_num_ = 8; + cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info->big_core_ids_ = {6, 7}; + cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5}; + cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0}; + set_arch_info(cpu_info, 2, kA75, kA55); + return true; + } else if (hardware_name.find("MSM8998") != std::string::npos) { // 835 + cpu_info->compute_core_num_ = 8; + cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info->big_core_ids_ = {4, 5, 6, 7}; + cpu_info->little_core_ids_ = {0, 1, 2, 3}; + cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + set_arch_info(cpu_info, 2, kA73, kA53); + set_cache_info(cpu_info, 0, 2, 64 * 1024); + set_cache_info(cpu_info, 1, 2, 1024 * 1024, + /*real cache size is 2M, while that will get bad performace + on conv3x3s1 or gemm, set to 1M or 512K*/ + 1024 * 1024); + return true; + + } else if (hardware_name.find("MSM8996") != std::string::npos) { // 820 + cpu_info->compute_core_num_ = 4; + cpu_info->core_ids_ = {0, 1, 2, 3}; + cpu_info->big_core_ids_ = {2, 3}; + cpu_info->little_core_ids_ = {0, 1}; + cpu_info->cluster_ids_ = {1, 1, 0, 0}; + set_arch_info(cpu_info, 1, kA72); + set_cache_info(cpu_info, 0, 1, 24 * 1024); + set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024); + return true; + + } else if (hardware_name.find("SDM660") != std::string::npos || + hardware_name.find("SDM636") != std::string::npos) { // 660, 636 + cpu_info->compute_core_num_ = 8; + cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info->big_core_ids_ = {4, 5, 6, 7}; + cpu_info->little_core_ids_ = {0, 1, 2, 3}; + cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + set_arch_info(cpu_info, 1, kA73); + set_cache_info(cpu_info, 0, 2, 64 * 1024, 32 * 1024); + set_cache_info(cpu_info, 1, 1, 1024 * 1024); + return true; + + } else if (hardware_name.find("MSM8976") != std::string::npos) { // 652,653 + cpu_info->compute_core_num_ = 8; + cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info->big_core_ids_ = {4, 5, 6, 7}; + cpu_info->little_core_ids_ = {0, 1, 2, 3}; + cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + set_arch_info(cpu_info, 2, kA72, kA53); + set_cache_info(cpu_info, 0, 1, 32 * 1024); + set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024); + return true; + + } else if (hardware_name.find("MSM8953") != std::string::npos) { // 625 + cpu_info->compute_core_num_ = 8; + cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info->little_core_ids_ = {}; + cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0}; + set_arch_info(cpu_info, 1, kA53); + set_cache_info(cpu_info, 0, 1, 32 * 1024); + set_cache_info(cpu_info, 1, 1, 1024 * 1024); + return true; + + } else if (hardware_name.find("MSM8939") != std::string::npos) { // 615 + cpu_info->compute_core_num_ = 8; + cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info->big_core_ids_ = {0, 1, 2, 3}; + cpu_info->little_core_ids_ = {4, 5, 6, 7}; + cpu_info->cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1}; + set_arch_info(cpu_info, 1, kA53); + set_cache_info(cpu_info, 0, 1, 32 * 1024); + set_cache_info(cpu_info, 1, 2, 512 * 1024, 256 * 1024); + return true; + + /* MediaTek */ + + } else if (hardware_name.find("MT6797") != + std::string::npos) { // X20/X23/X25/X27 + cpu_info->compute_core_num_ = 10; + cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + cpu_info->big_core_ids_ = {8, 9}; + cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}; + set_arch_info(cpu_info, 2, kA72, kA53); + set_cache_info(cpu_info, 0, 1, 32 * 1024); + set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024); + return true; + + } else if (hardware_name.find("MT6799") != std::string::npos) { // X30 + cpu_info->compute_core_num_ = 10; + cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + cpu_info->big_core_ids_ = {8, 9}; + cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}; + set_arch_info(cpu_info, 2, kA73, kA53); + return true; + + } else if (hardware_name.find("MT6795") != std::string::npos || + hardware_name.find("MT6762") != std::string::npos || + hardware_name.find("MT6755T") != std::string::npos || + hardware_name.find("MT6755S") != std::string::npos || + hardware_name.find("MT6753") != std::string::npos || + hardware_name.find("MT6752") != std::string::npos || + hardware_name.find("MT6750") != std::string::npos) { + // X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750 + cpu_info->compute_core_num_ = 8; + cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info->little_core_ids_ = {}; + cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0}; + set_arch_info(cpu_info, 1, kA53); + return true; + + } else if (hardware_name.find("MT6758") != std::string::npos || + hardware_name.find("MT6757") != std::string::npos || + hardware_name.find("MT6763") != std::string::npos || + hardware_name.find("MT6755M") != std::string::npos || + hardware_name.find("MT6755") != + std::string::npos) { // P30, P20/P25, P23, P10 + cpu_info->compute_core_num_ = 8; + cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info->big_core_ids_ = {4, 5, 6, 7}; + cpu_info->little_core_ids_ = {0, 1, 2, 3}; + cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + set_arch_info(cpu_info, 1, kA53); + return true; + + } else if (hardware_name.find("MT6771") != std::string::npos) { // P60 + cpu_info->compute_core_num_ = 8; + cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cpu_info->big_core_ids_ = {4, 5, 6, 7}; + cpu_info->little_core_ids_ = {0, 1, 2, 3}; + cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + set_arch_info(cpu_info, 2, kA73, kA53); + return true; + + } else if (hardware_name.find("MT6765") != std::string::npos || + hardware_name.find("MT6739") != std::string::npos || + hardware_name.find("MT6738") != std::string::npos || + hardware_name.find("MT6737") != + std::string::npos) { // A22, MT6739, MT6738, MT6767 + cpu_info->compute_core_num_ = 4; + cpu_info->core_ids_ = {0, 1, 2, 3}; + cpu_info->big_core_ids_ = {0, 0, 0, 0}; + cpu_info->little_core_ids_ = {}; + cpu_info->cluster_ids_ = {0, 0, 0, 0}; + set_arch_info(cpu_info, 1, kA53); + return true; + } + return false; +} + +size_t arm_get_meminfo() { +#ifdef LITE_WITH_ANDROID + // get cpu count from /proc/cpuinfo + FILE* fp = fopen("/proc/meminfo", "rb"); + if (!fp) { + return 1; + } + + size_t memsize = 0; + char line[1024]; + while (!feof(fp)) { + char* s = fgets(line, 1024, fp); + if (!s) { + break; + } + sscanf(s, "MemTotal: %d kB", &memsize); + } + + fclose(fp); + + return memsize; +#elif defined(TARGET_IOS) + // to be implemented + printf("not implemented\n"); + return 0; +#endif +} + +int arm_get_cpucount() { +#ifdef LITE_WITH_ANDROID + // get cpu count from /sys/devices/system/cpu/cpunum/uevent + int max_cpu_count = 20; + int count = 0; + for (int i = 0; i < max_cpu_count; ++i) { + char path[256]; + snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i); + FILE* fp = fopen(path, "rb"); + if (!fp) { + break; + } + count++; + fclose(fp); + } + if (count < 1) { + count = 1; + } + return count; +#elif defined(TARGET_IOS) + int count = 0; + size_t len = sizeof(count); + sysctlbyname("hw.ncpu", &count, &len, NULL, 0); + if (count < 1) { + count = 1; + } + return count; +#else + return 1; +#endif +} + +void arm_get_cpu_arch(std::vector* archs) { +#ifdef LITE_WITH_ANDROID + archs->clear(); + //! get CPU ARCH + FILE* fp = fopen("/proc/cpuinfo", "rb"); + if (!fp) { + return; + } + char line[1024]; + while (!feof(fp)) { + char* s = fgets(line, 1024, fp); + if (!s) { + break; + } + if (strstr(line, "part") != NULL) { + int arch_id = 0; + sscanf(s, "CPU part\t: %x", &arch_id); + switch (arch_id) { + case 0xd03: + archs->push_back(kA53); + break; + case 0xd05: + archs->push_back(kA55); + break; + case 0xd07: + archs->push_back(kA57); + break; + case 0xd08: + archs->push_back(kA72); + break; + case 0xd09: + archs->push_back(kA73); + break; + case 0xd0a: + archs->push_back(kA75); + break; + case 0x800: + // 835 + archs->push_back(kA73); + break; + case 0x205: + // 820 + archs->push_back(kA72); + break; + default: + LOG(ERROR) << "unknow type"; + archs->push_back(kARMArch_UNKOWN); + } + } + } + fclose(fp); + int cpu_count = arm_get_cpucount(); + if (archs->size() < cpu_count) { + for (int i = archs->size(); i < cpu_count; ++i) { + archs->push_back(archs->at(i - 1)); + } + } +#endif +#ifdef TARGET_IOS + int cpu_count = arm_get_cpucount(); + for (int i = 0; i < cpu_count; ++i) { + archs->push_back(APPLE); + } +#endif +} + +#ifdef LITE_WITH_ANDROID + +void set_default_cache(DeviceInfo* dev) { + int cpu_count = arm_get_cpucount(); + dev->L1_cache_.resize(cpu_count); + dev->L2_cache_.resize(cpu_count); + dev->L3_cache_.resize(cpu_count); +#ifdef TARGET_IOS + for (int i = 0; i < cpu_count; ++i) { + dev->L1_cache_[i] = 64 * 1024; + dev->L2_cache_[i] = 2048 * 1024; + dev->L3_cache_[i] = 0; + } +#else + for (int i = 0; i < cpu_count; ++i) { + dev->L1_cache_[i] = 32 * 1024; + dev->L2_cache_[i] = 512 * 1024; + dev->L3_cache_[i] = 0; + } +#endif +} +std::string arm_get_cpu_name() { + FILE* fp = fopen("/proc/cpuinfo", "rb"); + if (!fp) { + return ""; + } + char line[1024]; + while (!feof(fp)) { + char* s = fgets(line, 1024, fp); + if (!s) { + break; + } + if (strstr(line, "Hardware") != NULL) { + fclose(fp); + return std::string(line); + } + } + fclose(fp); + return ""; +} + +int get_max_freq_khz(int cpuid) { + // first try, for all possible cpu + char path[256]; + snprintf(path, sizeof(path), + "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid); + + FILE* fp = fopen(path, "rb"); + + if (!fp) { + // second try, for online cpu + snprintf(path, sizeof(path), + "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", + cpuid); + fp = fopen(path, "rb"); + + if (!fp) { + // third try, for online cpu + snprintf(path, sizeof(path), + "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid); + fp = fopen(path, "rb"); + + if (!fp) { + return -1; + } + + int max_freq_khz = -1; + fscanf(fp, "%d", &max_freq_khz); + + fclose(fp); + + return max_freq_khz; + } + } + + int max_freq_khz = 0; + while (!feof(fp)) { + int freq_khz = 0; + int nscan = fscanf(fp, "%d %*d", &freq_khz); + if (nscan != 1) { + break; + } + + if (freq_khz > max_freq_khz) { + max_freq_khz = freq_khz; + } + } + + fclose(fp); + + return max_freq_khz; +} + +int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector* cpuids, + const std::vector& cpu_freq, + std::vector* cluster_ids) { + if (cpu_count == 0) { + return 0; + } + + cpuids->resize(cpu_count); + cluster_ids->resize(cpu_count); + + for (int i = 0; i < cpu_count; i++) { + cpuids->at(i) = i; + } + + // sort cpuid as big core first + // simple bubble sort + + for (int i = 0; i < cpu_count; i++) { + for (int j = i + 1; j < cpu_count; j++) { + if (cpu_freq[i] < cpu_freq[j]) { + // swap + int tmp = cpuids->at(i); + cpuids->at(i) = cpuids->at(j); + cpuids->at(j) = tmp; + } + } + } + // SMP + int mid_max_freq_khz = + (cpu_freq[cpuids->at(0)] + cpu_freq[cpuids->at(cpu_count - 1)]) / 2; + + for (int i = 0; i < cpu_count; i++) { + cpuids->at(i) = i; + if (cpu_freq[i] >= mid_max_freq_khz) { + cluster_ids->at(i) = 0; + } else { + cluster_ids->at(i) = 1; + } + } + return 0; +} + +int check_online(const std::vector& core_ids) { + if (core_ids.size() == 0) { + return 0; + } + char path[256]; + int online = 1; + for (int i = 0; i < core_ids.size(); ++i) { + snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online", + core_ids[i]); + FILE* fp = fopen(path, "rb"); + if (!fp) { + return 0; + } + int cur_online = 0; + fscanf(fp, "%d", &cur_online); + online &= cur_online; + fclose(fp); + } + return online; +} + +int set_sched_affinity(const std::vector& cpuids) { +// #define CPU_SETSIZE 1024 +// #define __NCPUBITS (8 * sizeof (unsigned long)) +// typedef struct +// { +// unsigned long __bits[CPU_SETSIZE / __NCPUBITS]; +// } cpu_set_t; + +// set affinity for thread +#ifdef __GLIBC__ + pid_t pid = syscall(SYS_gettid); +#else + pid_t pid = gettid(); +#endif + cpu_set_t mask; + CPU_ZERO(&mask); + for (int i = 0; i < cpuids.size(); i++) { + CPU_SET(cpuids[i], &mask); + } + + int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask); + if (syscallret) { + LOG(ERROR) << "syscall error " << syscallret; + return -1; + } + + return 0; +} + +#endif // LITE_WITH_ANDROID + +#endif // LITE_WITH_ARM + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/core/cpu_info.h b/paddle/fluid/lite/core/cpu_info.h new file mode 100644 index 000000000..23a996f80 --- /dev/null +++ b/paddle/fluid/lite/core/cpu_info.h @@ -0,0 +1,125 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/lite/utils/cp_logging.h" + +#ifdef LITE_WITH_ANDROID +#include +#include +#endif + +#if __APPLE__ +#include "TargetConditionals.h" +#if TARGET_OS_IPHONE +#include +#include +#include +#endif // TARGET_OS_IPHONE +#endif // __APPLE__ + +namespace paddle { +namespace lite { + +#ifdef LITE_WITH_ARM + +typedef enum { + LITE_POWER_HIGH = 0, + LITE_POWER_LOW = 1, + LITE_POWER_FULL = 2, + LITE_POWER_NO_BIND = 3, + LITE_POWER_RAND_HIGH = 4, + LITE_POWER_RAND_LOW = 5 +} PowerMode; + +typedef enum { + kAPPLE = 0, + kA53 = 53, + kA55 = 55, + kA57 = 57, + kA72 = 72, + kA73 = 73, + kA75 = 75, + kA76 = 76, + kARMArch_UNKOWN = -1 +} ARMArch; + +class DeviceInfo { + public: + int idx_; + int max_freq_; + int min_freq_; + int generate_arch_; + int compute_core_num_; + int max_memory_; + int sharemem_size_; + + std::string device_name_; + std::string compute_ability_; + + std::vector L1_cache_; + std::vector L2_cache_; + std::vector L3_cache_; + std::vector core_ids_; + std::vector big_core_ids_; + std::vector little_core_ids_; + std::vector cluster_ids_; + std::vector archs_; + + static DeviceInfo& Global() { + static auto* x = new DeviceInfo; + return *x; + } + + static void init_info() { + auto& info = Global(); + get_info(&info); + } + + private: + DeviceInfo() = default; + static void get_info(DeviceInfo* dev); +}; + +size_t arm_get_meminfo(); + +int arm_get_cpucount(); + +void arm_get_cpu_arch(std::vector* archs); + +bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name); + +#ifdef LITE_WITH_ANDROID + +void set_default_cache(DeviceInfo* dev); + +std::string arm_get_cpu_name(); + +int get_max_freq_khz(int cpuid); + +int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector* cpuids, + const std::vector& cpu_freq, + std::vector* cluster_ids); +int check_online(const std::vector& core_ids); +int set_sched_affinity(const std::vector& cpuids); + +#endif // LITE_WITH_ANDROID + +#endif // LITE_WITH_ARM + +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/core/kernel.h b/paddle/fluid/lite/core/kernel.h index 6846dbb92..2eee83bd4 100644 --- a/paddle/fluid/lite/core/kernel.h +++ b/paddle/fluid/lite/core/kernel.h @@ -44,7 +44,7 @@ class KernelBase { virtual void Run() = 0; void SetContext(std::unique_ptr&& ctx) { - context_ = std::move(ctx); + ctx_ = std::move(ctx); } template void SetParam(T param) { @@ -86,7 +86,7 @@ class KernelBase { virtual TargetType target() const = 0; virtual PrecisionType precision() const = 0; virtual DataLayoutType layout() const = 0; - const KernelContext* context() const { return context_.get(); } + const KernelContext* context() const { return ctx_.get(); } virtual std::string name() const = 0; // Short human-readable document. @@ -134,7 +134,7 @@ class KernelBase { void Torch() {} protected: - std::unique_ptr context_; + std::unique_ptr ctx_; mutable operators::param_t param_; // The corresponding op type. std::string op_type_{}; @@ -152,9 +152,6 @@ template class KernelLite : public KernelBase { public: - // Set runtime context. - void SetContext(std::unique_ptr&& ctx) { ctx_ = ctx; } - // Run the kernel. virtual void Run() { CHECK(false) << "Not Implemented"; } @@ -168,9 +165,6 @@ class KernelLite : public KernelBase { KernelLite() = default; virtual ~KernelLite() = default; - - protected: - std::unique_ptr ctx_; }; template diff --git a/paddle/fluid/lite/core/lite_tensor.h b/paddle/fluid/lite/core/lite_tensor.h index 3fe29cc33..433bc6911 100644 --- a/paddle/fluid/lite/core/lite_tensor.h +++ b/paddle/fluid/lite/core/lite_tensor.h @@ -14,6 +14,7 @@ #pragma once #include +#include // for multiplies #include #include #include @@ -40,6 +41,10 @@ class DDimLite : public DDimBase { size_t size() const { return data_.size(); } bool empty() const { return data_.empty(); } + value_type product() const { + return std::accumulate(std::begin(data_), std::end(data_), 1, + std::multiplies()); + } const std::vector &data() const { return data_; } private: @@ -61,8 +66,10 @@ class TensorLite : public TensorBase { } void Resize(const DDimLite &ddim) { dims_ = ddim; } + void Resize(const std::vector &x) { dims_ = DDimLite(x); } const DDimLite &dims() const { return dims_; } + int64_t numel() const { return dims_.product(); } const LoD &lod() const { return lod_; } LoD *mutable_lod() { return &lod_; } diff --git a/paddle/fluid/lite/core/mir/runtime_context_assign_pass.cc b/paddle/fluid/lite/core/mir/runtime_context_assign_pass.cc index 3d2012306..1852fc2fc 100644 --- a/paddle/fluid/lite/core/mir/runtime_context_assign_pass.cc +++ b/paddle/fluid/lite/core/mir/runtime_context_assign_pass.cc @@ -32,7 +32,6 @@ class RuntimeContextAssignPass : public StmtPass { if (!node.IsStmt()) continue; auto& inst = node.AsStmt(); - switch (inst.picked_kernel().target()) { case TARGET(kHost): case TARGET(kX86): @@ -42,6 +41,11 @@ class RuntimeContextAssignPass : public StmtPass { case TARGET(kCUDA): inst.picked_kernel().SetContext(NewCudaContext()); break; +#endif +#ifdef LITE_WITH_ARM + case TARGET(kARM): + inst.picked_kernel().SetContext(NewARMContext()); + break; #endif default: LOG(FATAL) << "unsupported target " @@ -54,9 +58,18 @@ class RuntimeContextAssignPass : public StmtPass { std::unique_ptr ctx(new KernelContext); ctx->As(); // Some initialization here. + return ctx; } +#ifdef LITE_WITH_ARM + std::unique_ptr NewARMContext() { + DeviceInfo::init_info(); + std::unique_ptr ctx(new KernelContext); + ctx->As(); + return ctx; + } +#endif #ifdef LITE_WITH_CUDA std::unique_ptr NewCudaContext() { std::unique_ptr ctx(new KernelContext); @@ -66,9 +79,7 @@ class RuntimeContextAssignPass : public StmtPass { cuda.blas_fp32 = cublas_fp32_; return ctx; } -#endif -#ifdef LITE_WITH_CUDA void InitCudaBlas() { cublas_fp32_ = std::make_shared>(); } diff --git a/paddle/fluid/lite/kernels/CMakeLists.txt b/paddle/fluid/lite/kernels/CMakeLists.txt index 0708e7d9a..ce22ba121 100644 --- a/paddle/fluid/lite/kernels/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/CMakeLists.txt @@ -1,5 +1,5 @@ message(STATUS "add lite kernels") -set(lite_kernel_deps type_system kernel_lite op_lite op_registry_lite ${tensor_lite}) +set(lite_kernel_deps type_system kernel_lite op_lite op_registry_lite context_lite ${tensor_lite}) add_subdirectory(host) add_subdirectory(arm) add_subdirectory(cuda) diff --git a/paddle/fluid/lite/kernels/arm/CMakeLists.txt b/paddle/fluid/lite/kernels/arm/CMakeLists.txt index b5fc0bdea..75dc9fe43 100644 --- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt @@ -4,11 +4,13 @@ endif() message(STATUS "compile with lite ARM kernels") -cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} eigen3) +cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps}) cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3) cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} eigen3) +lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm eigen3) + set(arm_kernels fc_compute_arm relu_compute_arm diff --git a/paddle/fluid/lite/kernels/arm/fc_compute.cc b/paddle/fluid/lite/kernels/arm/fc_compute.cc index 6b7060227..b26551e05 100644 --- a/paddle/fluid/lite/kernels/arm/fc_compute.cc +++ b/paddle/fluid/lite/kernels/arm/fc_compute.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/lite/kernels/arm/fc_compute.h" -#include +#include "paddle/fluid/lite/arm/math/funcs.h" #include "paddle/fluid/lite/core/op_registry.h" #include "paddle/fluid/lite/core/type_system.h" @@ -22,24 +22,42 @@ namespace lite { namespace kernels { namespace arm { -// NOTE should use pure std C++ implementation. void FcCompute::Run() { auto& param = this->Param(); + auto x_dims = param.input->dims(); + auto w_dims = param.w->dims(); - CHECK_GE(param.input->dims().size(), 2UL); + CHECK_GE(x_dims.size(), 2UL); + CHECK_EQ(w_dims.size(), 2UL); CHECK_EQ(param.output->dims().size(), 2UL); - fc_compute_eigen( - param.input->data(), // x - param.input->dims().Slice(0, param.in_num_col_dims).production(), - param.input->dims() - .Slice(param.in_num_col_dims, param.input->dims().size()) - .production(), - param.w->data(), // w - param.w->dims()[1], // w_w - param.w->dims()[0], // w_h - param.bias->data(), // b - param.output->mutable_data()); + const auto* i_data = param.input->data(); + const auto* w_data = param.w->data(); + const auto* b_data = param.bias ? param.bias->data() : nullptr; + auto* o_data = param.output->mutable_data(); + + int x_h = x_dims.Slice(0, param.in_num_col_dims).production(); + int x_w = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production(); + int n = w_dims[1]; + CHECK_EQ(x_w, static_cast(w_dims[0])); + auto& ctx = this->ctx_->template As(); + if (x_h > 1) { + float* packed_in = static_cast(ctx.workspace_data()) + + ctx.l2_cache_size() / sizeof(float); + lite::arm::math::prepackA(packed_in, i_data, x_w, 0, x_h, 0, x_w, false, + &ctx); + lite::arm::math::sgemm_prepack(packed_in, w_data, b_data, o_data, x_h, n, + x_w, false, false, false, &ctx); + + if (param.bias) { + CHECK_EQ(param.bias->numel(), n); + lite::arm::math::fill_bias_fc(o_data, b_data, x_h, n); + } + } else { + // use sgemmv + // sgemv((const float*)weights, (const float*)din, (float*)dout, + // false, n, x_w, _param->_flag_bias, (float*)bias, false); + } } TargetType FcCompute::target() const { return TARGET(kARM); } diff --git a/paddle/fluid/lite/kernels/arm/fc_compute.h b/paddle/fluid/lite/kernels/arm/fc_compute.h index 36f3e0723..414517843 100644 --- a/paddle/fluid/lite/kernels/arm/fc_compute.h +++ b/paddle/fluid/lite/kernels/arm/fc_compute.h @@ -13,7 +13,6 @@ // limitations under the License. #pragma once -#include #include "paddle/fluid/lite/core/kernel.h" #include "paddle/fluid/lite/operators/fc_op.h" @@ -34,52 +33,6 @@ class FcCompute : public KernelLite { virtual ~FcCompute() = default; }; -template -void fc_compute_eigen(const T* x, int x_w, int x_h, // - const T* w, int w_w, int w_h, // - const T* b, // - T* out) { - using matrix_t = - Eigen::Matrix; - - Eigen::Map X(x, x_h, x_w); - Eigen::Map W(w, w_h, w_w); - Eigen::Map Out(out, x_h, w_h); - - Out = X * W.transpose(); - - if (b) { - Eigen::Map> B(b, w_h); - Out = Out.array().rowwise() + B.transpose().array(); - } -} - -template -__attribute__((optimize("unroll-loops"))) // -T dot(const T* x, const T* y, int dim) { - T out{}; - for (int i = 0; i < dim; i++) { - out += x[i] * y[i]; - } - return out; -} - -template -void fc_compute_naive(const T* x, int x_w, int x_h, // - const T* w, int w_w, int w_h, // - const T* b, // - T* out) { - CHECK_EQ(x_w, w_w); - // out shape: (x_h, w_w) - memset(out, 0, x_h * w_h * sizeof(T)); - - for (int r = 0; r < x_h; r++) { - for (int c = 0; c < w_h; c++) { - out[r * w_h + c] = dot(&x[r * x_w], &w[c * w_w], w_w) + b[c]; - } - } -} - } // namespace arm } // namespace kernels } // namespace lite diff --git a/paddle/fluid/lite/kernels/arm/fc_compute_test.cc b/paddle/fluid/lite/kernels/arm/fc_compute_test.cc index 5f5de8a89..1949a3a1e 100644 --- a/paddle/fluid/lite/kernels/arm/fc_compute_test.cc +++ b/paddle/fluid/lite/kernels/arm/fc_compute_test.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/lite/kernels/arm/fc_compute.h" #include #include +#include "paddle/fluid/lite/arm/math/funcs.h" #include "paddle/fluid/lite/core/op_registry.h" namespace paddle { @@ -22,60 +23,79 @@ namespace lite { namespace kernels { namespace arm { -TEST(fc_compute_naive, test) { - lite::Tensor x, w, b, out, out1; - const int batch_size = 2; +TEST(fc_arm, retrive_op) { + auto fc = + KernelRegistry::Global().Create("fc"); + ASSERT_FALSE(fc.empty()); + ASSERT_TRUE(fc.front()); +} + +TEST(fc_arm, init) { + FcCompute fc; + ASSERT_EQ(fc.precision(), PRECISION(kFloat)); + ASSERT_EQ(fc.target(), TARGET(kARM)); +} + +TEST(fc_arm, compare_test) { + lite::Tensor x, w, b, out, ref; + constexpr int batch_size = 2; x.Resize({batch_size, 3}); - w.Resize({4, 3}); + w.Resize({3, 4}); b.Resize({1, 4}); out.Resize({batch_size, 4}); - out1.Resize({batch_size, 4}); + ref.Resize({batch_size, 4}); auto x_data = x.mutable_data(); auto w_data = w.mutable_data(); auto b_data = b.mutable_data(); auto out_data = out.mutable_data(); - auto out_data1 = out1.mutable_data(); + auto ref_data = ref.mutable_data(); - for (int i = 0; i < product(x.dims()); i++) x_data[i] = i; - for (int i = 0; i < product(w.dims()); i++) w_data[i] = i; - for (int i = 0; i < product(b.dims()); i++) b_data[i] = i; - - fc_compute_naive(x_data, 3, batch_size, // - w_data, 3, 4, // - b_data, out_data); - fc_compute_eigen(x_data, 3, batch_size, // - w_data, 3, 4, // - b_data, out_data1); - - for (int i = 0; i < product(out.dims()); i++) { - EXPECT_NEAR(out_data[0], out_data1[0], 1e-6); + for (int64_t i = 0; i < x.dims().product(); i++) { + x_data[i] = static_cast(i); + } + for (int64_t i = 0; i < w.dims().product(); i++) { + w_data[i] = static_cast(i); + } + for (int64_t i = 0; i < b.dims().product(); i++) { + b_data[i] = static_cast(i); } -} -TEST(fc_arm, init) { + // TODO(TJ): enable bias soon + b_data = nullptr; + lite::arm::math::fc_compute_eigen(x_data, batch_size, 3, // + w_data, 3, 4, // + b_data, ref_data); + + // fc compute kernel FcCompute fc; - ASSERT_EQ(fc.precision(), PRECISION(kFloat)); - ASSERT_EQ(fc.target(), TARGET(kARM)); -} + operators::FcParam param; -TEST(fc_arm, algorithm) { - using matrix_t = Eigen::Matrix; - using matrix_map_t = Eigen::Map; + param.in_num_col_dims = 1; + param.input = &x; + param.w = &w; + param.bias = nullptr; + param.output = &out; + param.in_mat_dims = x.dims(); - // dim 10, 20 - std::vector input(10 * 20); - std::vector w(20 * 20); - std::vector output(10 * 20); + DeviceInfo::init_info(); + std::unique_ptr ctx(new KernelContext); + ctx->As(); + fc.SetParam(param); + fc.SetContext(std::move(ctx)); + fc.Run(); - Eigen::Map input_mat(input.data(), 10, 20); - Eigen::Map weight_mat(w.data(), 20, 20); - matrix_map_t output_mat(output.data(), 10, 20); + VLOG(3) << "output vs ref"; + for (int i = 0; i < out.dims().product(); i++) { + VLOG(3) << out_data[i] << " vs " << ref_data[i]; + } - output_mat = weight_mat.transpose() * input_mat; + for (int i = 0; i < out.dims().product(); ++i) { + EXPECT_NEAR(out_data[i], ref_data[i], 1e-5); + } } -TEST(fc_arm, compute) { +TEST(fc_arm, num_col_dims) { FcCompute fc; operators::FcParam param; @@ -84,20 +104,28 @@ TEST(fc_arm, compute) { lite::Tensor bias; lite::Tensor output; - x.Resize(DDim(std::vector({1, 10, 20}))); - w.Resize(DDim(std::vector({20, 20}))); - bias.Resize(DDim(std::vector({1, 10}))); - output.Resize(DDim(std::vector({10, 20}))); + x.Resize({1, 2, 3}); + w.Resize({3, 4}); + bias.Resize({1, 4}); + output.Resize({2, 4}); auto* x_data = x.mutable_data(); auto* w_data = w.mutable_data(); auto* bias_data = bias.mutable_data(); auto* output_data = output.mutable_data(); - for (int i = 0; i < 10 * 20; i++) x_data[i] = i; - for (int i = 0; i < 20 * 20; i++) w_data[i] = i; - for (int i = 0; i < 10; i++) bias_data[i] = i; - for (int i = 0; i < 10 * 20; i++) output_data[i] = 0; + for (int64_t i = 0; i < x.dims().product(); i++) { + x_data[i] = static_cast(i); + } + for (int64_t i = 0; i < w.dims().product(); i++) { + w_data[i] = static_cast(i); + } + for (int64_t i = 0; i < bias.dims().product(); i++) { + bias_data[i] = static_cast(i); + } + for (int64_t i = 0; i < output.dims().product(); i++) { + output_data[i] = static_cast(i); + } param.in_num_col_dims = 2; param.input = &x; @@ -106,20 +134,13 @@ TEST(fc_arm, compute) { param.output = &output; param.in_mat_dims = x.dims(); + std::unique_ptr ctx(new KernelContext); + ctx->As(); + DeviceInfo::init_info(); + fc.SetParam(param); + fc.SetContext(std::move(ctx)); fc.Run(); - - LOG(INFO) << "x"; - for (int i = 0; i < 10 * 20; i++) LOG(INFO) << x_data[i]; - - LOG(INFO) << "output:"; - for (int i = 0; i < 10 * 20; i++) LOG(INFO) << output.data()[i]; -} - -TEST(fc, retrive_op) { - auto fc = - KernelRegistry::Global().Create("fc"); - ASSERT_TRUE(fc); } } // namespace arm diff --git a/paddle/fluid/lite/kernels/cuda/mul_compute.h b/paddle/fluid/lite/kernels/cuda/mul_compute.h index 597d84683..5eb30bf8d 100644 --- a/paddle/fluid/lite/kernels/cuda/mul_compute.h +++ b/paddle/fluid/lite/kernels/cuda/mul_compute.h @@ -35,8 +35,8 @@ class MulCompute : public KernelLite { using param_t = operators::MulParam; void Run() override { - CHECK(context_) << "running context should be set first"; - auto& context = context_->As(); + CHECK(ctx_) << "running context should be set first"; + auto& context = ctx_->As(); CHECK(context.blas_fp32) << "blas should init first"; /* auto& blas = *context.blas_fp32; diff --git a/paddle/fluid/lite/kernels/x86/activation_compute.cc b/paddle/fluid/lite/kernels/x86/activation_compute.cc index 3001a98da..79f3829b6 100644 --- a/paddle/fluid/lite/kernels/x86/activation_compute.cc +++ b/paddle/fluid/lite/kernels/x86/activation_compute.cc @@ -60,7 +60,7 @@ class SquareCompute : public KernelLite { using param_t = operators::ActivationParam; void Run() override { - auto& context = context_->As(); + auto& context = ctx_->As(); auto& param = *param_.get_mutable(); CHECK(context.x86_device_context); @@ -82,7 +82,7 @@ class SquareGradCompute : public KernelLite { using param_t = operators::ActivationGradParam; void Run() override { - auto& context = context_->As(); + auto& context = ctx_->As(); auto& param = *param_.get_mutable(); CHECK(context.x86_device_context); param.X_grad->template mutable_data(); diff --git a/paddle/fluid/lite/kernels/x86/elementwise_compute.cc b/paddle/fluid/lite/kernels/x86/elementwise_compute.cc index d4ead92e4..e2ca9a52d 100644 --- a/paddle/fluid/lite/kernels/x86/elementwise_compute.cc +++ b/paddle/fluid/lite/kernels/x86/elementwise_compute.cc @@ -38,7 +38,7 @@ class ElementwiseSubCompute void Run() override { auto& param = *param_.get_mutable(); - auto& context = context_->As(); + auto& context = ctx_->As(); CHECK(context.x86_device_context); param.Out->template mutable_data(); diff --git a/paddle/fluid/lite/tools/build.sh b/paddle/fluid/lite/tools/build.sh index e3c639f18..37a04f901 100755 --- a/paddle/fluid/lite/tools/build.sh +++ b/paddle/fluid/lite/tools/build.sh @@ -22,7 +22,9 @@ function cmake_arm { -DLITE_WITH_CUDA=OFF \ -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ -DWITH_TESTING=ON \ + -DWITH_MKL=OFF \ -DWITH_MKLDNN=OFF + make cxx_api_lite_bin -j8 } function build { -- GitLab