提交 58bf3c48 编写于 作者: N nhzlx

Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into xzl/incubate/lite

...@@ -2,6 +2,20 @@ before_script: ...@@ -2,6 +2,20 @@ before_script:
- env - env
- export CI_USER_DIR=$(pwd) - export CI_USER_DIR=$(pwd)
# prepare ccache
- apt install ccache
# for proxy
- export http_proxy=$CI_PROXY
- export https_proxy=$CI_PROXY
# merge the latest code
- git config --global user.email "you@example.com"
- git config --global user.name "Your Name"
- git fetch origin incubate/lite
- git merge --no-ff origin/incubate/lite
image: $SERVER_LITE_DOCKER_IMAGE image: $SERVER_LITE_DOCKER_IMAGE
stages: stages:
...@@ -14,19 +28,13 @@ check:prebuilt: ...@@ -14,19 +28,13 @@ check:prebuilt:
- lite - lite
stage: ci stage: ci
script: script:
# prepare for pre-commit
- rm -rf ~/.pip - rm -rf ~/.pip
- export http_proxy=$CI_PROXY
- export https_proxy=$CI_PROXY
- pip install pre-commit - pip install pre-commit
- pre-commit install - pre-commit install
# merge the latest code
- git config --global user.email "you@example.com"
- git config --global user.name "Your Name"
- git fetch origin incubate/lite
- git merge --no-ff origin/incubate/lite
- ./paddle/fluid/lite/tools/build.sh check_style - ./paddle/fluid/lite/tools/build.sh check_style
cache: cache:
key: check_style key: check_style
paths: paths:
...@@ -42,17 +50,11 @@ build:server: ...@@ -42,17 +50,11 @@ build:server:
paths: paths:
- build/third_party - build/third_party
- ~/.ccache - ~/.ccache
- $CI_PROJECT_DIR/_build_server_ccache
script: script:
- apt install ccache # customize ccache path for specifying runner cache
- export http_proxy=$CI_PROXY - export CCACHE_DIR=$CI_PROJECT_DIR/_build_server_ccache
- export https_proxy=$CI_PROXY # run build and test
# merge the latest code
- git config --global user.email "you@example.com"
- git config --global user.name "Your Name"
- git fetch origin incubate/lite
- git merge --no-ff origin/incubate/lite
- mkdir -p build - mkdir -p build
- cd build - cd build
- ../paddle/fluid/lite/tools/build.sh cmake_x86 - ../paddle/fluid/lite/tools/build.sh cmake_x86
...@@ -66,7 +68,27 @@ build:server: ...@@ -66,7 +68,27 @@ build:server:
dependencies: dependencies:
- check:prebuilt - check:prebuilt
build:mobile: build:mobile_android:
tags:
- lite
stage: build_mobile
image: $MOBILE_LITE_DOCKER_IMAGE
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- ~/.ccache
- $CI_PROJECT_DIR/build_mobile_ccache
script:
- export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_ccache
- ./paddle/fluid/lite/tools/build.sh build_test_arm_subtask_android
dependencies:
- build:server
build:mobile_armlinux:
tags: tags:
- lite - lite
stage: build_mobile stage: build_mobile
...@@ -77,17 +99,43 @@ build:mobile: ...@@ -77,17 +99,43 @@ build:mobile:
- $MOBILE_LITE_CACHE0 - $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1 - $MOBILE_LITE_CACHE1
- ~/.ccache - ~/.ccache
- $CI_PROJECT_DIR/build_mobile_ccache2
script: script:
- apt install ccache - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_ccache2
- export http_proxy=$CI_PROXY - ./paddle/fluid/lite/tools/build.sh build_test_arm_subtask_armlinux
- export https_proxy=$CI_PROXY
dependencies:
- build:server
# merge the latest code cache:
- git config --global user.email "you@example.com" key: mobile_thirdparty
- git config --global user.name "Your Name" paths:
- git fetch origin incubate/lite - $MOBILE_LITE_CACHE0
- git merge --no-ff origin/incubate/lite - $MOBILE_LITE_CACHE1
- ~/.ccache
build:mobile_model_mobilenetv2:
tags:
- lite
stage: build_mobile
image: $MOBILE_LITE_DOCKER_IMAGE
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- ~/.ccache
script:
- export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model1
- ./paddle/fluid/lite/tools/build.sh build_test_arm_model1
- ./paddle/fluid/lite/tools/build.sh build_test_arm
dependencies: dependencies:
- build:server - build:server
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- ~/.ccache
- $CI_PROJECT_DIR/build_mobile_model1
...@@ -29,9 +29,10 @@ double time_diff(Time t1, Time t2) { ...@@ -29,9 +29,10 @@ double time_diff(Time t1, Time t2) {
return counter.count() / 1000.0; return counter.count() / 1000.0;
} }
void Run(const char* model_dir, int repeat) { void Run(const char* model_dir, int repeat, int thread_num) {
#ifdef LITE_WITH_ARM #ifdef LITE_WITH_ARM
DeviceInfo::Init(); DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, thread_num);
#endif #endif
lite::ExecutorLite predictor; lite::ExecutorLite predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
...@@ -67,8 +68,8 @@ void Run(const char* model_dir, int repeat) { ...@@ -67,8 +68,8 @@ void Run(const char* model_dir, int repeat) {
} // namespace paddle } // namespace paddle
int main(int argc, char** argv) { int main(int argc, char** argv) {
CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>"; CHECK_EQ(argc, 4) << "usage: ./cmd <model_dir> <repeat> <thread_num>";
paddle::lite::Run(argv[1], std::stoi(argv[2])); paddle::lite::Run(argv[1], std::stoi(argv[2]), std::stoi(argv[3]));
return 0; return 0;
} }
......
...@@ -13,322 +13,7 @@ ...@@ -13,322 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/lite/core/context.h" #include "paddle/fluid/lite/core/context.h"
#include "paddle/fluid/lite/core/cpu_info.h"
#ifdef LITE_WITH_LINUX
#include <sys/syscall.h>
#include <unistd.h>
#endif
#if __APPLE__
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#include <mach/machine.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#endif // TARGET_OS_IPHONE
#endif // __APPLE__
#ifdef ARM_WITH_OMP
#include <omp.h>
#endif
namespace paddle { namespace paddle {
namespace lite { namespace lite {} // namespace lite
#ifdef LITE_WITH_ARM
void Context<TargetType::kARM>::SetCache(int l1size, int l2size, int l3size) {
DeviceInfo& dev = DeviceInfo::Global();
int cpu_count = arm_get_cpucount();
dev.L1_cache_.resize(cpu_count);
dev.L2_cache_.resize(cpu_count);
dev.L3_cache_.resize(cpu_count);
for (int i = 0; i < cpu_count; ++i) {
dev.L1_cache_[i] = l1size;
dev.L2_cache_[i] = l2size;
dev.L3_cache_[i] = l3size;
}
workspace_.Resize({2 * (l1size + l2size)});
}
Context<TargetType::kARM>::Context() {
active_ids_ = {0};
mode_ = LITE_POWER_HIGH;
DeviceInfo& dev = DeviceInfo::Global();
workspace_.Resize(
{static_cast<int64_t>(dev.L2_cache_[active_ids_[0]] / sizeof(float))});
#ifdef TARGET_IOS
arch_ = APPLE; // use 6x8
#else
if (dev.big_core_ids_.size() > 0) {
arch_ = dev.archs_[dev.big_core_ids_[0]];
}
#endif
}
PowerMode Context<TargetType::kARM>::mode() const { return mode_; }
int Context<TargetType::kARM>::threads() const { return active_ids_.size(); }
Context<TargetType::kARM>::Context(const ARMContext& ctx) {
mode_ = ctx.mode_;
active_ids_ = ctx.active_ids_;
workspace_ = ctx.workspace_;
arch_ = ctx.arch_;
count_ = ctx.count_;
}
ARMContext& Context<TargetType::kARM>::operator=(const ARMContext& ctx) {
mode_ = ctx.mode_;
active_ids_ = ctx.active_ids_;
workspace_ = ctx.workspace_;
arch_ = ctx.arch_;
count_ = ctx.count_;
return *this;
}
void Context<TargetType::kARM>::BindDev() {
#ifdef ARM_WITH_OMP
int num_threads = active_ids_.size();
omp_set_num_threads(num_threads);
#ifdef LITE_WITH_LINUX
std::vector<int> ssarets;
for (int j = 0; j < num_threads; ++j) {
ssarets.push_back(0);
}
#pragma omp parallel for
for (int i = 0; i < num_threads; i++) {
ssarets[i] = set_sched_affinity(active_ids_);
}
for (int i = 0; i < num_threads; i++) {
if (ssarets[i] != 0) {
LOG(ERROR) << "set cpu affinity failed, cpuID: " << active_ids_[i];
return;
}
}
#endif // LITE_WITH_LINUX
#else // ARM_WITH_OMP
#ifdef LITE_WITH_LINUX
std::vector<int> cpuid1;
cpuid1.push_back(active_ids_[0]);
int ssaret = set_sched_affinity(cpuid1);
if (ssaret != 0) {
printf("set cpu affinity failed, cpuID: %d\n", active_ids_[0]);
return;
}
#endif // LITE_WITH_LINUX
#endif // ARM_WITH_OMP
}
void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
DeviceInfo& dev = DeviceInfo::Global();
int big_core_size = dev.big_core_ids_.size();
int small_core_size = dev.little_core_ids_.size();
if (threads > big_core_size + small_core_size) {
threads = big_core_size + small_core_size;
}
#ifdef ARM_WITH_OMP
count_++;
int shift_num = (count_ / 10) % big_core_size;
switch (mode) {
case LITE_POWER_FULL:
mode_ = mode;
active_ids_.clear();
for (int i = 0; i < threads; ++i) {
if (i < big_core_size) {
active_ids_.push_back(dev.big_core_ids_[i]);
} else {
active_ids_.push_back(dev.little_core_ids_[i - big_core_size]);
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
case LITE_POWER_HIGH:
active_ids_.clear();
if (big_core_size > 0) {
mode_ = LITE_POWER_HIGH;
if (threads > big_core_size) {
LOG(ERROR) << "threads: " << threads
<< ", exceed the big cores size: " << big_core_size;
active_ids_ = dev.big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.big_core_ids_[i]);
}
}
} else {
mode_ = LITE_POWER_LOW;
LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores";
if (threads > small_core_size) {
active_ids_ = dev.little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.little_core_ids_[i]);
}
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
case LITE_POWER_LOW:
active_ids_.clear();
if (small_core_size > 0) {
mode_ = LITE_POWER_LOW;
if (threads > small_core_size) {
LOG(WARNING) << "threads: " << threads
<< ", exceed the little cores size: " << small_core_size;
active_ids_ = dev.little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.little_core_ids_[i]);
}
}
} else {
mode_ = LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
if (threads > big_core_size) {
active_ids_ = dev.big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.big_core_ids_[i]);
}
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
case LITE_POWER_NO_BIND:
mode_ = LITE_POWER_NO_BIND;
active_ids_.clear();
if (threads > dev.core_ids_.size()) {
active_ids_.resize(dev.core_ids_.size());
} else {
active_ids_.resize(threads);
}
break;
case LITE_POWER_RAND_HIGH:
active_ids_.clear();
if (big_core_size > 0) {
mode_ = LITE_POWER_RAND_HIGH;
if (threads > big_core_size) {
LOG(WARNING) << "threads: " << threads
<< ", exceed the big cores size: " << big_core_size;
active_ids_ = dev.big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(
dev.big_core_ids_[(i + shift_num) % big_core_size]);
}
}
} else {
mode_ = LITE_POWER_LOW;
LOG(WARNING)
<< "HIGH POWER MODE is not support, switch to little cores";
if (threads > small_core_size) {
active_ids_ = dev.little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.little_core_ids_[i]);
}
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
case LITE_POWER_RAND_LOW:
active_ids_.clear();
if (small_core_size > 0) {
mode_ = LITE_POWER_RAND_LOW;
if (threads > small_core_size) {
LOG(WARNING) << "threads: " << threads
<< ", exceed the little cores size: " << small_core_size;
active_ids_ = dev.little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(
dev.little_core_ids_[(i + shift_num) % small_core_size]);
}
}
} else {
mode_ = LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
if (threads > big_core_size) {
active_ids_ = dev.big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.big_core_ids_[i]);
}
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
}
//! fix multi-threads LITE_POWER_HIGH mode
if (mode_ == LITE_POWER_NO_BIND || threads > 1) {
int threads = active_ids_.size();
omp_set_num_threads(threads);
} else {
if (check_online(active_ids_)) {
BindDev();
} else {
LOG(ERROR) << "core id " << active_ids_[0]
<< " is offline, switch to NO BIND MODE";
int threads = active_ids_.size();
omp_set_num_threads(threads);
}
}
#else
if (big_core_size > 0) {
active_ids_ = {dev.big_core_ids_[0]};
} else {
active_ids_ = {0};
}
#endif
//! alloc memory for sgemm in this context
int temp_mem_size =
DeviceInfo::Global().L2_cache_[active_ids_[0]] / sizeof(float);
workspace_.Resize({temp_mem_size});
arch_ = DeviceInfo::Global().archs_[active_ids_[0]];
}
ARMArch Context<TargetType::kARM>::arch() const { return arch_; }
void Context<TargetType::kARM>::SetArch(ARMArch arch) { arch_ = arch; }
int Context<TargetType::kARM>::l1_cache_size() const {
DeviceInfo& dev = DeviceInfo::Global();
return dev.L1_cache_[active_ids_[0]];
}
int Context<TargetType::kARM>::l2_cache_size() const {
DeviceInfo& dev = DeviceInfo::Global();
return dev.L2_cache_[active_ids_[0]];
}
int Context<TargetType::kARM>::l3_cache_size() const {
DeviceInfo& dev = DeviceInfo::Global();
return dev.L3_cache_[active_ids_[0]];
}
bool Context<TargetType::kARM>::ExtendWorkspace(DDimLite dims) {
auto count = dims.product();
auto old = workspace_.dims();
if (count == old.product()) {
return false;
}
workspace_.Resize(
{static_cast<int64_t>(count + l2_cache_size() / sizeof(float))});
return true;
}
#endif // LITE_WITH_ARM
} // namespace lite
} // namespace paddle } // namespace paddle
...@@ -61,47 +61,41 @@ class Context<TargetType::kHost> { ...@@ -61,47 +61,41 @@ class Context<TargetType::kHost> {
template <> template <>
class Context<TargetType::kARM> { class Context<TargetType::kARM> {
public: public:
Context(); Context() {}
Context(PowerMode mode, int threads);
explicit Context(const ARMContext& ctx); explicit Context(const ARMContext& ctx);
ARMContext& operator=(const ARMContext& ctx); ARMContext& operator=(const ARMContext& ctx) {}
// NOTE: InitOnce should only be used by ContextScheduler // NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() { DeviceInfo::Init(); } void InitOnce() { DeviceInfo::Init(); }
void CopyShared(const ARMContext* ctx) {} void CopyShared(const ARMContext* ctx) {}
void SetRunMode(PowerMode mode, int threads); void SetRunMode(PowerMode mode, int threads) {
void SetCache(int l1size, int l2size, int l3size); return DeviceInfo::Global().SetRunMode(mode, threads);
void SetArch(ARMArch arch); }
void BindDev(); void SetCache(int l1size, int l2size, int l3size) {
return DeviceInfo::Global().SetCache(l1size, l2size, l3size);
}
void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); }
PowerMode mode() const; PowerMode mode() const { return DeviceInfo::Global().mode(); }
int threads() const; int threads() const { return DeviceInfo::Global().threads(); }
ARMArch arch() const; ARMArch arch() const { return DeviceInfo::Global().arch(); }
int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }
template <typename T> template <typename T>
T* workspace_data() { T* workspace_data() {
return workspace_.mutable_data<T>(); return DeviceInfo::Global().workspace_data<T>();
} }
int l1_cache_size() const; bool ExtendWorkspace(DDimLite dims) {
int l2_cache_size() const; return DeviceInfo::Global().ExtendWorkspace(dims);
int l3_cache_size() const; }
bool ExtendWorkspace(DDimLite dims);
std::string name() const { return "ARMContext"; } std::string name() const { return "ARMContext"; }
private:
// LITE_POWER_HIGH stands for using big cores,
// LITE_POWER_LOW stands for using small core,
// LITE_POWER_FULL stands for using all cores
ARMArch arch_;
PowerMode mode_;
std::vector<int> active_ids_;
TensorLite workspace_;
int64_t count_{0};
}; };
#endif #endif
......
此差异已折叠。
...@@ -14,24 +14,12 @@ ...@@ -14,24 +14,12 @@
#pragma once #pragma once
#include <cstdarg>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/lite/core/lite_tensor.h"
#include "paddle/fluid/lite/utils/cp_logging.h" #include "paddle/fluid/lite/utils/cp_logging.h"
#ifdef LITE_WITH_LINUX
#include <sys/syscall.h>
#include <unistd.h>
#endif
#if __APPLE__
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#include <mach/machine.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#endif // TARGET_OS_IPHONE
#endif // __APPLE__
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -60,64 +48,73 @@ typedef enum { ...@@ -60,64 +48,73 @@ typedef enum {
class DeviceInfo { class DeviceInfo {
public: public:
int idx_;
int max_freq_;
int min_freq_;
int generate_arch_;
int compute_core_num_;
int max_memory_;
int sharemem_size_;
std::string device_name_;
std::string compute_ability_;
std::vector<int> L1_cache_;
std::vector<int> L2_cache_;
std::vector<int> L3_cache_;
std::vector<int> core_ids_;
std::vector<int> big_core_ids_;
std::vector<int> little_core_ids_;
std::vector<int> cluster_ids_;
std::vector<ARMArch> archs_;
static DeviceInfo& Global() { static DeviceInfo& Global() {
static auto* x = new DeviceInfo; static auto* x = new DeviceInfo;
return *x; return *x;
} }
static void Init() { static int Init() {
auto& info = Global(); static int ret = Global().Setup();
InitInternal(&info); return ret;
} }
private: int Setup();
DeviceInfo() = default;
static void InitInternal(DeviceInfo* dev);
};
size_t arm_get_meminfo(); void SetRunMode(PowerMode mode, int thread_num);
void SetCache(int l1size, int l2size, int l3size);
void SetArch(ARMArch arch) { arch_ = arch; }
int arm_get_cpucount(); PowerMode mode() const { return mode_; }
int threads() const { return active_ids_.size(); }
ARMArch arch() const { return arch_; }
int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
void arm_get_cpu_arch(std::vector<ARMArch>* archs); template <typename T>
T* workspace_data() {
bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name); return workspace_.mutable_data<T>();
}
#ifdef LITE_WITH_LINUX bool ExtendWorkspace(DDimLite dims);
void set_default_cache(DeviceInfo* dev);
std::string arm_get_cpu_name(); private:
int core_num_;
std::vector<int> max_freqs_;
std::vector<int> min_freqs_;
int mem_size_;
std::string dev_name_;
int get_max_freq_khz(int cpuid); std::vector<int> L1_cache_;
std::vector<int> L2_cache_;
std::vector<int> L3_cache_;
std::vector<int> core_ids_;
std::vector<int> big_core_ids_;
std::vector<int> little_core_ids_;
std::vector<int> cluster_ids_;
std::vector<ARMArch> archs_;
int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids, ARMArch arch_;
const std::vector<int>& cpu_freq, // LITE_POWER_HIGH stands for using big cores,
std::vector<int>* cluster_ids); // LITE_POWER_LOW stands for using small core,
int check_online(const std::vector<int>& core_ids); // LITE_POWER_FULL stands for using all cores
int set_sched_affinity(const std::vector<int>& cpuids); PowerMode mode_;
std::vector<int> active_ids_;
TensorLite workspace_;
int64_t count_{0};
void SetCacheInfo(int cache_id, int argc, ...);
void SetArchInfo(int argc, ...);
bool SetCPUInfoByName();
void SetCPUInfoByProb();
void RequestPowerFullMode(const int thread_num);
void RequestPowerHighMode(const int thread_num);
void RequestPowerLowMode(const int thread_num);
void RequestPowerNoBindMode(const int thread_num);
void RequestPowerRandHighMode(const int shift_num, const int thread_num);
void RequestPowerRandLowMode(const int shift_num, const int thread_num);
#endif // LITE_WITH_LINUX DeviceInfo() = default;
};
#endif // LITE_WITH_ARM #endif // LITE_WITH_ARM
......
...@@ -28,8 +28,6 @@ void ConvCompute::PrepareForRun() { ...@@ -28,8 +28,6 @@ void ConvCompute::PrepareForRun() {
auto o_dims = param.output->dims(); auto o_dims = param.output->dims();
auto& ctx = this->ctx_->template As<ARMContext>(); auto& ctx = this->ctx_->template As<ARMContext>();
// TODO(xxx): make api and expose it
ctx.SetRunMode(LITE_POWER_HIGH, 4);
int win = x_dims[3]; // nchw int win = x_dims[3]; // nchw
int hin = x_dims[2]; int hin = x_dims[2];
......
...@@ -28,7 +28,6 @@ void FcCompute::PrepareForRun() { ...@@ -28,7 +28,6 @@ void FcCompute::PrepareForRun() {
auto w_dims = param.w->dims(); auto w_dims = param.w->dims();
auto& ctx = this->ctx_->template As<ARMContext>(); auto& ctx = this->ctx_->template As<ARMContext>();
ctx.SetRunMode(LITE_POWER_HIGH, 4);
CHECK_GE(x_dims.size(), 2UL); CHECK_GE(x_dims.size(), 2UL);
CHECK_EQ(w_dims.size(), 2UL); CHECK_EQ(w_dims.size(), 2UL);
......
...@@ -24,7 +24,6 @@ namespace arm { ...@@ -24,7 +24,6 @@ namespace arm {
void MulCompute::PrepareForRun() { void MulCompute::PrepareForRun() {
auto& ctx = this->ctx_->template As<ARMContext>(); auto& ctx = this->ctx_->template As<ARMContext>();
ctx.SetRunMode(LITE_POWER_HIGH, 4);
} }
void MulCompute::Run() { void MulCompute::Run() {
......
...@@ -26,7 +26,6 @@ namespace arm { ...@@ -26,7 +26,6 @@ namespace arm {
void PoolCompute::PrepareForRun() { void PoolCompute::PrepareForRun() {
auto& ctx = this->ctx_->template As<ARMContext>(); auto& ctx = this->ctx_->template As<ARMContext>();
ctx.SetRunMode(LITE_POWER_HIGH, 4);
} }
void PoolCompute::Run() { void PoolCompute::Run() {
......
...@@ -17,6 +17,7 @@ cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps} ...@@ -17,6 +17,7 @@ cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps}
cc_library(concat_compute_x86 SRCS concat_compute.cc DEPS ${lite_kernel_deps} ) cc_library(concat_compute_x86 SRCS concat_compute.cc DEPS ${lite_kernel_deps} )
cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col) cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col)
cc_library(pool_compute_x86 SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling) cc_library(pool_compute_x86 SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling)
cc_library(batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86) lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86)
lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86) lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
...@@ -28,6 +29,7 @@ lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS relu_compute_x ...@@ -28,6 +29,7 @@ lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS relu_compute_x
lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86 operator) lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86 operator)
lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86) lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86) lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86)
set(x86_kernels set(x86_kernels
...@@ -44,6 +46,7 @@ set(x86_kernels ...@@ -44,6 +46,7 @@ set(x86_kernels
concat_compute_x86 concat_compute_x86
conv_compute_x86 conv_compute_x86
pool_compute_x86 pool_compute_x86
batch_norm_compute_x86
) )
set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels") set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels")
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/x86/batch_norm_compute.h"
REGISTER_LITE_KERNEL(batch_norm, kX86, kFloat, kNCHW,
paddle::lite::kernels::x86::BatchNormCompute<float>, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kX86))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))})
.BindInput("Mean", {LiteType::GetTensorTy(TARGET(kX86))})
.BindInput("Variance", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <random>
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
template <typename T>
using EigenArrayMap =
Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using ConstEigenArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
template <typename T>
using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
using ConstEigenVectorArrayMap =
Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
template <typename T>
class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::BatchNormParam;
void Run() override {
auto &param = *param_.get_mutable<operators::BatchNormParam>();
bool global_stats = param.is_test || param.use_global_stats;
const auto *x = param.x;
const auto &x_dims = x->dims();
CHECK(x_dims.size() >= 2 && x_dims.size() <= 5);
const int N = x_dims[0];
const int C = param.data_layout == DATALAYOUT(kNCHW)
? x_dims[1]
: x_dims[x_dims.size() - 1];
const int sample_size = x->dims().production() / N / C;
// alloc memory
param.y->template mutable_data<T>();
param.mean_out->template mutable_data<T>();
param.variance_out->template mutable_data<T>();
param.saved_mean->template mutable_data<T>();
param.saved_variance->template mutable_data<T>();
if (!global_stats) {
// saved_xx is use just in this batch of data
EigenVectorArrayMap<T> saved_mean_e(param.saved_mean->mutable_data<T>(),
C);
EigenVectorArrayMap<T> saved_variance_e(
param.saved_variance->mutable_data<T>(), C);
saved_mean_e.setZero();
saved_variance_e.setZero();
EigenVectorArrayMap<T> running_mean_arr(param.mean_out->mutable_data<T>(),
C);
EigenVectorArrayMap<T> running_var_arr(
param.variance_out->mutable_data<T>(), C);
if ((N * sample_size) == 1) {
LOG(WARNING) << "Only 1 element in normalization dimension, "
<< "we skip the batch norm calculation, let y = x.";
framework::TensorCopy(x->raw_tensor(), platform::CPUPlace(),
&param.y->raw_tensor());
return;
}
switch (param.data_layout) {
case DATALAYOUT(kNCHW): {
ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) {
saved_mean_e(nc % C) += x_arr.col(nc).sum();
}
saved_mean_e /= N * sample_size;
for (int nc = 0; nc < N * C; ++nc) {
saved_variance_e(nc % C) +=
(x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
}
saved_variance_e /= N * sample_size;
break;
}
default:
LOG(FATAL) << "Unknown storage order: "
<< DataLayoutToStr(param.data_layout);
break;
}
running_mean_arr = running_mean_arr * param.momentum +
saved_mean_e * (1. - param.momentum);
running_var_arr = running_var_arr * param.momentum +
saved_variance_e * (1. - param.momentum);
}
// use SavedMean and SavedVariance to do normalize
Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
if (global_stats) {
ConstEigenVectorArrayMap<T> var_arr(param.variance->data<T>(), C);
inv_std = (var_arr + param.epsilon).sqrt().inverse();
} else {
EigenVectorArrayMap<T> saved_inv_std(
param.saved_variance->mutable_data<T>(), C);
// inverse SavedVariance first, gradient will use it too.
saved_inv_std = (saved_inv_std + param.epsilon).inverse().sqrt();
inv_std = saved_inv_std;
}
ConstEigenVectorArrayMap<T> mean_arr(
global_stats ? param.mean->data<T>() : param.saved_mean->data<T>(), C);
// ((x - est_mean) * (inv_var) * scale + bias
// formula transform ====>
// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
ConstEigenVectorArrayMap<T> scale_arr(param.scale->data<T>(), C);
ConstEigenVectorArrayMap<T> bias_arr(param.bias->data<T>(), C);
Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
bias_arr - mean_arr * inv_std * scale_arr;
switch (param.data_layout) {
case DATALAYOUT(kNCHW): {
EigenArrayMap<T> y_arr(param.y->mutable_data<T>(), sample_size, N * C);
ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
for (int nc = 0; nc < N * C; ++nc) {
y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
}
break;
}
default:
LOG(FATAL) << "Unknown storage order: "
<< DataLayoutToStr(param.data_layout);
break;
}
}
virtual ~BatchNormCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/x86/batch_norm_compute.h"
#include <gtest/gtest.h>
#include <iostream>
#include <vector>
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
TEST(batch_norm_x86, retrive_op) {
auto batch_norm =
KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
"batch_norm");
ASSERT_FALSE(batch_norm.empty());
ASSERT_TRUE(batch_norm.front());
}
TEST(batch_norm_x86, init) {
BatchNormCompute<float> batch_norm;
ASSERT_EQ(batch_norm.precision(), PRECISION(kFloat));
ASSERT_EQ(batch_norm.target(), TARGET(kX86));
}
TEST(batch_norm_x86, run_test) {
lite::Tensor x, scale, bias, mean, variance, y, mean_out, variance_out,
saved_mean, saved_variance;
constexpr int batch_size = 2;
std::vector<int64_t> x_shape{batch_size, 3, 64, 64};
x.Resize(lite::DDim(x_shape));
std::vector<int64_t> scale_shape{3};
scale.Resize(lite::DDim(scale_shape));
std::vector<int64_t> bias_shape{3};
bias.Resize(lite::DDim(bias_shape));
std::vector<int64_t> mean_shape{3};
mean.Resize(lite::DDim(mean_shape));
std::vector<int64_t> variance_shape{3};
variance.Resize(lite::DDim(variance_shape));
std::vector<int64_t> y_shape{batch_size, 3, 64, 64};
y.Resize(lite::DDim(y_shape));
std::vector<int64_t> mean_out_shape{3};
mean_out.Resize(lite::DDim(mean_out_shape));
std::vector<int64_t> variance_out_shape{3};
variance_out.Resize(lite::DDim(variance_out_shape));
std::vector<int64_t> saved_mean_shape{3};
saved_mean.Resize(lite::DDim(saved_mean_shape));
std::vector<int64_t> saved_variance_shape{3};
saved_variance.Resize(lite::DDim(saved_variance_shape));
auto x_data = x.mutable_data<float>();
auto scale_data = scale.mutable_data<float>();
auto bias_data = bias.mutable_data<float>();
auto mean_data = mean.mutable_data<float>();
auto variance_data = variance.mutable_data<float>();
y.mutable_data<float>();
mean_out.mutable_data<float>();
variance_out.mutable_data<float>();
saved_mean.mutable_data<float>();
saved_variance.mutable_data<float>();
for (int64_t i = 0; i < x.dims().production(); i++) {
x_data[i] = static_cast<float>(i);
}
for (int i = 0; i < scale.dims().production(); i++) {
scale_data[i] = static_cast<float>(i) * 0.01f + 0.03f;
}
for (int i = 0; i < bias.dims().production(); i++) {
bias_data[i] = static_cast<float>(i) * 0.065f + 0.1f;
}
for (int i = 0; i < mean.dims().production(); i++) {
mean_data[i] = static_cast<float>(i) * 0.0565f;
}
for (int i = 0; i < variance.dims().production(); i++) {
variance_data[i] = static_cast<float>(i) * 2.08f + 1.5f;
}
// BatchNormCompute batch_norm;
BatchNormCompute<float> batch_norm;
operators::BatchNormParam param;
param.x = &x;
param.is_test = false;
param.scale = &scale;
param.bias = &bias;
param.mean = &mean;
param.variance = &variance;
param.use_global_stats = false;
param.epsilon = 1e-4f;
param.momentum = 0.9f;
param.y = &y;
param.mean_out = &mean_out;
param.variance_out = &variance_out;
param.saved_mean = &saved_mean;
param.saved_variance = &saved_variance;
batch_norm.SetParam(param);
batch_norm.Run();
LOG(INFO) << "output: " << y;
LOG(INFO) << "mean_out: " << mean_out;
LOG(INFO) << "variance_out: " << mean_out;
LOG(INFO) << "saved_mean: " << saved_mean;
LOG(INFO) << "saved_variance: " << saved_variance;
/*for (int i = 0; i < y.dims().production(); i++) {
if(i < 5 || i > y.dims().production() - 5)
LOG(INFO) << y_data[i];
}*/
}
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(batch_norm, kX86, kFloat, kNCHW, def);
...@@ -135,8 +135,8 @@ function test_arm_model { ...@@ -135,8 +135,8 @@ function test_arm_model {
adb -s emulator-${port} push ${model_dir} ${adb_work_dir} adb -s emulator-${port} push ${model_dir} ${adb_work_dir}
adb -s emulator-${port} push ${testpath} ${adb_work_dir} adb -s emulator-${port} push ${testpath} ${adb_work_dir}
adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}" adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
local adb_model_path="./${adb_work_dir}/`basename ${model_dir}`" local adb_model_path="${adb_work_dir}/`basename ${model_dir}`"
adb -s emulator-${port} shell "./${adb_work_dir}/${test_name} --eval_model_dir=$adb_model_path" adb -s emulator-${port} shell "${adb_work_dir}/${test_name} --eval_model_dir=$adb_model_path"
} }
...@@ -225,16 +225,11 @@ function test_arm { ...@@ -225,16 +225,11 @@ function test_arm {
for _test in $(cat $TESTS_FILE); do for _test in $(cat $TESTS_FILE); do
test_arm_android $_test $port test_arm_android $_test $port
done done
# TODO(sangoly): refine this
test_arm_model "test_cxx_api_lite" $port "./third_party/install/mobilenet_v2_relu"
} }
# Build the code and run lite arm tests. This is executed in the CI system. function prepare_emulator {
function build_test_arm { local port_armv8=$1
######################################################################## local port_armv7=$2
# job 1-4 must be in one runner
port_armv8=5554
port_armv7=5556
adb kill-server adb kill-server
adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
...@@ -245,6 +240,18 @@ function build_test_arm { ...@@ -245,6 +240,18 @@ function build_test_arm {
echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a" echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a"
echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -verbose -port ${port_armv7} & echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -verbose -port ${port_armv7} &
sleep 1m sleep 1m
}
# We split the arm unittest into several sub-tasks to parallel and reduce the overall CI timetime.
# sub-task1
function build_test_arm_subtask_android {
########################################################################
# job 1-4 must be in one runner
port_armv8=5554
port_armv7=5556
prepare_emulator $port_armv8 $port_armv7
# job 1 # job 1
build_arm "android" "armv8" "gcc" build_arm "android" "armv8" "gcc"
...@@ -252,9 +259,9 @@ function build_test_arm { ...@@ -252,9 +259,9 @@ function build_test_arm {
cd - cd -
# job 2 # job 2
build_arm "android" "armv8" "clang" #build_arm "android" "armv8" "clang"
test_arm "android" "armv8" "clang" ${port_armv8} #test_arm "android" "armv8" "clang" ${port_armv8}
cd - #cd -
# job 3 # job 3
build_arm "android" "armv7" "gcc" build_arm "android" "armv7" "gcc"
...@@ -262,13 +269,22 @@ function build_test_arm { ...@@ -262,13 +269,22 @@ function build_test_arm {
cd - cd -
# job 4 # job 4
build_arm "android" "armv7" "clang" #build_arm "android" "armv7" "clang"
test_arm "android" "armv7" "clang" ${port_armv7} #test_arm "android" "armv7" "clang" ${port_armv7}
cd - #cd -
adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
echo "Done" echo "Done"
}
# sub-task2
function build_test_arm_subtask_armlinux {
######################################################################## ########################################################################
# job 1-4 must be in one runner
port_armv8=5554
port_armv7=5556
prepare_emulator $port_armv8 $port_armv7
# job 5 # job 5
build_arm "armlinux" "armv8" build_arm "armlinux" "armv8"
...@@ -285,9 +301,47 @@ function build_test_arm { ...@@ -285,9 +301,47 @@ function build_test_arm {
test_arm "armlinux" "armv7hf" test_arm "armlinux" "armv7hf"
cd - cd -
adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
echo "Done"
}
# sub-task3
function build_test_arm_subtask3_mobilenet_v2 {
local port_armv8=5554
local port_armv7=5556
# We just test following single one environment to limit the CI time.
local os=android
local abi=armv8
local lang=gcc
cur_dir=$(pwd)
build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
mkdir -p $build_dir
cd $build_dir
cmake_arm $os $abi $lang
make test_cxx_api_lite -j$NUM_CORES_FOR_COMPILE
prepare_emulator $port_armv8 $port_armv7
# just test the model on armv8
test_arm_model "test_cxx_api_lite" $port_armv8 "./third_party/install/mobilenet_v2_relu"
adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
echo "Done" echo "Done"
} }
# Build the code and run lite arm tests. This is executed in the CI system.
function build_test_arm {
########################################################################
# job 1-4 must be in one runner
port_armv8=5554
port_armv7=5556
build_test_arm_subtask_android
build_test_arm_subtask_armlinux
}
############################# MAIN ################################# ############################# MAIN #################################
function print_usage { function print_usage {
echo -e "\nUSAGE:" echo -e "\nUSAGE:"
...@@ -379,6 +433,18 @@ function main { ...@@ -379,6 +433,18 @@ function main {
build_test_arm build_test_arm
shift shift
;; ;;
build_test_arm_subtask_android)
build_test_arm_subtask_android
shift
;;
build_test_arm_subtask_armlinux)
build_test_arm_subtask_armlinux
shift
;;
build_test_arm_model1)
build_test_arm_subtask3_mobilenet_v2
shift
;;
check_style) check_style)
check_style check_style
shift shift
...@@ -397,4 +463,3 @@ function main { ...@@ -397,4 +463,3 @@ function main {
} }
main $@ main $@
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册