1.update CPUContext to probe CPU info 2.improve the performance of...

1.update CPUContext to probe CPU info 2.improve the performance of SlidingwindowConv3x3s1 and SlidingwindowConv3x3s2 (#1655)

1.update CPUContext to probe CPU info 2.improve the performance of...
1.update CPUContext to probe CPU info 2.improve the performance of SlidingwindowConv3x3s1 and SlidingwindowConv3x3s2 (#1655)
b025553b · hong19860320 · GitHub · b53a20aa · b025553b · b025553b
14 changed file
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -111,6 +111,26 @@ enum PoolingType {
  FIRST = 3,
 };
+enum PowerMode {
+  PERFORMANCE_PRIORITY = 0,  // let threads run on big cores if
+                             // thread_num <= big_cores_num,
+                             // otherwise the power mode will be
+                             // set to AUTO and all threads are
+                             // scheduled by system
+  EFFICIENCY_PRIORITY = 1,   // let threads run on little cores if
+                             // thread_num <= little_cores_num,
+                             // otherwise the power mode will be
+                             // set to AUTO and all threads are
+                             // scheduled by system
+  PERFORMANCE_ONLY = 2,      // force threads run on big cores,
+                             // and the remains are ignored if
+                             // exceed the number big cores
+  EFFICIENCY_ONLY = 3,       // force threads run on little cores,
+                             // and the remains are ignored if
+                             // exceed the number of little cores
+  AUTO = 4,                  // scheduled by system
+};
 struct PaddleMobileConfigInternal {
  bool load_when_predict = false;
 };

--- a/src/framework/context.cpp
+++ b/src/framework/context.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "framework/context.h"
+#include <iostream>
+#include <string>
+#include "common/log.h"
+#ifdef __APPLE__
+#include "TargetConditionals.h"
+#ifdef TARGET_OS_IPHONE
+// iOS
+#elif TARGET_OS_MAC
+// Mac OS
+#else
+// Unsupported platform
+#endif
+#include <mach/machine.h>
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#else  // Linux or Android
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+namespace paddle_mobile {
+namespace framework {
+const int DEFAULT_L1_CACHE_SIZE = 32 * 1024;
+const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
+const int DEFAULT_L3_CACHE_SIZE = 0;
+void fill_cpu_cache_size(std::vector<int> *cpu_cache_sizes, int value,
+                         const std::vector<int> cpu_ids = {}) {
+  int num = cpu_ids.size();
+  if (num > 0) {
+    for (int i = 0; i < num; i++) {
+      (*cpu_cache_sizes)[cpu_ids[i]] = value;
+    }
+  } else {
+    num = cpu_cache_sizes->size();
+    for (int i = 0; i < num; i++) {
+      (*cpu_cache_sizes)[i] = value;
+    }
+  }
+}
+int get_cpu_num() {
+#ifdef __APPLE__
+  int count = 0;
+  size_t len = sizeof(count);
+  sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
+  if (count < 1) {
+    count = 1;
+  }
+  return count;
+#else  // Linux or Android
+  // get cpu num from /sys/devices/system/cpu/cpunum/uevent
+  int max_cpu_num = 20;
+  int count = 0;
+  for (int i = 0; i < max_cpu_num; i++) {
+    char path[256];
+    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
+    FILE *fp = fopen(path, "rb");
+    if (!fp) {
+      break;
+    }
+    count++;
+    fclose(fp);
+  }
+  if (count < 1) {
+    count = 1;
+  }
+  return count;
+#endif
+}
+#if !defined(__APPLE__)  // Linux or Android
+std::string get_cpu_name() {
+  FILE *fp = fopen("/proc/cpuinfo", "rb");
+  if (!fp) {
+    return "";
+  }
+  char line[1024];
+  while (!feof(fp)) {
+    char *s = fgets(line, 1024, fp);
+    if (!s) {
+      break;
+    }
+    if (strstr(line, "Hardware") != NULL) {
+      fclose(fp);
+      return std::string(line);
+    }
+  }
+  fclose(fp);
+  return "";
+}
+int get_cpu_max_freq_khz(int cpu_id) {
+  // first try, for all possible cpu
+  char path[256];
+#ifdef __ANDROID__
+  snprintf(path, sizeof(path),
+           "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpu_id);
+  FILE *fp = fopen(path, "rb");
+  if (!fp) {
+    // second try, for online cpu
+    snprintf(path, sizeof(path),
+             "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",
+             cpu_id);
+    fp = fopen(path, "rb");
+    if (!fp) {
+      // third try, for online cpu
+      snprintf(path, sizeof(path),
+               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
+               cpu_id);
+      fp = fopen(path, "rb");
+      if (!fp) {
+        return 0;
+      }
+      int max_freq_khz = 0;
+      if (fscanf(fp, "%d", &max_freq_khz) <= 0) {
+        max_freq_khz = 0;
+      }
+      fclose(fp);
+      return max_freq_khz;
+    }
+  }
+  int max_freq_khz = 0;
+  while (!feof(fp)) {
+    int freq_khz = 0;
+    int nscan = fscanf(fp, "%d %*d", &freq_khz);
+    if (nscan != 1) {
+      break;
+    }
+    if (freq_khz > max_freq_khz) {
+      max_freq_khz = freq_khz;
+    }
+  }
+  fclose(fp);
+  return max_freq_khz;
+#else
+  snprintf(path, sizeof(path),
+           "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_max_freq", cpu_id);
+  FILE *fp = fopen(path, "r");
+  if (!fp) {
+    return 0;
+  }
+  int max_freq_khz = 0;
+  if (fscanf(fp, "%d", &max_freq_khz) <= 0) {
+    max_freq_khz = 0;
+  }
+  fclose(fp);
+  return max_freq_khz;
+#endif
+}
+void get_cpu_cache_size(int cpu_id, int *l1_cache_size, int *l2_cache_size,
+                        int *l3_cache_size) {
+  int max_cache_idx_num = 10;
+  *l1_cache_size = DEFAULT_L1_CACHE_SIZE;
+  *l2_cache_size = DEFAULT_L2_CACHE_SIZE;
+  *l3_cache_size = DEFAULT_L3_CACHE_SIZE;
+  for (int i = 0; i < max_cache_idx_num; i++) {
+    char path[256];
+    snprintf(path, sizeof(path),
+             "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpu_id, i);
+    FILE *fp = fopen(path, "rb");
+    if (fp) {
+      int level = -1;
+      fscanf(fp, "%d", &level);
+      fclose(fp);
+      snprintf(path, sizeof(path),
+               "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpu_id, i);
+      fp = fopen(path, "rb");
+      if (fp) {
+        int size = -1;
+        fscanf(fp, "%d", &size);
+        fclose(fp);
+        if (size >= 0) {
+          if (level == 1) {
+            *l1_cache_size = size * 1024;
+          } else if (level == 2) {
+            *l2_cache_size = size * 1024;
+          } else if (level == 3) {
+            *l3_cache_size = size * 1024;
+          }
+        }
+      }
+    }
+  }
+}
+int check_online(std::vector<int> *cpu_ids) {
+  if (cpu_ids->size() == 0) {
+    return 0;
+  }
+  std::vector<int> online_cpu_ids;
+  char path[256];
+  for (int i = 0; i < cpu_ids->size(); i++) {
+    int cpu_id = (*cpu_ids)[i];
+    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online",
+             cpu_id);
+    FILE *fp = fopen(path, "rb");
+    if (fp) {
+      int is_online = 0;
+      fscanf(fp, "%d", &is_online);
+      fclose(fp);
+      if (is_online != 0) {
+        online_cpu_ids.push_back(cpu_id);
+      }
+    }
+    // open failed(Permission denied)
+  }
+  *cpu_ids = online_cpu_ids;
+  return cpu_ids->size();
+}
+int set_sched_affinity(const std::vector<int> &cpu_ids) {
+// cpu_set_t definition
+// ref http://stackoverflow.com/questions/16319725/android-set-thread-affinity
+#define CPU_SETSIZE 1024
+#define __NCPUBITS (8 * sizeof(unsigned long))
+  typedef struct {
+    unsigned long __bits[CPU_SETSIZE / __NCPUBITS];
+  } cpu_set_t;
+#define CPU_SET(cpu, cpusetp) \
+  ((cpusetp)->__bits[(cpu) / __NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS)))
+#define CPU_ZERO(cpusetp) memset((cpusetp), 0, sizeof(cpu_set_t))
+  // set affinity for thread
+#ifdef __GLIBC__
+  pid_t pid = syscall(SYS_gettid);
+#else
+  pid_t pid = gettid();
+#endif
+  cpu_set_t mask;
+  CPU_ZERO(&mask);
+  for (int i = 0; i < cpu_ids.size(); i++) {
+    CPU_SET(cpu_ids[i], &mask);
+  }
+  int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
+  if (syscallret) {
+    LOG(kLOG_WARNING) << "invoke syscall(__NR_sched_setaffinity) error(ret="
+                      << syscallret << ")";
+    return -1;
+  }
+  return 0;
+}
+int get_cpu_info_by_name(int *cpu_num, std::vector<int> *big_core_ids,
+                         std::vector<int> *little_core_ids,
+                         std::vector<int> *l1_cache_sizes,
+                         std::vector<int> *l2_cache_sizes,
+                         std::vector<int> *l3_cache_sizes,
+                         std::string hardware_name) {
+  /* Snapdragon */
+  if (hardware_name.find("SDM845") != std::string::npos) {  // 845
+    *cpu_num = 8;
+    *big_core_ids = {4, 5, 6, 7};
+    *little_core_ids = {0, 1, 2, 3};
+    l1_cache_sizes->resize(*cpu_num);
+    l2_cache_sizes->resize(*cpu_num);
+    l3_cache_sizes->resize(*cpu_num);
+    fill_cpu_cache_size(l1_cache_sizes, 64 * 1024);
+    fill_cpu_cache_size(l2_cache_sizes, 256 * 1024, *big_core_ids);
+    fill_cpu_cache_size(l2_cache_sizes, 128 * 1024, *little_core_ids);
+    fill_cpu_cache_size(l3_cache_sizes, 2048 * 1024);
+    return 0;
+  } else if (hardware_name.find("SDM710") != std::string::npos) {  // 710
+    *cpu_num = 8;
+    *big_core_ids = {6, 7};
+    *little_core_ids = {0, 1, 2, 3, 4, 5};
+    l1_cache_sizes->resize(*cpu_num);
+    l2_cache_sizes->resize(*cpu_num);
+    l3_cache_sizes->resize(*cpu_num);
+    fill_cpu_cache_size(l1_cache_sizes, 64 * 1024, *big_core_ids);
+    fill_cpu_cache_size(l1_cache_sizes, 32 * 1024, *little_core_ids);
+    fill_cpu_cache_size(l2_cache_sizes, 256 * 1024, *big_core_ids);
+    fill_cpu_cache_size(l2_cache_sizes, 128 * 1024, *little_core_ids);
+    fill_cpu_cache_size(l3_cache_sizes, 1024 * 1024);
+    return 0;
+  } else if (hardware_name.find("MSM8998") != std::string::npos) {  // 835
+    *cpu_num = 8;
+    *big_core_ids = {4, 5, 6, 7};
+    *little_core_ids = {0, 1, 2, 3};
+    l1_cache_sizes->resize(*cpu_num);
+    l2_cache_sizes->resize(*cpu_num);
+    l3_cache_sizes->resize(*cpu_num);
+    fill_cpu_cache_size(l1_cache_sizes, 64 * 1024, *big_core_ids);
+    fill_cpu_cache_size(l1_cache_sizes, 32 * 1024, *little_core_ids);
+    // real L2 cache size is 2M, while that will get bad performace on conv3x3s1
+    // or gemm, set to 1M or 512K
+    // fill_cpu_cache_size(l2_cache_sizes, 2048 *1024,
+    // *big_core_ids);
+    // fill_cpu_cache_size(l2_cache_sizes, 1024 * 1024,
+    // *little_core_ids);
+    fill_cpu_cache_size(l2_cache_sizes, 1024 * 1024);
+    fill_cpu_cache_size(l3_cache_sizes, 0);
+    return 0;
+  } else if (hardware_name.find("MSM8976") != std::string::npos) {  // 652,653
+    *cpu_num = 8;
+    *big_core_ids = {0, 1, 2, 3, 4, 5, 6, 7};
+    *little_core_ids = {};
+    l1_cache_sizes->resize(*cpu_num);
+    l2_cache_sizes->resize(*cpu_num);
+    l3_cache_sizes->resize(*cpu_num);
+    fill_cpu_cache_size(l1_cache_sizes, 32 * 1024);
+    fill_cpu_cache_size(l2_cache_sizes, 1024 * 1024);
+    fill_cpu_cache_size(l3_cache_sizes, 0);
+    return 0;
+  }
+  return -1;
+}
+// divide cpu cores into big and little clusters by max frequency
+void get_cpu_info_by_probe(int cpu_num, std::vector<int> *big_core_ids,
+                           std::vector<int> *little_core_ids,
+                           std::vector<int> *l1_cache_sizes,
+                           std::vector<int> *l2_cache_sizes,
+                           std::vector<int> *l3_cache_sizes) {
+  // get maxium & minium of cpu_max_freqs
+  std::vector<int> cpu_max_freqs(cpu_num);
+  for (int i = 0; i < cpu_num; i++) {
+    cpu_max_freqs[i] = get_cpu_max_freq_khz(i) / 1000;
+  }
+  int max_cpu_max_freq = cpu_max_freqs[0];
+  int min_cpu_max_freq = cpu_max_freqs[0];
+  for (int i = 1; i < cpu_num; i++) {
+    int cur_cpu_max_freq = cpu_max_freqs[i];
+    if (cur_cpu_max_freq < min_cpu_max_freq) {
+      min_cpu_max_freq = cur_cpu_max_freq;
+    } else if (cur_cpu_max_freq > max_cpu_max_freq) {
+      max_cpu_max_freq = cur_cpu_max_freq;
+    }
+  }
+  int mid_max_freq_khz = (max_cpu_max_freq + min_cpu_max_freq) / 2;
+  big_core_ids->clear();
+  little_core_ids->clear();
+  for (int i = 0; i < cpu_num; i++) {
+    if (cpu_max_freqs[i] >= mid_max_freq_khz) {
+      big_core_ids->push_back(i);
+    } else {
+      little_core_ids->push_back(i);
+    }
+  }
+  /* get l1, l2, l3 cache size for each core */
+  l1_cache_sizes->resize(cpu_num);
+  l2_cache_sizes->resize(cpu_num);
+  l3_cache_sizes->resize(cpu_num);
+  for (int i = 0; i < cpu_num; i++) {
+    get_cpu_cache_size(i, &((*l1_cache_sizes)[i]), &((*l2_cache_sizes)[i]),
+                       &((*l3_cache_sizes)[i]));
+  }
+}
+void bind_threads(const std::vector<int> &cpu_ids) {
+#ifdef _OPENMP
+  int num_threads = omp_get_max_threads();
+  std::vector<int> ssarets;
+  for (int i = 0; i < num_threads; i++) {
+    ssarets.push_back(0);
+  }
+#pragma omp parallel for
+  for (int i = 0; i < num_threads; i++) {
+    ssarets[i] = set_sched_affinity(cpu_ids);
+  }
+  for (int i = 0; i < num_threads; i++) {
+    if (ssarets[i] != 0) {
+      LOG(kLOG_WARNING) << "set cpu affinity failed, thread idx: " << i;
+      return;
+    }
+  }
+#else
+  int ssaret = set_sched_affinity(cpu_ids);
+  if (ssaret != 0) {
+    LOG(kLOG_WARNING) << "set cpu affinity failed, thread idx: 0 ";
+    return;
+  }
+#endif
+}
+#endif
+CPUContext::CPUContext() {
+  _cpu_num = get_cpu_num();
+  _big_core_ids.clear();
+  _little_core_ids.clear();
+#ifdef __APPLE__
+  // set default L1, L2 and L3 cache sizes
+  _l1_cache_sizes.resize(_cpu_num);
+  _l2_cache_sizes.resize(_cpu_num);
+  _l3_cache_sizes.resize(_cpu_num);
+  fill_cpu_cache_size(&_l1_cache_sizes, DEFAULT_L1_CACHE_SIZE);
+  fill_cpu_cache_size(&_l2_cache_sizes, DEFAULT_L2_CACHE_SIZE);
+  fill_cpu_cache_size(&_l3_cache_sizes, DEFAULT_L3_CACHE_SIZE);
+#else  // Linux or Android
+  // probe cpu info, and set big&litte clusters, L1, L2 and L3 cache sizes
+  std::string cpu_name = get_cpu_name();
+  bool failed =
+      get_cpu_info_by_name(&_cpu_num, &_big_core_ids, &_little_core_ids,
+                           &_l1_cache_sizes, &_l2_cache_sizes, &_l3_cache_sizes,
+                           cpu_name) != 0;
+  if (failed) {
+    get_cpu_info_by_probe(_cpu_num, &_big_core_ids, &_little_core_ids,
+                          &_l1_cache_sizes, &_l2_cache_sizes, &_l3_cache_sizes);
+  }
+  LOG(kLOG_INFO) << "CPU num: " << _cpu_num;
+  for (int i = 0; i < _cpu_num; i++) {
+    LOG(kLOG_INFO) << i << " L1 Cache: " << _l1_cache_sizes[i] << "KB"
+                   << " L2 Cache: " << _l2_cache_sizes[i] << "KB"
+                   << " L3 Cache: " << _l3_cache_sizes[i] << "KB";
+  }
+  LOG(kLOG_INFO) << "Big cores: ";
+  for (int i = 0; i < _big_core_ids.size(); i++) {
+    LOG(kLOG_INFO) << _big_core_ids[i];
+  }
+  LOG(kLOG_INFO) << "Little cores: ";
+  for (int i = 0; i < _little_core_ids.size(); i++) {
+    LOG(kLOG_INFO) << _little_core_ids[i];
+  }
+#endif
+  // use single thread by default
+  set_thread_num(1, PERFORMANCE_PRIORITY);
+}
+void CPUContext::set_thread_num(int thread_num, PowerMode power_mode) {
+  int big_core_num = _big_core_ids.size();
+  int little_core_num = _little_core_ids.size();
+#ifdef _OPENMP
+  if (thread_num > _cpu_num) {
+    thread_num = _cpu_num;
+  }
+#else
+  thread_num = 1;
+#endif
+  std::vector<int> bind_core_ids;
+  if (power_mode == PERFORMANCE_PRIORITY || power_mode == PERFORMANCE_ONLY) {
+    if (big_core_num > 0) {
+      bind_core_ids = _big_core_ids;
+      if (power_mode == PERFORMANCE_ONLY && thread_num > big_core_num) {
+        LOG(kLOG_ERROR) << "thread_num(" << thread_num
+                        << ") exceed the big cores num (" << big_core_num << ")"
+                        << ", force to set thread_num = " << big_core_num;
+        thread_num = big_core_num;
+      }
+    }
+  } else if (power_mode == EFFICIENCY_PRIORITY ||
+             power_mode == EFFICIENCY_ONLY) {
+    if (little_core_num > 0) {
+      bind_core_ids = _little_core_ids;
+      if (power_mode == EFFICIENCY_ONLY && thread_num > little_core_num) {
+        LOG(kLOG_ERROR) << "thread_num(" << thread_num
+                        << ") exceed the little cores num (" << little_core_num
+                        << ")"
+                        << ", force to set thread_num = " << little_core_num;
+        thread_num = little_core_num;
+      }
+    }
+  }
+  _power_mode = AUTO;
+#ifdef _OPENMP
+  omp_set_num_threads(thread_num);
+  thread_num = omp_get_max_threads();
+#endif
+#if !defined(__APPLE__)  // Linux or Android
+  if (bind_core_ids.size() > 0 && check_online(&bind_core_ids) >= thread_num) {
+    bind_threads(bind_core_ids);
+    _power_mode = power_mode;
+  }
+#endif
+  LOG(kLOG_INFO) << "thread num: " << thread_num
+                 << " power mode: " << _power_mode;
+}
+int CPUContext::get_thread_num() {
+  int thread_num = 1;
+#ifdef _OPENMP
+  thread_num = omp_get_max_threads();
+#endif
+  return thread_num;
+}
+int CPUContext::get_cache_size(int level) {
+  std::vector<int> *ptr = nullptr;
+  if (level == 1) {
+    ptr = &_l1_cache_sizes;
+  } else if (level == 2) {
+    ptr = &_l2_cache_sizes;
+  } else if (level == 3) {
+    ptr = &_l3_cache_sizes;
+  } else {
+    return 0;
+  }
+  if (_power_mode == PERFORMANCE_PRIORITY || _power_mode == PERFORMANCE_ONLY) {
+    return (*ptr)[_big_core_ids[0]];
+  } else if (_power_mode == EFFICIENCY_PRIORITY ||
+             _power_mode == EFFICIENCY_ONLY) {
+    return (*ptr)[_little_core_ids[0]];
+  } else {  // AUTO
+    return (*ptr)[0];
+  }
+}
+void *CPUContext::get_work_space(int size_in_byte) {
+  return reinterpret_cast<void *>(
+      _workspace.mutable_data<int8_t>(make_ddim({size_in_byte})));
+}
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/context.h
+++ b/src/framework/context.h
@@ -18,63 +18,45 @@ limitations under the License. */
 #include <omp.h>
 #endif
-#define MOBILE_MAX_CPU_NUM 8
+#include <vector>
+#include "framework/tensor.h"
 namespace paddle_mobile {
 namespace framework {
 struct CPUContext {
 private:
-  CPUContext() : num_cpus(4), num_threads(1) {
+  CPUContext();
-    // TODO(hjchen2)
-    for (int i = 0; i < num_cpus; ++i) {
-      cpu_frequencies[i] = 2400;      // 2400 MHz
-      max_cpu_frequencies[i] = 2400;  // 2400 MHz
-    }
-    //    L1_cache = 32000;    // 32K
-    L1_cache = 32 * 1024;
-    L2_cache = 2000000;  // 2M
-                         //    L2_cache = 512000;
-  }
- public:
-  void set_num_threads(int threads) {
-#if _ONENMP
-    omp_set_num_threads(threads);
-    if (threads <= omp_get_max_threads()) {
-      num_threads = threads;
-    } else {
-      num_threads = omp_get_max_threads();
-    }
-#endif
-    num_threads = (num_threads > 1) ? num_threads : 1;
-  }
  virtual ~CPUContext() {}
 public:
  static CPUContext* Context() {
-    static CPUContext* ctx = new CPUContext;
+    static CPUContext* ctx = nullptr;
+    if (ctx == nullptr) {
+      ctx = new CPUContext();
+    }
    return ctx;
  }
-  int num_cpus;
+  void set_thread_num(int thread_num,
-  int num_threads;
+                      PowerMode power_mode = PERFORMANCE_PRIORITY);
-  int cpu_frequencies[MOBILE_MAX_CPU_NUM];
+  int get_thread_num();
-  int max_cpu_frequencies[MOBILE_MAX_CPU_NUM];
+  PowerMode get_power_mode() const { return _power_mode; }
+  int get_cache_size(int level);
-  int L1_cache;
+  int get_l1_cache_size() { return get_cache_size(1); }
-  int L2_cache;
+  int get_l2_cache_size() { return get_cache_size(2); }
+  int get_l3_cache_size() { return get_cache_size(3); }
+  void* get_work_space(int size_in_byte);
+  int _cpu_num;
+  PowerMode _power_mode;
+  std::vector<int> _big_core_ids;
+  std::vector<int> _little_core_ids;
+  std::vector<int> _l1_cache_sizes;
+  std::vector<int> _l2_cache_sizes;
+  std::vector<int> _l3_cache_sizes;
+  Tensor _workspace;
 };
-inline void set_global_num_threads(int threads) {
-  // CPUContext::Context()->set_num_threads(threads);
-  CPUContext::Context()->num_threads = threads;
-}
-inline int get_global_num_threads() {
-  return CPUContext::Context()->num_threads;
-}
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -40,8 +40,8 @@ namespace framework {
 #pragma mark - executor
 template <typename Device, typename T>
-void Executor<Device, T>::SetThreadNum(int threads) {
+void Executor<Device, T>::SetThreadNum(int thread_num, PowerMode power_mode) {
-  set_global_num_threads(threads);
+  CPUContext::Context()->set_thread_num(thread_num, power_mode);
 }
 template <typename Device, typename T>
@@ -440,7 +440,7 @@ std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
 template <typename Device, typename T>
 PMStatus Executor<Device, T>::Predict() {
 #if _OPENMP
-  omp_set_num_threads(get_global_num_threads());
+  omp_set_num_threads(CPUContext::Context()->get_thread_num());
 #endif
  // clear all no persistable tensor array since write_to_array
  // is always push back a new tensor in the array

--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 #include "common/types.h"
@@ -37,7 +38,8 @@ class Executor {
           paddle_mobile::PaddleMobileConfigInternal config, int batch_size = 1,
           const bool use_optimize = true, const bool lod_mode = false);
-  void SetThreadNum(int threads);
+  void SetThreadNum(int thread_num,
+                    PowerMode power_mode = PERFORMANCE_PRIORITY);
  PMStatus Predict(const std::vector<std::pair<std::string, Tensor>> &inputs);
  PMStatus Predict(

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -29,8 +29,9 @@ limitations under the License. */
 namespace paddle_mobile {
 template <typename Device, typename T>
-void PaddleMobile<Device, T>::SetThreadNum(int num) {
+void PaddleMobile<Device, T>::SetThreadNum(int thread_num,
-  executor_->SetThreadNum(num);
+                                           PowerMode power_mode) {
+  executor_->SetThreadNum(thread_num, power_mode);
 }
 template <typename Device, typename T>

--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -83,7 +83,8 @@ class PaddleMobile {
                          bool quantification = false, int batch_size = 1,
                          bool lod_mode = false);
-  void SetThreadNum(int count);
+  void SetThreadNum(int thread_num,
+                    PowerMode power_mode = PERFORMANCE_PRIORITY);
  void Clear();
  double GetPredictTime();

--- a/src/operators/kernel/arm/convolution/conv_common.cpp
+++ b/src/operators/kernel/arm/convolution/conv_common.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "operators/kernel/arm/convolution/conv_common.h"
+#include "operators/math/slidingwindow_utils.h"
 #include "operators/math/winograd/winograd_transform.h"
 namespace paddle_mobile {
@@ -56,38 +57,31 @@ void InitBaseConvKernel(ConvParam<CPU> *param) {
    } else if (conv3x3 && param->Groups() == 1 &&
               param->Strides()[0] == param->Strides()[1] &&
               param->Dilations()[0] == param->Dilations()[1] &&
-               param->Strides()[0] == 1 && param->Dilations()[0] == 1
+               param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-#if 1
-               && (param->Input()->dims()[1] >= 8 &&
-                   param->Output()->dims()[1] >= 8)
-#endif
-    ) {
-      param->ExecMode() = ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT;
      // transform weight
      Variable *transformed_var = param->GetScope()->Var();
      param->transformed_filter_ =
          transformed_var->GetMutable<framework::LoDTensor>();
-      operators::math::winograd_transform_weight<8, 3>(
+      if (param->Input()->dims()[1] >= 32 && param->Output()->dims()[1] >= 32 &&
-          *param->Filter(), param->transformed_filter_);
+          param->Output()->dims()[2] > 16 && param->Output()->dims()[3] > 16) {
-    } else if (conv3x3 && param->Groups() == 1 &&
+        math::winograd_transform_weight<8, 3>(*param->Filter(),
-               param->Strides()[0] == param->Strides()[1] &&
+                                              param->transformed_filter_);
-               param->Dilations()[0] == param->Dilations()[1] &&
+        param->ExecMode() = ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT;
-               param->Strides()[0] == 1 && param->Dilations()[0] == 1
+      } else {
-#if 1
+        math::slidingwindow_transform_weight<float>(*param->Filter(),
-               && (param->Input()->dims()[2] >= 48 &&
+                                                    param->transformed_filter_);
-                   param->Output()->dims()[1] <= 24)
-#endif
-    ) {
        param->ExecMode() = ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT;
+      }
    } else if (conv3x3 && param->Groups() == 1 &&
               param->Strides()[0] == param->Strides()[1] &&
               param->Dilations()[0] == param->Dilations()[1] &&
-               param->Strides()[0] == 2 && param->Dilations()[0] == 1
+               param->Strides()[0] == 2 && param->Dilations()[0] == 1) {
-#if 1
+      // transform weight
-               && (param->Input()->dims()[2] >= 48 &&
+      Variable *transformed_var = param->GetScope()->Var();
-                   param->Output()->dims()[1] <= 24)
+      param->transformed_filter_ =
-#endif
+          transformed_var->GetMutable<framework::LoDTensor>();
-    ) {
+      math::slidingwindow_transform_weight<float>(*param->Filter(),
+                                                  param->transformed_filter_);
      param->ExecMode() = ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT;
    } else {
      param->ExecMode() = ConvParam<CPU>::EXEC_GEMM_FLOAT;

--- a/src/operators/kernel/central-arm-func/conv_arm_func.cpp
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.cpp
@@ -243,9 +243,15 @@ void SlidingwindowConv3x3(const ConvParam<CPU> &param) {
  output->mutable_data<Otype>();
  if (strides[0] == 1) {
-    math::SlidingwindowConv3x3s1<Itype, Otype>(input, filter, paddings, output);
+    // math::SlidingwindowConv3x3s1<Itype, Otype>(input, filter, paddings,
+    // output);
+    math::SlidingwindowConv3x3s1Faster<Itype, Otype>(
+        input, param.transformed_filter_, paddings, output);
  } else if (strides[0] == 2) {
-    math::SlidingwindowConv3x3s2<Itype, Otype>(input, filter, paddings, output);
+    // math::SlidingwindowConv3x3s2<Itype, Otype>(input, filter, paddings,
+    // output);
+    math::SlidingwindowConv3x3s2Faster<Itype, Otype>(
+        input, param.transformed_filter_, paddings, output);
  } else {
    GemmConv<Itype, Otype>(param);
  }

--- a/src/operators/math/gemm/executor.h
+++ b/src/operators/math/gemm/executor.h
@@ -29,8 +29,6 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {
-static framework::CPUContext *g_cpu_ctx = framework::CPUContext::Context();
 int CeilDiv(const int &x, const int &y) { return (x + y - 1) / y; }
 unsigned int ResetL1Cache(const unsigned int L1_size, const int thread_num,
                          const int N, const int K) {
@@ -70,11 +68,15 @@ class GemmExecutor : public Executor {
    unsigned int L1_size = 0;
    unsigned int L2_size = 0;
    if (M_ > N_) {
-      L2_size = ResetL1Cache(g_cpu_ctx->L1_cache, num_threads_, M_, K_);
+      L2_size =
-      L1_size = g_cpu_ctx->L2_cache;
+          ResetL1Cache(framework::CPUContext::Context()->get_l1_cache_size(),
+                       num_threads_, M_, K_);
+      L1_size = framework::CPUContext::Context()->get_l2_cache_size();
    } else {
-      L1_size = ResetL1Cache(g_cpu_ctx->L1_cache, num_threads_, N_, K_);
+      L1_size =
-      L2_size = g_cpu_ctx->L2_cache;
+          ResetL1Cache(framework::CPUContext::Context()->get_l1_cache_size(),
+                       num_threads_, N_, K_);
+      L2_size = framework::CPUContext::Context()->get_l2_cache_size();
    }
    rhs_tile_num_ = L1_size / (K_ * sizeof(Itype));

--- a/src/operators/math/slidingwindow_conv3x3.cpp
+++ b/src/operators/math/slidingwindow_conv3x3.cpp
--- a/src/operators/math/slidingwindow_conv3x3.h
+++ b/src/operators/math/slidingwindow_conv3x3.h
@@ -33,6 +33,17 @@ void SlidingwindowConv3x3s2(const framework::Tensor *input,
                            const std::vector<int> &paddings,
                            framework::Tensor *output);
+template <typename Itype, typename Otype>
+void SlidingwindowConv3x3s1Faster(const framework::Tensor *input,
+                                  framework::Tensor *filter,
+                                  const std::vector<int> &paddings,
+                                  framework::Tensor *output);
+template <typename Itype, typename Otype>
+void SlidingwindowConv3x3s2Faster(const framework::Tensor *input,
+                                  framework::Tensor *filter,
+                                  const std::vector<int> &paddings,
+                                  framework::Tensor *output);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/slidingwindow_utils.cpp
+++ b/src/operators/math/slidingwindow_utils.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/math/slidingwindow_utils.h"
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+void slidingwindow_fill_bias(float* dout, const float* bias, int ch_num,
+                             int ch_size) {
+  for (int j = 0; j < ch_num; j++) {
+    float32x4_t vb = vdupq_n_f32(bias[j]);
+    int i = 0;
+    for (; i < ch_size - 3; i += 4) {
+      vst1q_f32(dout + i, vb);
+    }
+    for (; i < ch_size; i++) {
+      dout[i] = bias[j];
+    }
+    dout += ch_size;
+  }
+}
+/* write result in outputs
+ * input din: [n, c, h, w], output dout: [n, c, h, w]
+ */
+void slidingwindow_writeout_c1_fp32(const float* din, float* dout, int cs,
+                                    int ce, int hs, int he, int ws, int we,
+                                    int channel, int height, int width,
+                                    bool flag_relu, float* trash_ptr) {
+  if (cs > channel) {
+    return;
+  }
+  const int c1 = 1;
+  const int w4 = 4;
+  int size_c_out = width * height;
+  float* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
+  const float* ptr_din = din;
+  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
+  int w_round = we - ws;
+  int cnt = (width - ws) / w4;
+  for (int i = 0; i < size_h; i++) {
+    int size_w = i * width;
+    float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
+    const float* din_hei_ptr = ptr_din + i * w_round * c1;
+    if (cnt > 0) {
+      int cnt_loop = cnt;
+      if (flag_relu) {
+#ifdef __aarch64__
+        asm volatile(
+            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
+                                                   c0r3 */
+            "movi v20.4s, #0                \n" /* for relu */
+            "1:                             \n" /* main loop */
+            "fmax   v1.4s, v0.4s, v20.4s    \n" /* relu */
+            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
+                                                   c0r3 */
+            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1 */
+            "str    q1, [%[doutc0r0]], #16  \n" /* store c0r0 */
+            "bne    1b                      \n" /* jump to main loop */
+            : [doutc0r0] "+r"(doutc0_ptr), [cnt] "+r"(cnt_loop),
+              [ptr_din] "+r"(din_hei_ptr)
+            :
+            : "v0", "v1", "v20");
+#else
+        asm volatile(
+            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data, c0r0, c1r0, "
+            "c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n"
+            "vmov.u32 q15, #0                       @ dump zero\n"
+            "1:                                     @ main loop\n"
+            "vmax.f32   q1, q0, q15                 @ relu\n"
+            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data \n"
+            "vst1.32  {d2-d3}, [%[doutc0r0]]!       @ store result, add "
+            "pointer\n"
+            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
+            "bne    1b                              @ jump to main loop\n"
+            : [doutc0r0] "+r"(doutc0_ptr), [ptr_din] "+r"(din_hei_ptr),
+              [cnt] "+r"(cnt_loop)
+            :
+            : "q0", "q1", "q15");
+#endif
+      } else {
+#ifdef __aarch64__
+        asm volatile(
+            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
+                                                   c0r3 */
+            "1:                             \n" /* main loop */
+            "str    q0, [%[doutc0r0]], #16  \n" /* store c2r0 */
+            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1 */
+            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
+                                                   c0r3 */
+            "bne    1b                      \n" /* jump to main loop */
+            : [doutc0r0] "+r"(doutc0_ptr), [cnt] "+r"(cnt_loop),
+              [ptr_din] "+r"(din_hei_ptr)
+            :
+            : "v0");
+#else
+        asm volatile(
+            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data, c0r0, c0r1, "
+            "c0r2, c0r3\n"
+            "1:                                     @ main loop\n"
+            "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add "
+            "pointer\n"
+            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
+            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data \n"
+            "bne    1b                              @ jump to main loop\n"
+            : [doutc0r0] "+r"(doutc0_ptr), [ptr_din] "+r"(din_hei_ptr),
+              [cnt] "+r"(cnt_loop)
+            :
+            : "q0");
+#endif
+      }
+    }
+    if (we > width) {
+      int offset = i * w_round * c1 + c1 * w4 * cnt;
+      din_hei_ptr = ptr_din + offset;
+      int j = we - w4;
+      if (flag_relu) {
+        for (; j < width; ++j) {
+          *(doutc0_ptr++) = std::max(din_hei_ptr[0], 0.f);
+          din_hei_ptr++;
+        }
+      } else {
+        for (; j < width; ++j) {
+          *(doutc0_ptr++) = *(din_hei_ptr++);
+        }
+      }
+    }
+  }
+}
+/* write result in outputs
+ * input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w]
+ */
+void slidingwindow_writeout_c4_fp32(const float* din, float* dout, int cs,
+                                    int ce, int hs, int he, int ws, int we,
+                                    int channel, int height, int width,
+                                    bool flag_relu, float* trash_ptr) {
+  const int c4 = 4;
+  const int w4 = 4;
+  const int w_round = we - ws;
+  const int ch_n = ce - cs;
+  int size_c_out = width * height;
+  float* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
+  float* doutc1r0 = doutc0r0 + size_c_out;
+  float* doutc2r0 = doutc1r0 + size_c_out;
+  float* doutc3r0 = doutc2r0 + size_c_out;
+  const float* ptr_din = din;
+  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
+  int cnt = (width - ws) / w4;
+  for (int i = 0; i < size_h; i++) {
+    int size_w = i * width;
+    float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
+    float* doutc1_ptr = doutc1r0 + size_w;
+    float* doutc2_ptr = doutc2r0 + size_w;
+    float* doutc3_ptr = doutc3r0 + size_w;
+    if (ce > channel) {
+      switch (ce - channel) {
+        case 3:
+          doutc1_ptr = trash_ptr;
+        case 2:
+          doutc2_ptr = trash_ptr;
+        case 1:
+          doutc3_ptr = trash_ptr;
+        default:
+          break;
+      }
+    }
+    const float* din_hei_ptr = ptr_din + i * w_round * ch_n;
+    if (cnt > 0) {
+      int cnt_loop = cnt;
+      if (flag_relu) {
+#ifdef __aarch64__
+        asm volatile(
+            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
+            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
+            "movi v20.4s, #0                \n" /* for relu */
+            "1:                             \n" /* main loop */
+            "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1 */
+            "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1 */
+            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
+            "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3 */
+            "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3 */
+            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
+            "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10 */
+            "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10 */
+            "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11 */
+            "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11 */
+            "fmax   v16.4s, v16.4s, v20.4s  \n" /* relu */
+            "fmax   v17.4s, v17.4s, v20.4s  \n" /* relu */
+            "fmax   v18.4s, v18.4s, v20.4s  \n" /* relu */
+            "fmax   v19.4s, v19.4s, v20.4s  \n" /* relu */
+            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0 */
+            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0 */
+            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0 */
+            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0 */
+            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1 */
+            "bne    1b                      \n" /* jump to main loop */
+            : [doutc0r0] "+r"(doutc0_ptr), [doutc1r0] "+r"(doutc1_ptr),
+              [doutc2r0] "+r"(doutc2_ptr), [doutc3r0] "+r"(doutc3_ptr),
+              [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_hei_ptr)
+            :
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+              "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20");
+#else
+        asm volatile(
+            "vld1.32 {d0-d3}, [%[ptr_din]]!       @ load data \n"
+            "vld1.32 {d4-d7}, [%[ptr_din]]!       @ load data \n"
+            "vmov.u32 q15, #0                     @ dump zero \n"
+            "1:                                   @ main loop \n"
+            "vtrn.32 q0, q1                       @ trans data:c00c01c20c21 "
+            "\n"
+            "vtrn.32 q2, q3                       @ trans data:c02c03c22c23 "
+            "\n"
+            "vswp   d1, d4                        @ swap data\n"
+            "vswp   d3, d6                        @ swap data\n"
+            "vmax.f32   q0, q0, q15               @ relu\n"
+            "vmax.f32   q1, q1, q15               @ relu\n"
+            "vmax.f32   q2, q2, q15               @ relu\n"
+            "vmax.f32   q3, q3, q15               @ relu\n"
+            "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n"
+            "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n"
+            "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n"
+            "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n"
+            "subs   %[cnt], %[cnt], #1            @ loop count - 1\n"
+            "vld1.32 {d0-d3}, [%[ptr_din]]!       @ load data \n"
+            "vld1.32 {d4-d7}, [%[ptr_din]]!       @ load data \n"
+            "bne    1b                            @ jump to main loop\n"
+            : [doutc0r0] "+r"(doutc0_ptr), [doutc1r0] "+r"(doutc1_ptr),
+              [doutc2r0] "+r"(doutc2_ptr), [doutc3r0] "+r"(doutc3_ptr),
+              [ptr_din] "+r"(din_hei_ptr), [cnt] "+r"(cnt_loop)
+            :
+            : "q0", "q1", "q2", "q3", "q15");
+#endif
+      } else {
+#ifdef __aarch64__
+        asm volatile(
+            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
+            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
+            "1:                             \n" /* main loop */
+            "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1 */
+            "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1 */
+            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
+            "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3 */
+            "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3 */
+            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
+            "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10 */
+            "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10 */
+            "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11 */
+            "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11 */
+            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0 */
+            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0 */
+            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0 */
+            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0 */
+            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1 */
+            "bne    1b                      \n" /* jump to main loop */
+            : [doutc0r0] "+r"(doutc0_ptr), [doutc1r0] "+r"(doutc1_ptr),
+              [doutc2r0] "+r"(doutc2_ptr), [doutc3r0] "+r"(doutc3_ptr),
+              [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_hei_ptr)
+            :
+            : "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17",
+              "v18", "v19");
+#else
+        asm volatile(
+            "vld1.32 {d0-d3}, [%[ptr_din]]!       @ load data \n"
+            "vld1.32 {d4-d7}, [%[ptr_din]]!       @ load data \n"
+            "1:                                   @ main loop \n"
+            "vtrn.32 q0, q1                       @ trans data:c00c01c20c21 "
+            "\n"
+            "vtrn.32 q2, q3                       @ trans data:c02c03c22c23 "
+            "\n"
+            "vswp   d1, d4                        @ swap data\n"
+            "vswp   d3, d6                        @ swap data\n"
+            "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n"
+            "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n"
+            "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n"
+            "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n"
+            "subs   %[cnt], %[cnt], #1            @ loop count - 1\n"
+            "vld1.32 {d0-d3}, [%[ptr_din]]!       @ load data \n"
+            "vld1.32 {d4-d7}, [%[ptr_din]]!       @ load data \n"
+            "bne    1b                            @ jump to main loop\n"
+            : [doutc0r0] "+r"(doutc0_ptr), [doutc1r0] "+r"(doutc1_ptr),
+              [doutc2r0] "+r"(doutc2_ptr), [doutc3r0] "+r"(doutc3_ptr),
+              [ptr_din] "+r"(din_hei_ptr), [cnt] "+r"(cnt_loop)
+            :
+            : "q0", "q1", "q2", "q3");
+#endif
+      }
+    }
+    if (we > width) {
+      int offset = i * w_round * c4 + c4 * w4 * cnt;
+      din_hei_ptr = ptr_din + offset;
+      int j = we - w4;
+      if (flag_relu) {
+        for (; j < width; ++j) {
+          *(doutc0_ptr++) = std::max(din_hei_ptr[0], 0.f);
+          *(doutc1_ptr++) = std::max(din_hei_ptr[1], 0.f);
+          *(doutc2_ptr++) = std::max(din_hei_ptr[2], 0.f);
+          *(doutc3_ptr++) = std::max(din_hei_ptr[3], 0.f);
+          din_hei_ptr += w4;
+        }
+      } else {
+        for (; j < width; ++j) {
+          *(doutc0_ptr++) = din_hei_ptr[0];
+          *(doutc1_ptr++) = din_hei_ptr[1];
+          *(doutc2_ptr++) = din_hei_ptr[2];
+          *(doutc3_ptr++) = din_hei_ptr[3];
+          din_hei_ptr += w4;
+        }
+      }
+    }
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/math/slidingwindow_utils.h
+++ b/src/operators/math/slidingwindow_utils.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include "framework/tensor.h"
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+/* preprocessing weights
+ * input weights: [chout, chin/ group, kh, kw] --> outputs weights: [chout / n,
+ * chin/ group, kh, kw, n]
+ */
+template <typename dtype>
+void slidingwindow_transform_weight(const framework::Tensor& weight,
+                                    framework::Tensor* output) {
+  int chout = weight.dims()[0];
+  int chin = weight.dims()[1];
+  int kernel_size = weight.dims()[2] * weight.dims()[3];
+  const int n = 4;
+  int cround = (chout + n - 1) / n * n;
+  const dtype* din = weight.data<dtype>();
+  dtype* dout = output->mutable_data<dtype>({cround, chin, 3, 3});
+  int c_loop = chout / n;
+  int chout_round = (chout + n - 1) / n;
+  int win_stride = chin * kernel_size;
+  int wout_stride = n * win_stride;
+  int co = 0;
+  for (; co < c_loop; ++co) {
+    dtype* dout_c = dout + co * wout_stride;
+    const dtype* din_array[n];
+    din_array[0] = din + co * wout_stride;
+    for (int i = 1; i < n; i++) {
+      din_array[i] = din_array[i - 1] + win_stride;
+    }
+    for (int ci = 0; ci < chin; ++ci) {
+      for (int k = 0; k < kernel_size; ++k) {
+        for (int i = 0; i < n; i++) {
+          *(dout_c++) = *(din_array[i]++);
+        }
+      }
+    }
+  }
+  // pad final chout
+  if (chout_round > c_loop) {
+    dtype* dout_c = dout + c_loop * wout_stride;
+    const dtype* din_array[n];
+    din_array[0] = din + c_loop * wout_stride;
+    for (int i = 1; i < n; i++) {
+      din_array[i] = din_array[i - 1] + win_stride;
+    }
+    // deal remain
+    int cremain = chout_round * n - chout;
+    for (int i = 1; i <= cremain; i++) {
+      din_array[n - i] = din_array[0];
+    }
+    for (int ci = 0; ci < chin; ++ci) {
+      for (int k = 0; k < kernel_size; ++k) {
+        for (int i = 0; i < n; i++) {
+          *(dout_c++) = *(din_array[i]++);
+        }
+      }
+    }
+  }
+}
+/* preprocessing inputs
+ * input din: [1, chin, he-hs, we - ws] --> outputs dout: [n, chin, 1, we - ws]
+ * n = he - hs
+ */
+template <typename dtype>
+void slidingwindow_prepack_input(const dtype* din, dtype* dout, int cs, int ce,
+                                 int hs, int he, int ws, int we, int channel,
+                                 int width, int height, dtype* zero_ptr) {
+  int n = he - hs;
+  int w0 = ws < 0 ? 0 : ws;
+  int w1 = we > width ? width : we;
+  int size_w = we - ws;
+  int size_wc_len = size_w * channel;
+  int size_c = width * height;
+  int valid_w = w1 - w0;
+  size_t valid_w_byte = valid_w * sizeof(dtype);
+  dtype* out_array[n];
+  out_array[0] = dout;
+  for (int i = 1; i < n; i++) {
+    out_array[i] = out_array[i - 1] + size_wc_len;
+  }
+  for (int c = 0; c < channel; ++c) {
+    int j = 0;
+    // valid height
+    for (int i = hs; i < he; i++) {
+      // get address
+      const dtype* in_array;
+      if (i < 0 || i >= height) {
+        in_array = zero_ptr;
+      } else {
+        in_array = din + i * width;
+      }
+      for (int w = ws; w < w0; ++w) {
+        *(out_array[j]++) = 0.f;
+      }
+      memcpy(out_array[j], in_array, valid_w_byte);
+      out_array[j] += valid_w;
+      for (int w = w1; w < we; ++w) {
+        *(out_array[j]++) = 0.f;
+      }
+      j++;
+    }
+    din += size_c;
+  }
+}
+inline void slidingwindow_fill_bias(float* dout, const float* bias, int size) {
+  float32x4_t vb = vld1q_f32(bias);
+  int cnt = size / 4;
+  for (int i = 0; i < cnt; ++i) {
+    vst1q_f32(dout, vb);
+    dout += 4;
+  }
+}
+void slidingwindow_fill_bias(float* dout, const float* bias, int ch_num,
+                             int ch_size);
+void slidingwindow_writeout_c1_fp32(const float* din, float* dout, int cs,
+                                    int ce, int hs, int he, int ws, int we,
+                                    int channel, int height, int width,
+                                    bool flag_relu, float* trash_ptr);
+void slidingwindow_writeout_c4_fp32(const float* din, float* dout, int cs,
+                                    int ce, int hs, int he, int ws, int we,
+                                    int channel, int height, int width,
+                                    bool flag_relu, float* trash_ptr);
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile