diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc
index 789f2a5c1e2ccf0f87f8fbf03c71a22d2dec76cf..ce50595412c7b24a148c02b7b261d20f344a9c72 100644
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -36,45 +36,98 @@ namespace mace {
 
 int MaceOpenMPThreadCount = 1;
 
-namespace {
+struct CPUFreq {
+  size_t core_id;
+  float freq;
+};
 
+namespace {
+#if defined(__ANDROID__)
 int GetCPUCount() {
-  char path[64];
   int cpu_count = 0;
-  int result = 0;
-
-  while (true) {
-    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d", cpu_count);
-    result = access(path, F_OK);
-    if (result != 0) {
-      if (errno != ENOENT) {
-        LOG(ERROR) << "Access " << path << " failed: " << strerror(errno);
-      }
-      return cpu_count;
+  std::string cpu_sys_conf = "/proc/cpuinfo";
+  std::ifstream f(cpu_sys_conf);
+  if (!f.is_open()) {
+    LOG(ERROR) << "failed to open " << cpu_sys_conf;
+    return -1;
+  }
+  std::string line;
+  const std::string processor_key = "processor";
+  while (std::getline(f, line)) {
+    if (line.size() >= processor_key.size()
+        && line.compare(0, processor_key.size(), processor_key) == 0) {
+      ++cpu_count;
     }
-    cpu_count++;
   }
+  if (f.bad()) {
+    LOG(ERROR) << "failed to read " << cpu_sys_conf;
+  }
+  if (!f.eof()) {
+    LOG(ERROR) << "failed to read end of " << cpu_sys_conf;
+  }
+  f.close();
+  VLOG(2) << "CPU cores: " << cpu_count;
+  return cpu_count;
 }
+#endif
 
-int GetCPUMaxFreq(int cpu_id) {
-  char path[64];
-  snprintf(path, sizeof(path),
-           "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
-           cpu_id);
-
-  FILE *fp = fopen(path, "rb");
-  if (!fp) {
-    LOG(WARNING) << "File: " << path << " not exists.";
-    return 0;
+int GetCPUMaxFreq(std::vector<float> *max_freqs) {
+#if defined(__ANDROID__)
+  int cpu_count = GetCPUCount();
+  for (int cpu_id = 0; cpu_id < cpu_count; ++cpu_id) {
+    std::string cpuinfo_max_freq_sys_conf = MakeString(
+        "/sys/devices/system/cpu/cpu",
+        cpu_id,
+        "/cpufreq/cpuinfo_max_freq");
+    std::ifstream f(cpuinfo_max_freq_sys_conf);
+    if (!f.is_open()) {
+      LOG(ERROR) << "failed to open " << cpuinfo_max_freq_sys_conf;
+      return -1;
+    }
+    std::string line;
+    if (std::getline(f, line)) {
+      float freq = atof(line.c_str());
+      max_freqs->push_back(freq);
+    }
+    if (f.bad()) {
+      LOG(ERROR) << "failed to read " << cpuinfo_max_freq_sys_conf;
+    }
+    f.close();
+  }
+#else
+  std::string cpu_sys_conf = "/proc/cpuinfo";
+  std::ifstream f(cpu_sys_conf);
+  if (!f.is_open()) {
+    LOG(ERROR) << "failed to open " << cpu_sys_conf;
+    return -1;
   }
+  std::string line;
+  const std::string freq_key = "cpu MHz";
+  while (std::getline(f, line)) {
+    if (line.size() >= freq_key.size()
+        && line.compare(0, freq_key.size(), freq_key) == 0) {
+      size_t pos = line.find(":");
+      if (pos != std::string::npos) {
+        std::string freq_str = line.substr(pos + 1);
+        float freq = atof(freq_str.c_str());
+        max_freqs->push_back(freq);
+      }
+    }
+  }
+  if (f.bad()) {
+    LOG(ERROR) << "failed to read " << cpu_sys_conf;
+  }
+  if (!f.eof()) {
+    LOG(ERROR) << "failed to read end of " << cpu_sys_conf;
+  }
+  f.close();
+#endif
 
-  int freq = 0;
-  int items_read = fscanf(fp, "%d", &freq);
-  if (items_read != 1) {
-    LOG(WARNING) << "Read file: " << path << " failed.";
+  for (float freq : *max_freqs) {
+    VLOG(2) << "CPU freq: " << freq;
   }
-  fclose(fp);
-  return freq;
+
+  return 0;
 }
 
 MaceStatus SetThreadAffinity(cpu_set_t mask) {
@@ -93,51 +146,14 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) {
   }
 }
 
-MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
-                                  std::vector<int> *little_core_ids) {
-  MACE_CHECK_NOTNULL(big_core_ids);
-  MACE_CHECK_NOTNULL(little_core_ids);
-  int cpu_count = GetCPUCount();
-  std::vector<int> cpu_max_freq(cpu_count);
-
-  // set cpu max frequency
-  for (int i = 0; i < cpu_count; ++i) {
-    cpu_max_freq[i] = GetCPUMaxFreq(i);
-    if (cpu_max_freq[i] == 0) {
-      LOG(WARNING) << "Cannot get CPU" << i
-                   << "'s max frequency info, maybe it is offline.";
-      return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
-                        "Cannot get CPU's max frequency info,"
-                        " maybe it is offline.");
-    }
-  }
-
-  int big_core_freq =
-      *(std::max_element(cpu_max_freq.begin(), cpu_max_freq.end()));
-  int little_core_freq =
-      *(std::min_element(cpu_max_freq.begin(), cpu_max_freq.end()));
-
-  big_core_ids->reserve(cpu_count);
-  little_core_ids->reserve(cpu_count);
-  for (int i = 0; i < cpu_count; ++i) {
-    if (cpu_max_freq[i] == little_core_freq) {
-      little_core_ids->push_back(i);
-    }
-    if (cpu_max_freq[i] == big_core_freq) {
-      big_core_ids->push_back(i);
-    }
-  }
-
-  return MaceStatus::MACE_SUCCESS;
-}
-
 MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
-                                           const std::vector<int> &cpu_ids) {
+                                           const std::vector<size_t> &cpu_ids) {
   MaceOpenMPThreadCount = omp_num_threads;
 
 #ifdef MACE_ENABLE_OPENMP
   VLOG(1) << "Set OpenMP threads number: " << omp_num_threads
           << ", CPU core IDs: " << MakeString(cpu_ids);
+  omp_set_schedule(omp_sched_guided, 1);
   omp_set_num_threads(omp_num_threads);
 #else
   MACE_UNUSED(omp_num_threads);
@@ -174,55 +190,90 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
 }  // namespace
 
 MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
-    int omp_num_threads_hint,
+    int num_threads_hint,
     CPUAffinityPolicy policy,
     void *gemm_context) {
+  // get cpu frequency info
+  std::vector<float> cpu_max_freqs;
+  if (GetCPUMaxFreq(&cpu_max_freqs) == -1 || cpu_max_freqs.size() == 0) {
+    return MaceStatus::MACE_INVALID_ARGS;
+  }
+
+  std::vector<CPUFreq> cpu_freq(cpu_max_freqs.size());
+  for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
+    cpu_freq[i].core_id = i;
+    cpu_freq[i].freq = cpu_max_freqs[i];
+  }
+  if (policy == CPUAffinityPolicy::AFFINITY_POWER_SAVE ||
+      policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
+    std::sort(cpu_freq.begin(),
+              cpu_freq.end(),
+              [=](const CPUFreq &lhs, const CPUFreq &rhs) {
+                return lhs.freq < rhs.freq;
+              });
+  } else if (policy == CPUAffinityPolicy::AFFINITY_HIGH_PERFORMANCE ||
+      policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) {
+    std::sort(cpu_freq.begin(),
+              cpu_freq.end(),
+              [](const CPUFreq &lhs, const CPUFreq &rhs) {
+                return lhs.freq > rhs.freq;
+              });
+  }
+
+  int cpu_count = static_cast<int>(cpu_freq.size());
+  if (num_threads_hint <= 0 || num_threads_hint > cpu_count) {
+    num_threads_hint = cpu_count;
+  }
+
   if (policy == CPUAffinityPolicy::AFFINITY_NONE) {
 #ifdef MACE_ENABLE_QUANTIZE
     if (gemm_context) {
       static_cast<gemmlowp::GemmContext*>(gemm_context)->set_max_num_threads(
-          std::max(0, omp_num_threads_hint));
+          num_threads_hint);
     }
 #else
     MACE_UNUSED(gemm_context);
 #endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENMP
-    if (omp_num_threads_hint > 0) {
-      omp_set_num_threads(std::min(omp_num_threads_hint, omp_get_num_procs()));
-    }
+    omp_set_num_threads(num_threads_hint);
 #else
     LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled.";
 #endif
     return MaceStatus::MACE_SUCCESS;
   }
 
-  std::vector<int> big_core_ids;
-  std::vector<int> little_core_ids;
-  MaceStatus res = GetCPUBigLittleCoreIDs(&big_core_ids, &little_core_ids);
-  if (res != MaceStatus::MACE_SUCCESS) {
-    return res;
-  }
 
-  std::vector<int> use_cpu_ids;
-  if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) {
-    use_cpu_ids = std::move(big_core_ids);
+  // decide num of cores to use
+  int cores_to_use = 0;
+  if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY
+      || policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
+    for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
+      if (cpu_freq[i].freq != cpu_freq[0].freq) {
+        break;
+      }
+      ++cores_to_use;
+    }
+    num_threads_hint = cores_to_use;
   } else {
-    use_cpu_ids = std::move(little_core_ids);
+    cores_to_use = num_threads_hint;
   }
 
-  if (omp_num_threads_hint <= 0 ||
-      omp_num_threads_hint > static_cast<int>(use_cpu_ids.size())) {
-    omp_num_threads_hint = use_cpu_ids.size();
+  VLOG(2) << "Use " << num_threads_hint << " threads";
+  std::vector<size_t> cpu_ids(cores_to_use);
+  for (int i = 0; i < cores_to_use; ++i) {
+    VLOG(2) << "Bind thread to core: " << cpu_freq[i].core_id << " with freq "
+            << cpu_freq[i].freq;
+    cpu_ids[i] = cpu_freq[i].core_id;
   }
 
 #ifdef MACE_ENABLE_QUANTIZE
   if (gemm_context) {
     static_cast<gemmlowp::GemmContext*>(gemm_context)->set_max_num_threads(
-        omp_num_threads_hint);
+        num_threads_hint);
   }
 #endif  // MACE_ENABLE_QUANTIZE
 
-  return SetOpenMPThreadsAndAffinityCPUs(omp_num_threads_hint, use_cpu_ids);
+  return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint, cpu_ids);
 }
 
 }  // namespace mace
diff --git a/mace/ops/activation.h b/mace/ops/activation.h
index 2c9a18618da776e5d004e7c01012117b4a94afb0..36fb45d6bdeef39eb9214d398a5cd33fea7c4a07 100644
--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
@@ -66,26 +66,26 @@ void DoActivation(const T *input_ptr,
     case NOOP:
       break;
     case RELU:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output_ptr[i] = std::max(input_ptr[i], static_cast<T>(0));
       }
       break;
     case RELUX:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output_ptr[i] = std::min(std::max(input_ptr[i], static_cast<T>(0)),
                                  static_cast<T>(relux_max_limit));
       }
       break;
     case TANH:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output_ptr[i] = std::tanh(input_ptr[i]);
       }
       break;
     case SIGMOID:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i]));
       }
@@ -111,13 +111,13 @@ inline void DoActivation(const float *input_ptr,
       ReluxNeon(input_ptr, relux_max_limit, size, output_ptr);
       break;
     case TANH:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output_ptr[i] = std::tanh(input_ptr[i]);
       }
       break;
     case SIGMOID:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i]));
       }
@@ -134,7 +134,7 @@ void PReLUActivation(const T *input_ptr,
                      const index_t inner_size,
                      const T *alpha_ptr,
                      T *output_ptr) {
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
   for (index_t i = 0; i < outer_size; ++i) {
     for (index_t chan_idx = 0; chan_idx < input_chan; ++chan_idx) {
       for (index_t j = 0; j < inner_size; ++j) {
diff --git a/mace/ops/argmax.cc b/mace/ops/argmax.cc
index 8f8419b7c839dfb5fcae4500b6b109fdc30d1b9a..2b3e2f0be6aa223ef4eb8d0c47aa5733bd13cac6 100644
--- a/mace/ops/argmax.cc
+++ b/mace/ops/argmax.cc
@@ -59,7 +59,7 @@ class ArgMaxOp : public Operation {
     index_t outer_size = output->size();
     index_t inner_size = input->dim(axis_value);
 
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t i = 0; i < outer_size; ++i) {
       int idx = 0;
       T max_value = std::numeric_limits<T>::lowest();
diff --git a/mace/ops/arm/activation_neon.cc b/mace/ops/arm/activation_neon.cc
index 44b492a42d7351867391410eb31fd9aaab5ffe35..ec9ba357425ac9c6603b08bac604b6d7f79c57f4 100644
--- a/mace/ops/arm/activation_neon.cc
+++ b/mace/ops/arm/activation_neon.cc
@@ -25,7 +25,7 @@ namespace ops {
 void ReluNeon(const float *input, const index_t size, float *output) {
 #if defined(MACE_ENABLE_NEON)
   float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
   for (index_t i = 0; i <= size - 4; i += 4) {
     float32x4_t v = vld1q_f32(input + i);
     v = vmaxq_f32(v, vzero);
@@ -36,7 +36,7 @@ void ReluNeon(const float *input, const index_t size, float *output) {
     output[i] = std::max(input[i], 0.f);
   }
 #else
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
   for (index_t i = 0; i < size; ++i) {
     output[i] = std::max(input[i], 0.f);
   }
@@ -48,7 +48,7 @@ void ReluxNeon(const float *input, const float limit,
 #if defined(MACE_ENABLE_NEON)
   float32x4_t vzero = vdupq_n_f32(0.f);
   float32x4_t vlimit = vdupq_n_f32(limit);
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
   for (index_t i = 0; i <= size - 4; i += 4) {
     float32x4_t v = vld1q_f32(input + i);
     v = vmaxq_f32(v, vzero);
@@ -60,7 +60,7 @@ void ReluxNeon(const float *input, const float limit,
     output[i] = std::min(std::max(input[i], 0.f), limit);
   }
 #else
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
   for (index_t i = 0; i < size; ++i) {
     output[i] = std::min(std::max(input[i], 0.f), limit);
   }
diff --git a/mace/ops/arm/conv_2d_neon_15x1.cc b/mace/ops/arm/conv_2d_neon_15x1.cc
index a4bae4e9835f571066b0c53ed9a9ddda647f4c4d..553de92e6cb28ba492919dcc5bb6e93c7ba2f6bf 100644
--- a/mace/ops/arm/conv_2d_neon_15x1.cc
+++ b/mace/ops/arm/conv_2d_neon_15x1.cc
@@ -60,7 +60,7 @@ void Conv2dNeonK15x1S1(const float *input,
   const index_t tile_width =
       out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3];
 
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t m = 0; m < out_shape[1]; ++m) {
       for (index_t w = 0; w < out_shape[3]; w += tile_width) {
diff --git a/mace/ops/arm/conv_2d_neon_1x15.cc b/mace/ops/arm/conv_2d_neon_1x15.cc
index 06c40e2902cf7f05bdbacd2b32c241957f36a8c7..07deca05abc32a98a058194718a46493e4327f42 100644
--- a/mace/ops/arm/conv_2d_neon_1x15.cc
+++ b/mace/ops/arm/conv_2d_neon_1x15.cc
@@ -61,7 +61,7 @@ void Conv2dNeonK1x15S1(const float *input,
   const index_t tile_height =
       out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2];
 
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t m = 0; m < out_shape[1]; ++m) {
       for (index_t h = 0; h < out_shape[2]; h += tile_height) {
diff --git a/mace/ops/arm/conv_2d_neon_1x7.cc b/mace/ops/arm/conv_2d_neon_1x7.cc
index 39321e0fbee05388a876871cf64040da3dc938d4..09061e0550da4742ce04d2d3e35c41f73115f32d 100644
--- a/mace/ops/arm/conv_2d_neon_1x7.cc
+++ b/mace/ops/arm/conv_2d_neon_1x7.cc
@@ -32,7 +32,7 @@ void Conv2dNeonK1x7S1(const float *input,
   const index_t in_batch_size = in_shape[1] * in_image_size;
   const index_t out_batch_size = out_shape[1] * out_image_size;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t m = 0; m < out_shape[1]; m += 4) {
       const index_t out_channels = out_shape[1];
diff --git a/mace/ops/arm/conv_2d_neon_3x3.cc b/mace/ops/arm/conv_2d_neon_3x3.cc
index 33653a424c926af4a89469396abb4fee20c9091f..6213a208b0b663d95fc33d3d069898830544db63 100644
--- a/mace/ops/arm/conv_2d_neon_3x3.cc
+++ b/mace/ops/arm/conv_2d_neon_3x3.cc
@@ -33,7 +33,7 @@ void Conv2dNeonK3x3S1(const float *input,
   const index_t in_batch_size = in_shape[1] * in_image_size;
   const index_t out_batch_size = out_shape[1] * out_image_size;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t m = 0; m < out_shape[1]; m += 2) {
       const index_t out_channels = out_shape[1];
@@ -515,7 +515,7 @@ void Conv2dNeonK3x3S2(const float *input,
   const index_t in_batch_size = in_shape[1] * in_image_size;
   const index_t out_batch_size = out_shape[1] * out_image_size;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t m = 0; m < out_shape[1]; ++m) {
       for (index_t c = 0; c < in_shape[1]; ++c) {
diff --git a/mace/ops/arm/conv_2d_neon_5x5.cc b/mace/ops/arm/conv_2d_neon_5x5.cc
index 7803a89ef9d6ffeb2090d3334703f0672bf3b71f..87b997c60fef51763be46403ccb1993ad3dee57a 100644
--- a/mace/ops/arm/conv_2d_neon_5x5.cc
+++ b/mace/ops/arm/conv_2d_neon_5x5.cc
@@ -87,7 +87,7 @@ void Conv2dNeonK5x5S1(const float *input,
   const index_t in_batch_size = in_shape[1] * in_image_size;
   const index_t out_batch_size = out_shape[1] * out_image_size;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t m = 0; m < out_shape[1]; m += 4) {
       const index_t out_channels = out_shape[1];
diff --git a/mace/ops/arm/conv_2d_neon_7x1.cc b/mace/ops/arm/conv_2d_neon_7x1.cc
index 37d9ec9deadd20c2f9aef9b45c95bb012c9e19b5..78025de68f5cfd81685f621487b7b25aa77efb08 100644
--- a/mace/ops/arm/conv_2d_neon_7x1.cc
+++ b/mace/ops/arm/conv_2d_neon_7x1.cc
@@ -32,7 +32,7 @@ void Conv2dNeonK7x1S1(const float *input,
   const index_t in_batch_size = in_shape[1] * in_image_size;
   const index_t out_batch_size = out_shape[1] * out_image_size;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t m = 0; m < out_shape[1]; m += 4) {
       const index_t out_channels = out_shape[1];
diff --git a/mace/ops/arm/conv_2d_neon_7x7.cc b/mace/ops/arm/conv_2d_neon_7x7.cc
index 4e1c0041d178df71787a5f75511dcb4d218a66fc..04c8323f3fddc1edf419cbc0ddd9a713fa647f7d 100644
--- a/mace/ops/arm/conv_2d_neon_7x7.cc
+++ b/mace/ops/arm/conv_2d_neon_7x7.cc
@@ -164,7 +164,7 @@ void Conv2dNeonK7x7S1(const float *input,
   const index_t in_batch_size = in_shape[1] * in_image_size;
   const index_t out_batch_size = out_shape[1] * out_image_size;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t m = 0; m < out_shape[1]; m += 4) {
       const index_t out_channels = out_shape[1];
@@ -319,7 +319,7 @@ void Conv2dNeonK7x7S2(const float *input,
   const index_t in_batch_size = in_shape[1] * in_image_size;
   const index_t out_batch_size = out_shape[1] * out_image_size;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t m = 0; m < out_shape[1]; m += 4) {
       const index_t out_channels = out_shape[1];
@@ -484,7 +484,7 @@ void Conv2dNeonK7x7S3(const float *input,
   const index_t in_batch_size = in_shape[1] * in_image_size;
   const index_t out_batch_size = out_shape[1] * out_image_size;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t m = 0; m < out_shape[1]; m += 4) {
       const index_t out_channels = out_shape[1];
diff --git a/mace/ops/arm/conv_winograd.cc b/mace/ops/arm/conv_winograd.cc
index 2f6207fd9a194eb216bc64b7ef267892252ecea5..748cc694ed7d7f9c7d7f2d1dd145c12b1f87bb5e 100644
--- a/mace/ops/arm/conv_winograd.cc
+++ b/mace/ops/arm/conv_winograd.cc
@@ -34,7 +34,7 @@ void TransformInput4x4(const float *input,
   const index_t input_batch_size = in_height_width * in_channels;
   const index_t output_batch_size = 16 * in_channels * tile_count;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t n = 0; n < batch; ++n) {
     for (index_t c = 0; c < in_channels; ++c) {
       index_t tile_index = 0;
@@ -155,7 +155,7 @@ void TransformInput8x8(const float *input,
   const index_t input_batch_size = in_height_width * in_channels;
   const index_t output_batch_size = 64 * in_channels * tile_count;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t n = 0; n < batch; ++n) {
     for (index_t c = 0; c < in_channels; ++c) {
       index_t tile_index = 0;
@@ -292,7 +292,7 @@ void TransformOutput4x4(const float *input,
   const index_t out_image_size = out_height * out_width;
   const index_t output_batch_size = out_channels * out_image_size;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t n = 0; n < batch; ++n) {
     for (index_t m = 0; m < out_channels; ++m) {
       index_t tile_offset = 0;
@@ -388,7 +388,7 @@ void TransformOutput8x8(const float *input,
   const index_t out_image_size = out_height * out_width;
   const index_t output_batch_size = out_channels * out_image_size;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t n = 0; n < batch; ++n) {
     for (index_t m = 0; m < out_channels; ++m) {
       index_t tile_offset = 0;
@@ -471,7 +471,7 @@ void TransformFilter4x4(const float *filter,
                         float *output) {
   const index_t stride = out_channels * in_channels;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t m = 0; m < out_channels; ++m) {
     for (index_t c = 0; c < in_channels; ++c) {
       float g0, g1, g2, g3, g4, g5, g6, g7, g8;
@@ -573,7 +573,7 @@ void TransformFilter8x8(const float *filter,
                          {1.0f / 45, -1.0f / 90, 1.0f / 180},
                          {0.0f, 0.0f, 1.0f}};
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t m = 0; m < out_channels; ++m) {
     for (index_t c = 0; c < in_channels; ++c) {
       // load filter
@@ -720,7 +720,7 @@ void ConvRef3x3s1(const float *input,
   index_t out_height = in_height - 2;
   index_t out_width = in_width - 2;
 
-#pragma omp parallel for collapse(4)
+#pragma omp parallel for collapse(4) schedule(runtime)
   for (index_t b = 0; b < batch; ++b) {
     for (index_t m = 0; m < out_channels; ++m) {
       for (index_t h = 0; h < out_height; ++h) {
diff --git a/mace/ops/arm/deconv_2d_neon_2x2.cc b/mace/ops/arm/deconv_2d_neon_2x2.cc
index 001ab01be369f4b3f880c457073be754b7ef1eb9..39f1f3304192348dba0c39fc7f5f586a413f3232 100644
--- a/mace/ops/arm/deconv_2d_neon_2x2.cc
+++ b/mace/ops/arm/deconv_2d_neon_2x2.cc
@@ -33,7 +33,7 @@ void Deconv2dNeonK2x2S1(const float *input,
 
   const index_t out_img_size = outh * outw;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t oc = 0; oc < outch; oc += 2) {
       if (oc + 1 < outch) {
@@ -199,7 +199,7 @@ void Deconv2dNeonK2x2S2(const float *input,
   const index_t outw = out_shape[3];
   const index_t out_img_size = outh * outw;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t oc = 0; oc < outch; ++oc) {
       float *out_base = output + (b * outch + oc) * out_img_size;
diff --git a/mace/ops/arm/deconv_2d_neon_3x3.cc b/mace/ops/arm/deconv_2d_neon_3x3.cc
index 6df0c7badfee33aadbc385068bd1f781a63ab2b3..da4d1d885b6572e47bac978cf0c0f150373d7d4c 100644
--- a/mace/ops/arm/deconv_2d_neon_3x3.cc
+++ b/mace/ops/arm/deconv_2d_neon_3x3.cc
@@ -33,7 +33,7 @@ void Deconv2dNeonK3x3S1(const float *input,
 
   const index_t out_img_size = outh * outw;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t oc = 0; oc < outch; oc += 2) {
       if (oc + 1 < outch) {
@@ -293,7 +293,7 @@ void Deconv2dNeonK3x3S2(const float *input,
   const index_t outw = out_shape[3];
   const index_t out_img_size = outh * outw;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t oc = 0; oc < outch; ++oc) {
       float *out_base = output + (b * outch + oc) * out_img_size;
diff --git a/mace/ops/arm/deconv_2d_neon_4x4.cc b/mace/ops/arm/deconv_2d_neon_4x4.cc
index dd85896095d0922e02f3079809edd8972380f223..39389e229f1a9c72be1fbbc0766ad2908f139e3c 100644
--- a/mace/ops/arm/deconv_2d_neon_4x4.cc
+++ b/mace/ops/arm/deconv_2d_neon_4x4.cc
@@ -31,7 +31,7 @@ void Deconv2dNeonK4x4S1(const float *input,
   const index_t outw = out_shape[3];
   const index_t outch = out_shape[1];
   const index_t out_img_size = outh * outw;
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t oc = 0; oc < outch; oc += 2) {
       if (oc + 1 < outch) {
@@ -386,7 +386,7 @@ void Deconv2dNeonK4x4S2(const float *input,
   const index_t outch = out_shape[1];
   const index_t out_img_size = outh * outw;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t p = 0; p < outch; p++) {
       float *out_base = output + (b * outch + p) * out_img_size;
diff --git a/mace/ops/arm/depthwise_conv2d_neon_3x3.cc b/mace/ops/arm/depthwise_conv2d_neon_3x3.cc
index 2e997912f7e096e42278cc025657803353fec84a..3166c9238d47ceba50559efc7ebd3f1cf3ddfcf4 100644
--- a/mace/ops/arm/depthwise_conv2d_neon_3x3.cc
+++ b/mace/ops/arm/depthwise_conv2d_neon_3x3.cc
@@ -70,7 +70,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
   const index_t in_batch_size = in_shape[1] * in_image_size;
   const index_t out_batch_size = out_shape[1] * out_image_size;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < in_shape[0]; ++b) {
     for (index_t m = 0; m < out_shape[1]; ++m) {
       index_t c = m / multiplier;
@@ -250,7 +250,7 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
   const index_t in_batch_size = in_shape[1] * in_image_size;
   const index_t out_batch_size = out_shape[1] * out_image_size;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < in_shape[0]; ++b) {
     for (index_t m = 0; m < out_shape[1]; ++m) {
       index_t c = m / multiplier;
diff --git a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
index 4296fb407ad24bd1e5cda017b36847616061627e..1f138ca68af19648ebaf0d634aeff6a17e3d5d0b 100644
--- a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
+++ b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
@@ -32,7 +32,7 @@ void DepthwiseDeconv2dNeonK3x3S1(const float *input,
   const index_t outw = out_shape[3];
   const index_t out_img_size = outh * outw;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t c = 0; c < channels; ++c) {
       const index_t offset = b * channels + c;
@@ -137,7 +137,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input,
   const index_t outw = out_shape[3];
   const index_t out_img_size = outh * outw;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t c = 0; c < channels; ++c) {
       const index_t offset = b * channels + c;
@@ -251,7 +251,7 @@ void GroupDeconv2dNeonK3x3S1(const float *input,
   const index_t inch_g = inch / group;
   const index_t outch_g = outch / group;
 
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (int g = 0; g < group; ++g) {
       for (index_t oc = 0; oc < outch_g; oc += 2) {
@@ -525,7 +525,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input,
   const index_t inch_g = inch / group;
   const index_t outch_g = outch / group;
 
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (int g = 0; g < group; ++g) {
       for (index_t oc = 0; oc < outch_g; ++oc) {
diff --git a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
index 744e70243652c11036f8e992877e6ee3627f35f7..b859bf436aebe07a9b03a8a1adbf6dbf263f2570 100644
--- a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
+++ b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
@@ -33,7 +33,7 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input,
   const index_t outw = out_shape[3];
   const index_t out_img_size = outh * outw;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < batch; ++b) {
     for (index_t c = 0; c < channels; ++c) {
       const index_t offset = b * channels + c;
@@ -169,7 +169,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
   const index_t outw = out_shape[3];
   const index_t out_img_size = outh * outw;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t c = 0; c < channels; ++c) {
       const index_t offset = b * channels + c;
@@ -304,7 +304,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
   const index_t inch_g = inch / group;
   const index_t outch_g = outch / group;
 
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (int g = 0; g < group; ++g) {
       for (index_t oc = 0; oc < outch_g; oc += 2) {
@@ -679,7 +679,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
   const index_t inch_g = inch / group;
   const index_t outch_g = outch / group;
 
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (int g = 0; g < group; ++g) {
       for (index_t oc = 0; oc < outch_g; oc++) {
diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc
index 529a900b955dac46cfcb1033fa94f238fb8ffaaf..5cc6a1e025c54b755a61d3e0c5331d0f38aa5450 100644
--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -124,7 +124,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
         std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);
 
     // make channel outter loop so we can make best use of cache
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
     for (index_t c = 0; c < channels; ++c) {
       for (index_t block_h = 0; block_h < in_height;
            block_h += block_h_size) {
@@ -213,7 +213,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
     index_t out_width = space_tensor->dim(2);
     index_t channels = space_tensor->dim(3);
 
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t in_b = 0; in_b < in_batches; ++in_b) {
       const index_t b = in_b % out_batches;
       const index_t tile_index = in_b / out_batches;
diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc
index 78e6f7ad583205cc21458592b25e3aa69c3a980c..04c6a88dc99c06ac9f401a1839205d349b32ff90 100644
--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -55,7 +55,7 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
     index_t batch_size = channels * image_size;
     index_t channels_per_group = channels / groups_;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
     for (index_t b = 0; b < batch; ++b) {
       for (index_t c = 0; c < channels; ++c) {
         const T *input_base = input_ptr + b * batch_size;
diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc
index af459b2bb3e9d730dc111f7a46615f5c452e405d..7bb213c0bc70f9a47d0b7c3964b050b76bcccdba 100644
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -475,7 +475,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
 
     // unpack output
     if (extra_output_height != height || extra_output_width != width) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
       for (index_t b = 0; b < batch; ++b) {
         for (index_t c = 0; c < channels; ++c) {
           for (index_t h = 0; h < height; ++h) {
@@ -494,7 +494,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
 
     if (bias_data != nullptr) {
       const index_t image_size = height * width;
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
       for (index_t b = 0; b < batch; ++b) {
         for (index_t c = 0; c < channels; ++c) {
           float *output_ptr = output_data + (b * channels + c) * image_size;
@@ -539,7 +539,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
     const index_t out_batch_size = filter_shape[0] * out_image_size;
     const index_t filter_size = filter_shape[2] * filter_shape[3];
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
     for (index_t b = 0; b < in_shape[0]; b++) {
       for (index_t m = 0; m < filter_shape[0]; m += 4) {
         const index_t in_width = in_shape[3];
@@ -867,7 +867,7 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
     const index_t input_row_size = in_shape[2] * in_shape[3];
     const index_t patch_row_size = filter_w * in_shape[3];
 
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
     for (index_t b = 0; b < out_shape[0]; ++b) {
       for (index_t h = 0; h < out_shape[1]; ++h) {
         for (index_t w = 0; w < out_shape[2]; ++w) {
diff --git a/mace/ops/conv_pool_2d_util.cc b/mace/ops/conv_pool_2d_util.cc
index 6ec025b9eb52773dff3b309b663945ca0e1a7e74..a056743e85af91b562781d9821aebad87115221d 100644
--- a/mace/ops/conv_pool_2d_util.cc
+++ b/mace/ops/conv_pool_2d_util.cc
@@ -395,7 +395,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
   const index_t in_batch_size = channels * in_image_size;
   const index_t out_batch_size = channels * out_image_size;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (int i = 0; i < batch; ++i) {
     for (int j = 0; j < channels; ++j) {
       for (int k = 0; k < height; ++k) {
@@ -443,7 +443,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
   if (padding_same_value) {
     LOG(FATAL) << "Not implemented";
   } else {
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
     for (int n = 0; n < batch; ++n) {
       for (int h = 0; h < height; ++h) {
         for (int w = 0; w < width; ++w) {
diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc
index 183885295f8ac780ad14b170d0c298e9902c4b48..c9113439536746e9ce05d33e4b20feb35a075060 100644
--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -276,7 +276,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
       const index_t batch = out_shape[0];
       const index_t channels = out_shape[1];
       const index_t img_size = out_shape[2] * out_shape[3];
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
       for (index_t b = 0; b < batch; ++b) {
         for (index_t c = 0; c < channels; ++c) {
           for (index_t i = 0; i < img_size; ++i) {
@@ -324,7 +324,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
     const index_t out_channels = out_shape[1];
     const index_t in_channels = in_shape[1];
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
     for (int b = 0; b < batch; ++b) {
       for (int oc = 0; oc < out_channels; ++oc) {
         float *out_base =
diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc
index be7a2f82361955490118a909c821bec042e77a33..e18cc106f4fba10c4f054cd7d8c219b0ef032118 100644
--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -57,7 +57,7 @@ class DepthToSpaceOp : public Operation {
     const T *input_ptr = input->data<T>();
     T *output_ptr = output->mutable_data<T>();
 
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t b = 0; b < batch_size; ++b) {
       for (index_t d = 0; d < output_depth; ++d) {
         for (index_t h = 0; h < output_height; ++h) {
diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc
index 94622ac3f16625837f3336e90ba1d663982ab33a..29f0c5a7ed29834fbd43d3a8951959fa20f1524d 100644
--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -201,7 +201,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
   switch (type) {
     case SUM:
       if (coeff.empty()) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
         for (index_t d = 0; d < diff_size; ++d) {
           for (index_t i = 0; i < common_size; ++i) {
             output[i + d * common_size] =
@@ -213,7 +213,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
         if (swapped) {
           std::swap(coeff_copy[0], coeff_copy[1]);
         }
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
         for (index_t d = 0; d < diff_size; ++d) {
           for (index_t i = 0; i < common_size; ++i) {
             output[i + d * common_size] =
@@ -225,7 +225,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
       break;
     case SUB:
       if (!swapped) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
         for (index_t d = 0; d < diff_size; ++d) {
           for (index_t i = 0; i < common_size; ++i) {
             output[i + d * common_size] =
@@ -233,7 +233,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
           }
         }
       } else {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
         for (index_t d = 0; d < diff_size; ++d) {
           for (index_t i = 0; i < common_size; ++i) {
             output[i + d * common_size] =
@@ -243,7 +243,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
       }
       break;
     case PROD:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
       for (index_t d = 0; d < diff_size; ++d) {
         for (index_t i = 0; i < common_size; ++i) {
           output[i + d * common_size] = input0[i + d * common_size] * input1[i];
@@ -252,7 +252,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
       break;
     case DIV:
       if (!swapped) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
         for (index_t d = 0; d < diff_size; ++d) {
           for (index_t i = 0; i < common_size; ++i) {
             output[i + d * common_size] =
@@ -260,7 +260,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
           }
         }
       } else {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
         for (index_t d = 0; d < diff_size; ++d) {
           for (index_t i = 0; i < common_size; ++i) {
             output[i + d * common_size] =
@@ -270,7 +270,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
       }
       break;
     case MIN:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
       for (index_t d = 0; d < diff_size; ++d) {
         for (index_t i = 0; i < common_size; ++i) {
           output[i + d * common_size] =
@@ -279,7 +279,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
       }
       break;
     case MAX:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
       for (index_t d = 0; d < diff_size; ++d) {
         for (index_t i = 0; i < common_size; ++i) {
           output[i + d * common_size] =
@@ -288,7 +288,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
       }
       break;
     case SQR_DIFF:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
       for (index_t d = 0; d < diff_size; ++d) {
         for (index_t i = 0; i < common_size; ++i) {
           output[i + d * common_size] =
@@ -298,7 +298,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
       break;
     case POW:
       if (!swapped) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
         for (index_t d = 0; d < diff_size; ++d) {
           for (index_t i = 0; i < common_size; ++i) {
             output[i + d * common_size] =
@@ -306,7 +306,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
           }
         }
       } else {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
         for (index_t d = 0; d < diff_size; ++d) {
           for (index_t i = 0; i < common_size; ++i) {
             output[i + d * common_size] =
@@ -316,19 +316,19 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
       }
       break;
     case NEG:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < diff_size * common_size; ++i) {
         output[i] = -input0[i];
       }
       break;
     case ABS:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < diff_size * common_size; ++i) {
         output[i] = std::fabs(input0[i]);
       }
       break;
     case EQUAL:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
       for (index_t d = 0; d < diff_size; ++d) {
         for (index_t i = 0; i < common_size; ++i) {
           output[i + d * common_size] =
@@ -353,7 +353,7 @@ inline void TensorEltwise(const EltwiseType type,
   switch (type) {
     case SUM:
       if (coeff.empty()) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
         for (index_t i = 0; i < size; ++i) {
           output[i] = input0[i] + input1[i];
         }
@@ -363,7 +363,7 @@ inline void TensorEltwise(const EltwiseType type,
         if (swapped) {
           std::swap(coeff_copy[0], coeff_copy[1]);
         }
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
         for (index_t i = 0; i < size; ++i) {
           output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1];
         }
@@ -371,20 +371,20 @@ inline void TensorEltwise(const EltwiseType type,
       break;
     case SUB:
       if (!swapped) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
         for (index_t i = 0; i < size; ++i) {
           output[i] = input0[i] - input1[i];
         }
 
       } else {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
         for (index_t i = 0; i < size; ++i) {
           output[i] = input1[i] - input0[i];
         }
       }
       break;
     case PROD:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output[i] = input0[i] * input1[i];
       }
@@ -392,34 +392,34 @@ inline void TensorEltwise(const EltwiseType type,
       break;
     case DIV:
       if (!swapped) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
         for (index_t i = 0; i < size; ++i) {
           output[i] = input0[i] / input1[i];
         }
 
       } else {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
         for (index_t i = 0; i < size; ++i) {
           output[i] = input1[i] / input0[i];
         }
       }
       break;
     case MIN:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output[i] = std::min(input0[i], input1[i]);
       }
 
       break;
     case MAX:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output[i] = std::max(input0[i], input1[i]);
       }
 
       break;
     case SQR_DIFF:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output[i] = std::pow(input0[i] - input1[i], 2.f);
       }
@@ -427,7 +427,7 @@ inline void TensorEltwise(const EltwiseType type,
       break;
     case POW:
       if (!swapped) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
         for (index_t i = 0; i < size; ++i) {
           output[i] = std::pow(input0[i], input1[i]);
         }
@@ -438,19 +438,19 @@ inline void TensorEltwise(const EltwiseType type,
       }
       break;
     case NEG:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output[i] = -input0[i];
       }
       break;
     case ABS:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output[i] = std::fabs(input0[i]);
       }
       break;
     case EQUAL:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output[i] = input0[i] == input1[i];
       }
@@ -472,7 +472,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
   switch (type) {
     case SUM:
       if (coeff.empty()) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
         for (index_t i = 0; i < size; ++i) {
           output[i] = input0[i] + input1;
         }
@@ -482,7 +482,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
         if (swapped) {
           std::swap(coeff_copy[0], coeff_copy[1]);
         }
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
         for (index_t i = 0; i < size; ++i) {
           output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1];
         }
@@ -490,20 +490,20 @@ inline void TensorScalarEltwise(const EltwiseType type,
       break;
     case SUB:
       if (!swapped) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
         for (index_t i = 0; i < size; ++i) {
           output[i] = input0[i] - input1;
         }
 
       } else {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
         for (index_t i = 0; i < size; ++i) {
           output[i] = input1 - input0[i];
         }
       }
       break;
     case PROD:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output[i] = input0[i] * input1;
       }
@@ -511,34 +511,34 @@ inline void TensorScalarEltwise(const EltwiseType type,
       break;
     case DIV:
       if (!swapped) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
         for (index_t i = 0; i < size; ++i) {
           output[i] = input0[i] / input1;
         }
 
       } else {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
         for (index_t i = 0; i < size; ++i) {
           output[i] = input1 / input0[i];
         }
       }
       break;
     case MIN:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output[i] = std::min(input0[i], input1);
       }
 
       break;
     case MAX:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output[i] = std::max(input0[i], input1);
       }
 
       break;
     case SQR_DIFF:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output[i] = std::pow(input0[i] - input1, 2.f);
       }
@@ -546,7 +546,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
       break;
     case POW:
       if (!swapped) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
         for (index_t i = 0; i < size; ++i) {
           output[i] = std::pow(input0[i], input1);
         }
@@ -557,19 +557,19 @@ inline void TensorScalarEltwise(const EltwiseType type,
       }
       break;
     case NEG:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output[i] = -input0[i];
       }
       break;
     case ABS:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output[i] = std::fabs(input0[i]);
       }
       break;
     case EQUAL:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < size; ++i) {
         output[i] = input0[i] == input1;
       }
@@ -594,7 +594,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
   switch (type) {
     case SUM:
       if (coeff.empty()) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
         for (index_t b = 0; b < batch0; ++b) {
           for (index_t c = 0; c < channel; ++c) {
             const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -610,7 +610,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
         if (swapped) {
           std::swap(coeff_copy[0], coeff_copy[1]);
         }
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
         for (index_t b = 0; b < batch0; ++b) {
           for (index_t c = 0; c < channel; ++c) {
             const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -626,7 +626,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
       break;
     case SUB:
       if (!swapped) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
         for (index_t b = 0; b < batch0; ++b) {
           for (index_t c = 0; c < channel; ++c) {
             const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -638,7 +638,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
           }
         }
       } else {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
         for (index_t b = 0; b < batch0; ++b) {
           for (index_t c = 0; c < channel; ++c) {
             const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -652,7 +652,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
       }
       break;
     case PROD:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
       for (index_t b = 0; b < batch0; ++b) {
         for (index_t c = 0; c < channel; ++c) {
           const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -666,7 +666,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
       break;
     case DIV:
       if (!swapped) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
         for (index_t b = 0; b < batch0; ++b) {
           for (index_t c = 0; c < channel; ++c) {
             const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -678,7 +678,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
           }
         }
       } else {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
         for (index_t b = 0; b < batch0; ++b) {
           for (index_t c = 0; c < channel; ++c) {
             const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -692,7 +692,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
       }
       break;
     case MIN:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
       for (index_t b = 0; b < batch0; ++b) {
         for (index_t c = 0; c < channel; ++c) {
           const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -705,7 +705,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
       }
       break;
     case MAX:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
       for (index_t b = 0; b < batch0; ++b) {
         for (index_t c = 0; c < channel; ++c) {
           const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -718,7 +718,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
       }
       break;
     case SQR_DIFF:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
       for (index_t b = 0; b < batch0; ++b) {
         for (index_t c = 0; c < channel; ++c) {
           const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -732,7 +732,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
       break;
     case POW:
       if (!swapped) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
         for (index_t b = 0; b < batch0; ++b) {
           for (index_t c = 0; c < channel; ++c) {
             const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -744,7 +744,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
           }
         }
       } else {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
         for (index_t b = 0; b < batch0; ++b) {
           for (index_t c = 0; c < channel; ++c) {
             const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -758,19 +758,19 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
       }
       break;
     case NEG:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
         output[i] = -input0[i];
       }
       break;
     case ABS:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
         output[i] = std::fabs(input0[i]);
       }
       break;
     case EQUAL:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
       for (index_t b = 0; b < batch0; ++b) {
         for (index_t c = 0; c < channel; ++c) {
           const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -989,7 +989,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
 
     index_t handled_output_size = 0;
 #ifdef MACE_ENABLE_NEON
-    #pragma omp parallel for
+    #pragma omp parallel for schedule(runtime)
     for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) {
       const auto input0_val = vld1_u8(input0_ptr + i);
       const auto input1_val = vld1_u8(input1_ptr + i);
@@ -1035,7 +1035,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
     }
     handled_output_size = output->size() - output->size() % 8;
 #endif  // NEON
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t i = handled_output_size; i < output->size(); ++i) {
       const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
       const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
diff --git a/mace/ops/gather.cc b/mace/ops/gather.cc
index 1af56d7edf8bd11c53c7db816ea636b2d0ff06fa..60ca2856abb0ca9519fe0f63ba946e881cfac142 100644
--- a/mace/ops/gather.cc
+++ b/mace/ops/gather.cc
@@ -62,7 +62,7 @@ class GatherOp : public Operation {
                         params->shape().end(), 1, std::multiplies<index_t>());
     index_t index_size = indices->size();
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
     for (index_t l = 0; l < lhs_size; ++l) {
       for (index_t idx = 0; idx < index_size; ++idx) {
         MACE_ASSERT(indices_data[idx] < axis_dim_size, "idx out of bound: ",
diff --git a/mace/ops/local_response_norm.cc b/mace/ops/local_response_norm.cc
index 16828baa2d3419cfa1b4af81d83c20b305983fea..fb0cda7cf0dd993e8cd29e0b99251a21cc896758 100644
--- a/mace/ops/local_response_norm.cc
+++ b/mace/ops/local_response_norm.cc
@@ -53,7 +53,7 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
     index_t image_size = height * width;
     index_t batch_size = channels * image_size;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
     for (index_t b = 0; b < batch; ++b) {
       for (index_t c = 0; c < channels; ++c) {
         const int begin_input_c = std::max(static_cast<index_t>(0),
diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc
index 5f9d0e0d7641217e156e3d071e8f521eda95af8b..2ce9d6acb6ac535311b5dc77e6161721a6c716cd 100644
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -133,7 +133,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
     const index_t in_batch_size = in_shape[1] * in_image_size;
     const index_t out_batch_size = out_shape[1] * out_image_size;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
     for (index_t b = 0; b < out_shape[0]; ++b) {
       for (index_t c = 0; c < out_shape[1]; ++c) {
         const index_t out_base = b * out_batch_size + c * out_image_size;
@@ -179,7 +179,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
     const index_t in_batch_size = in_shape[1] * in_image_size;
     const index_t out_batch_size = out_shape[1] * out_image_size;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
     for (index_t b = 0; b < out_shape[0]; ++b) {
       for (index_t c = 0; c < out_shape[1]; ++c) {
         const index_t out_base = b * out_batch_size + c * out_image_size;
@@ -301,7 +301,7 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
                   const int *stride_hw,
                   const int *pad_hw,
                   uint8_t *output) {
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
     for (index_t b = 0; b < out_shape[0]; ++b) {
       for (index_t h = 0; h < out_shape[1]; ++h) {
         for (index_t w = 0; w < out_shape[2]; ++w) {
@@ -358,7 +358,7 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
                   const int *stride_hw,
                   const int *pad_hw,
                   uint8_t *output) {
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
     for (index_t b = 0; b < out_shape[0]; ++b) {
       for (index_t h = 0; h < out_shape[1]; ++h) {
         for (index_t w = 0; w < out_shape[2]; ++w) {
diff --git a/mace/ops/reduce_mean.cc b/mace/ops/reduce_mean.cc
index 0857eb3e8ba5949fdce5f86c1d35b278585bd85d..9364146f267cabd203dc75989c129c58ba466b76 100644
--- a/mace/ops/reduce_mean.cc
+++ b/mace/ops/reduce_mean.cc
@@ -134,7 +134,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
           }
           output_ptr[0] = sum / data_reshape_[0];
         } else {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
           for (int i = 0; i < data_reshape_[0]; ++i) {
             output_ptr[i] = input_ptr[i];
           }
@@ -142,7 +142,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
         break;
       case 2:
         if (reduce_first_axis_) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
           for (int i = 0; i < data_reshape_[1]; ++i) {
             for (int j = 0; j < data_reshape_[0]; ++j) {
               output_ptr[i] += input_ptr[j * data_reshape_[1] + i];
@@ -150,7 +150,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
             output_ptr[i] /= data_reshape_[0];
           }
         } else {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
           for (int i = 0; i < data_reshape_[0]; ++i) {
             for (int j = 0; j < data_reshape_[1]; ++j) {
               output_ptr[i] += input_ptr[i * data_reshape_[1] + j];
@@ -161,7 +161,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
         break;
       case 3:
         if (reduce_first_axis_) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
           for (int i = 0; i < data_reshape_[1]; ++i) {
             for (int j = 0; j < data_reshape_[2]; ++j) {
               for (int k = 0; k < data_reshape_[0]; ++k) {
@@ -173,7 +173,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
             output_ptr[i] /= (data_reshape_[0] * data_reshape_[2]);
           }
         } else {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
           for (int i = 0; i < data_reshape_[0]; ++i) {
             for (int j = 0; j < data_reshape_[2]; ++j) {
               for (int k = 0; k < data_reshape_[1]; ++k) {
@@ -188,7 +188,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
         break;
       case 4:
         if (reduce_first_axis_) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
           for (int i = 0; i < data_reshape_[1]; ++i) {
             for (int j = 0; j < data_reshape_[3]; ++j) {
               for (int k = 0; k < data_reshape_[2]; ++k) {
@@ -203,7 +203,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
             }
           }
         } else {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
           for (int i = 0; i < data_reshape_[0]; ++i) {
             for (int j = 0; j < data_reshape_[2]; ++j) {
               for (int k = 0; k < data_reshape_[1]; ++k) {
diff --git a/mace/ops/resize_bicubic.cc b/mace/ops/resize_bicubic.cc
index 28912faef0fb77a147aa1601c43bca1d566b96b4..403300607cfcb929169a18946eff79085d6c534c 100644
--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
@@ -85,7 +85,7 @@ inline void ResizeImage(const float *images,
                         const float height_scale,
                         const float width_scale,
                         float *output) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < batch_size; ++b) {
     for (index_t y = 0; y < out_height; ++y) {
       std::vector<float> y_weights;
diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc
index 91f6c3e5ccf491755d98cae03f8bf32910fde31e..5ce6ef4a44a4bdb2f9d3b11057e9b317867d62d5 100644
--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -95,7 +95,7 @@ inline void ResizeImageNCHW(const T *images,
                             T *output) {
   const CachedInterpolation *xs = xs_vec.data();
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
   for (index_t b = 0; b < batch_size; ++b) {
     for (index_t c = 0; c < channels; ++c) {
       const T
@@ -141,7 +141,7 @@ inline void ResizeImageNHWC(const T *images,
   for (index_t b = 0; b < batch_size; ++b) {
     const T *input_base = images + b * channels * in_height * in_width;
     T *output_base = output + b * channels * out_height * out_width;
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t y = 0; y < out_height; ++y) {
       const T
           *y_lower_input_ptr = input_base + ys[y].lower * in_width * channels;
diff --git a/mace/ops/sgemm.cc b/mace/ops/sgemm.cc
index 5dd1de2d4cec8991683fe303ca458ff2111279d4..cdb95bc2ab529681b70ceca3481313a40bf0c5ba 100644
--- a/mace/ops/sgemm.cc
+++ b/mace/ops/sgemm.cc
@@ -283,7 +283,7 @@ void SGemm::RunInternal(const PackedBlock &lhs,
   }
 
   if (batch >= MaceOpenMPThreadCount) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     MACE_SGEMM_RUN_PER_BATCH
   } else {
     MACE_SGEMM_RUN_PER_BATCH
@@ -310,7 +310,7 @@ void SGemm::RunPerBatch(const float *lhs_data,
   // as possible to cache, by tiling lhs by height and rhs by width.
 
   // w: 4
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
   for (index_t bw = 0; bw < block_w; ++bw) {
     index_t remain_h = height;
     index_t block_h = 0;
@@ -733,7 +733,7 @@ void SGemm::RunPerBatch(const float *lhs_data,
   rhs_data += (width - remain_w) * depth;
 
   // w: 1
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
   for (index_t bw = 0; bw < remain_w; ++bw) {
     index_t remain_h = height;
 
@@ -954,7 +954,7 @@ void SGemm::Pack(const MatrixMap<const float> &src,
       PackPerBatch(src, order, b, packed_data + b * height * width);  \
     }
   if (src.batch() >= MaceOpenMPThreadCount) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     MACE_SGEMM_PACK_PER_BATCH
   } else {
     MACE_SGEMM_PACK_PER_BATCH
@@ -976,7 +976,7 @@ void SGemm::UnPack(const PackedBlock &packed_result,
   }
 
   if (matrix_map->batch() >= MaceOpenMPThreadCount) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     MACE_SGEMM_UNPACK_PER_BATCH
   } else {
     MACE_SGEMM_UNPACK_PER_BATCH
@@ -999,7 +999,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
     index_t h = 0;
 #if defined(MACE_ENABLE_NEON)
 #if defined(__aarch64__)
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t ih = h; ih <= height - 8; ih += 8) {
       const float *src_data_ptr = src_data + ih * width;
       float *packed_data_ptr = packed_data + ih * width;
@@ -1020,7 +1020,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
     }
     h += (height - h) / 8 * 8;
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t ih = h; ih <= height - 4; ih += 4) {
       const float *src_data_ptr = src_data + ih * width;
       float *packed_data_ptr = packed_data + ih * width;
@@ -1036,7 +1036,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
     }
     h += (height - h) / 4 * 4;
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t ih = h; ih < height; ++ih) {
       std::copy_n(src_data + ih * width, width, packed_data + ih * width);
     }
@@ -1046,7 +1046,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
     index_t h = 0;
 #if defined(MACE_ENABLE_NEON)
 #if defined(__aarch64__)
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t ih = h; ih <= height - 8; ih += 8) {
       const float *src_data_ptr = src_data + ih;
       float *packed_data_ptr = packed_data + ih * width;
@@ -1061,7 +1061,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
     }
     h += (height - h) / 8 * 8;
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t ih = h; ih <= height - 4; ih += 4) {
       const float *src_data_ptr = src_data + ih;
       float *packed_data_ptr = packed_data + ih * width;
@@ -1074,7 +1074,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
     }
     h += (height - h) / 4 * 4;
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t ih = h; ih < height; ++ih) {
       const float *src_data_ptr = src_data + ih;
       float *packed_data_ptr = packed_data + ih * width;
@@ -1087,7 +1087,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
     // This is for packing no-transpose rhs.
     index_t w = 0;
 #if defined(MACE_ENABLE_NEON)
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t iw = w; iw <= width - 4; iw += 4) {
       const float *src_data_ptr = src_data + iw;
       float *packed_data_ptr = packed_data + iw * height;
@@ -1100,7 +1100,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
     }
     w += (width - w) / 4 * 4;
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t iw = w; iw < width; ++iw) {
       const float *src_data_ptr = src_data + iw;
       float *packed_data_ptr = packed_data + iw * height;
@@ -1113,7 +1113,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
     // This is for packing transpose-needed rhs.
     index_t w = 0;
 #if defined(MACE_ENABLE_NEON)
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t iw = w; iw <= width - 4; iw += 4) {
       const float *src_data_ptr = src_data + iw * height;
       float *packed_data_ptr = packed_data + iw * height;
@@ -1129,7 +1129,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
     }
     w += (width - w) / 4 * 4;
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t iw = w; iw < width; ++iw) {
       std::copy_n(src_data + iw * height, height, packed_data + iw * height);
     }
@@ -1149,7 +1149,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
     // This is for non-transposed result
     index_t w = 0;
 #if defined(MACE_ENABLE_NEON)
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t iw = w; iw <= width - 4; iw += 4) {
       const float *packed_data_ptr = packed_data + iw * height;
       float *unpacked_data_ptr = unpacked_data + iw;
@@ -1162,7 +1162,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
     }
     w += (width - w) / 4 * 4;
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t iw = w; iw < width; ++iw) {
       const float *packed_data_ptr = packed_data + iw * height;
       float *unpacked_data_ptr = unpacked_data + iw;
@@ -1174,7 +1174,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
     // This is for transposed result
     index_t w = 0;
 #if defined(MACE_ENABLE_NEON)
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t iw = w; iw <= width - 4; iw += 4) {
       const float *packed_data_ptr = packed_data + iw * height;
       float *unpacked_data_ptr = unpacked_data + iw * height;
@@ -1190,7 +1190,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
     }
     w += (width - w) / 4 * 4;
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t iw = w; iw < width; ++iw) {
       std::copy_n(
           packed_data + iw * height, height, unpacked_data + iw * height);
diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc
index 6d62fabc9781838007b9a6d8db8a629b47cfdb40..bf06114430be46dfd37046921f09afa33ce3fe5d 100644
--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -59,7 +59,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
       const index_t batch_size = class_count * class_size;
 
       for (index_t b = 0; b < batch; ++b) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
         for (index_t k = 0; k < class_size; ++k) {
           const float *input_ptr = input_data + b * batch_size + k;
           float *output_ptr = output_data + b * batch_size + k;
@@ -94,7 +94,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
     } else if (input->dim_size() == 2) {  // normal 2d softmax
       const index_t class_size = input->dim(0);
       const index_t class_count = input->dim(1);
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t k = 0; k < class_size; ++k) {
         const float *input_ptr = input_data + k * class_count;
         float *output_ptr = output_data + k * class_count;
@@ -172,7 +172,7 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
     // If depth is short, do it using float32. Float computation should not
     // be here, but as long as it is on CPU, it is fine.
     if (depth < 32) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
       for (index_t b = 0; b < batch; ++b) {
         const uint8_t *input_ptr = input_data + b * depth;
         uint8_t *output_ptr = output_data + b * depth;
@@ -201,7 +201,7 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
         (1ll << 31) - 1.0));
     int32_t input_delta_limit = -((1ll << 31) - 1) / scale_q;
 
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t b = 0; b < batch; ++b) {
       const uint8_t *input_ptr = input_data + b * depth;
       uint8_t *output_ptr = output_data + b * depth;
diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc
index a023ae897b98fca66c7502b82f40bef8fcc94959..7d422938c77516f3e11ef3cf5e9f8b7bc7c5db15 100644
--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -129,7 +129,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
         std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width);
 
     // make channel outter loop so we can make best use of cache
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
     for (index_t c = 0; c < channels; ++c) {
       for (index_t block_h = 0; block_h < out_height;
            block_h += block_h_size) {
@@ -238,7 +238,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, uint8_t> : public SpaceToBatchOpBase {
     index_t out_width = batch_tensor->dim(2);
     index_t channels = batch_tensor->dim(3);
 
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (index_t b = 0; b < out_batches; ++b) {
       const index_t in_b = b % in_batches;
       const index_t tile_index = b / in_batches;
diff --git a/mace/ops/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc
index f25d66c1118e3319c9e5e58d33d78f11e90500fb..7927da3b9a321d417386e2c76c8494e45a3417f2 100644
--- a/mace/ops/sqrdiff_mean.cc
+++ b/mace/ops/sqrdiff_mean.cc
@@ -64,7 +64,7 @@ class SqrDiffMeanOp : public Operation {
 
     const index_t img_size = input0->dim(2) * input0->dim(3);
     const index_t bc = input0->dim(0) * input0->dim(1);
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
     for (int i = 0; i < bc; ++i) {
       for (int j = 0; j < img_size; ++j) {
         T diff = input_ptr0[i * img_size + j] - input_ptr1[i];
diff --git a/mace/public/mace.h b/mace/public/mace.h
index ef8fb35d5d781a401b0be58c8a59f03c48a3bd16..9e7f568638cc71a9cf358f141b4c0ed46853ab34 100644
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -48,10 +48,28 @@ enum GPUPriorityHint {
   PRIORITY_HIGH = 3
 };
 
+// AFFINITY_NONE: initiate 'num_threads_hint' threads with no affinity
+// scheduled.
+// If 'num_threads_hint' is -1 or greater than number of available cores,
+// 'num_threads_hint' will be reset to number of available cores.
+// AFFINITY_BIG_ONLY: all available big cores are used, and number of threads
+// is equal to numbers of available big cores.
+// AFFINITY_LITTLE_ONLY: all available little cores are used, and number of
+// threads is equal to numbers of available little cores.
+// AFFINITY_HIGH_PERFORMANCE: initiate 'num_threads_hint' threads on different
+// cores with top-num_threads_hint frequencies.
+// If 'num_threads_hint' is -1 or greater than number of available cores,
+// 'num_threads_hint' will be reset to number of available cores.
+// AFFINITY_POWER_SAVE: initiate 'num_threads_hint' threads on different
+// cores with bottom-num_threads_hint frequencies.
+// If 'num_threads_hint' is -1 or greater than number of available cores,
+// 'num_threads_hint' will be reset to number of available cores.
 enum CPUAffinityPolicy {
   AFFINITY_NONE = 0,
   AFFINITY_BIG_ONLY = 1,
   AFFINITY_LITTLE_ONLY = 2,
+  AFFINITY_HIGH_PERFORMANCE = 3,
+  AFFINITY_POWER_SAVE = 4,
 };
 
 struct CallStats {
diff --git a/mace/utils/quantize.h b/mace/utils/quantize.h
index dfaaff1560925c6d1674958ea8f9ae55f4842dd6..0755e70819f092ecc2541851ab3aff909dfbbeef 100644
--- a/mace/utils/quantize.h
+++ b/mace/utils/quantize.h
@@ -99,7 +99,7 @@ inline void QuantizeWithScaleAndZeropoint(const float *input,
                                           int32_t zero_point,
                                           T *output) {
   float recip_scale = 1 / scale;
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
   for (int i = 0; i < size; ++i) {
     output[i] = Saturate<T>(roundf(zero_point + recip_scale * input[i]));
   }
@@ -128,7 +128,7 @@ inline void Dequantize(const T *input,
                        const float scale,
                        const int32_t zero_point,
                        float *output) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
   for (int i = 0; i < size; ++i) {
     output[i] = scale * (input[i] - zero_point);
   }