Using guided openmp scheduler

69583cd6 · 李寅 · 0102ad55 · 69583cd6 · 69583cd6 · 69583cd6
37 changed file
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -36,45 +36,98 @@ namespace mace {

 int MaceOpenMPThreadCount = 1;

-namespace {
+struct CPUFreq {
+  size_t core_id;
+  float freq;
+};

+namespace {
+#if defined(__ANDROID__)
 int GetCPUCount() {
-  char path[64];
  int cpu_count = 0;
-  int result = 0;
-
-  while (true) {
-    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d", cpu_count);
-    result = access(path, F_OK);
-    if (result != 0) {
-      if (errno != ENOENT) {
-        LOG(ERROR) << "Access " << path << " failed: " << strerror(errno);
-      }
-      return cpu_count;
+  std::string cpu_sys_conf = "/proc/cpuinfo";
+  std::ifstream f(cpu_sys_conf);
+  if (!f.is_open()) {
+    LOG(ERROR) << "failed to open " << cpu_sys_conf;
+    return -1;
+  }
+  std::string line;
+  const std::string processor_key = "processor";
+  while (std::getline(f, line)) {
+    if (line.size() >= processor_key.size()
+        && line.compare(0, processor_key.size(), processor_key) == 0) {
+      ++cpu_count;
    }
-    cpu_count++;
  }
+  if (f.bad()) {
+    LOG(ERROR) << "failed to read " << cpu_sys_conf;
+  }
+  if (!f.eof()) {
+    LOG(ERROR) << "failed to read end of " << cpu_sys_conf;
+  }
+  f.close();
+  VLOG(2) << "CPU cores: " << cpu_count;
+  return cpu_count;
 }
+#endif

-int GetCPUMaxFreq(int cpu_id) {
-  char path[64];
-  snprintf(path, sizeof(path),
-           "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
-           cpu_id);
-
-  FILE *fp = fopen(path, "rb");
-  if (!fp) {
-    LOG(WARNING) << "File: " << path << " not exists.";
-    return 0;
+int GetCPUMaxFreq(std::vector<float> *max_freqs) {
+#if defined(__ANDROID__)
+  int cpu_count = GetCPUCount();
+  for (int cpu_id = 0; cpu_id < cpu_count; ++cpu_id) {
+    std::string cpuinfo_max_freq_sys_conf = MakeString(
+        "/sys/devices/system/cpu/cpu",
+        cpu_id,
+        "/cpufreq/cpuinfo_max_freq");
+    std::ifstream f(cpuinfo_max_freq_sys_conf);
+    if (!f.is_open()) {
+      LOG(ERROR) << "failed to open " << cpuinfo_max_freq_sys_conf;
+      return -1;
+    }
+    std::string line;
+    if (std::getline(f, line)) {
+      float freq = atof(line.c_str());
+      max_freqs->push_back(freq);
+    }
+    if (f.bad()) {
+      LOG(ERROR) << "failed to read " << cpuinfo_max_freq_sys_conf;
+    }
+    f.close();
+  }
+#else
+  std::string cpu_sys_conf = "/proc/cpuinfo";
+  std::ifstream f(cpu_sys_conf);
+  if (!f.is_open()) {
+    LOG(ERROR) << "failed to open " << cpu_sys_conf;
+    return -1;
  }
+  std::string line;
+  const std::string freq_key = "cpu MHz";
+  while (std::getline(f, line)) {
+    if (line.size() >= freq_key.size()
+        && line.compare(0, freq_key.size(), freq_key) == 0) {
+      size_t pos = line.find(":");
+      if (pos != std::string::npos) {
+        std::string freq_str = line.substr(pos + 1);
+        float freq = atof(freq_str.c_str());
+        max_freqs->push_back(freq);
+      }
+    }
+  }
+  if (f.bad()) {
+    LOG(ERROR) << "failed to read " << cpu_sys_conf;
+  }
+  if (!f.eof()) {
+    LOG(ERROR) << "failed to read end of " << cpu_sys_conf;
+  }
+  f.close();
+#endif

-  int freq = 0;
-  int items_read = fscanf(fp, "%d", &freq);
-  if (items_read != 1) {
-    LOG(WARNING) << "Read file: " << path << " failed.";
+  for (float freq : *max_freqs) {
+    VLOG(2) << "CPU freq: " << freq;
  }
-  fclose(fp);
-  return freq;
+
+  return 0;
 }

 MaceStatus SetThreadAffinity(cpu_set_t mask) {
@@ -93,51 +146,14 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) {
  }
 }

-MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
-                                  std::vector<int> *little_core_ids) {
-  MACE_CHECK_NOTNULL(big_core_ids);
-  MACE_CHECK_NOTNULL(little_core_ids);
-  int cpu_count = GetCPUCount();
-  std::vector<int> cpu_max_freq(cpu_count);
-
-  // set cpu max frequency
-  for (int i = 0; i < cpu_count; ++i) {
-    cpu_max_freq[i] = GetCPUMaxFreq(i);
-    if (cpu_max_freq[i] == 0) {
-      LOG(WARNING) << "Cannot get CPU" << i
-                   << "'s max frequency info, maybe it is offline.";
-      return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
-                        "Cannot get CPU's max frequency info,"
-                        " maybe it is offline.");
-    }
-  }
-
-  int big_core_freq =
-      *(std::max_element(cpu_max_freq.begin(), cpu_max_freq.end()));
-  int little_core_freq =
-      *(std::min_element(cpu_max_freq.begin(), cpu_max_freq.end()));
-
-  big_core_ids->reserve(cpu_count);
-  little_core_ids->reserve(cpu_count);
-  for (int i = 0; i < cpu_count; ++i) {
-    if (cpu_max_freq[i] == little_core_freq) {
-      little_core_ids->push_back(i);
-    }
-    if (cpu_max_freq[i] == big_core_freq) {
-      big_core_ids->push_back(i);
-    }
-  }
-
-  return MaceStatus::MACE_SUCCESS;
-}
-
 MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
-                                           const std::vector<int> &cpu_ids) {
+                                           const std::vector<size_t> &cpu_ids) {
  MaceOpenMPThreadCount = omp_num_threads;

 #ifdef MACE_ENABLE_OPENMP
  VLOG(1) << "Set OpenMP threads number: " << omp_num_threads
          << ", CPU core IDs: " << MakeString(cpu_ids);
+  omp_set_schedule(omp_sched_guided, 1);
  omp_set_num_threads(omp_num_threads);
 #else
  MACE_UNUSED(omp_num_threads);
@@ -174,55 +190,90 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
 }  // namespace

 MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
-    int omp_num_threads_hint,
+    int num_threads_hint,
    CPUAffinityPolicy policy,
    void *gemm_context) {
+  // get cpu frequency info
+  std::vector<float> cpu_max_freqs;
+  if (GetCPUMaxFreq(&cpu_max_freqs) == -1 || cpu_max_freqs.size() == 0) {
+    return MaceStatus::MACE_INVALID_ARGS;
+  }
+
+  std::vector<CPUFreq> cpu_freq(cpu_max_freqs.size());
+  for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
+    cpu_freq[i].core_id = i;
+    cpu_freq[i].freq = cpu_max_freqs[i];
+  }
+  if (policy == CPUAffinityPolicy::AFFINITY_POWER_SAVE ||
+      policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
+    std::sort(cpu_freq.begin(),
+              cpu_freq.end(),
+              [=](const CPUFreq &lhs, const CPUFreq &rhs) {
+                return lhs.freq < rhs.freq;
+              });
+  } else if (policy == CPUAffinityPolicy::AFFINITY_HIGH_PERFORMANCE ||
+      policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) {
+    std::sort(cpu_freq.begin(),
+              cpu_freq.end(),
+              [](const CPUFreq &lhs, const CPUFreq &rhs) {
+                return lhs.freq > rhs.freq;
+              });
+  }
+
+  int cpu_count = static_cast<int>(cpu_freq.size());
+  if (num_threads_hint <= 0 || num_threads_hint > cpu_count) {
+    num_threads_hint = cpu_count;
+  }
+
  if (policy == CPUAffinityPolicy::AFFINITY_NONE) {
 #ifdef MACE_ENABLE_QUANTIZE
    if (gemm_context) {
      static_cast<gemmlowp::GemmContext*>(gemm_context)->set_max_num_threads(
-          std::max(0, omp_num_threads_hint));
+          num_threads_hint);
    }
 #else
    MACE_UNUSED(gemm_context);
 #endif  // MACE_ENABLE_QUANTIZE
 #ifdef MACE_ENABLE_OPENMP
-    if (omp_num_threads_hint > 0) {
-      omp_set_num_threads(std::min(omp_num_threads_hint, omp_get_num_procs()));
-    }
+    omp_set_num_threads(num_threads_hint);
 #else
    LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled.";
 #endif
    return MaceStatus::MACE_SUCCESS;
  }

-  std::vector<int> big_core_ids;
-  std::vector<int> little_core_ids;
-  MaceStatus res = GetCPUBigLittleCoreIDs(&big_core_ids, &little_core_ids);
-  if (res != MaceStatus::MACE_SUCCESS) {
-    return res;
-  }

-  std::vector<int> use_cpu_ids;
-  if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) {
-    use_cpu_ids = std::move(big_core_ids);
+  // decide num of cores to use
+  int cores_to_use = 0;
+  if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY
+      || policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
+    for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
+      if (cpu_freq[i].freq != cpu_freq[0].freq) {
+        break;
+      }
+      ++cores_to_use;
+    }
+    num_threads_hint = cores_to_use;
  } else {
-    use_cpu_ids = std::move(little_core_ids);
+    cores_to_use = num_threads_hint;
  }

-  if (omp_num_threads_hint <= 0 ||
-      omp_num_threads_hint > static_cast<int>(use_cpu_ids.size())) {
-    omp_num_threads_hint = use_cpu_ids.size();
+  VLOG(2) << "Use " << num_threads_hint << " threads";
+  std::vector<size_t> cpu_ids(cores_to_use);
+  for (int i = 0; i < cores_to_use; ++i) {
+    VLOG(2) << "Bind thread to core: " << cpu_freq[i].core_id << " with freq "
+            << cpu_freq[i].freq;
+    cpu_ids[i] = cpu_freq[i].core_id;
  }

 #ifdef MACE_ENABLE_QUANTIZE
  if (gemm_context) {
    static_cast<gemmlowp::GemmContext*>(gemm_context)->set_max_num_threads(
-        omp_num_threads_hint);
+        num_threads_hint);
  }
 #endif  // MACE_ENABLE_QUANTIZE

-  return SetOpenMPThreadsAndAffinityCPUs(omp_num_threads_hint, use_cpu_ids);
+  return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint, cpu_ids);
 }

 }  // namespace mace

--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
@@ -66,26 +66,26 @@ void DoActivation(const T *input_ptr,
    case NOOP:
      break;
    case RELU:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output_ptr[i] = std::max(input_ptr[i], static_cast<T>(0));
      }
      break;
    case RELUX:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output_ptr[i] = std::min(std::max(input_ptr[i], static_cast<T>(0)),
                                 static_cast<T>(relux_max_limit));
      }
      break;
    case TANH:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output_ptr[i] = std::tanh(input_ptr[i]);
      }
      break;
    case SIGMOID:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i]));
      }
@@ -111,13 +111,13 @@ inline void DoActivation(const float *input_ptr,
      ReluxNeon(input_ptr, relux_max_limit, size, output_ptr);
      break;
    case TANH:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output_ptr[i] = std::tanh(input_ptr[i]);
      }
      break;
    case SIGMOID:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i]));
      }
@@ -134,7 +134,7 @@ void PReLUActivation(const T *input_ptr,
                     const index_t inner_size,
                     const T *alpha_ptr,
                     T *output_ptr) {
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
  for (index_t i = 0; i < outer_size; ++i) {
    for (index_t chan_idx = 0; chan_idx < input_chan; ++chan_idx) {
      for (index_t j = 0; j < inner_size; ++j) {

--- a/mace/ops/argmax.cc
+++ b/mace/ops/argmax.cc
@@ -59,7 +59,7 @@ class ArgMaxOp : public Operation {
    index_t outer_size = output->size();
    index_t inner_size = input->dim(axis_value);

-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t i = 0; i < outer_size; ++i) {
      int idx = 0;
      T max_value = std::numeric_limits<T>::lowest();

--- a/mace/ops/arm/activation_neon.cc
+++ b/mace/ops/arm/activation_neon.cc
@@ -25,7 +25,7 @@ namespace ops {
 void ReluNeon(const float *input, const index_t size, float *output) {
 #if defined(MACE_ENABLE_NEON)
  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
  for (index_t i = 0; i <= size - 4; i += 4) {
    float32x4_t v = vld1q_f32(input + i);
    v = vmaxq_f32(v, vzero);
@@ -36,7 +36,7 @@ void ReluNeon(const float *input, const index_t size, float *output) {
    output[i] = std::max(input[i], 0.f);
  }
 #else
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
  for (index_t i = 0; i < size; ++i) {
    output[i] = std::max(input[i], 0.f);
  }
@@ -48,7 +48,7 @@ void ReluxNeon(const float *input, const float limit,
 #if defined(MACE_ENABLE_NEON)
  float32x4_t vzero = vdupq_n_f32(0.f);
  float32x4_t vlimit = vdupq_n_f32(limit);
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
  for (index_t i = 0; i <= size - 4; i += 4) {
    float32x4_t v = vld1q_f32(input + i);
    v = vmaxq_f32(v, vzero);
@@ -60,7 +60,7 @@ void ReluxNeon(const float *input, const float limit,
    output[i] = std::min(std::max(input[i], 0.f), limit);
  }
 #else
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
  for (index_t i = 0; i < size; ++i) {
    output[i] = std::min(std::max(input[i], 0.f), limit);
  }

--- a/mace/ops/arm/conv_2d_neon_15x1.cc
+++ b/mace/ops/arm/conv_2d_neon_15x1.cc
@@ -60,7 +60,7 @@ void Conv2dNeonK15x1S1(const float *input,
  const index_t tile_width =
      out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3];

-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t m = 0; m < out_shape[1]; ++m) {
      for (index_t w = 0; w < out_shape[3]; w += tile_width) {

--- a/mace/ops/arm/conv_2d_neon_1x15.cc
+++ b/mace/ops/arm/conv_2d_neon_1x15.cc
@@ -61,7 +61,7 @@ void Conv2dNeonK1x15S1(const float *input,
  const index_t tile_height =
      out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2];

-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t m = 0; m < out_shape[1]; ++m) {
      for (index_t h = 0; h < out_shape[2]; h += tile_height) {

--- a/mace/ops/arm/conv_2d_neon_1x7.cc
+++ b/mace/ops/arm/conv_2d_neon_1x7.cc
@@ -32,7 +32,7 @@ void Conv2dNeonK1x7S1(const float *input,
  const index_t in_batch_size = in_shape[1] * in_image_size;
  const index_t out_batch_size = out_shape[1] * out_image_size;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t m = 0; m < out_shape[1]; m += 4) {
      const index_t out_channels = out_shape[1];

--- a/mace/ops/arm/conv_2d_neon_3x3.cc
+++ b/mace/ops/arm/conv_2d_neon_3x3.cc
@@ -33,7 +33,7 @@ void Conv2dNeonK3x3S1(const float *input,
  const index_t in_batch_size = in_shape[1] * in_image_size;
  const index_t out_batch_size = out_shape[1] * out_image_size;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t m = 0; m < out_shape[1]; m += 2) {
      const index_t out_channels = out_shape[1];
@@ -515,7 +515,7 @@ void Conv2dNeonK3x3S2(const float *input,
  const index_t in_batch_size = in_shape[1] * in_image_size;
  const index_t out_batch_size = out_shape[1] * out_image_size;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t m = 0; m < out_shape[1]; ++m) {
      for (index_t c = 0; c < in_shape[1]; ++c) {

--- a/mace/ops/arm/conv_2d_neon_5x5.cc
+++ b/mace/ops/arm/conv_2d_neon_5x5.cc
@@ -87,7 +87,7 @@ void Conv2dNeonK5x5S1(const float *input,
  const index_t in_batch_size = in_shape[1] * in_image_size;
  const index_t out_batch_size = out_shape[1] * out_image_size;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t m = 0; m < out_shape[1]; m += 4) {
      const index_t out_channels = out_shape[1];

--- a/mace/ops/arm/conv_2d_neon_7x1.cc
+++ b/mace/ops/arm/conv_2d_neon_7x1.cc
@@ -32,7 +32,7 @@ void Conv2dNeonK7x1S1(const float *input,
  const index_t in_batch_size = in_shape[1] * in_image_size;
  const index_t out_batch_size = out_shape[1] * out_image_size;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t m = 0; m < out_shape[1]; m += 4) {
      const index_t out_channels = out_shape[1];

--- a/mace/ops/arm/conv_2d_neon_7x7.cc
+++ b/mace/ops/arm/conv_2d_neon_7x7.cc
@@ -164,7 +164,7 @@ void Conv2dNeonK7x7S1(const float *input,
  const index_t in_batch_size = in_shape[1] * in_image_size;
  const index_t out_batch_size = out_shape[1] * out_image_size;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t m = 0; m < out_shape[1]; m += 4) {
      const index_t out_channels = out_shape[1];
@@ -319,7 +319,7 @@ void Conv2dNeonK7x7S2(const float *input,
  const index_t in_batch_size = in_shape[1] * in_image_size;
  const index_t out_batch_size = out_shape[1] * out_image_size;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t m = 0; m < out_shape[1]; m += 4) {
      const index_t out_channels = out_shape[1];
@@ -484,7 +484,7 @@ void Conv2dNeonK7x7S3(const float *input,
  const index_t in_batch_size = in_shape[1] * in_image_size;
  const index_t out_batch_size = out_shape[1] * out_image_size;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t m = 0; m < out_shape[1]; m += 4) {
      const index_t out_channels = out_shape[1];

--- a/mace/ops/arm/conv_winograd.cc
+++ b/mace/ops/arm/conv_winograd.cc
@@ -34,7 +34,7 @@ void TransformInput4x4(const float *input,
  const index_t input_batch_size = in_height_width * in_channels;
  const index_t output_batch_size = 16 * in_channels * tile_count;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t n = 0; n < batch; ++n) {
    for (index_t c = 0; c < in_channels; ++c) {
      index_t tile_index = 0;
@@ -155,7 +155,7 @@ void TransformInput8x8(const float *input,
  const index_t input_batch_size = in_height_width * in_channels;
  const index_t output_batch_size = 64 * in_channels * tile_count;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t n = 0; n < batch; ++n) {
    for (index_t c = 0; c < in_channels; ++c) {
      index_t tile_index = 0;
@@ -292,7 +292,7 @@ void TransformOutput4x4(const float *input,
  const index_t out_image_size = out_height * out_width;
  const index_t output_batch_size = out_channels * out_image_size;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t n = 0; n < batch; ++n) {
    for (index_t m = 0; m < out_channels; ++m) {
      index_t tile_offset = 0;
@@ -388,7 +388,7 @@ void TransformOutput8x8(const float *input,
  const index_t out_image_size = out_height * out_width;
  const index_t output_batch_size = out_channels * out_image_size;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t n = 0; n < batch; ++n) {
    for (index_t m = 0; m < out_channels; ++m) {
      index_t tile_offset = 0;
@@ -471,7 +471,7 @@ void TransformFilter4x4(const float *filter,
                        float *output) {
  const index_t stride = out_channels * in_channels;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t m = 0; m < out_channels; ++m) {
    for (index_t c = 0; c < in_channels; ++c) {
      float g0, g1, g2, g3, g4, g5, g6, g7, g8;
@@ -573,7 +573,7 @@ void TransformFilter8x8(const float *filter,
                         {1.0f / 45, -1.0f / 90, 1.0f / 180},
                         {0.0f, 0.0f, 1.0f}};

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t m = 0; m < out_channels; ++m) {
    for (index_t c = 0; c < in_channels; ++c) {
      // load filter
@@ -720,7 +720,7 @@ void ConvRef3x3s1(const float *input,
  index_t out_height = in_height - 2;
  index_t out_width = in_width - 2;

-#pragma omp parallel for collapse(4)
+#pragma omp parallel for collapse(4) schedule(runtime)
  for (index_t b = 0; b < batch; ++b) {
    for (index_t m = 0; m < out_channels; ++m) {
      for (index_t h = 0; h < out_height; ++h) {

--- a/mace/ops/arm/deconv_2d_neon_2x2.cc
+++ b/mace/ops/arm/deconv_2d_neon_2x2.cc
@@ -33,7 +33,7 @@ void Deconv2dNeonK2x2S1(const float *input,

  const index_t out_img_size = outh * outw;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t oc = 0; oc < outch; oc += 2) {
      if (oc + 1 < outch) {
@@ -199,7 +199,7 @@ void Deconv2dNeonK2x2S2(const float *input,
  const index_t outw = out_shape[3];
  const index_t out_img_size = outh * outw;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t oc = 0; oc < outch; ++oc) {
      float *out_base = output + (b * outch + oc) * out_img_size;

--- a/mace/ops/arm/deconv_2d_neon_3x3.cc
+++ b/mace/ops/arm/deconv_2d_neon_3x3.cc
@@ -33,7 +33,7 @@ void Deconv2dNeonK3x3S1(const float *input,

  const index_t out_img_size = outh * outw;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t oc = 0; oc < outch; oc += 2) {
      if (oc + 1 < outch) {
@@ -293,7 +293,7 @@ void Deconv2dNeonK3x3S2(const float *input,
  const index_t outw = out_shape[3];
  const index_t out_img_size = outh * outw;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t oc = 0; oc < outch; ++oc) {
      float *out_base = output + (b * outch + oc) * out_img_size;

--- a/mace/ops/arm/deconv_2d_neon_4x4.cc
+++ b/mace/ops/arm/deconv_2d_neon_4x4.cc
@@ -31,7 +31,7 @@ void Deconv2dNeonK4x4S1(const float *input,
  const index_t outw = out_shape[3];
  const index_t outch = out_shape[1];
  const index_t out_img_size = outh * outw;
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t oc = 0; oc < outch; oc += 2) {
      if (oc + 1 < outch) {
@@ -386,7 +386,7 @@ void Deconv2dNeonK4x4S2(const float *input,
  const index_t outch = out_shape[1];
  const index_t out_img_size = outh * outw;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t p = 0; p < outch; p++) {
      float *out_base = output + (b * outch + p) * out_img_size;

--- a/mace/ops/arm/depthwise_conv2d_neon_3x3.cc
+++ b/mace/ops/arm/depthwise_conv2d_neon_3x3.cc
@@ -70,7 +70,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
  const index_t in_batch_size = in_shape[1] * in_image_size;
  const index_t out_batch_size = out_shape[1] * out_image_size;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < in_shape[0]; ++b) {
    for (index_t m = 0; m < out_shape[1]; ++m) {
      index_t c = m / multiplier;
@@ -250,7 +250,7 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
  const index_t in_batch_size = in_shape[1] * in_image_size;
  const index_t out_batch_size = out_shape[1] * out_image_size;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < in_shape[0]; ++b) {
    for (index_t m = 0; m < out_shape[1]; ++m) {
      index_t c = m / multiplier;

--- a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
+++ b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc
@@ -32,7 +32,7 @@ void DepthwiseDeconv2dNeonK3x3S1(const float *input,
  const index_t outw = out_shape[3];
  const index_t out_img_size = outh * outw;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t c = 0; c < channels; ++c) {
      const index_t offset = b * channels + c;
@@ -137,7 +137,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input,
  const index_t outw = out_shape[3];
  const index_t out_img_size = outh * outw;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t c = 0; c < channels; ++c) {
      const index_t offset = b * channels + c;
@@ -251,7 +251,7 @@ void GroupDeconv2dNeonK3x3S1(const float *input,
  const index_t inch_g = inch / group;
  const index_t outch_g = outch / group;

-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (int g = 0; g < group; ++g) {
      for (index_t oc = 0; oc < outch_g; oc += 2) {
@@ -525,7 +525,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input,
  const index_t inch_g = inch / group;
  const index_t outch_g = outch / group;

-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (int g = 0; g < group; ++g) {
      for (index_t oc = 0; oc < outch_g; ++oc) {

--- a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
+++ b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc
@@ -33,7 +33,7 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input,
  const index_t outw = out_shape[3];
  const index_t out_img_size = outh * outw;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < batch; ++b) {
    for (index_t c = 0; c < channels; ++c) {
      const index_t offset = b * channels + c;
@@ -169,7 +169,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
  const index_t outw = out_shape[3];
  const index_t out_img_size = outh * outw;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (index_t c = 0; c < channels; ++c) {
      const index_t offset = b * channels + c;
@@ -304,7 +304,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
  const index_t inch_g = inch / group;
  const index_t outch_g = outch / group;

-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (int g = 0; g < group; ++g) {
      for (index_t oc = 0; oc < outch_g; oc += 2) {
@@ -679,7 +679,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
  const index_t inch_g = inch / group;
  const index_t outch_g = outch / group;

-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
  for (index_t b = 0; b < out_shape[0]; ++b) {
    for (int g = 0; g < group; ++g) {
      for (index_t oc = 0; oc < outch_g; oc++) {

--- a/mace/ops/batch_to_space.cc
+++ b/mace/ops/batch_to_space.cc
@@ -124,7 +124,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
        std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);

    // make channel outter loop so we can make best use of cache
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
    for (index_t c = 0; c < channels; ++c) {
      for (index_t block_h = 0; block_h < in_height;
           block_h += block_h_size) {
@@ -213,7 +213,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
    index_t out_width = space_tensor->dim(2);
    index_t channels = space_tensor->dim(3);

-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t in_b = 0; in_b < in_batches; ++in_b) {
      const index_t b = in_b % out_batches;
      const index_t tile_index = in_b / out_batches;

--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -55,7 +55,7 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
    index_t batch_size = channels * image_size;
    index_t channels_per_group = channels / groups_;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
    for (index_t b = 0; b < batch; ++b) {
      for (index_t c = 0; c < channels; ++c) {
        const T *input_base = input_ptr + b * batch_size;

--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -475,7 +475,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {

    // unpack output
    if (extra_output_height != height || extra_output_width != width) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
      for (index_t b = 0; b < batch; ++b) {
        for (index_t c = 0; c < channels; ++c) {
          for (index_t h = 0; h < height; ++h) {
@@ -494,7 +494,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {

    if (bias_data != nullptr) {
      const index_t image_size = height * width;
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
      for (index_t b = 0; b < batch; ++b) {
        for (index_t c = 0; c < channels; ++c) {
          float *output_ptr = output_data + (b * channels + c) * image_size;
@@ -539,7 +539,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
    const index_t out_batch_size = filter_shape[0] * out_image_size;
    const index_t filter_size = filter_shape[2] * filter_shape[3];

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
    for (index_t b = 0; b < in_shape[0]; b++) {
      for (index_t m = 0; m < filter_shape[0]; m += 4) {
        const index_t in_width = in_shape[3];
@@ -867,7 +867,7 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
    const index_t input_row_size = in_shape[2] * in_shape[3];
    const index_t patch_row_size = filter_w * in_shape[3];

-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
    for (index_t b = 0; b < out_shape[0]; ++b) {
      for (index_t h = 0; h < out_shape[1]; ++h) {
        for (index_t w = 0; w < out_shape[2]; ++w) {

--- a/mace/ops/conv_pool_2d_util.cc
+++ b/mace/ops/conv_pool_2d_util.cc
@@ -395,7 +395,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
  const index_t in_batch_size = channels * in_image_size;
  const index_t out_batch_size = channels * out_image_size;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (int i = 0; i < batch; ++i) {
    for (int j = 0; j < channels; ++j) {
      for (int k = 0; k < height; ++k) {
@@ -443,7 +443,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
  if (padding_same_value) {
    LOG(FATAL) << "Not implemented";
  } else {
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
    for (int n = 0; n < batch; ++n) {
      for (int h = 0; h < height; ++h) {
        for (int w = 0; w < width; ++w) {

--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -276,7 +276,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
      const index_t batch = out_shape[0];
      const index_t channels = out_shape[1];
      const index_t img_size = out_shape[2] * out_shape[3];
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
      for (index_t b = 0; b < batch; ++b) {
        for (index_t c = 0; c < channels; ++c) {
          for (index_t i = 0; i < img_size; ++i) {
@@ -324,7 +324,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
    const index_t out_channels = out_shape[1];
    const index_t in_channels = in_shape[1];

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
    for (int b = 0; b < batch; ++b) {
      for (int oc = 0; oc < out_channels; ++oc) {
        float *out_base =

--- a/mace/ops/depth_to_space.cc
+++ b/mace/ops/depth_to_space.cc
@@ -57,7 +57,7 @@ class DepthToSpaceOp : public Operation {
    const T *input_ptr = input->data<T>();
    T *output_ptr = output->mutable_data<T>();

-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t b = 0; b < batch_size; ++b) {
      for (index_t d = 0; d < output_depth; ++d) {
        for (index_t h = 0; h < output_height; ++h) {

--- a/mace/ops/eltwise.cc
+++ b/mace/ops/eltwise.cc
@@ -201,7 +201,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
  switch (type) {
    case SUM:
      if (coeff.empty()) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
        for (index_t d = 0; d < diff_size; ++d) {
          for (index_t i = 0; i < common_size; ++i) {
            output[i + d * common_size] =
@@ -213,7 +213,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
        if (swapped) {
          std::swap(coeff_copy[0], coeff_copy[1]);
        }
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
        for (index_t d = 0; d < diff_size; ++d) {
          for (index_t i = 0; i < common_size; ++i) {
            output[i + d * common_size] =
@@ -225,7 +225,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
      break;
    case SUB:
      if (!swapped) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
        for (index_t d = 0; d < diff_size; ++d) {
          for (index_t i = 0; i < common_size; ++i) {
            output[i + d * common_size] =
@@ -233,7 +233,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
          }
        }
      } else {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
        for (index_t d = 0; d < diff_size; ++d) {
          for (index_t i = 0; i < common_size; ++i) {
            output[i + d * common_size] =
@@ -243,7 +243,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
      }
      break;
    case PROD:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
      for (index_t d = 0; d < diff_size; ++d) {
        for (index_t i = 0; i < common_size; ++i) {
          output[i + d * common_size] = input0[i + d * common_size] * input1[i];
@@ -252,7 +252,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
      break;
    case DIV:
      if (!swapped) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
        for (index_t d = 0; d < diff_size; ++d) {
          for (index_t i = 0; i < common_size; ++i) {
            output[i + d * common_size] =
@@ -260,7 +260,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
          }
        }
      } else {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
        for (index_t d = 0; d < diff_size; ++d) {
          for (index_t i = 0; i < common_size; ++i) {
            output[i + d * common_size] =
@@ -270,7 +270,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
      }
      break;
    case MIN:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
      for (index_t d = 0; d < diff_size; ++d) {
        for (index_t i = 0; i < common_size; ++i) {
          output[i + d * common_size] =
@@ -279,7 +279,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
      }
      break;
    case MAX:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
      for (index_t d = 0; d < diff_size; ++d) {
        for (index_t i = 0; i < common_size; ++i) {
          output[i + d * common_size] =
@@ -288,7 +288,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
      }
      break;
    case SQR_DIFF:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
      for (index_t d = 0; d < diff_size; ++d) {
        for (index_t i = 0; i < common_size; ++i) {
          output[i + d * common_size] =
@@ -298,7 +298,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
      break;
    case POW:
      if (!swapped) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
        for (index_t d = 0; d < diff_size; ++d) {
          for (index_t i = 0; i < common_size; ++i) {
            output[i + d * common_size] =
@@ -306,7 +306,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
          }
        }
      } else {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
        for (index_t d = 0; d < diff_size; ++d) {
          for (index_t i = 0; i < common_size; ++i) {
            output[i + d * common_size] =
@@ -316,19 +316,19 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
      }
      break;
    case NEG:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < diff_size * common_size; ++i) {
        output[i] = -input0[i];
      }
      break;
    case ABS:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < diff_size * common_size; ++i) {
        output[i] = std::fabs(input0[i]);
      }
      break;
    case EQUAL:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
      for (index_t d = 0; d < diff_size; ++d) {
        for (index_t i = 0; i < common_size; ++i) {
          output[i + d * common_size] =
@@ -353,7 +353,7 @@ inline void TensorEltwise(const EltwiseType type,
  switch (type) {
    case SUM:
      if (coeff.empty()) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
        for (index_t i = 0; i < size; ++i) {
          output[i] = input0[i] + input1[i];
        }
@@ -363,7 +363,7 @@ inline void TensorEltwise(const EltwiseType type,
        if (swapped) {
          std::swap(coeff_copy[0], coeff_copy[1]);
        }
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
        for (index_t i = 0; i < size; ++i) {
          output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1];
        }
@@ -371,20 +371,20 @@ inline void TensorEltwise(const EltwiseType type,
      break;
    case SUB:
      if (!swapped) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
        for (index_t i = 0; i < size; ++i) {
          output[i] = input0[i] - input1[i];
        }

      } else {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
        for (index_t i = 0; i < size; ++i) {
          output[i] = input1[i] - input0[i];
        }
      }
      break;
    case PROD:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output[i] = input0[i] * input1[i];
      }
@@ -392,34 +392,34 @@ inline void TensorEltwise(const EltwiseType type,
      break;
    case DIV:
      if (!swapped) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
        for (index_t i = 0; i < size; ++i) {
          output[i] = input0[i] / input1[i];
        }

      } else {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
        for (index_t i = 0; i < size; ++i) {
          output[i] = input1[i] / input0[i];
        }
      }
      break;
    case MIN:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output[i] = std::min(input0[i], input1[i]);
      }

      break;
    case MAX:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output[i] = std::max(input0[i], input1[i]);
      }

      break;
    case SQR_DIFF:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output[i] = std::pow(input0[i] - input1[i], 2.f);
      }
@@ -427,7 +427,7 @@ inline void TensorEltwise(const EltwiseType type,
      break;
    case POW:
      if (!swapped) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
        for (index_t i = 0; i < size; ++i) {
          output[i] = std::pow(input0[i], input1[i]);
        }
@@ -438,19 +438,19 @@ inline void TensorEltwise(const EltwiseType type,
      }
      break;
    case NEG:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output[i] = -input0[i];
      }
      break;
    case ABS:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output[i] = std::fabs(input0[i]);
      }
      break;
    case EQUAL:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output[i] = input0[i] == input1[i];
      }
@@ -472,7 +472,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
  switch (type) {
    case SUM:
      if (coeff.empty()) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
        for (index_t i = 0; i < size; ++i) {
          output[i] = input0[i] + input1;
        }
@@ -482,7 +482,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
        if (swapped) {
          std::swap(coeff_copy[0], coeff_copy[1]);
        }
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
        for (index_t i = 0; i < size; ++i) {
          output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1];
        }
@@ -490,20 +490,20 @@ inline void TensorScalarEltwise(const EltwiseType type,
      break;
    case SUB:
      if (!swapped) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
        for (index_t i = 0; i < size; ++i) {
          output[i] = input0[i] - input1;
        }

      } else {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
        for (index_t i = 0; i < size; ++i) {
          output[i] = input1 - input0[i];
        }
      }
      break;
    case PROD:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output[i] = input0[i] * input1;
      }
@@ -511,34 +511,34 @@ inline void TensorScalarEltwise(const EltwiseType type,
      break;
    case DIV:
      if (!swapped) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
        for (index_t i = 0; i < size; ++i) {
          output[i] = input0[i] / input1;
        }

      } else {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
        for (index_t i = 0; i < size; ++i) {
          output[i] = input1 / input0[i];
        }
      }
      break;
    case MIN:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output[i] = std::min(input0[i], input1);
      }

      break;
    case MAX:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output[i] = std::max(input0[i], input1);
      }

      break;
    case SQR_DIFF:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output[i] = std::pow(input0[i] - input1, 2.f);
      }
@@ -546,7 +546,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
      break;
    case POW:
      if (!swapped) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
        for (index_t i = 0; i < size; ++i) {
          output[i] = std::pow(input0[i], input1);
        }
@@ -557,19 +557,19 @@ inline void TensorScalarEltwise(const EltwiseType type,
      }
      break;
    case NEG:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output[i] = -input0[i];
      }
      break;
    case ABS:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output[i] = std::fabs(input0[i]);
      }
      break;
    case EQUAL:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output[i] = input0[i] == input1;
      }
@@ -594,7 +594,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
  switch (type) {
    case SUM:
      if (coeff.empty()) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -610,7 +610,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
        if (swapped) {
          std::swap(coeff_copy[0], coeff_copy[1]);
        }
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -626,7 +626,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
      break;
    case SUB:
      if (!swapped) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -638,7 +638,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
          }
        }
      } else {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -652,7 +652,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
      }
      break;
    case PROD:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
      for (index_t b = 0; b < batch0; ++b) {
        for (index_t c = 0; c < channel; ++c) {
          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -666,7 +666,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
      break;
    case DIV:
      if (!swapped) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -678,7 +678,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
          }
        }
      } else {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -692,7 +692,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
      }
      break;
    case MIN:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
      for (index_t b = 0; b < batch0; ++b) {
        for (index_t c = 0; c < channel; ++c) {
          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -705,7 +705,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
      }
      break;
    case MAX:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
      for (index_t b = 0; b < batch0; ++b) {
        for (index_t c = 0; c < channel; ++c) {
          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -718,7 +718,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
      }
      break;
    case SQR_DIFF:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
      for (index_t b = 0; b < batch0; ++b) {
        for (index_t c = 0; c < channel; ++c) {
          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -732,7 +732,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
      break;
    case POW:
      if (!swapped) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -744,7 +744,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
          }
        }
      } else {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
        for (index_t b = 0; b < batch0; ++b) {
          for (index_t c = 0; c < channel; ++c) {
            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -758,19 +758,19 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
      }
      break;
    case NEG:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
        output[i] = -input0[i];
      }
      break;
    case ABS:
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
        output[i] = std::fabs(input0[i]);
      }
      break;
    case EQUAL:
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
      for (index_t b = 0; b < batch0; ++b) {
        for (index_t c = 0; c < channel; ++c) {
          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
@@ -991,7 +991,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {

    index_t handled_output_size = 0;
 #ifdef MACE_ENABLE_NEON
-    #pragma omp parallel for
+    #pragma omp parallel for schedule(runtime)
    for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) {
      const auto input0_val = vld1_u8(input0_ptr + i);
      const auto input1_val = vld1_u8(input1_ptr + i);
@@ -1037,7 +1037,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
    }
    handled_output_size = output->size() - output->size() % 8;
 #endif  // NEON
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t i = handled_output_size; i < output->size(); ++i) {
      const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
      const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();

--- a/mace/ops/gather.cc
+++ b/mace/ops/gather.cc
@@ -62,7 +62,7 @@ class GatherOp : public Operation {
                        params->shape().end(), 1, std::multiplies<index_t>());
    index_t index_size = indices->size();

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
    for (index_t l = 0; l < lhs_size; ++l) {
      for (index_t idx = 0; idx < index_size; ++idx) {
        MACE_ASSERT(indices_data[idx] < axis_dim_size, "idx out of bound: ",

--- a/mace/ops/local_response_norm.cc
+++ b/mace/ops/local_response_norm.cc
@@ -53,7 +53,7 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
    index_t image_size = height * width;
    index_t batch_size = channels * image_size;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
    for (index_t b = 0; b < batch; ++b) {
      for (index_t c = 0; c < channels; ++c) {
        const int begin_input_c = std::max(static_cast<index_t>(0),

--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -133,7 +133,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
    const index_t in_batch_size = in_shape[1] * in_image_size;
    const index_t out_batch_size = out_shape[1] * out_image_size;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
    for (index_t b = 0; b < out_shape[0]; ++b) {
      for (index_t c = 0; c < out_shape[1]; ++c) {
        const index_t out_base = b * out_batch_size + c * out_image_size;
@@ -179,7 +179,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
    const index_t in_batch_size = in_shape[1] * in_image_size;
    const index_t out_batch_size = out_shape[1] * out_image_size;

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
    for (index_t b = 0; b < out_shape[0]; ++b) {
      for (index_t c = 0; c < out_shape[1]; ++c) {
        const index_t out_base = b * out_batch_size + c * out_image_size;
@@ -301,7 +301,7 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
                  const int *stride_hw,
                  const int *pad_hw,
                  uint8_t *output) {
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
    for (index_t b = 0; b < out_shape[0]; ++b) {
      for (index_t h = 0; h < out_shape[1]; ++h) {
        for (index_t w = 0; w < out_shape[2]; ++w) {
@@ -358,7 +358,7 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
                  const int *stride_hw,
                  const int *pad_hw,
                  uint8_t *output) {
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
    for (index_t b = 0; b < out_shape[0]; ++b) {
      for (index_t h = 0; h < out_shape[1]; ++h) {
        for (index_t w = 0; w < out_shape[2]; ++w) {

--- a/mace/ops/reduce_mean.cc
+++ b/mace/ops/reduce_mean.cc
@@ -134,7 +134,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
          }
          output_ptr[0] = sum / data_reshape_[0];
        } else {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
          for (int i = 0; i < data_reshape_[0]; ++i) {
            output_ptr[i] = input_ptr[i];
          }
@@ -142,7 +142,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
        break;
      case 2:
        if (reduce_first_axis_) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
          for (int i = 0; i < data_reshape_[1]; ++i) {
            for (int j = 0; j < data_reshape_[0]; ++j) {
              output_ptr[i] += input_ptr[j * data_reshape_[1] + i];
@@ -150,7 +150,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
            output_ptr[i] /= data_reshape_[0];
          }
        } else {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
          for (int i = 0; i < data_reshape_[0]; ++i) {
            for (int j = 0; j < data_reshape_[1]; ++j) {
              output_ptr[i] += input_ptr[i * data_reshape_[1] + j];
@@ -161,7 +161,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
        break;
      case 3:
        if (reduce_first_axis_) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
          for (int i = 0; i < data_reshape_[1]; ++i) {
            for (int j = 0; j < data_reshape_[2]; ++j) {
              for (int k = 0; k < data_reshape_[0]; ++k) {
@@ -173,7 +173,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
            output_ptr[i] /= (data_reshape_[0] * data_reshape_[2]);
          }
        } else {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
          for (int i = 0; i < data_reshape_[0]; ++i) {
            for (int j = 0; j < data_reshape_[2]; ++j) {
              for (int k = 0; k < data_reshape_[1]; ++k) {
@@ -188,7 +188,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
        break;
      case 4:
        if (reduce_first_axis_) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
          for (int i = 0; i < data_reshape_[1]; ++i) {
            for (int j = 0; j < data_reshape_[3]; ++j) {
              for (int k = 0; k < data_reshape_[2]; ++k) {
@@ -203,7 +203,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
            }
          }
        } else {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
          for (int i = 0; i < data_reshape_[0]; ++i) {
            for (int j = 0; j < data_reshape_[2]; ++j) {
              for (int k = 0; k < data_reshape_[1]; ++k) {

--- a/mace/ops/resize_bicubic.cc
+++ b/mace/ops/resize_bicubic.cc
@@ -85,7 +85,7 @@ inline void ResizeImage(const float *images,
                        const float height_scale,
                        const float width_scale,
                        float *output) {
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < batch_size; ++b) {
    for (index_t y = 0; y < out_height; ++y) {
      std::vector<float> y_weights;

--- a/mace/ops/resize_bilinear.cc
+++ b/mace/ops/resize_bilinear.cc
@@ -95,7 +95,7 @@ inline void ResizeImageNCHW(const T *images,
                            T *output) {
  const CachedInterpolation *xs = xs_vec.data();

-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(2) schedule(runtime)
  for (index_t b = 0; b < batch_size; ++b) {
    for (index_t c = 0; c < channels; ++c) {
      const T
@@ -141,7 +141,7 @@ inline void ResizeImageNHWC(const T *images,
  for (index_t b = 0; b < batch_size; ++b) {
    const T *input_base = images + b * channels * in_height * in_width;
    T *output_base = output + b * channels * out_height * out_width;
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t y = 0; y < out_height; ++y) {
      const T
          *y_lower_input_ptr = input_base + ys[y].lower * in_width * channels;

--- a/mace/ops/sgemm.cc
+++ b/mace/ops/sgemm.cc
@@ -252,7 +252,7 @@ void SGemm::RunInternal(const PackedBlock &lhs,
  }

  if (batch >= MaceOpenMPThreadCount) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    MACE_SGEMM_RUN_PER_BATCH
  } else {
    MACE_SGEMM_RUN_PER_BATCH
@@ -279,7 +279,7 @@ void SGemm::RunPerBatch(const float *lhs_data,
  // as possible to cache, by tiling lhs by height and rhs by width.

  // w: 4
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
  for (index_t bw = 0; bw < block_w; ++bw) {
    index_t remain_h = height;
    index_t block_h = 0;
@@ -702,7 +702,7 @@ void SGemm::RunPerBatch(const float *lhs_data,
  rhs_data += (width - remain_w) * depth;

  // w: 1
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
  for (index_t bw = 0; bw < remain_w; ++bw) {
    index_t remain_h = height;

@@ -923,7 +923,7 @@ void SGemm::Pack(const MatrixMap<const float> &src,
      PackPerBatch(src, order, b, packed_data + b * height * width);  \
    }
  if (src.batch() >= MaceOpenMPThreadCount) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    MACE_SGEMM_PACK_PER_BATCH
  } else {
    MACE_SGEMM_PACK_PER_BATCH
@@ -945,7 +945,7 @@ void SGemm::UnPack(const PackedBlock &packed_result,
  }

  if (matrix_map->batch() >= MaceOpenMPThreadCount) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    MACE_SGEMM_UNPACK_PER_BATCH
  } else {
    MACE_SGEMM_UNPACK_PER_BATCH
@@ -968,7 +968,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
    index_t h = 0;
 #if defined(MACE_ENABLE_NEON)
 #if defined(__aarch64__)
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t ih = h; ih <= height - 8; ih += 8) {
      const float *src_data_ptr = src_data + ih * width;
      float *packed_data_ptr = packed_data + ih * width;
@@ -989,7 +989,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
    }
    h += (height - h) / 8 * 8;
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t ih = h; ih <= height - 4; ih += 4) {
      const float *src_data_ptr = src_data + ih * width;
      float *packed_data_ptr = packed_data + ih * width;
@@ -1005,7 +1005,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
    }
    h += (height - h) / 4 * 4;
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t ih = h; ih < height; ++ih) {
      std::copy_n(src_data + ih * width, width, packed_data + ih * width);
    }
@@ -1015,7 +1015,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
    index_t h = 0;
 #if defined(MACE_ENABLE_NEON)
 #if defined(__aarch64__)
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t ih = h; ih <= height - 8; ih += 8) {
      const float *src_data_ptr = src_data + ih;
      float *packed_data_ptr = packed_data + ih * width;
@@ -1030,7 +1030,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
    }
    h += (height - h) / 8 * 8;
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t ih = h; ih <= height - 4; ih += 4) {
      const float *src_data_ptr = src_data + ih;
      float *packed_data_ptr = packed_data + ih * width;
@@ -1043,7 +1043,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
    }
    h += (height - h) / 4 * 4;
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t ih = h; ih < height; ++ih) {
      const float *src_data_ptr = src_data + ih;
      float *packed_data_ptr = packed_data + ih * width;
@@ -1056,7 +1056,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
    // This is for packing no-transpose rhs.
    index_t w = 0;
 #if defined(MACE_ENABLE_NEON)
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t iw = w; iw <= width - 4; iw += 4) {
      const float *src_data_ptr = src_data + iw;
      float *packed_data_ptr = packed_data + iw * height;
@@ -1069,7 +1069,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
    }
    w += (width - w) / 4 * 4;
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t iw = w; iw < width; ++iw) {
      const float *src_data_ptr = src_data + iw;
      float *packed_data_ptr = packed_data + iw * height;
@@ -1082,7 +1082,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
    // This is for packing transpose-needed rhs.
    index_t w = 0;
 #if defined(MACE_ENABLE_NEON)
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t iw = w; iw <= width - 4; iw += 4) {
      const float *src_data_ptr = src_data + iw * height;
      float *packed_data_ptr = packed_data + iw * height;
@@ -1098,7 +1098,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
    }
    w += (width - w) / 4 * 4;
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t iw = w; iw < width; ++iw) {
      std::copy_n(src_data + iw * height, height, packed_data + iw * height);
    }
@@ -1118,7 +1118,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
    // This is for non-transposed result
    index_t w = 0;
 #if defined(MACE_ENABLE_NEON)
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t iw = w; iw <= width - 4; iw += 4) {
      const float *packed_data_ptr = packed_data + iw * height;
      float *unpacked_data_ptr = unpacked_data + iw;
@@ -1131,7 +1131,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
    }
    w += (width - w) / 4 * 4;
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t iw = w; iw < width; ++iw) {
      const float *packed_data_ptr = packed_data + iw * height;
      float *unpacked_data_ptr = unpacked_data + iw;
@@ -1143,7 +1143,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
    // This is for transposed result
    index_t w = 0;
 #if defined(MACE_ENABLE_NEON)
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t iw = w; iw <= width - 4; iw += 4) {
      const float *packed_data_ptr = packed_data + iw * height;
      float *unpacked_data_ptr = unpacked_data + iw * height;
@@ -1159,7 +1159,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
    }
    w += (width - w) / 4 * 4;
 #endif
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t iw = w; iw < width; ++iw) {
      std::copy_n(
          packed_data + iw * height, height, unpacked_data + iw * height);

--- a/mace/ops/softmax.cc
+++ b/mace/ops/softmax.cc
@@ -59,7 +59,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
      const index_t batch_size = class_count * class_size;

      for (index_t b = 0; b < batch; ++b) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
        for (index_t k = 0; k < class_size; ++k) {
          const float *input_ptr = input_data + b * batch_size + k;
          float *output_ptr = output_data + b * batch_size + k;
@@ -94,7 +94,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
    } else if (input->dim_size() == 2) {  // normal 2d softmax
      const index_t class_size = input->dim(0);
      const index_t class_count = input->dim(1);
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t k = 0; k < class_size; ++k) {
        const float *input_ptr = input_data + k * class_count;
        float *output_ptr = output_data + k * class_count;
@@ -172,7 +172,7 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
    // If depth is short, do it using float32. Float computation should not
    // be here, but as long as it is on CPU, it is fine.
    if (depth < 32) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
      for (index_t b = 0; b < batch; ++b) {
        const uint8_t *input_ptr = input_data + b * depth;
        uint8_t *output_ptr = output_data + b * depth;
@@ -201,7 +201,7 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
        (1ll << 31) - 1.0));
    int32_t input_delta_limit = -((1ll << 31) - 1) / scale_q;

-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t b = 0; b < batch; ++b) {
      const uint8_t *input_ptr = input_data + b * depth;
      uint8_t *output_ptr = output_data + b * depth;

--- a/mace/ops/space_to_batch.cc
+++ b/mace/ops/space_to_batch.cc
@@ -129,7 +129,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
        std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width);

    // make channel outter loop so we can make best use of cache
-#pragma omp parallel for collapse(3)
+#pragma omp parallel for collapse(3) schedule(runtime)
    for (index_t c = 0; c < channels; ++c) {
      for (index_t block_h = 0; block_h < out_height;
           block_h += block_h_size) {
@@ -238,7 +238,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, uint8_t> : public SpaceToBatchOpBase {
    index_t out_width = batch_tensor->dim(2);
    index_t channels = batch_tensor->dim(3);

-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (index_t b = 0; b < out_batches; ++b) {
      const index_t in_b = b % in_batches;
      const index_t tile_index = b / in_batches;

--- a/mace/ops/sqrdiff_mean.cc
+++ b/mace/ops/sqrdiff_mean.cc
@@ -64,7 +64,7 @@ class SqrDiffMeanOp : public Operation {

    const index_t img_size = input0->dim(2) * input0->dim(3);
    const index_t bc = input0->dim(0) * input0->dim(1);
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
    for (int i = 0; i < bc; ++i) {
      for (int j = 0; j < img_size; ++j) {
        T diff = input_ptr0[i * img_size + j] - input_ptr1[i];

--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -48,10 +48,28 @@ enum GPUPriorityHint {
  PRIORITY_HIGH = 3
 };

+// AFFINITY_NONE: initiate 'num_threads_hint' threads with no affinity
+// scheduled.
+// If 'num_threads_hint' is -1 or greater than number of available cores,
+// 'num_threads_hint' will be reset to number of available cores.
+// AFFINITY_BIG_ONLY: all available big cores are used, and number of threads
+// is equal to numbers of available big cores.
+// AFFINITY_LITTLE_ONLY: all available little cores are used, and number of
+// threads is equal to numbers of available little cores.
+// AFFINITY_HIGH_PERFORMANCE: initiate 'num_threads_hint' threads on different
+// cores with top-num_threads_hint frequencies.
+// If 'num_threads_hint' is -1 or greater than number of available cores,
+// 'num_threads_hint' will be reset to number of available cores.
+// AFFINITY_POWER_SAVE: initiate 'num_threads_hint' threads on different
+// cores with bottom-num_threads_hint frequencies.
+// If 'num_threads_hint' is -1 or greater than number of available cores,
+// 'num_threads_hint' will be reset to number of available cores.
 enum CPUAffinityPolicy {
  AFFINITY_NONE = 0,
  AFFINITY_BIG_ONLY = 1,
  AFFINITY_LITTLE_ONLY = 2,
+  AFFINITY_HIGH_PERFORMANCE = 3,
+  AFFINITY_POWER_SAVE = 4,
 };

 struct CallStats {

--- a/mace/utils/quantize.h
+++ b/mace/utils/quantize.h
@@ -99,7 +99,7 @@ inline void QuantizeWithScaleAndZeropoint(const float *input,
                                          int32_t zero_point,
                                          T *output) {
  float recip_scale = 1 / scale;
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
  for (int i = 0; i < size; ++i) {
    output[i] = Saturate<T>(roundf(zero_point + recip_scale * input[i]));
  }
@@ -128,7 +128,7 @@ inline void Dequantize(const T *input,
                       const float scale,
                       const int32_t zero_point,
                       float *output) {
-#pragma omp parallel for
+#pragma omp parallel for schedule(runtime)
  for (int i = 0; i < size; ++i) {
    output[i] = scale * (input[i] - zero_point);
  }