diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc index 789f2a5c1e2ccf0f87f8fbf03c71a22d2dec76cf..ce50595412c7b24a148c02b7b261d20f344a9c72 100644 --- a/mace/core/runtime/cpu/cpu_runtime.cc +++ b/mace/core/runtime/cpu/cpu_runtime.cc @@ -36,45 +36,98 @@ namespace mace { int MaceOpenMPThreadCount = 1; -namespace { +struct CPUFreq { + size_t core_id; + float freq; +}; +namespace { +#if defined(__ANDROID__) int GetCPUCount() { - char path[64]; int cpu_count = 0; - int result = 0; - - while (true) { - snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d", cpu_count); - result = access(path, F_OK); - if (result != 0) { - if (errno != ENOENT) { - LOG(ERROR) << "Access " << path << " failed: " << strerror(errno); - } - return cpu_count; + std::string cpu_sys_conf = "/proc/cpuinfo"; + std::ifstream f(cpu_sys_conf); + if (!f.is_open()) { + LOG(ERROR) << "failed to open " << cpu_sys_conf; + return -1; + } + std::string line; + const std::string processor_key = "processor"; + while (std::getline(f, line)) { + if (line.size() >= processor_key.size() + && line.compare(0, processor_key.size(), processor_key) == 0) { + ++cpu_count; } - cpu_count++; } + if (f.bad()) { + LOG(ERROR) << "failed to read " << cpu_sys_conf; + } + if (!f.eof()) { + LOG(ERROR) << "failed to read end of " << cpu_sys_conf; + } + f.close(); + VLOG(2) << "CPU cores: " << cpu_count; + return cpu_count; } +#endif -int GetCPUMaxFreq(int cpu_id) { - char path[64]; - snprintf(path, sizeof(path), - "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", - cpu_id); - - FILE *fp = fopen(path, "rb"); - if (!fp) { - LOG(WARNING) << "File: " << path << " not exists."; - return 0; +int GetCPUMaxFreq(std::vector *max_freqs) { +#if defined(__ANDROID__) + int cpu_count = GetCPUCount(); + for (int cpu_id = 0; cpu_id < cpu_count; ++cpu_id) { + std::string cpuinfo_max_freq_sys_conf = MakeString( + "/sys/devices/system/cpu/cpu", + cpu_id, + "/cpufreq/cpuinfo_max_freq"); + std::ifstream f(cpuinfo_max_freq_sys_conf); + if (!f.is_open()) { + LOG(ERROR) << "failed to open " << cpuinfo_max_freq_sys_conf; + return -1; + } + std::string line; + if (std::getline(f, line)) { + float freq = atof(line.c_str()); + max_freqs->push_back(freq); + } + if (f.bad()) { + LOG(ERROR) << "failed to read " << cpuinfo_max_freq_sys_conf; + } + f.close(); + } +#else + std::string cpu_sys_conf = "/proc/cpuinfo"; + std::ifstream f(cpu_sys_conf); + if (!f.is_open()) { + LOG(ERROR) << "failed to open " << cpu_sys_conf; + return -1; } + std::string line; + const std::string freq_key = "cpu MHz"; + while (std::getline(f, line)) { + if (line.size() >= freq_key.size() + && line.compare(0, freq_key.size(), freq_key) == 0) { + size_t pos = line.find(":"); + if (pos != std::string::npos) { + std::string freq_str = line.substr(pos + 1); + float freq = atof(freq_str.c_str()); + max_freqs->push_back(freq); + } + } + } + if (f.bad()) { + LOG(ERROR) << "failed to read " << cpu_sys_conf; + } + if (!f.eof()) { + LOG(ERROR) << "failed to read end of " << cpu_sys_conf; + } + f.close(); +#endif - int freq = 0; - int items_read = fscanf(fp, "%d", &freq); - if (items_read != 1) { - LOG(WARNING) << "Read file: " << path << " failed."; + for (float freq : *max_freqs) { + VLOG(2) << "CPU freq: " << freq; } - fclose(fp); - return freq; + + return 0; } MaceStatus SetThreadAffinity(cpu_set_t mask) { @@ -93,51 +146,14 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) { } } -MaceStatus GetCPUBigLittleCoreIDs(std::vector *big_core_ids, - std::vector *little_core_ids) { - MACE_CHECK_NOTNULL(big_core_ids); - MACE_CHECK_NOTNULL(little_core_ids); - int cpu_count = GetCPUCount(); - std::vector cpu_max_freq(cpu_count); - - // set cpu max frequency - for (int i = 0; i < cpu_count; ++i) { - cpu_max_freq[i] = GetCPUMaxFreq(i); - if (cpu_max_freq[i] == 0) { - LOG(WARNING) << "Cannot get CPU" << i - << "'s max frequency info, maybe it is offline."; - return MaceStatus(MaceStatus::MACE_INVALID_ARGS, - "Cannot get CPU's max frequency info," - " maybe it is offline."); - } - } - - int big_core_freq = - *(std::max_element(cpu_max_freq.begin(), cpu_max_freq.end())); - int little_core_freq = - *(std::min_element(cpu_max_freq.begin(), cpu_max_freq.end())); - - big_core_ids->reserve(cpu_count); - little_core_ids->reserve(cpu_count); - for (int i = 0; i < cpu_count; ++i) { - if (cpu_max_freq[i] == little_core_freq) { - little_core_ids->push_back(i); - } - if (cpu_max_freq[i] == big_core_freq) { - big_core_ids->push_back(i); - } - } - - return MaceStatus::MACE_SUCCESS; -} - MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, - const std::vector &cpu_ids) { + const std::vector &cpu_ids) { MaceOpenMPThreadCount = omp_num_threads; #ifdef MACE_ENABLE_OPENMP VLOG(1) << "Set OpenMP threads number: " << omp_num_threads << ", CPU core IDs: " << MakeString(cpu_ids); + omp_set_schedule(omp_sched_guided, 1); omp_set_num_threads(omp_num_threads); #else MACE_UNUSED(omp_num_threads); @@ -174,55 +190,90 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, } // namespace MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( - int omp_num_threads_hint, + int num_threads_hint, CPUAffinityPolicy policy, void *gemm_context) { + // get cpu frequency info + std::vector cpu_max_freqs; + if (GetCPUMaxFreq(&cpu_max_freqs) == -1 || cpu_max_freqs.size() == 0) { + return MaceStatus::MACE_INVALID_ARGS; + } + + std::vector cpu_freq(cpu_max_freqs.size()); + for (size_t i = 0; i < cpu_max_freqs.size(); ++i) { + cpu_freq[i].core_id = i; + cpu_freq[i].freq = cpu_max_freqs[i]; + } + if (policy == CPUAffinityPolicy::AFFINITY_POWER_SAVE || + policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) { + std::sort(cpu_freq.begin(), + cpu_freq.end(), + [=](const CPUFreq &lhs, const CPUFreq &rhs) { + return lhs.freq < rhs.freq; + }); + } else if (policy == CPUAffinityPolicy::AFFINITY_HIGH_PERFORMANCE || + policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) { + std::sort(cpu_freq.begin(), + cpu_freq.end(), + [](const CPUFreq &lhs, const CPUFreq &rhs) { + return lhs.freq > rhs.freq; + }); + } + + int cpu_count = static_cast(cpu_freq.size()); + if (num_threads_hint <= 0 || num_threads_hint > cpu_count) { + num_threads_hint = cpu_count; + } + if (policy == CPUAffinityPolicy::AFFINITY_NONE) { #ifdef MACE_ENABLE_QUANTIZE if (gemm_context) { static_cast(gemm_context)->set_max_num_threads( - std::max(0, omp_num_threads_hint)); + num_threads_hint); } #else MACE_UNUSED(gemm_context); #endif // MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_OPENMP - if (omp_num_threads_hint > 0) { - omp_set_num_threads(std::min(omp_num_threads_hint, omp_get_num_procs())); - } + omp_set_num_threads(num_threads_hint); #else LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled."; #endif return MaceStatus::MACE_SUCCESS; } - std::vector big_core_ids; - std::vector little_core_ids; - MaceStatus res = GetCPUBigLittleCoreIDs(&big_core_ids, &little_core_ids); - if (res != MaceStatus::MACE_SUCCESS) { - return res; - } - std::vector use_cpu_ids; - if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) { - use_cpu_ids = std::move(big_core_ids); + // decide num of cores to use + int cores_to_use = 0; + if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY + || policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) { + for (size_t i = 0; i < cpu_max_freqs.size(); ++i) { + if (cpu_freq[i].freq != cpu_freq[0].freq) { + break; + } + ++cores_to_use; + } + num_threads_hint = cores_to_use; } else { - use_cpu_ids = std::move(little_core_ids); + cores_to_use = num_threads_hint; } - if (omp_num_threads_hint <= 0 || - omp_num_threads_hint > static_cast(use_cpu_ids.size())) { - omp_num_threads_hint = use_cpu_ids.size(); + VLOG(2) << "Use " << num_threads_hint << " threads"; + std::vector cpu_ids(cores_to_use); + for (int i = 0; i < cores_to_use; ++i) { + VLOG(2) << "Bind thread to core: " << cpu_freq[i].core_id << " with freq " + << cpu_freq[i].freq; + cpu_ids[i] = cpu_freq[i].core_id; } #ifdef MACE_ENABLE_QUANTIZE if (gemm_context) { static_cast(gemm_context)->set_max_num_threads( - omp_num_threads_hint); + num_threads_hint); } #endif // MACE_ENABLE_QUANTIZE - return SetOpenMPThreadsAndAffinityCPUs(omp_num_threads_hint, use_cpu_ids); + return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint, cpu_ids); } } // namespace mace diff --git a/mace/ops/activation.h b/mace/ops/activation.h index 2c9a18618da776e5d004e7c01012117b4a94afb0..36fb45d6bdeef39eb9214d398a5cd33fea7c4a07 100644 --- a/mace/ops/activation.h +++ b/mace/ops/activation.h @@ -66,26 +66,26 @@ void DoActivation(const T *input_ptr, case NOOP: break; case RELU: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output_ptr[i] = std::max(input_ptr[i], static_cast(0)); } break; case RELUX: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output_ptr[i] = std::min(std::max(input_ptr[i], static_cast(0)), static_cast(relux_max_limit)); } break; case TANH: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output_ptr[i] = std::tanh(input_ptr[i]); } break; case SIGMOID: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i])); } @@ -111,13 +111,13 @@ inline void DoActivation(const float *input_ptr, ReluxNeon(input_ptr, relux_max_limit, size, output_ptr); break; case TANH: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output_ptr[i] = std::tanh(input_ptr[i]); } break; case SIGMOID: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i])); } @@ -134,7 +134,7 @@ void PReLUActivation(const T *input_ptr, const index_t inner_size, const T *alpha_ptr, T *output_ptr) { -#pragma omp parallel for collapse(3) +#pragma omp parallel for collapse(3) schedule(runtime) for (index_t i = 0; i < outer_size; ++i) { for (index_t chan_idx = 0; chan_idx < input_chan; ++chan_idx) { for (index_t j = 0; j < inner_size; ++j) { diff --git a/mace/ops/argmax.cc b/mace/ops/argmax.cc index 8f8419b7c839dfb5fcae4500b6b109fdc30d1b9a..2b3e2f0be6aa223ef4eb8d0c47aa5733bd13cac6 100644 --- a/mace/ops/argmax.cc +++ b/mace/ops/argmax.cc @@ -59,7 +59,7 @@ class ArgMaxOp : public Operation { index_t outer_size = output->size(); index_t inner_size = input->dim(axis_value); -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < outer_size; ++i) { int idx = 0; T max_value = std::numeric_limits::lowest(); diff --git a/mace/ops/arm/activation_neon.cc b/mace/ops/arm/activation_neon.cc index 44b492a42d7351867391410eb31fd9aaab5ffe35..ec9ba357425ac9c6603b08bac604b6d7f79c57f4 100644 --- a/mace/ops/arm/activation_neon.cc +++ b/mace/ops/arm/activation_neon.cc @@ -25,7 +25,7 @@ namespace ops { void ReluNeon(const float *input, const index_t size, float *output) { #if defined(MACE_ENABLE_NEON) float32x4_t vzero = vdupq_n_f32(0.f); -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i <= size - 4; i += 4) { float32x4_t v = vld1q_f32(input + i); v = vmaxq_f32(v, vzero); @@ -36,7 +36,7 @@ void ReluNeon(const float *input, const index_t size, float *output) { output[i] = std::max(input[i], 0.f); } #else -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = std::max(input[i], 0.f); } @@ -48,7 +48,7 @@ void ReluxNeon(const float *input, const float limit, #if defined(MACE_ENABLE_NEON) float32x4_t vzero = vdupq_n_f32(0.f); float32x4_t vlimit = vdupq_n_f32(limit); -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i <= size - 4; i += 4) { float32x4_t v = vld1q_f32(input + i); v = vmaxq_f32(v, vzero); @@ -60,7 +60,7 @@ void ReluxNeon(const float *input, const float limit, output[i] = std::min(std::max(input[i], 0.f), limit); } #else -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = std::min(std::max(input[i], 0.f), limit); } diff --git a/mace/ops/arm/conv_2d_neon_15x1.cc b/mace/ops/arm/conv_2d_neon_15x1.cc index a4bae4e9835f571066b0c53ed9a9ddda647f4c4d..553de92e6cb28ba492919dcc5bb6e93c7ba2f6bf 100644 --- a/mace/ops/arm/conv_2d_neon_15x1.cc +++ b/mace/ops/arm/conv_2d_neon_15x1.cc @@ -60,7 +60,7 @@ void Conv2dNeonK15x1S1(const float *input, const index_t tile_width = out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3]; -#pragma omp parallel for collapse(3) +#pragma omp parallel for collapse(3) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t m = 0; m < out_shape[1]; ++m) { for (index_t w = 0; w < out_shape[3]; w += tile_width) { diff --git a/mace/ops/arm/conv_2d_neon_1x15.cc b/mace/ops/arm/conv_2d_neon_1x15.cc index 06c40e2902cf7f05bdbacd2b32c241957f36a8c7..07deca05abc32a98a058194718a46493e4327f42 100644 --- a/mace/ops/arm/conv_2d_neon_1x15.cc +++ b/mace/ops/arm/conv_2d_neon_1x15.cc @@ -61,7 +61,7 @@ void Conv2dNeonK1x15S1(const float *input, const index_t tile_height = out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2]; -#pragma omp parallel for collapse(3) +#pragma omp parallel for collapse(3) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t m = 0; m < out_shape[1]; ++m) { for (index_t h = 0; h < out_shape[2]; h += tile_height) { diff --git a/mace/ops/arm/conv_2d_neon_1x7.cc b/mace/ops/arm/conv_2d_neon_1x7.cc index 39321e0fbee05388a876871cf64040da3dc938d4..09061e0550da4742ce04d2d3e35c41f73115f32d 100644 --- a/mace/ops/arm/conv_2d_neon_1x7.cc +++ b/mace/ops/arm/conv_2d_neon_1x7.cc @@ -32,7 +32,7 @@ void Conv2dNeonK1x7S1(const float *input, const index_t in_batch_size = in_shape[1] * in_image_size; const index_t out_batch_size = out_shape[1] * out_image_size; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t m = 0; m < out_shape[1]; m += 4) { const index_t out_channels = out_shape[1]; diff --git a/mace/ops/arm/conv_2d_neon_3x3.cc b/mace/ops/arm/conv_2d_neon_3x3.cc index 33653a424c926af4a89469396abb4fee20c9091f..6213a208b0b663d95fc33d3d069898830544db63 100644 --- a/mace/ops/arm/conv_2d_neon_3x3.cc +++ b/mace/ops/arm/conv_2d_neon_3x3.cc @@ -33,7 +33,7 @@ void Conv2dNeonK3x3S1(const float *input, const index_t in_batch_size = in_shape[1] * in_image_size; const index_t out_batch_size = out_shape[1] * out_image_size; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t m = 0; m < out_shape[1]; m += 2) { const index_t out_channels = out_shape[1]; @@ -515,7 +515,7 @@ void Conv2dNeonK3x3S2(const float *input, const index_t in_batch_size = in_shape[1] * in_image_size; const index_t out_batch_size = out_shape[1] * out_image_size; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t m = 0; m < out_shape[1]; ++m) { for (index_t c = 0; c < in_shape[1]; ++c) { diff --git a/mace/ops/arm/conv_2d_neon_5x5.cc b/mace/ops/arm/conv_2d_neon_5x5.cc index 7803a89ef9d6ffeb2090d3334703f0672bf3b71f..87b997c60fef51763be46403ccb1993ad3dee57a 100644 --- a/mace/ops/arm/conv_2d_neon_5x5.cc +++ b/mace/ops/arm/conv_2d_neon_5x5.cc @@ -87,7 +87,7 @@ void Conv2dNeonK5x5S1(const float *input, const index_t in_batch_size = in_shape[1] * in_image_size; const index_t out_batch_size = out_shape[1] * out_image_size; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t m = 0; m < out_shape[1]; m += 4) { const index_t out_channels = out_shape[1]; diff --git a/mace/ops/arm/conv_2d_neon_7x1.cc b/mace/ops/arm/conv_2d_neon_7x1.cc index 37d9ec9deadd20c2f9aef9b45c95bb012c9e19b5..78025de68f5cfd81685f621487b7b25aa77efb08 100644 --- a/mace/ops/arm/conv_2d_neon_7x1.cc +++ b/mace/ops/arm/conv_2d_neon_7x1.cc @@ -32,7 +32,7 @@ void Conv2dNeonK7x1S1(const float *input, const index_t in_batch_size = in_shape[1] * in_image_size; const index_t out_batch_size = out_shape[1] * out_image_size; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t m = 0; m < out_shape[1]; m += 4) { const index_t out_channels = out_shape[1]; diff --git a/mace/ops/arm/conv_2d_neon_7x7.cc b/mace/ops/arm/conv_2d_neon_7x7.cc index 4e1c0041d178df71787a5f75511dcb4d218a66fc..04c8323f3fddc1edf419cbc0ddd9a713fa647f7d 100644 --- a/mace/ops/arm/conv_2d_neon_7x7.cc +++ b/mace/ops/arm/conv_2d_neon_7x7.cc @@ -164,7 +164,7 @@ void Conv2dNeonK7x7S1(const float *input, const index_t in_batch_size = in_shape[1] * in_image_size; const index_t out_batch_size = out_shape[1] * out_image_size; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t m = 0; m < out_shape[1]; m += 4) { const index_t out_channels = out_shape[1]; @@ -319,7 +319,7 @@ void Conv2dNeonK7x7S2(const float *input, const index_t in_batch_size = in_shape[1] * in_image_size; const index_t out_batch_size = out_shape[1] * out_image_size; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t m = 0; m < out_shape[1]; m += 4) { const index_t out_channels = out_shape[1]; @@ -484,7 +484,7 @@ void Conv2dNeonK7x7S3(const float *input, const index_t in_batch_size = in_shape[1] * in_image_size; const index_t out_batch_size = out_shape[1] * out_image_size; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t m = 0; m < out_shape[1]; m += 4) { const index_t out_channels = out_shape[1]; diff --git a/mace/ops/arm/conv_winograd.cc b/mace/ops/arm/conv_winograd.cc index 2f6207fd9a194eb216bc64b7ef267892252ecea5..748cc694ed7d7f9c7d7f2d1dd145c12b1f87bb5e 100644 --- a/mace/ops/arm/conv_winograd.cc +++ b/mace/ops/arm/conv_winograd.cc @@ -34,7 +34,7 @@ void TransformInput4x4(const float *input, const index_t input_batch_size = in_height_width * in_channels; const index_t output_batch_size = 16 * in_channels * tile_count; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t n = 0; n < batch; ++n) { for (index_t c = 0; c < in_channels; ++c) { index_t tile_index = 0; @@ -155,7 +155,7 @@ void TransformInput8x8(const float *input, const index_t input_batch_size = in_height_width * in_channels; const index_t output_batch_size = 64 * in_channels * tile_count; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t n = 0; n < batch; ++n) { for (index_t c = 0; c < in_channels; ++c) { index_t tile_index = 0; @@ -292,7 +292,7 @@ void TransformOutput4x4(const float *input, const index_t out_image_size = out_height * out_width; const index_t output_batch_size = out_channels * out_image_size; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t n = 0; n < batch; ++n) { for (index_t m = 0; m < out_channels; ++m) { index_t tile_offset = 0; @@ -388,7 +388,7 @@ void TransformOutput8x8(const float *input, const index_t out_image_size = out_height * out_width; const index_t output_batch_size = out_channels * out_image_size; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t n = 0; n < batch; ++n) { for (index_t m = 0; m < out_channels; ++m) { index_t tile_offset = 0; @@ -471,7 +471,7 @@ void TransformFilter4x4(const float *filter, float *output) { const index_t stride = out_channels * in_channels; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t m = 0; m < out_channels; ++m) { for (index_t c = 0; c < in_channels; ++c) { float g0, g1, g2, g3, g4, g5, g6, g7, g8; @@ -573,7 +573,7 @@ void TransformFilter8x8(const float *filter, {1.0f / 45, -1.0f / 90, 1.0f / 180}, {0.0f, 0.0f, 1.0f}}; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t m = 0; m < out_channels; ++m) { for (index_t c = 0; c < in_channels; ++c) { // load filter @@ -720,7 +720,7 @@ void ConvRef3x3s1(const float *input, index_t out_height = in_height - 2; index_t out_width = in_width - 2; -#pragma omp parallel for collapse(4) +#pragma omp parallel for collapse(4) schedule(runtime) for (index_t b = 0; b < batch; ++b) { for (index_t m = 0; m < out_channels; ++m) { for (index_t h = 0; h < out_height; ++h) { diff --git a/mace/ops/arm/deconv_2d_neon_2x2.cc b/mace/ops/arm/deconv_2d_neon_2x2.cc index 001ab01be369f4b3f880c457073be754b7ef1eb9..39f1f3304192348dba0c39fc7f5f586a413f3232 100644 --- a/mace/ops/arm/deconv_2d_neon_2x2.cc +++ b/mace/ops/arm/deconv_2d_neon_2x2.cc @@ -33,7 +33,7 @@ void Deconv2dNeonK2x2S1(const float *input, const index_t out_img_size = outh * outw; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t oc = 0; oc < outch; oc += 2) { if (oc + 1 < outch) { @@ -199,7 +199,7 @@ void Deconv2dNeonK2x2S2(const float *input, const index_t outw = out_shape[3]; const index_t out_img_size = outh * outw; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t oc = 0; oc < outch; ++oc) { float *out_base = output + (b * outch + oc) * out_img_size; diff --git a/mace/ops/arm/deconv_2d_neon_3x3.cc b/mace/ops/arm/deconv_2d_neon_3x3.cc index 6df0c7badfee33aadbc385068bd1f781a63ab2b3..da4d1d885b6572e47bac978cf0c0f150373d7d4c 100644 --- a/mace/ops/arm/deconv_2d_neon_3x3.cc +++ b/mace/ops/arm/deconv_2d_neon_3x3.cc @@ -33,7 +33,7 @@ void Deconv2dNeonK3x3S1(const float *input, const index_t out_img_size = outh * outw; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t oc = 0; oc < outch; oc += 2) { if (oc + 1 < outch) { @@ -293,7 +293,7 @@ void Deconv2dNeonK3x3S2(const float *input, const index_t outw = out_shape[3]; const index_t out_img_size = outh * outw; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t oc = 0; oc < outch; ++oc) { float *out_base = output + (b * outch + oc) * out_img_size; diff --git a/mace/ops/arm/deconv_2d_neon_4x4.cc b/mace/ops/arm/deconv_2d_neon_4x4.cc index dd85896095d0922e02f3079809edd8972380f223..39389e229f1a9c72be1fbbc0766ad2908f139e3c 100644 --- a/mace/ops/arm/deconv_2d_neon_4x4.cc +++ b/mace/ops/arm/deconv_2d_neon_4x4.cc @@ -31,7 +31,7 @@ void Deconv2dNeonK4x4S1(const float *input, const index_t outw = out_shape[3]; const index_t outch = out_shape[1]; const index_t out_img_size = outh * outw; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t oc = 0; oc < outch; oc += 2) { if (oc + 1 < outch) { @@ -386,7 +386,7 @@ void Deconv2dNeonK4x4S2(const float *input, const index_t outch = out_shape[1]; const index_t out_img_size = outh * outw; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t p = 0; p < outch; p++) { float *out_base = output + (b * outch + p) * out_img_size; diff --git a/mace/ops/arm/depthwise_conv2d_neon_3x3.cc b/mace/ops/arm/depthwise_conv2d_neon_3x3.cc index 2e997912f7e096e42278cc025657803353fec84a..3166c9238d47ceba50559efc7ebd3f1cf3ddfcf4 100644 --- a/mace/ops/arm/depthwise_conv2d_neon_3x3.cc +++ b/mace/ops/arm/depthwise_conv2d_neon_3x3.cc @@ -70,7 +70,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, const index_t in_batch_size = in_shape[1] * in_image_size; const index_t out_batch_size = out_shape[1] * out_image_size; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < in_shape[0]; ++b) { for (index_t m = 0; m < out_shape[1]; ++m) { index_t c = m / multiplier; @@ -250,7 +250,7 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, const index_t in_batch_size = in_shape[1] * in_image_size; const index_t out_batch_size = out_shape[1] * out_image_size; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < in_shape[0]; ++b) { for (index_t m = 0; m < out_shape[1]; ++m) { index_t c = m / multiplier; diff --git a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc index 4296fb407ad24bd1e5cda017b36847616061627e..1f138ca68af19648ebaf0d634aeff6a17e3d5d0b 100644 --- a/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc +++ b/mace/ops/arm/depthwise_deconv2d_neon_3x3.cc @@ -32,7 +32,7 @@ void DepthwiseDeconv2dNeonK3x3S1(const float *input, const index_t outw = out_shape[3]; const index_t out_img_size = outh * outw; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t c = 0; c < channels; ++c) { const index_t offset = b * channels + c; @@ -137,7 +137,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input, const index_t outw = out_shape[3]; const index_t out_img_size = outh * outw; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t c = 0; c < channels; ++c) { const index_t offset = b * channels + c; @@ -251,7 +251,7 @@ void GroupDeconv2dNeonK3x3S1(const float *input, const index_t inch_g = inch / group; const index_t outch_g = outch / group; -#pragma omp parallel for collapse(3) +#pragma omp parallel for collapse(3) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (int g = 0; g < group; ++g) { for (index_t oc = 0; oc < outch_g; oc += 2) { @@ -525,7 +525,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input, const index_t inch_g = inch / group; const index_t outch_g = outch / group; -#pragma omp parallel for collapse(3) +#pragma omp parallel for collapse(3) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (int g = 0; g < group; ++g) { for (index_t oc = 0; oc < outch_g; ++oc) { diff --git a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc index 744e70243652c11036f8e992877e6ee3627f35f7..b859bf436aebe07a9b03a8a1adbf6dbf263f2570 100644 --- a/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc +++ b/mace/ops/arm/depthwise_deconv2d_neon_4x4.cc @@ -33,7 +33,7 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input, const index_t outw = out_shape[3]; const index_t out_img_size = outh * outw; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { const index_t offset = b * channels + c; @@ -169,7 +169,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input, const index_t outw = out_shape[3]; const index_t out_img_size = outh * outw; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t c = 0; c < channels; ++c) { const index_t offset = b * channels + c; @@ -304,7 +304,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input, const index_t inch_g = inch / group; const index_t outch_g = outch / group; -#pragma omp parallel for collapse(3) +#pragma omp parallel for collapse(3) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (int g = 0; g < group; ++g) { for (index_t oc = 0; oc < outch_g; oc += 2) { @@ -679,7 +679,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input, const index_t inch_g = inch / group; const index_t outch_g = outch / group; -#pragma omp parallel for collapse(3) +#pragma omp parallel for collapse(3) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (int g = 0; g < group; ++g) { for (index_t oc = 0; oc < outch_g; oc++) { diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc index 529a900b955dac46cfcb1033fa94f238fb8ffaaf..5cc6a1e025c54b755a61d3e0c5331d0f38aa5450 100644 --- a/mace/ops/batch_to_space.cc +++ b/mace/ops/batch_to_space.cc @@ -124,7 +124,7 @@ class BatchToSpaceNDOp : public BatchToSpaceOpBase { std::max(static_cast(1), 8 * 1024 / block_shape_w / out_width); // make channel outter loop so we can make best use of cache -#pragma omp parallel for collapse(3) +#pragma omp parallel for collapse(3) schedule(runtime) for (index_t c = 0; c < channels; ++c) { for (index_t block_h = 0; block_h < in_height; block_h += block_h_size) { @@ -213,7 +213,7 @@ class BatchToSpaceNDOp : public BatchToSpaceOpBase { index_t out_width = space_tensor->dim(2); index_t channels = space_tensor->dim(3); -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t in_b = 0; in_b < in_batches; ++in_b) { const index_t b = in_b % out_batches; const index_t tile_index = in_b / out_batches; diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc index 78e6f7ad583205cc21458592b25e3aa69c3a980c..04c6a88dc99c06ac9f401a1839205d349b32ff90 100644 --- a/mace/ops/channel_shuffle.cc +++ b/mace/ops/channel_shuffle.cc @@ -55,7 +55,7 @@ class ChannelShuffleOp : public Operation { index_t batch_size = channels * image_size; index_t channels_per_group = channels / groups_; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { const T *input_base = input_ptr + b * batch_size; diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc index af459b2bb3e9d730dc111f7a46615f5c452e405d..7bb213c0bc70f9a47d0b7c3964b050b76bcccdba 100644 --- a/mace/ops/conv_2d.cc +++ b/mace/ops/conv_2d.cc @@ -475,7 +475,7 @@ class Conv2dOp : public ConvPool2dOpBase { // unpack output if (extra_output_height != height || extra_output_width != width) { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { for (index_t h = 0; h < height; ++h) { @@ -494,7 +494,7 @@ class Conv2dOp : public ConvPool2dOpBase { if (bias_data != nullptr) { const index_t image_size = height * width; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { float *output_ptr = output_data + (b * channels + c) * image_size; @@ -539,7 +539,7 @@ class Conv2dOp : public ConvPool2dOpBase { const index_t out_batch_size = filter_shape[0] * out_image_size; const index_t filter_size = filter_shape[2] * filter_shape[3]; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < in_shape[0]; b++) { for (index_t m = 0; m < filter_shape[0]; m += 4) { const index_t in_width = in_shape[3]; @@ -867,7 +867,7 @@ class Conv2dOp : public ConvPool2dOpBase { const index_t input_row_size = in_shape[2] * in_shape[3]; const index_t patch_row_size = filter_w * in_shape[3]; -#pragma omp parallel for collapse(3) +#pragma omp parallel for collapse(3) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t h = 0; h < out_shape[1]; ++h) { for (index_t w = 0; w < out_shape[2]; ++w) { diff --git a/mace/ops/conv_pool_2d_util.cc b/mace/ops/conv_pool_2d_util.cc index 6ec025b9eb52773dff3b309b663945ca0e1a7e74..a056743e85af91b562781d9821aebad87115221d 100644 --- a/mace/ops/conv_pool_2d_util.cc +++ b/mace/ops/conv_pool_2d_util.cc @@ -395,7 +395,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor, const index_t in_batch_size = channels * in_image_size; const index_t out_batch_size = channels * out_image_size; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (int i = 0; i < batch; ++i) { for (int j = 0; j < channels; ++j) { for (int k = 0; k < height; ++k) { @@ -443,7 +443,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor, if (padding_same_value) { LOG(FATAL) << "Not implemented"; } else { -#pragma omp parallel for collapse(3) +#pragma omp parallel for collapse(3) schedule(runtime) for (int n = 0; n < batch; ++n) { for (int h = 0; h < height; ++h) { for (int w = 0; w < width; ++w) { diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc index 183885295f8ac780ad14b170d0c298e9902c4b48..c9113439536746e9ce05d33e4b20feb35a075060 100644 --- a/mace/ops/deconv_2d.cc +++ b/mace/ops/deconv_2d.cc @@ -276,7 +276,7 @@ class Deconv2dOp : public Deconv2dOpBase { const index_t batch = out_shape[0]; const index_t channels = out_shape[1]; const index_t img_size = out_shape[2] * out_shape[3]; -#pragma omp parallel for collapse(3) +#pragma omp parallel for collapse(3) schedule(runtime) for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { for (index_t i = 0; i < img_size; ++i) { @@ -324,7 +324,7 @@ class Deconv2dOp : public Deconv2dOpBase { const index_t out_channels = out_shape[1]; const index_t in_channels = in_shape[1]; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (int b = 0; b < batch; ++b) { for (int oc = 0; oc < out_channels; ++oc) { float *out_base = diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc index be7a2f82361955490118a909c821bec042e77a33..e18cc106f4fba10c4f054cd7d8c219b0ef032118 100644 --- a/mace/ops/depth_to_space.cc +++ b/mace/ops/depth_to_space.cc @@ -57,7 +57,7 @@ class DepthToSpaceOp : public Operation { const T *input_ptr = input->data(); T *output_ptr = output->mutable_data(); -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t b = 0; b < batch_size; ++b) { for (index_t d = 0; d < output_depth; ++d) { for (index_t h = 0; h < output_height; ++h) { diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc index 94622ac3f16625837f3336e90ba1d663982ab33a..29f0c5a7ed29834fbd43d3a8951959fa20f1524d 100644 --- a/mace/ops/eltwise.cc +++ b/mace/ops/eltwise.cc @@ -201,7 +201,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, switch (type) { case SUM: if (coeff.empty()) { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t d = 0; d < diff_size; ++d) { for (index_t i = 0; i < common_size; ++i) { output[i + d * common_size] = @@ -213,7 +213,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, if (swapped) { std::swap(coeff_copy[0], coeff_copy[1]); } -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t d = 0; d < diff_size; ++d) { for (index_t i = 0; i < common_size; ++i) { output[i + d * common_size] = @@ -225,7 +225,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, break; case SUB: if (!swapped) { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t d = 0; d < diff_size; ++d) { for (index_t i = 0; i < common_size; ++i) { output[i + d * common_size] = @@ -233,7 +233,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, } } } else { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t d = 0; d < diff_size; ++d) { for (index_t i = 0; i < common_size; ++i) { output[i + d * common_size] = @@ -243,7 +243,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, } break; case PROD: -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t d = 0; d < diff_size; ++d) { for (index_t i = 0; i < common_size; ++i) { output[i + d * common_size] = input0[i + d * common_size] * input1[i]; @@ -252,7 +252,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, break; case DIV: if (!swapped) { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t d = 0; d < diff_size; ++d) { for (index_t i = 0; i < common_size; ++i) { output[i + d * common_size] = @@ -260,7 +260,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, } } } else { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t d = 0; d < diff_size; ++d) { for (index_t i = 0; i < common_size; ++i) { output[i + d * common_size] = @@ -270,7 +270,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, } break; case MIN: -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t d = 0; d < diff_size; ++d) { for (index_t i = 0; i < common_size; ++i) { output[i + d * common_size] = @@ -279,7 +279,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, } break; case MAX: -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t d = 0; d < diff_size; ++d) { for (index_t i = 0; i < common_size; ++i) { output[i + d * common_size] = @@ -288,7 +288,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, } break; case SQR_DIFF: -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t d = 0; d < diff_size; ++d) { for (index_t i = 0; i < common_size; ++i) { output[i + d * common_size] = @@ -298,7 +298,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, break; case POW: if (!swapped) { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t d = 0; d < diff_size; ++d) { for (index_t i = 0; i < common_size; ++i) { output[i + d * common_size] = @@ -306,7 +306,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, } } } else { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t d = 0; d < diff_size; ++d) { for (index_t i = 0; i < common_size; ++i) { output[i + d * common_size] = @@ -316,19 +316,19 @@ inline void TensorBroadcastEltwise(const EltwiseType type, } break; case NEG: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < diff_size * common_size; ++i) { output[i] = -input0[i]; } break; case ABS: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < diff_size * common_size; ++i) { output[i] = std::fabs(input0[i]); } break; case EQUAL: -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t d = 0; d < diff_size; ++d) { for (index_t i = 0; i < common_size; ++i) { output[i + d * common_size] = @@ -353,7 +353,7 @@ inline void TensorEltwise(const EltwiseType type, switch (type) { case SUM: if (coeff.empty()) { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = input0[i] + input1[i]; } @@ -363,7 +363,7 @@ inline void TensorEltwise(const EltwiseType type, if (swapped) { std::swap(coeff_copy[0], coeff_copy[1]); } -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1]; } @@ -371,20 +371,20 @@ inline void TensorEltwise(const EltwiseType type, break; case SUB: if (!swapped) { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = input0[i] - input1[i]; } } else { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = input1[i] - input0[i]; } } break; case PROD: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = input0[i] * input1[i]; } @@ -392,34 +392,34 @@ inline void TensorEltwise(const EltwiseType type, break; case DIV: if (!swapped) { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = input0[i] / input1[i]; } } else { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = input1[i] / input0[i]; } } break; case MIN: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = std::min(input0[i], input1[i]); } break; case MAX: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = std::max(input0[i], input1[i]); } break; case SQR_DIFF: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = std::pow(input0[i] - input1[i], 2.f); } @@ -427,7 +427,7 @@ inline void TensorEltwise(const EltwiseType type, break; case POW: if (!swapped) { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = std::pow(input0[i], input1[i]); } @@ -438,19 +438,19 @@ inline void TensorEltwise(const EltwiseType type, } break; case NEG: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = -input0[i]; } break; case ABS: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = std::fabs(input0[i]); } break; case EQUAL: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = input0[i] == input1[i]; } @@ -472,7 +472,7 @@ inline void TensorScalarEltwise(const EltwiseType type, switch (type) { case SUM: if (coeff.empty()) { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = input0[i] + input1; } @@ -482,7 +482,7 @@ inline void TensorScalarEltwise(const EltwiseType type, if (swapped) { std::swap(coeff_copy[0], coeff_copy[1]); } -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1]; } @@ -490,20 +490,20 @@ inline void TensorScalarEltwise(const EltwiseType type, break; case SUB: if (!swapped) { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = input0[i] - input1; } } else { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = input1 - input0[i]; } } break; case PROD: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = input0[i] * input1; } @@ -511,34 +511,34 @@ inline void TensorScalarEltwise(const EltwiseType type, break; case DIV: if (!swapped) { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = input0[i] / input1; } } else { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = input1 / input0[i]; } } break; case MIN: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = std::min(input0[i], input1); } break; case MAX: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = std::max(input0[i], input1); } break; case SQR_DIFF: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = std::pow(input0[i] - input1, 2.f); } @@ -546,7 +546,7 @@ inline void TensorScalarEltwise(const EltwiseType type, break; case POW: if (!swapped) { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = std::pow(input0[i], input1); } @@ -557,19 +557,19 @@ inline void TensorScalarEltwise(const EltwiseType type, } break; case NEG: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = -input0[i]; } break; case ABS: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = std::fabs(input0[i]); } break; case EQUAL: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < size; ++i) { output[i] = input0[i] == input1; } @@ -594,7 +594,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, switch (type) { case SUM: if (coeff.empty()) { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; @@ -610,7 +610,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, if (swapped) { std::swap(coeff_copy[0], coeff_copy[1]); } -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; @@ -626,7 +626,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, break; case SUB: if (!swapped) { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; @@ -638,7 +638,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, } } } else { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; @@ -652,7 +652,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, } break; case PROD: -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; @@ -666,7 +666,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, break; case DIV: if (!swapped) { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; @@ -678,7 +678,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, } } } else { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; @@ -692,7 +692,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, } break; case MIN: -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; @@ -705,7 +705,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, } break; case MAX: -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; @@ -718,7 +718,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, } break; case SQR_DIFF: -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; @@ -732,7 +732,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, break; case POW: if (!swapped) { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; @@ -744,7 +744,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, } } } else { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; @@ -758,19 +758,19 @@ inline void TensorEltwisePerChannel(const EltwiseType type, } break; case NEG: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < batch0 * channel * image_size; ++i) { output[i] = -input0[i]; } break; case ABS: -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = 0; i < batch0 * channel * image_size; ++i) { output[i] = std::fabs(input0[i]); } break; case EQUAL: -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch0; ++b) { for (index_t c = 0; c < channel; ++c) { const T *in0_ptr = input0 + ((b * channel) + c) * image_size; @@ -989,7 +989,7 @@ class EltwiseOp : public Operation { index_t handled_output_size = 0; #ifdef MACE_ENABLE_NEON - #pragma omp parallel for + #pragma omp parallel for schedule(runtime) for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) { const auto input0_val = vld1_u8(input0_ptr + i); const auto input1_val = vld1_u8(input1_ptr + i); @@ -1035,7 +1035,7 @@ class EltwiseOp : public Operation { } handled_output_size = output->size() - output->size() % 8; #endif // NEON -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t i = handled_output_size; i < output->size(); ++i) { const int32_t offset_input0 = input0_ptr[i] - input0->zero_point(); const int32_t offset_input1 = input1_ptr[i] - input1->zero_point(); diff --git a/mace/ops/gather.cc b/mace/ops/gather.cc index 1af56d7edf8bd11c53c7db816ea636b2d0ff06fa..60ca2856abb0ca9519fe0f63ba946e881cfac142 100644 --- a/mace/ops/gather.cc +++ b/mace/ops/gather.cc @@ -62,7 +62,7 @@ class GatherOp : public Operation { params->shape().end(), 1, std::multiplies()); index_t index_size = indices->size(); -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t l = 0; l < lhs_size; ++l) { for (index_t idx = 0; idx < index_size; ++idx) { MACE_ASSERT(indices_data[idx] < axis_dim_size, "idx out of bound: ", diff --git a/mace/ops/local_response_norm.cc b/mace/ops/local_response_norm.cc index 16828baa2d3419cfa1b4af81d83c20b305983fea..fb0cda7cf0dd993e8cd29e0b99251a21cc896758 100644 --- a/mace/ops/local_response_norm.cc +++ b/mace/ops/local_response_norm.cc @@ -53,7 +53,7 @@ class LocalResponseNormOp : public Operation { index_t image_size = height * width; index_t batch_size = channels * image_size; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch; ++b) { for (index_t c = 0; c < channels; ++c) { const int begin_input_c = std::max(static_cast(0), diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc index 5f9d0e0d7641217e156e3d071e8f521eda95af8b..2ce9d6acb6ac535311b5dc77e6161721a6c716cd 100644 --- a/mace/ops/pooling.cc +++ b/mace/ops/pooling.cc @@ -133,7 +133,7 @@ class PoolingOp : public PoolingOpBase { const index_t in_batch_size = in_shape[1] * in_image_size; const index_t out_batch_size = out_shape[1] * out_image_size; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t c = 0; c < out_shape[1]; ++c) { const index_t out_base = b * out_batch_size + c * out_image_size; @@ -179,7 +179,7 @@ class PoolingOp : public PoolingOpBase { const index_t in_batch_size = in_shape[1] * in_image_size; const index_t out_batch_size = out_shape[1] * out_image_size; -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t c = 0; c < out_shape[1]; ++c) { const index_t out_base = b * out_batch_size + c * out_image_size; @@ -301,7 +301,7 @@ class PoolingOp : public PoolingOpBase { const int *stride_hw, const int *pad_hw, uint8_t *output) { -#pragma omp parallel for collapse(3) +#pragma omp parallel for collapse(3) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t h = 0; h < out_shape[1]; ++h) { for (index_t w = 0; w < out_shape[2]; ++w) { @@ -358,7 +358,7 @@ class PoolingOp : public PoolingOpBase { const int *stride_hw, const int *pad_hw, uint8_t *output) { -#pragma omp parallel for collapse(3) +#pragma omp parallel for collapse(3) schedule(runtime) for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t h = 0; h < out_shape[1]; ++h) { for (index_t w = 0; w < out_shape[2]; ++w) { diff --git a/mace/ops/reduce_mean.cc b/mace/ops/reduce_mean.cc index 0857eb3e8ba5949fdce5f86c1d35b278585bd85d..9364146f267cabd203dc75989c129c58ba466b76 100644 --- a/mace/ops/reduce_mean.cc +++ b/mace/ops/reduce_mean.cc @@ -134,7 +134,7 @@ class ReduceMeanOp : public ReduceMeanOpBase { } output_ptr[0] = sum / data_reshape_[0]; } else { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (int i = 0; i < data_reshape_[0]; ++i) { output_ptr[i] = input_ptr[i]; } @@ -142,7 +142,7 @@ class ReduceMeanOp : public ReduceMeanOpBase { break; case 2: if (reduce_first_axis_) { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (int i = 0; i < data_reshape_[1]; ++i) { for (int j = 0; j < data_reshape_[0]; ++j) { output_ptr[i] += input_ptr[j * data_reshape_[1] + i]; @@ -150,7 +150,7 @@ class ReduceMeanOp : public ReduceMeanOpBase { output_ptr[i] /= data_reshape_[0]; } } else { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (int i = 0; i < data_reshape_[0]; ++i) { for (int j = 0; j < data_reshape_[1]; ++j) { output_ptr[i] += input_ptr[i * data_reshape_[1] + j]; @@ -161,7 +161,7 @@ class ReduceMeanOp : public ReduceMeanOpBase { break; case 3: if (reduce_first_axis_) { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (int i = 0; i < data_reshape_[1]; ++i) { for (int j = 0; j < data_reshape_[2]; ++j) { for (int k = 0; k < data_reshape_[0]; ++k) { @@ -173,7 +173,7 @@ class ReduceMeanOp : public ReduceMeanOpBase { output_ptr[i] /= (data_reshape_[0] * data_reshape_[2]); } } else { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (int i = 0; i < data_reshape_[0]; ++i) { for (int j = 0; j < data_reshape_[2]; ++j) { for (int k = 0; k < data_reshape_[1]; ++k) { @@ -188,7 +188,7 @@ class ReduceMeanOp : public ReduceMeanOpBase { break; case 4: if (reduce_first_axis_) { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (int i = 0; i < data_reshape_[1]; ++i) { for (int j = 0; j < data_reshape_[3]; ++j) { for (int k = 0; k < data_reshape_[2]; ++k) { @@ -203,7 +203,7 @@ class ReduceMeanOp : public ReduceMeanOpBase { } } } else { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (int i = 0; i < data_reshape_[0]; ++i) { for (int j = 0; j < data_reshape_[2]; ++j) { for (int k = 0; k < data_reshape_[1]; ++k) { diff --git a/mace/ops/resize_bicubic.cc b/mace/ops/resize_bicubic.cc index 28912faef0fb77a147aa1601c43bca1d566b96b4..403300607cfcb929169a18946eff79085d6c534c 100644 --- a/mace/ops/resize_bicubic.cc +++ b/mace/ops/resize_bicubic.cc @@ -85,7 +85,7 @@ inline void ResizeImage(const float *images, const float height_scale, const float width_scale, float *output) { -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch_size; ++b) { for (index_t y = 0; y < out_height; ++y) { std::vector y_weights; diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc index 91f6c3e5ccf491755d98cae03f8bf32910fde31e..5ce6ef4a44a4bdb2f9d3b11057e9b317867d62d5 100644 --- a/mace/ops/resize_bilinear.cc +++ b/mace/ops/resize_bilinear.cc @@ -95,7 +95,7 @@ inline void ResizeImageNCHW(const T *images, T *output) { const CachedInterpolation *xs = xs_vec.data(); -#pragma omp parallel for collapse(2) +#pragma omp parallel for collapse(2) schedule(runtime) for (index_t b = 0; b < batch_size; ++b) { for (index_t c = 0; c < channels; ++c) { const T @@ -141,7 +141,7 @@ inline void ResizeImageNHWC(const T *images, for (index_t b = 0; b < batch_size; ++b) { const T *input_base = images + b * channels * in_height * in_width; T *output_base = output + b * channels * out_height * out_width; -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t y = 0; y < out_height; ++y) { const T *y_lower_input_ptr = input_base + ys[y].lower * in_width * channels; diff --git a/mace/ops/sgemm.cc b/mace/ops/sgemm.cc index 5dd1de2d4cec8991683fe303ca458ff2111279d4..cdb95bc2ab529681b70ceca3481313a40bf0c5ba 100644 --- a/mace/ops/sgemm.cc +++ b/mace/ops/sgemm.cc @@ -283,7 +283,7 @@ void SGemm::RunInternal(const PackedBlock &lhs, } if (batch >= MaceOpenMPThreadCount) { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) MACE_SGEMM_RUN_PER_BATCH } else { MACE_SGEMM_RUN_PER_BATCH @@ -310,7 +310,7 @@ void SGemm::RunPerBatch(const float *lhs_data, // as possible to cache, by tiling lhs by height and rhs by width. // w: 4 -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t bw = 0; bw < block_w; ++bw) { index_t remain_h = height; index_t block_h = 0; @@ -733,7 +733,7 @@ void SGemm::RunPerBatch(const float *lhs_data, rhs_data += (width - remain_w) * depth; // w: 1 -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t bw = 0; bw < remain_w; ++bw) { index_t remain_h = height; @@ -954,7 +954,7 @@ void SGemm::Pack(const MatrixMap &src, PackPerBatch(src, order, b, packed_data + b * height * width); \ } if (src.batch() >= MaceOpenMPThreadCount) { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) MACE_SGEMM_PACK_PER_BATCH } else { MACE_SGEMM_PACK_PER_BATCH @@ -976,7 +976,7 @@ void SGemm::UnPack(const PackedBlock &packed_result, } if (matrix_map->batch() >= MaceOpenMPThreadCount) { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) MACE_SGEMM_UNPACK_PER_BATCH } else { MACE_SGEMM_UNPACK_PER_BATCH @@ -999,7 +999,7 @@ void SGemm::PackPerBatch(const MatrixMap &src, index_t h = 0; #if defined(MACE_ENABLE_NEON) #if defined(__aarch64__) -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t ih = h; ih <= height - 8; ih += 8) { const float *src_data_ptr = src_data + ih * width; float *packed_data_ptr = packed_data + ih * width; @@ -1020,7 +1020,7 @@ void SGemm::PackPerBatch(const MatrixMap &src, } h += (height - h) / 8 * 8; #endif -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t ih = h; ih <= height - 4; ih += 4) { const float *src_data_ptr = src_data + ih * width; float *packed_data_ptr = packed_data + ih * width; @@ -1036,7 +1036,7 @@ void SGemm::PackPerBatch(const MatrixMap &src, } h += (height - h) / 4 * 4; #endif -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t ih = h; ih < height; ++ih) { std::copy_n(src_data + ih * width, width, packed_data + ih * width); } @@ -1046,7 +1046,7 @@ void SGemm::PackPerBatch(const MatrixMap &src, index_t h = 0; #if defined(MACE_ENABLE_NEON) #if defined(__aarch64__) -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t ih = h; ih <= height - 8; ih += 8) { const float *src_data_ptr = src_data + ih; float *packed_data_ptr = packed_data + ih * width; @@ -1061,7 +1061,7 @@ void SGemm::PackPerBatch(const MatrixMap &src, } h += (height - h) / 8 * 8; #endif -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t ih = h; ih <= height - 4; ih += 4) { const float *src_data_ptr = src_data + ih; float *packed_data_ptr = packed_data + ih * width; @@ -1074,7 +1074,7 @@ void SGemm::PackPerBatch(const MatrixMap &src, } h += (height - h) / 4 * 4; #endif -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t ih = h; ih < height; ++ih) { const float *src_data_ptr = src_data + ih; float *packed_data_ptr = packed_data + ih * width; @@ -1087,7 +1087,7 @@ void SGemm::PackPerBatch(const MatrixMap &src, // This is for packing no-transpose rhs. index_t w = 0; #if defined(MACE_ENABLE_NEON) -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t iw = w; iw <= width - 4; iw += 4) { const float *src_data_ptr = src_data + iw; float *packed_data_ptr = packed_data + iw * height; @@ -1100,7 +1100,7 @@ void SGemm::PackPerBatch(const MatrixMap &src, } w += (width - w) / 4 * 4; #endif -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t iw = w; iw < width; ++iw) { const float *src_data_ptr = src_data + iw; float *packed_data_ptr = packed_data + iw * height; @@ -1113,7 +1113,7 @@ void SGemm::PackPerBatch(const MatrixMap &src, // This is for packing transpose-needed rhs. index_t w = 0; #if defined(MACE_ENABLE_NEON) -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t iw = w; iw <= width - 4; iw += 4) { const float *src_data_ptr = src_data + iw * height; float *packed_data_ptr = packed_data + iw * height; @@ -1129,7 +1129,7 @@ void SGemm::PackPerBatch(const MatrixMap &src, } w += (width - w) / 4 * 4; #endif -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t iw = w; iw < width; ++iw) { std::copy_n(src_data + iw * height, height, packed_data + iw * height); } @@ -1149,7 +1149,7 @@ void SGemm::UnPackPerBatch(const float *packed_data, // This is for non-transposed result index_t w = 0; #if defined(MACE_ENABLE_NEON) -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t iw = w; iw <= width - 4; iw += 4) { const float *packed_data_ptr = packed_data + iw * height; float *unpacked_data_ptr = unpacked_data + iw; @@ -1162,7 +1162,7 @@ void SGemm::UnPackPerBatch(const float *packed_data, } w += (width - w) / 4 * 4; #endif -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t iw = w; iw < width; ++iw) { const float *packed_data_ptr = packed_data + iw * height; float *unpacked_data_ptr = unpacked_data + iw; @@ -1174,7 +1174,7 @@ void SGemm::UnPackPerBatch(const float *packed_data, // This is for transposed result index_t w = 0; #if defined(MACE_ENABLE_NEON) -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t iw = w; iw <= width - 4; iw += 4) { const float *packed_data_ptr = packed_data + iw * height; float *unpacked_data_ptr = unpacked_data + iw * height; @@ -1190,7 +1190,7 @@ void SGemm::UnPackPerBatch(const float *packed_data, } w += (width - w) / 4 * 4; #endif -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t iw = w; iw < width; ++iw) { std::copy_n( packed_data + iw * height, height, unpacked_data + iw * height); diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc index 6d62fabc9781838007b9a6d8db8a629b47cfdb40..bf06114430be46dfd37046921f09afa33ce3fe5d 100644 --- a/mace/ops/softmax.cc +++ b/mace/ops/softmax.cc @@ -59,7 +59,7 @@ class SoftmaxOp : public Operation { const index_t batch_size = class_count * class_size; for (index_t b = 0; b < batch; ++b) { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t k = 0; k < class_size; ++k) { const float *input_ptr = input_data + b * batch_size + k; float *output_ptr = output_data + b * batch_size + k; @@ -94,7 +94,7 @@ class SoftmaxOp : public Operation { } else if (input->dim_size() == 2) { // normal 2d softmax const index_t class_size = input->dim(0); const index_t class_count = input->dim(1); -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t k = 0; k < class_size; ++k) { const float *input_ptr = input_data + k * class_count; float *output_ptr = output_data + k * class_count; @@ -172,7 +172,7 @@ class SoftmaxOp : public Operation { // If depth is short, do it using float32. Float computation should not // be here, but as long as it is on CPU, it is fine. if (depth < 32) { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t b = 0; b < batch; ++b) { const uint8_t *input_ptr = input_data + b * depth; uint8_t *output_ptr = output_data + b * depth; @@ -201,7 +201,7 @@ class SoftmaxOp : public Operation { (1ll << 31) - 1.0)); int32_t input_delta_limit = -((1ll << 31) - 1) / scale_q; -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t b = 0; b < batch; ++b) { const uint8_t *input_ptr = input_data + b * depth; uint8_t *output_ptr = output_data + b * depth; diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc index a023ae897b98fca66c7502b82f40bef8fcc94959..7d422938c77516f3e11ef3cf5e9f8b7bc7c5db15 100644 --- a/mace/ops/space_to_batch.cc +++ b/mace/ops/space_to_batch.cc @@ -129,7 +129,7 @@ class SpaceToBatchNDOp : public SpaceToBatchOpBase { std::max(static_cast(1), 8 * 1024 / block_shape_w / in_width); // make channel outter loop so we can make best use of cache -#pragma omp parallel for collapse(3) +#pragma omp parallel for collapse(3) schedule(runtime) for (index_t c = 0; c < channels; ++c) { for (index_t block_h = 0; block_h < out_height; block_h += block_h_size) { @@ -238,7 +238,7 @@ class SpaceToBatchNDOp : public SpaceToBatchOpBase { index_t out_width = batch_tensor->dim(2); index_t channels = batch_tensor->dim(3); -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (index_t b = 0; b < out_batches; ++b) { const index_t in_b = b % in_batches; const index_t tile_index = b / in_batches; diff --git a/mace/ops/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc index f25d66c1118e3319c9e5e58d33d78f11e90500fb..7927da3b9a321d417386e2c76c8494e45a3417f2 100644 --- a/mace/ops/sqrdiff_mean.cc +++ b/mace/ops/sqrdiff_mean.cc @@ -64,7 +64,7 @@ class SqrDiffMeanOp : public Operation { const index_t img_size = input0->dim(2) * input0->dim(3); const index_t bc = input0->dim(0) * input0->dim(1); -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (int i = 0; i < bc; ++i) { for (int j = 0; j < img_size; ++j) { T diff = input_ptr0[i * img_size + j] - input_ptr1[i]; diff --git a/mace/public/mace.h b/mace/public/mace.h index ef8fb35d5d781a401b0be58c8a59f03c48a3bd16..9e7f568638cc71a9cf358f141b4c0ed46853ab34 100644 --- a/mace/public/mace.h +++ b/mace/public/mace.h @@ -48,10 +48,28 @@ enum GPUPriorityHint { PRIORITY_HIGH = 3 }; +// AFFINITY_NONE: initiate 'num_threads_hint' threads with no affinity +// scheduled. +// If 'num_threads_hint' is -1 or greater than number of available cores, +// 'num_threads_hint' will be reset to number of available cores. +// AFFINITY_BIG_ONLY: all available big cores are used, and number of threads +// is equal to numbers of available big cores. +// AFFINITY_LITTLE_ONLY: all available little cores are used, and number of +// threads is equal to numbers of available little cores. +// AFFINITY_HIGH_PERFORMANCE: initiate 'num_threads_hint' threads on different +// cores with top-num_threads_hint frequencies. +// If 'num_threads_hint' is -1 or greater than number of available cores, +// 'num_threads_hint' will be reset to number of available cores. +// AFFINITY_POWER_SAVE: initiate 'num_threads_hint' threads on different +// cores with bottom-num_threads_hint frequencies. +// If 'num_threads_hint' is -1 or greater than number of available cores, +// 'num_threads_hint' will be reset to number of available cores. enum CPUAffinityPolicy { AFFINITY_NONE = 0, AFFINITY_BIG_ONLY = 1, AFFINITY_LITTLE_ONLY = 2, + AFFINITY_HIGH_PERFORMANCE = 3, + AFFINITY_POWER_SAVE = 4, }; struct CallStats { diff --git a/mace/utils/quantize.h b/mace/utils/quantize.h index dfaaff1560925c6d1674958ea8f9ae55f4842dd6..0755e70819f092ecc2541851ab3aff909dfbbeef 100644 --- a/mace/utils/quantize.h +++ b/mace/utils/quantize.h @@ -99,7 +99,7 @@ inline void QuantizeWithScaleAndZeropoint(const float *input, int32_t zero_point, T *output) { float recip_scale = 1 / scale; -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (int i = 0; i < size; ++i) { output[i] = Saturate(roundf(zero_point + recip_scale * input[i])); } @@ -128,7 +128,7 @@ inline void Dequantize(const T *input, const float scale, const int32_t zero_point, float *output) { -#pragma omp parallel for +#pragma omp parallel for schedule(runtime) for (int i = 0; i < size; ++i) { output[i] = scale * (input[i] - zero_point); }