提交 69583cd6 编写于 作者: 李寅

Using guided openmp scheduler

上级 0102ad55
......@@ -36,45 +36,98 @@ namespace mace {
int MaceOpenMPThreadCount = 1;
namespace {
struct CPUFreq {
size_t core_id;
float freq;
};
namespace {
#if defined(__ANDROID__)
int GetCPUCount() {
char path[64];
int cpu_count = 0;
int result = 0;
while (true) {
snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d", cpu_count);
result = access(path, F_OK);
if (result != 0) {
if (errno != ENOENT) {
LOG(ERROR) << "Access " << path << " failed: " << strerror(errno);
}
return cpu_count;
std::string cpu_sys_conf = "/proc/cpuinfo";
std::ifstream f(cpu_sys_conf);
if (!f.is_open()) {
LOG(ERROR) << "failed to open " << cpu_sys_conf;
return -1;
}
std::string line;
const std::string processor_key = "processor";
while (std::getline(f, line)) {
if (line.size() >= processor_key.size()
&& line.compare(0, processor_key.size(), processor_key) == 0) {
++cpu_count;
}
cpu_count++;
}
if (f.bad()) {
LOG(ERROR) << "failed to read " << cpu_sys_conf;
}
if (!f.eof()) {
LOG(ERROR) << "failed to read end of " << cpu_sys_conf;
}
f.close();
VLOG(2) << "CPU cores: " << cpu_count;
return cpu_count;
}
#endif
int GetCPUMaxFreq(int cpu_id) {
char path[64];
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
cpu_id);
FILE *fp = fopen(path, "rb");
if (!fp) {
LOG(WARNING) << "File: " << path << " not exists.";
return 0;
int GetCPUMaxFreq(std::vector<float> *max_freqs) {
#if defined(__ANDROID__)
int cpu_count = GetCPUCount();
for (int cpu_id = 0; cpu_id < cpu_count; ++cpu_id) {
std::string cpuinfo_max_freq_sys_conf = MakeString(
"/sys/devices/system/cpu/cpu",
cpu_id,
"/cpufreq/cpuinfo_max_freq");
std::ifstream f(cpuinfo_max_freq_sys_conf);
if (!f.is_open()) {
LOG(ERROR) << "failed to open " << cpuinfo_max_freq_sys_conf;
return -1;
}
std::string line;
if (std::getline(f, line)) {
float freq = atof(line.c_str());
max_freqs->push_back(freq);
}
if (f.bad()) {
LOG(ERROR) << "failed to read " << cpuinfo_max_freq_sys_conf;
}
f.close();
}
#else
std::string cpu_sys_conf = "/proc/cpuinfo";
std::ifstream f(cpu_sys_conf);
if (!f.is_open()) {
LOG(ERROR) << "failed to open " << cpu_sys_conf;
return -1;
}
std::string line;
const std::string freq_key = "cpu MHz";
while (std::getline(f, line)) {
if (line.size() >= freq_key.size()
&& line.compare(0, freq_key.size(), freq_key) == 0) {
size_t pos = line.find(":");
if (pos != std::string::npos) {
std::string freq_str = line.substr(pos + 1);
float freq = atof(freq_str.c_str());
max_freqs->push_back(freq);
}
}
}
if (f.bad()) {
LOG(ERROR) << "failed to read " << cpu_sys_conf;
}
if (!f.eof()) {
LOG(ERROR) << "failed to read end of " << cpu_sys_conf;
}
f.close();
#endif
int freq = 0;
int items_read = fscanf(fp, "%d", &freq);
if (items_read != 1) {
LOG(WARNING) << "Read file: " << path << " failed.";
for (float freq : *max_freqs) {
VLOG(2) << "CPU freq: " << freq;
}
fclose(fp);
return freq;
return 0;
}
MaceStatus SetThreadAffinity(cpu_set_t mask) {
......@@ -93,51 +146,14 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) {
}
}
MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
std::vector<int> *little_core_ids) {
MACE_CHECK_NOTNULL(big_core_ids);
MACE_CHECK_NOTNULL(little_core_ids);
int cpu_count = GetCPUCount();
std::vector<int> cpu_max_freq(cpu_count);
// set cpu max frequency
for (int i = 0; i < cpu_count; ++i) {
cpu_max_freq[i] = GetCPUMaxFreq(i);
if (cpu_max_freq[i] == 0) {
LOG(WARNING) << "Cannot get CPU" << i
<< "'s max frequency info, maybe it is offline.";
return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
"Cannot get CPU's max frequency info,"
" maybe it is offline.");
}
}
int big_core_freq =
*(std::max_element(cpu_max_freq.begin(), cpu_max_freq.end()));
int little_core_freq =
*(std::min_element(cpu_max_freq.begin(), cpu_max_freq.end()));
big_core_ids->reserve(cpu_count);
little_core_ids->reserve(cpu_count);
for (int i = 0; i < cpu_count; ++i) {
if (cpu_max_freq[i] == little_core_freq) {
little_core_ids->push_back(i);
}
if (cpu_max_freq[i] == big_core_freq) {
big_core_ids->push_back(i);
}
}
return MaceStatus::MACE_SUCCESS;
}
MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
const std::vector<int> &cpu_ids) {
const std::vector<size_t> &cpu_ids) {
MaceOpenMPThreadCount = omp_num_threads;
#ifdef MACE_ENABLE_OPENMP
VLOG(1) << "Set OpenMP threads number: " << omp_num_threads
<< ", CPU core IDs: " << MakeString(cpu_ids);
omp_set_schedule(omp_sched_guided, 1);
omp_set_num_threads(omp_num_threads);
#else
MACE_UNUSED(omp_num_threads);
......@@ -174,55 +190,90 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
} // namespace
MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
int omp_num_threads_hint,
int num_threads_hint,
CPUAffinityPolicy policy,
void *gemm_context) {
// get cpu frequency info
std::vector<float> cpu_max_freqs;
if (GetCPUMaxFreq(&cpu_max_freqs) == -1 || cpu_max_freqs.size() == 0) {
return MaceStatus::MACE_INVALID_ARGS;
}
std::vector<CPUFreq> cpu_freq(cpu_max_freqs.size());
for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
cpu_freq[i].core_id = i;
cpu_freq[i].freq = cpu_max_freqs[i];
}
if (policy == CPUAffinityPolicy::AFFINITY_POWER_SAVE ||
policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
std::sort(cpu_freq.begin(),
cpu_freq.end(),
[=](const CPUFreq &lhs, const CPUFreq &rhs) {
return lhs.freq < rhs.freq;
});
} else if (policy == CPUAffinityPolicy::AFFINITY_HIGH_PERFORMANCE ||
policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) {
std::sort(cpu_freq.begin(),
cpu_freq.end(),
[](const CPUFreq &lhs, const CPUFreq &rhs) {
return lhs.freq > rhs.freq;
});
}
int cpu_count = static_cast<int>(cpu_freq.size());
if (num_threads_hint <= 0 || num_threads_hint > cpu_count) {
num_threads_hint = cpu_count;
}
if (policy == CPUAffinityPolicy::AFFINITY_NONE) {
#ifdef MACE_ENABLE_QUANTIZE
if (gemm_context) {
static_cast<gemmlowp::GemmContext*>(gemm_context)->set_max_num_threads(
std::max(0, omp_num_threads_hint));
num_threads_hint);
}
#else
MACE_UNUSED(gemm_context);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENMP
if (omp_num_threads_hint > 0) {
omp_set_num_threads(std::min(omp_num_threads_hint, omp_get_num_procs()));
}
omp_set_num_threads(num_threads_hint);
#else
LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled.";
#endif
return MaceStatus::MACE_SUCCESS;
}
std::vector<int> big_core_ids;
std::vector<int> little_core_ids;
MaceStatus res = GetCPUBigLittleCoreIDs(&big_core_ids, &little_core_ids);
if (res != MaceStatus::MACE_SUCCESS) {
return res;
}
std::vector<int> use_cpu_ids;
if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) {
use_cpu_ids = std::move(big_core_ids);
// decide num of cores to use
int cores_to_use = 0;
if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY
|| policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
if (cpu_freq[i].freq != cpu_freq[0].freq) {
break;
}
++cores_to_use;
}
num_threads_hint = cores_to_use;
} else {
use_cpu_ids = std::move(little_core_ids);
cores_to_use = num_threads_hint;
}
if (omp_num_threads_hint <= 0 ||
omp_num_threads_hint > static_cast<int>(use_cpu_ids.size())) {
omp_num_threads_hint = use_cpu_ids.size();
VLOG(2) << "Use " << num_threads_hint << " threads";
std::vector<size_t> cpu_ids(cores_to_use);
for (int i = 0; i < cores_to_use; ++i) {
VLOG(2) << "Bind thread to core: " << cpu_freq[i].core_id << " with freq "
<< cpu_freq[i].freq;
cpu_ids[i] = cpu_freq[i].core_id;
}
#ifdef MACE_ENABLE_QUANTIZE
if (gemm_context) {
static_cast<gemmlowp::GemmContext*>(gemm_context)->set_max_num_threads(
omp_num_threads_hint);
num_threads_hint);
}
#endif // MACE_ENABLE_QUANTIZE
return SetOpenMPThreadsAndAffinityCPUs(omp_num_threads_hint, use_cpu_ids);
return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint, cpu_ids);
}
} // namespace mace
......
......@@ -66,26 +66,26 @@ void DoActivation(const T *input_ptr,
case NOOP:
break;
case RELU:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = std::max(input_ptr[i], static_cast<T>(0));
}
break;
case RELUX:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = std::min(std::max(input_ptr[i], static_cast<T>(0)),
static_cast<T>(relux_max_limit));
}
break;
case TANH:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = std::tanh(input_ptr[i]);
}
break;
case SIGMOID:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i]));
}
......@@ -111,13 +111,13 @@ inline void DoActivation(const float *input_ptr,
ReluxNeon(input_ptr, relux_max_limit, size, output_ptr);
break;
case TANH:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = std::tanh(input_ptr[i]);
}
break;
case SIGMOID:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i]));
}
......@@ -134,7 +134,7 @@ void PReLUActivation(const T *input_ptr,
const index_t inner_size,
const T *alpha_ptr,
T *output_ptr) {
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3) schedule(runtime)
for (index_t i = 0; i < outer_size; ++i) {
for (index_t chan_idx = 0; chan_idx < input_chan; ++chan_idx) {
for (index_t j = 0; j < inner_size; ++j) {
......
......@@ -59,7 +59,7 @@ class ArgMaxOp : public Operation {
index_t outer_size = output->size();
index_t inner_size = input->dim(axis_value);
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < outer_size; ++i) {
int idx = 0;
T max_value = std::numeric_limits<T>::lowest();
......
......@@ -25,7 +25,7 @@ namespace ops {
void ReluNeon(const float *input, const index_t size, float *output) {
#if defined(MACE_ENABLE_NEON)
float32x4_t vzero = vdupq_n_f32(0.f);
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i <= size - 4; i += 4) {
float32x4_t v = vld1q_f32(input + i);
v = vmaxq_f32(v, vzero);
......@@ -36,7 +36,7 @@ void ReluNeon(const float *input, const index_t size, float *output) {
output[i] = std::max(input[i], 0.f);
}
#else
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = std::max(input[i], 0.f);
}
......@@ -48,7 +48,7 @@ void ReluxNeon(const float *input, const float limit,
#if defined(MACE_ENABLE_NEON)
float32x4_t vzero = vdupq_n_f32(0.f);
float32x4_t vlimit = vdupq_n_f32(limit);
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i <= size - 4; i += 4) {
float32x4_t v = vld1q_f32(input + i);
v = vmaxq_f32(v, vzero);
......@@ -60,7 +60,7 @@ void ReluxNeon(const float *input, const float limit,
output[i] = std::min(std::max(input[i], 0.f), limit);
}
#else
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = std::min(std::max(input[i], 0.f), limit);
}
......
......@@ -60,7 +60,7 @@ void Conv2dNeonK15x1S1(const float *input,
const index_t tile_width =
out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3];
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; ++m) {
for (index_t w = 0; w < out_shape[3]; w += tile_width) {
......
......@@ -61,7 +61,7 @@ void Conv2dNeonK1x15S1(const float *input,
const index_t tile_height =
out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2];
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; ++m) {
for (index_t h = 0; h < out_shape[2]; h += tile_height) {
......
......@@ -32,7 +32,7 @@ void Conv2dNeonK1x7S1(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1];
......
......@@ -33,7 +33,7 @@ void Conv2dNeonK3x3S1(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 2) {
const index_t out_channels = out_shape[1];
......@@ -515,7 +515,7 @@ void Conv2dNeonK3x3S2(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; ++m) {
for (index_t c = 0; c < in_shape[1]; ++c) {
......
......@@ -87,7 +87,7 @@ void Conv2dNeonK5x5S1(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1];
......
......@@ -32,7 +32,7 @@ void Conv2dNeonK7x1S1(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1];
......
......@@ -164,7 +164,7 @@ void Conv2dNeonK7x7S1(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1];
......@@ -319,7 +319,7 @@ void Conv2dNeonK7x7S2(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1];
......@@ -484,7 +484,7 @@ void Conv2dNeonK7x7S3(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1];
......
......@@ -34,7 +34,7 @@ void TransformInput4x4(const float *input,
const index_t input_batch_size = in_height_width * in_channels;
const index_t output_batch_size = 16 * in_channels * tile_count;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t n = 0; n < batch; ++n) {
for (index_t c = 0; c < in_channels; ++c) {
index_t tile_index = 0;
......@@ -155,7 +155,7 @@ void TransformInput8x8(const float *input,
const index_t input_batch_size = in_height_width * in_channels;
const index_t output_batch_size = 64 * in_channels * tile_count;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t n = 0; n < batch; ++n) {
for (index_t c = 0; c < in_channels; ++c) {
index_t tile_index = 0;
......@@ -292,7 +292,7 @@ void TransformOutput4x4(const float *input,
const index_t out_image_size = out_height * out_width;
const index_t output_batch_size = out_channels * out_image_size;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t n = 0; n < batch; ++n) {
for (index_t m = 0; m < out_channels; ++m) {
index_t tile_offset = 0;
......@@ -388,7 +388,7 @@ void TransformOutput8x8(const float *input,
const index_t out_image_size = out_height * out_width;
const index_t output_batch_size = out_channels * out_image_size;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t n = 0; n < batch; ++n) {
for (index_t m = 0; m < out_channels; ++m) {
index_t tile_offset = 0;
......@@ -471,7 +471,7 @@ void TransformFilter4x4(const float *filter,
float *output) {
const index_t stride = out_channels * in_channels;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t m = 0; m < out_channels; ++m) {
for (index_t c = 0; c < in_channels; ++c) {
float g0, g1, g2, g3, g4, g5, g6, g7, g8;
......@@ -573,7 +573,7 @@ void TransformFilter8x8(const float *filter,
{1.0f / 45, -1.0f / 90, 1.0f / 180},
{0.0f, 0.0f, 1.0f}};
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t m = 0; m < out_channels; ++m) {
for (index_t c = 0; c < in_channels; ++c) {
// load filter
......@@ -720,7 +720,7 @@ void ConvRef3x3s1(const float *input,
index_t out_height = in_height - 2;
index_t out_width = in_width - 2;
#pragma omp parallel for collapse(4)
#pragma omp parallel for collapse(4) schedule(runtime)
for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; ++m) {
for (index_t h = 0; h < out_height; ++h) {
......
......@@ -33,7 +33,7 @@ void Deconv2dNeonK2x2S1(const float *input,
const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t oc = 0; oc < outch; oc += 2) {
if (oc + 1 < outch) {
......@@ -199,7 +199,7 @@ void Deconv2dNeonK2x2S2(const float *input,
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t oc = 0; oc < outch; ++oc) {
float *out_base = output + (b * outch + oc) * out_img_size;
......
......@@ -33,7 +33,7 @@ void Deconv2dNeonK3x3S1(const float *input,
const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t oc = 0; oc < outch; oc += 2) {
if (oc + 1 < outch) {
......@@ -293,7 +293,7 @@ void Deconv2dNeonK3x3S2(const float *input,
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t oc = 0; oc < outch; ++oc) {
float *out_base = output + (b * outch + oc) * out_img_size;
......
......@@ -31,7 +31,7 @@ void Deconv2dNeonK4x4S1(const float *input,
const index_t outw = out_shape[3];
const index_t outch = out_shape[1];
const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t oc = 0; oc < outch; oc += 2) {
if (oc + 1 < outch) {
......@@ -386,7 +386,7 @@ void Deconv2dNeonK4x4S2(const float *input,
const index_t outch = out_shape[1];
const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t p = 0; p < outch; p++) {
float *out_base = output + (b * outch + p) * out_img_size;
......
......@@ -70,7 +70,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < in_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; ++m) {
index_t c = m / multiplier;
......@@ -250,7 +250,7 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < in_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; ++m) {
index_t c = m / multiplier;
......
......@@ -32,7 +32,7 @@ void DepthwiseDeconv2dNeonK3x3S1(const float *input,
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t c = 0; c < channels; ++c) {
const index_t offset = b * channels + c;
......@@ -137,7 +137,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input,
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t c = 0; c < channels; ++c) {
const index_t offset = b * channels + c;
......@@ -251,7 +251,7 @@ void GroupDeconv2dNeonK3x3S1(const float *input,
const index_t inch_g = inch / group;
const index_t outch_g = outch / group;
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (int g = 0; g < group; ++g) {
for (index_t oc = 0; oc < outch_g; oc += 2) {
......@@ -525,7 +525,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input,
const index_t inch_g = inch / group;
const index_t outch_g = outch / group;
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (int g = 0; g < group; ++g) {
for (index_t oc = 0; oc < outch_g; ++oc) {
......
......@@ -33,7 +33,7 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input,
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
const index_t offset = b * channels + c;
......@@ -169,7 +169,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t c = 0; c < channels; ++c) {
const index_t offset = b * channels + c;
......@@ -304,7 +304,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
const index_t inch_g = inch / group;
const index_t outch_g = outch / group;
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (int g = 0; g < group; ++g) {
for (index_t oc = 0; oc < outch_g; oc += 2) {
......@@ -679,7 +679,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
const index_t inch_g = inch / group;
const index_t outch_g = outch / group;
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (int g = 0; g < group; ++g) {
for (index_t oc = 0; oc < outch_g; oc++) {
......
......@@ -124,7 +124,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);
// make channel outter loop so we can make best use of cache
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3) schedule(runtime)
for (index_t c = 0; c < channels; ++c) {
for (index_t block_h = 0; block_h < in_height;
block_h += block_h_size) {
......@@ -213,7 +213,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
index_t out_width = space_tensor->dim(2);
index_t channels = space_tensor->dim(3);
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t in_b = 0; in_b < in_batches; ++in_b) {
const index_t b = in_b % out_batches;
const index_t tile_index = in_b / out_batches;
......
......@@ -55,7 +55,7 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
index_t batch_size = channels * image_size;
index_t channels_per_group = channels / groups_;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
const T *input_base = input_ptr + b * batch_size;
......
......@@ -475,7 +475,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
// unpack output
if (extra_output_height != height || extra_output_width != width) {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
for (index_t h = 0; h < height; ++h) {
......@@ -494,7 +494,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
if (bias_data != nullptr) {
const index_t image_size = height * width;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
float *output_ptr = output_data + (b * channels + c) * image_size;
......@@ -539,7 +539,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
const index_t out_batch_size = filter_shape[0] * out_image_size;
const index_t filter_size = filter_shape[2] * filter_shape[3];
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < in_shape[0]; b++) {
for (index_t m = 0; m < filter_shape[0]; m += 4) {
const index_t in_width = in_shape[3];
......@@ -867,7 +867,7 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
const index_t input_row_size = in_shape[2] * in_shape[3];
const index_t patch_row_size = filter_w * in_shape[3];
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t h = 0; h < out_shape[1]; ++h) {
for (index_t w = 0; w < out_shape[2]; ++w) {
......
......@@ -395,7 +395,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
const index_t in_batch_size = channels * in_image_size;
const index_t out_batch_size = channels * out_image_size;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
for (int k = 0; k < height; ++k) {
......@@ -443,7 +443,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
if (padding_same_value) {
LOG(FATAL) << "Not implemented";
} else {
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3) schedule(runtime)
for (int n = 0; n < batch; ++n) {
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
......
......@@ -276,7 +276,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
const index_t batch = out_shape[0];
const index_t channels = out_shape[1];
const index_t img_size = out_shape[2] * out_shape[3];
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
for (index_t i = 0; i < img_size; ++i) {
......@@ -324,7 +324,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
const index_t out_channels = out_shape[1];
const index_t in_channels = in_shape[1];
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (int b = 0; b < batch; ++b) {
for (int oc = 0; oc < out_channels; ++oc) {
float *out_base =
......
......@@ -57,7 +57,7 @@ class DepthToSpaceOp : public Operation {
const T *input_ptr = input->data<T>();
T *output_ptr = output->mutable_data<T>();
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t b = 0; b < batch_size; ++b) {
for (index_t d = 0; d < output_depth; ++d) {
for (index_t h = 0; h < output_height; ++h) {
......
......@@ -201,7 +201,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
switch (type) {
case SUM:
if (coeff.empty()) {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
......@@ -213,7 +213,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
if (swapped) {
std::swap(coeff_copy[0], coeff_copy[1]);
}
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
......@@ -225,7 +225,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
break;
case SUB:
if (!swapped) {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
......@@ -233,7 +233,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
}
} else {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
......@@ -243,7 +243,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
break;
case PROD:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] = input0[i + d * common_size] * input1[i];
......@@ -252,7 +252,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
break;
case DIV:
if (!swapped) {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
......@@ -260,7 +260,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
}
} else {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
......@@ -270,7 +270,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
break;
case MIN:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
......@@ -279,7 +279,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
break;
case MAX:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
......@@ -288,7 +288,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
break;
case SQR_DIFF:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
......@@ -298,7 +298,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
break;
case POW:
if (!swapped) {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
......@@ -306,7 +306,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
}
} else {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
......@@ -316,19 +316,19 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
}
break;
case NEG:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < diff_size * common_size; ++i) {
output[i] = -input0[i];
}
break;
case ABS:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < diff_size * common_size; ++i) {
output[i] = std::fabs(input0[i]);
}
break;
case EQUAL:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] =
......@@ -353,7 +353,7 @@ inline void TensorEltwise(const EltwiseType type,
switch (type) {
case SUM:
if (coeff.empty()) {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] + input1[i];
}
......@@ -363,7 +363,7 @@ inline void TensorEltwise(const EltwiseType type,
if (swapped) {
std::swap(coeff_copy[0], coeff_copy[1]);
}
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1];
}
......@@ -371,20 +371,20 @@ inline void TensorEltwise(const EltwiseType type,
break;
case SUB:
if (!swapped) {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] - input1[i];
}
} else {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = input1[i] - input0[i];
}
}
break;
case PROD:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] * input1[i];
}
......@@ -392,34 +392,34 @@ inline void TensorEltwise(const EltwiseType type,
break;
case DIV:
if (!swapped) {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] / input1[i];
}
} else {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = input1[i] / input0[i];
}
}
break;
case MIN:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = std::min(input0[i], input1[i]);
}
break;
case MAX:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = std::max(input0[i], input1[i]);
}
break;
case SQR_DIFF:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input0[i] - input1[i], 2.f);
}
......@@ -427,7 +427,7 @@ inline void TensorEltwise(const EltwiseType type,
break;
case POW:
if (!swapped) {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input0[i], input1[i]);
}
......@@ -438,19 +438,19 @@ inline void TensorEltwise(const EltwiseType type,
}
break;
case NEG:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = -input0[i];
}
break;
case ABS:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = std::fabs(input0[i]);
}
break;
case EQUAL:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] == input1[i];
}
......@@ -472,7 +472,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
switch (type) {
case SUM:
if (coeff.empty()) {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] + input1;
}
......@@ -482,7 +482,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
if (swapped) {
std::swap(coeff_copy[0], coeff_copy[1]);
}
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1];
}
......@@ -490,20 +490,20 @@ inline void TensorScalarEltwise(const EltwiseType type,
break;
case SUB:
if (!swapped) {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] - input1;
}
} else {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = input1 - input0[i];
}
}
break;
case PROD:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] * input1;
}
......@@ -511,34 +511,34 @@ inline void TensorScalarEltwise(const EltwiseType type,
break;
case DIV:
if (!swapped) {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] / input1;
}
} else {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = input1 / input0[i];
}
}
break;
case MIN:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = std::min(input0[i], input1);
}
break;
case MAX:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = std::max(input0[i], input1);
}
break;
case SQR_DIFF:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input0[i] - input1, 2.f);
}
......@@ -546,7 +546,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
break;
case POW:
if (!swapped) {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input0[i], input1);
}
......@@ -557,19 +557,19 @@ inline void TensorScalarEltwise(const EltwiseType type,
}
break;
case NEG:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = -input0[i];
}
break;
case ABS:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = std::fabs(input0[i]);
}
break;
case EQUAL:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] == input1;
}
......@@ -594,7 +594,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
switch (type) {
case SUM:
if (coeff.empty()) {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
......@@ -610,7 +610,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
if (swapped) {
std::swap(coeff_copy[0], coeff_copy[1]);
}
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
......@@ -626,7 +626,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
break;
case SUB:
if (!swapped) {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
......@@ -638,7 +638,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
}
} else {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
......@@ -652,7 +652,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
break;
case PROD:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
......@@ -666,7 +666,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
break;
case DIV:
if (!swapped) {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
......@@ -678,7 +678,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
}
} else {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
......@@ -692,7 +692,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
break;
case MIN:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
......@@ -705,7 +705,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
break;
case MAX:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
......@@ -718,7 +718,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
break;
case SQR_DIFF:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
......@@ -732,7 +732,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
break;
case POW:
if (!swapped) {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
......@@ -744,7 +744,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
}
} else {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
......@@ -758,19 +758,19 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
}
break;
case NEG:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
output[i] = -input0[i];
}
break;
case ABS:
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
output[i] = std::fabs(input0[i]);
}
break;
case EQUAL:
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
......@@ -991,7 +991,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
index_t handled_output_size = 0;
#ifdef MACE_ENABLE_NEON
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) {
const auto input0_val = vld1_u8(input0_ptr + i);
const auto input1_val = vld1_u8(input1_ptr + i);
......@@ -1037,7 +1037,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
}
handled_output_size = output->size() - output->size() % 8;
#endif // NEON
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t i = handled_output_size; i < output->size(); ++i) {
const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
......
......@@ -62,7 +62,7 @@ class GatherOp : public Operation {
params->shape().end(), 1, std::multiplies<index_t>());
index_t index_size = indices->size();
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t l = 0; l < lhs_size; ++l) {
for (index_t idx = 0; idx < index_size; ++idx) {
MACE_ASSERT(indices_data[idx] < axis_dim_size, "idx out of bound: ",
......
......@@ -53,7 +53,7 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
index_t image_size = height * width;
index_t batch_size = channels * image_size;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) {
const int begin_input_c = std::max(static_cast<index_t>(0),
......
......@@ -133,7 +133,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t c = 0; c < out_shape[1]; ++c) {
const index_t out_base = b * out_batch_size + c * out_image_size;
......@@ -179,7 +179,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t c = 0; c < out_shape[1]; ++c) {
const index_t out_base = b * out_batch_size + c * out_image_size;
......@@ -301,7 +301,7 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
const int *stride_hw,
const int *pad_hw,
uint8_t *output) {
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t h = 0; h < out_shape[1]; ++h) {
for (index_t w = 0; w < out_shape[2]; ++w) {
......@@ -358,7 +358,7 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
const int *stride_hw,
const int *pad_hw,
uint8_t *output) {
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t h = 0; h < out_shape[1]; ++h) {
for (index_t w = 0; w < out_shape[2]; ++w) {
......
......@@ -134,7 +134,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
}
output_ptr[0] = sum / data_reshape_[0];
} else {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (int i = 0; i < data_reshape_[0]; ++i) {
output_ptr[i] = input_ptr[i];
}
......@@ -142,7 +142,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
break;
case 2:
if (reduce_first_axis_) {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (int i = 0; i < data_reshape_[1]; ++i) {
for (int j = 0; j < data_reshape_[0]; ++j) {
output_ptr[i] += input_ptr[j * data_reshape_[1] + i];
......@@ -150,7 +150,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
output_ptr[i] /= data_reshape_[0];
}
} else {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (int i = 0; i < data_reshape_[0]; ++i) {
for (int j = 0; j < data_reshape_[1]; ++j) {
output_ptr[i] += input_ptr[i * data_reshape_[1] + j];
......@@ -161,7 +161,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
break;
case 3:
if (reduce_first_axis_) {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (int i = 0; i < data_reshape_[1]; ++i) {
for (int j = 0; j < data_reshape_[2]; ++j) {
for (int k = 0; k < data_reshape_[0]; ++k) {
......@@ -173,7 +173,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
output_ptr[i] /= (data_reshape_[0] * data_reshape_[2]);
}
} else {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (int i = 0; i < data_reshape_[0]; ++i) {
for (int j = 0; j < data_reshape_[2]; ++j) {
for (int k = 0; k < data_reshape_[1]; ++k) {
......@@ -188,7 +188,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
break;
case 4:
if (reduce_first_axis_) {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (int i = 0; i < data_reshape_[1]; ++i) {
for (int j = 0; j < data_reshape_[3]; ++j) {
for (int k = 0; k < data_reshape_[2]; ++k) {
......@@ -203,7 +203,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
}
}
} else {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (int i = 0; i < data_reshape_[0]; ++i) {
for (int j = 0; j < data_reshape_[2]; ++j) {
for (int k = 0; k < data_reshape_[1]; ++k) {
......
......@@ -85,7 +85,7 @@ inline void ResizeImage(const float *images,
const float height_scale,
const float width_scale,
float *output) {
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch_size; ++b) {
for (index_t y = 0; y < out_height; ++y) {
std::vector<float> y_weights;
......
......@@ -95,7 +95,7 @@ inline void ResizeImageNCHW(const T *images,
T *output) {
const CachedInterpolation *xs = xs_vec.data();
#pragma omp parallel for collapse(2)
#pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch_size; ++b) {
for (index_t c = 0; c < channels; ++c) {
const T
......@@ -141,7 +141,7 @@ inline void ResizeImageNHWC(const T *images,
for (index_t b = 0; b < batch_size; ++b) {
const T *input_base = images + b * channels * in_height * in_width;
T *output_base = output + b * channels * out_height * out_width;
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t y = 0; y < out_height; ++y) {
const T
*y_lower_input_ptr = input_base + ys[y].lower * in_width * channels;
......
......@@ -252,7 +252,7 @@ void SGemm::RunInternal(const PackedBlock &lhs,
}
if (batch >= MaceOpenMPThreadCount) {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
MACE_SGEMM_RUN_PER_BATCH
} else {
MACE_SGEMM_RUN_PER_BATCH
......@@ -279,7 +279,7 @@ void SGemm::RunPerBatch(const float *lhs_data,
// as possible to cache, by tiling lhs by height and rhs by width.
// w: 4
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t bw = 0; bw < block_w; ++bw) {
index_t remain_h = height;
index_t block_h = 0;
......@@ -702,7 +702,7 @@ void SGemm::RunPerBatch(const float *lhs_data,
rhs_data += (width - remain_w) * depth;
// w: 1
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t bw = 0; bw < remain_w; ++bw) {
index_t remain_h = height;
......@@ -923,7 +923,7 @@ void SGemm::Pack(const MatrixMap<const float> &src,
PackPerBatch(src, order, b, packed_data + b * height * width); \
}
if (src.batch() >= MaceOpenMPThreadCount) {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
MACE_SGEMM_PACK_PER_BATCH
} else {
MACE_SGEMM_PACK_PER_BATCH
......@@ -945,7 +945,7 @@ void SGemm::UnPack(const PackedBlock &packed_result,
}
if (matrix_map->batch() >= MaceOpenMPThreadCount) {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
MACE_SGEMM_UNPACK_PER_BATCH
} else {
MACE_SGEMM_UNPACK_PER_BATCH
......@@ -968,7 +968,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
index_t h = 0;
#if defined(MACE_ENABLE_NEON)
#if defined(__aarch64__)
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t ih = h; ih <= height - 8; ih += 8) {
const float *src_data_ptr = src_data + ih * width;
float *packed_data_ptr = packed_data + ih * width;
......@@ -989,7 +989,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
h += (height - h) / 8 * 8;
#endif
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t ih = h; ih <= height - 4; ih += 4) {
const float *src_data_ptr = src_data + ih * width;
float *packed_data_ptr = packed_data + ih * width;
......@@ -1005,7 +1005,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
h += (height - h) / 4 * 4;
#endif
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t ih = h; ih < height; ++ih) {
std::copy_n(src_data + ih * width, width, packed_data + ih * width);
}
......@@ -1015,7 +1015,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
index_t h = 0;
#if defined(MACE_ENABLE_NEON)
#if defined(__aarch64__)
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t ih = h; ih <= height - 8; ih += 8) {
const float *src_data_ptr = src_data + ih;
float *packed_data_ptr = packed_data + ih * width;
......@@ -1030,7 +1030,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
h += (height - h) / 8 * 8;
#endif
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t ih = h; ih <= height - 4; ih += 4) {
const float *src_data_ptr = src_data + ih;
float *packed_data_ptr = packed_data + ih * width;
......@@ -1043,7 +1043,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
h += (height - h) / 4 * 4;
#endif
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t ih = h; ih < height; ++ih) {
const float *src_data_ptr = src_data + ih;
float *packed_data_ptr = packed_data + ih * width;
......@@ -1056,7 +1056,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
// This is for packing no-transpose rhs.
index_t w = 0;
#if defined(MACE_ENABLE_NEON)
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t iw = w; iw <= width - 4; iw += 4) {
const float *src_data_ptr = src_data + iw;
float *packed_data_ptr = packed_data + iw * height;
......@@ -1069,7 +1069,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
w += (width - w) / 4 * 4;
#endif
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t iw = w; iw < width; ++iw) {
const float *src_data_ptr = src_data + iw;
float *packed_data_ptr = packed_data + iw * height;
......@@ -1082,7 +1082,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
// This is for packing transpose-needed rhs.
index_t w = 0;
#if defined(MACE_ENABLE_NEON)
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t iw = w; iw <= width - 4; iw += 4) {
const float *src_data_ptr = src_data + iw * height;
float *packed_data_ptr = packed_data + iw * height;
......@@ -1098,7 +1098,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
}
w += (width - w) / 4 * 4;
#endif
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t iw = w; iw < width; ++iw) {
std::copy_n(src_data + iw * height, height, packed_data + iw * height);
}
......@@ -1118,7 +1118,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
// This is for non-transposed result
index_t w = 0;
#if defined(MACE_ENABLE_NEON)
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t iw = w; iw <= width - 4; iw += 4) {
const float *packed_data_ptr = packed_data + iw * height;
float *unpacked_data_ptr = unpacked_data + iw;
......@@ -1131,7 +1131,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
}
w += (width - w) / 4 * 4;
#endif
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t iw = w; iw < width; ++iw) {
const float *packed_data_ptr = packed_data + iw * height;
float *unpacked_data_ptr = unpacked_data + iw;
......@@ -1143,7 +1143,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
// This is for transposed result
index_t w = 0;
#if defined(MACE_ENABLE_NEON)
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t iw = w; iw <= width - 4; iw += 4) {
const float *packed_data_ptr = packed_data + iw * height;
float *unpacked_data_ptr = unpacked_data + iw * height;
......@@ -1159,7 +1159,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
}
w += (width - w) / 4 * 4;
#endif
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t iw = w; iw < width; ++iw) {
std::copy_n(
packed_data + iw * height, height, unpacked_data + iw * height);
......
......@@ -59,7 +59,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
const index_t batch_size = class_count * class_size;
for (index_t b = 0; b < batch; ++b) {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t k = 0; k < class_size; ++k) {
const float *input_ptr = input_data + b * batch_size + k;
float *output_ptr = output_data + b * batch_size + k;
......@@ -94,7 +94,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
} else if (input->dim_size() == 2) { // normal 2d softmax
const index_t class_size = input->dim(0);
const index_t class_count = input->dim(1);
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t k = 0; k < class_size; ++k) {
const float *input_ptr = input_data + k * class_count;
float *output_ptr = output_data + k * class_count;
......@@ -172,7 +172,7 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
// If depth is short, do it using float32. Float computation should not
// be here, but as long as it is on CPU, it is fine.
if (depth < 32) {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t b = 0; b < batch; ++b) {
const uint8_t *input_ptr = input_data + b * depth;
uint8_t *output_ptr = output_data + b * depth;
......@@ -201,7 +201,7 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
(1ll << 31) - 1.0));
int32_t input_delta_limit = -((1ll << 31) - 1) / scale_q;
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t b = 0; b < batch; ++b) {
const uint8_t *input_ptr = input_data + b * depth;
uint8_t *output_ptr = output_data + b * depth;
......
......@@ -129,7 +129,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width);
// make channel outter loop so we can make best use of cache
#pragma omp parallel for collapse(3)
#pragma omp parallel for collapse(3) schedule(runtime)
for (index_t c = 0; c < channels; ++c) {
for (index_t block_h = 0; block_h < out_height;
block_h += block_h_size) {
......@@ -238,7 +238,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, uint8_t> : public SpaceToBatchOpBase {
index_t out_width = batch_tensor->dim(2);
index_t channels = batch_tensor->dim(3);
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (index_t b = 0; b < out_batches; ++b) {
const index_t in_b = b % in_batches;
const index_t tile_index = b / in_batches;
......
......@@ -64,7 +64,7 @@ class SqrDiffMeanOp : public Operation {
const index_t img_size = input0->dim(2) * input0->dim(3);
const index_t bc = input0->dim(0) * input0->dim(1);
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (int i = 0; i < bc; ++i) {
for (int j = 0; j < img_size; ++j) {
T diff = input_ptr0[i * img_size + j] - input_ptr1[i];
......
......@@ -48,10 +48,28 @@ enum GPUPriorityHint {
PRIORITY_HIGH = 3
};
// AFFINITY_NONE: initiate 'num_threads_hint' threads with no affinity
// scheduled.
// If 'num_threads_hint' is -1 or greater than number of available cores,
// 'num_threads_hint' will be reset to number of available cores.
// AFFINITY_BIG_ONLY: all available big cores are used, and number of threads
// is equal to numbers of available big cores.
// AFFINITY_LITTLE_ONLY: all available little cores are used, and number of
// threads is equal to numbers of available little cores.
// AFFINITY_HIGH_PERFORMANCE: initiate 'num_threads_hint' threads on different
// cores with top-num_threads_hint frequencies.
// If 'num_threads_hint' is -1 or greater than number of available cores,
// 'num_threads_hint' will be reset to number of available cores.
// AFFINITY_POWER_SAVE: initiate 'num_threads_hint' threads on different
// cores with bottom-num_threads_hint frequencies.
// If 'num_threads_hint' is -1 or greater than number of available cores,
// 'num_threads_hint' will be reset to number of available cores.
enum CPUAffinityPolicy {
AFFINITY_NONE = 0,
AFFINITY_BIG_ONLY = 1,
AFFINITY_LITTLE_ONLY = 2,
AFFINITY_HIGH_PERFORMANCE = 3,
AFFINITY_POWER_SAVE = 4,
};
struct CallStats {
......
......@@ -99,7 +99,7 @@ inline void QuantizeWithScaleAndZeropoint(const float *input,
int32_t zero_point,
T *output) {
float recip_scale = 1 / scale;
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (int i = 0; i < size; ++i) {
output[i] = Saturate<T>(roundf(zero_point + recip_scale * input[i]));
}
......@@ -128,7 +128,7 @@ inline void Dequantize(const T *input,
const float scale,
const int32_t zero_point,
float *output) {
#pragma omp parallel for
#pragma omp parallel for schedule(runtime)
for (int i = 0; i < size; ++i) {
output[i] = scale * (input[i] - zero_point);
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册