提交 69583cd6 编写于 作者: 李寅

Using guided openmp scheduler

上级 0102ad55
...@@ -36,45 +36,98 @@ namespace mace { ...@@ -36,45 +36,98 @@ namespace mace {
int MaceOpenMPThreadCount = 1; int MaceOpenMPThreadCount = 1;
namespace { struct CPUFreq {
size_t core_id;
float freq;
};
namespace {
#if defined(__ANDROID__)
int GetCPUCount() { int GetCPUCount() {
char path[64];
int cpu_count = 0; int cpu_count = 0;
int result = 0; std::string cpu_sys_conf = "/proc/cpuinfo";
std::ifstream f(cpu_sys_conf);
while (true) { if (!f.is_open()) {
snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d", cpu_count); LOG(ERROR) << "failed to open " << cpu_sys_conf;
result = access(path, F_OK); return -1;
if (result != 0) {
if (errno != ENOENT) {
LOG(ERROR) << "Access " << path << " failed: " << strerror(errno);
} }
return cpu_count; std::string line;
const std::string processor_key = "processor";
while (std::getline(f, line)) {
if (line.size() >= processor_key.size()
&& line.compare(0, processor_key.size(), processor_key) == 0) {
++cpu_count;
}
}
if (f.bad()) {
LOG(ERROR) << "failed to read " << cpu_sys_conf;
} }
cpu_count++; if (!f.eof()) {
LOG(ERROR) << "failed to read end of " << cpu_sys_conf;
} }
f.close();
VLOG(2) << "CPU cores: " << cpu_count;
return cpu_count;
} }
#endif
int GetCPUMaxFreq(int cpu_id) { int GetCPUMaxFreq(std::vector<float> *max_freqs) {
char path[64]; #if defined(__ANDROID__)
snprintf(path, sizeof(path), int cpu_count = GetCPUCount();
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", for (int cpu_id = 0; cpu_id < cpu_count; ++cpu_id) {
cpu_id); std::string cpuinfo_max_freq_sys_conf = MakeString(
"/sys/devices/system/cpu/cpu",
FILE *fp = fopen(path, "rb"); cpu_id,
if (!fp) { "/cpufreq/cpuinfo_max_freq");
LOG(WARNING) << "File: " << path << " not exists."; std::ifstream f(cpuinfo_max_freq_sys_conf);
return 0; if (!f.is_open()) {
LOG(ERROR) << "failed to open " << cpuinfo_max_freq_sys_conf;
return -1;
} }
std::string line;
if (std::getline(f, line)) {
float freq = atof(line.c_str());
max_freqs->push_back(freq);
}
if (f.bad()) {
LOG(ERROR) << "failed to read " << cpuinfo_max_freq_sys_conf;
}
f.close();
}
#else
std::string cpu_sys_conf = "/proc/cpuinfo";
std::ifstream f(cpu_sys_conf);
if (!f.is_open()) {
LOG(ERROR) << "failed to open " << cpu_sys_conf;
return -1;
}
std::string line;
const std::string freq_key = "cpu MHz";
while (std::getline(f, line)) {
if (line.size() >= freq_key.size()
&& line.compare(0, freq_key.size(), freq_key) == 0) {
size_t pos = line.find(":");
if (pos != std::string::npos) {
std::string freq_str = line.substr(pos + 1);
float freq = atof(freq_str.c_str());
max_freqs->push_back(freq);
}
}
}
if (f.bad()) {
LOG(ERROR) << "failed to read " << cpu_sys_conf;
}
if (!f.eof()) {
LOG(ERROR) << "failed to read end of " << cpu_sys_conf;
}
f.close();
#endif
int freq = 0; for (float freq : *max_freqs) {
int items_read = fscanf(fp, "%d", &freq); VLOG(2) << "CPU freq: " << freq;
if (items_read != 1) {
LOG(WARNING) << "Read file: " << path << " failed.";
} }
fclose(fp);
return freq; return 0;
} }
MaceStatus SetThreadAffinity(cpu_set_t mask) { MaceStatus SetThreadAffinity(cpu_set_t mask) {
...@@ -93,51 +146,14 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) { ...@@ -93,51 +146,14 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) {
} }
} }
MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
std::vector<int> *little_core_ids) {
MACE_CHECK_NOTNULL(big_core_ids);
MACE_CHECK_NOTNULL(little_core_ids);
int cpu_count = GetCPUCount();
std::vector<int> cpu_max_freq(cpu_count);
// set cpu max frequency
for (int i = 0; i < cpu_count; ++i) {
cpu_max_freq[i] = GetCPUMaxFreq(i);
if (cpu_max_freq[i] == 0) {
LOG(WARNING) << "Cannot get CPU" << i
<< "'s max frequency info, maybe it is offline.";
return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
"Cannot get CPU's max frequency info,"
" maybe it is offline.");
}
}
int big_core_freq =
*(std::max_element(cpu_max_freq.begin(), cpu_max_freq.end()));
int little_core_freq =
*(std::min_element(cpu_max_freq.begin(), cpu_max_freq.end()));
big_core_ids->reserve(cpu_count);
little_core_ids->reserve(cpu_count);
for (int i = 0; i < cpu_count; ++i) {
if (cpu_max_freq[i] == little_core_freq) {
little_core_ids->push_back(i);
}
if (cpu_max_freq[i] == big_core_freq) {
big_core_ids->push_back(i);
}
}
return MaceStatus::MACE_SUCCESS;
}
MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
const std::vector<int> &cpu_ids) { const std::vector<size_t> &cpu_ids) {
MaceOpenMPThreadCount = omp_num_threads; MaceOpenMPThreadCount = omp_num_threads;
#ifdef MACE_ENABLE_OPENMP #ifdef MACE_ENABLE_OPENMP
VLOG(1) << "Set OpenMP threads number: " << omp_num_threads VLOG(1) << "Set OpenMP threads number: " << omp_num_threads
<< ", CPU core IDs: " << MakeString(cpu_ids); << ", CPU core IDs: " << MakeString(cpu_ids);
omp_set_schedule(omp_sched_guided, 1);
omp_set_num_threads(omp_num_threads); omp_set_num_threads(omp_num_threads);
#else #else
MACE_UNUSED(omp_num_threads); MACE_UNUSED(omp_num_threads);
...@@ -174,55 +190,90 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, ...@@ -174,55 +190,90 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
} // namespace } // namespace
MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
int omp_num_threads_hint, int num_threads_hint,
CPUAffinityPolicy policy, CPUAffinityPolicy policy,
void *gemm_context) { void *gemm_context) {
// get cpu frequency info
std::vector<float> cpu_max_freqs;
if (GetCPUMaxFreq(&cpu_max_freqs) == -1 || cpu_max_freqs.size() == 0) {
return MaceStatus::MACE_INVALID_ARGS;
}
std::vector<CPUFreq> cpu_freq(cpu_max_freqs.size());
for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
cpu_freq[i].core_id = i;
cpu_freq[i].freq = cpu_max_freqs[i];
}
if (policy == CPUAffinityPolicy::AFFINITY_POWER_SAVE ||
policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
std::sort(cpu_freq.begin(),
cpu_freq.end(),
[=](const CPUFreq &lhs, const CPUFreq &rhs) {
return lhs.freq < rhs.freq;
});
} else if (policy == CPUAffinityPolicy::AFFINITY_HIGH_PERFORMANCE ||
policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) {
std::sort(cpu_freq.begin(),
cpu_freq.end(),
[](const CPUFreq &lhs, const CPUFreq &rhs) {
return lhs.freq > rhs.freq;
});
}
int cpu_count = static_cast<int>(cpu_freq.size());
if (num_threads_hint <= 0 || num_threads_hint > cpu_count) {
num_threads_hint = cpu_count;
}
if (policy == CPUAffinityPolicy::AFFINITY_NONE) { if (policy == CPUAffinityPolicy::AFFINITY_NONE) {
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
if (gemm_context) { if (gemm_context) {
static_cast<gemmlowp::GemmContext*>(gemm_context)->set_max_num_threads( static_cast<gemmlowp::GemmContext*>(gemm_context)->set_max_num_threads(
std::max(0, omp_num_threads_hint)); num_threads_hint);
} }
#else #else
MACE_UNUSED(gemm_context); MACE_UNUSED(gemm_context);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENMP #ifdef MACE_ENABLE_OPENMP
if (omp_num_threads_hint > 0) { omp_set_num_threads(num_threads_hint);
omp_set_num_threads(std::min(omp_num_threads_hint, omp_get_num_procs()));
}
#else #else
LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled."; LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled.";
#endif #endif
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} }
std::vector<int> big_core_ids;
std::vector<int> little_core_ids;
MaceStatus res = GetCPUBigLittleCoreIDs(&big_core_ids, &little_core_ids);
if (res != MaceStatus::MACE_SUCCESS) {
return res;
}
std::vector<int> use_cpu_ids; // decide num of cores to use
if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) { int cores_to_use = 0;
use_cpu_ids = std::move(big_core_ids); if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY
|| policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
if (cpu_freq[i].freq != cpu_freq[0].freq) {
break;
}
++cores_to_use;
}
num_threads_hint = cores_to_use;
} else { } else {
use_cpu_ids = std::move(little_core_ids); cores_to_use = num_threads_hint;
} }
if (omp_num_threads_hint <= 0 || VLOG(2) << "Use " << num_threads_hint << " threads";
omp_num_threads_hint > static_cast<int>(use_cpu_ids.size())) { std::vector<size_t> cpu_ids(cores_to_use);
omp_num_threads_hint = use_cpu_ids.size(); for (int i = 0; i < cores_to_use; ++i) {
VLOG(2) << "Bind thread to core: " << cpu_freq[i].core_id << " with freq "
<< cpu_freq[i].freq;
cpu_ids[i] = cpu_freq[i].core_id;
} }
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
if (gemm_context) { if (gemm_context) {
static_cast<gemmlowp::GemmContext*>(gemm_context)->set_max_num_threads( static_cast<gemmlowp::GemmContext*>(gemm_context)->set_max_num_threads(
omp_num_threads_hint); num_threads_hint);
} }
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
return SetOpenMPThreadsAndAffinityCPUs(omp_num_threads_hint, use_cpu_ids); return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint, cpu_ids);
} }
} // namespace mace } // namespace mace
......
...@@ -66,26 +66,26 @@ void DoActivation(const T *input_ptr, ...@@ -66,26 +66,26 @@ void DoActivation(const T *input_ptr,
case NOOP: case NOOP:
break; break;
case RELU: case RELU:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output_ptr[i] = std::max(input_ptr[i], static_cast<T>(0)); output_ptr[i] = std::max(input_ptr[i], static_cast<T>(0));
} }
break; break;
case RELUX: case RELUX:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output_ptr[i] = std::min(std::max(input_ptr[i], static_cast<T>(0)), output_ptr[i] = std::min(std::max(input_ptr[i], static_cast<T>(0)),
static_cast<T>(relux_max_limit)); static_cast<T>(relux_max_limit));
} }
break; break;
case TANH: case TANH:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output_ptr[i] = std::tanh(input_ptr[i]); output_ptr[i] = std::tanh(input_ptr[i]);
} }
break; break;
case SIGMOID: case SIGMOID:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i])); output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i]));
} }
...@@ -111,13 +111,13 @@ inline void DoActivation(const float *input_ptr, ...@@ -111,13 +111,13 @@ inline void DoActivation(const float *input_ptr,
ReluxNeon(input_ptr, relux_max_limit, size, output_ptr); ReluxNeon(input_ptr, relux_max_limit, size, output_ptr);
break; break;
case TANH: case TANH:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output_ptr[i] = std::tanh(input_ptr[i]); output_ptr[i] = std::tanh(input_ptr[i]);
} }
break; break;
case SIGMOID: case SIGMOID:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i])); output_ptr[i] = 1 / (1 + std::exp(-input_ptr[i]));
} }
...@@ -134,7 +134,7 @@ void PReLUActivation(const T *input_ptr, ...@@ -134,7 +134,7 @@ void PReLUActivation(const T *input_ptr,
const index_t inner_size, const index_t inner_size,
const T *alpha_ptr, const T *alpha_ptr,
T *output_ptr) { T *output_ptr) {
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3) schedule(runtime)
for (index_t i = 0; i < outer_size; ++i) { for (index_t i = 0; i < outer_size; ++i) {
for (index_t chan_idx = 0; chan_idx < input_chan; ++chan_idx) { for (index_t chan_idx = 0; chan_idx < input_chan; ++chan_idx) {
for (index_t j = 0; j < inner_size; ++j) { for (index_t j = 0; j < inner_size; ++j) {
......
...@@ -59,7 +59,7 @@ class ArgMaxOp : public Operation { ...@@ -59,7 +59,7 @@ class ArgMaxOp : public Operation {
index_t outer_size = output->size(); index_t outer_size = output->size();
index_t inner_size = input->dim(axis_value); index_t inner_size = input->dim(axis_value);
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < outer_size; ++i) { for (index_t i = 0; i < outer_size; ++i) {
int idx = 0; int idx = 0;
T max_value = std::numeric_limits<T>::lowest(); T max_value = std::numeric_limits<T>::lowest();
......
...@@ -25,7 +25,7 @@ namespace ops { ...@@ -25,7 +25,7 @@ namespace ops {
void ReluNeon(const float *input, const index_t size, float *output) { void ReluNeon(const float *input, const index_t size, float *output) {
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
float32x4_t vzero = vdupq_n_f32(0.f); float32x4_t vzero = vdupq_n_f32(0.f);
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i <= size - 4; i += 4) { for (index_t i = 0; i <= size - 4; i += 4) {
float32x4_t v = vld1q_f32(input + i); float32x4_t v = vld1q_f32(input + i);
v = vmaxq_f32(v, vzero); v = vmaxq_f32(v, vzero);
...@@ -36,7 +36,7 @@ void ReluNeon(const float *input, const index_t size, float *output) { ...@@ -36,7 +36,7 @@ void ReluNeon(const float *input, const index_t size, float *output) {
output[i] = std::max(input[i], 0.f); output[i] = std::max(input[i], 0.f);
} }
#else #else
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::max(input[i], 0.f); output[i] = std::max(input[i], 0.f);
} }
...@@ -48,7 +48,7 @@ void ReluxNeon(const float *input, const float limit, ...@@ -48,7 +48,7 @@ void ReluxNeon(const float *input, const float limit,
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
float32x4_t vzero = vdupq_n_f32(0.f); float32x4_t vzero = vdupq_n_f32(0.f);
float32x4_t vlimit = vdupq_n_f32(limit); float32x4_t vlimit = vdupq_n_f32(limit);
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i <= size - 4; i += 4) { for (index_t i = 0; i <= size - 4; i += 4) {
float32x4_t v = vld1q_f32(input + i); float32x4_t v = vld1q_f32(input + i);
v = vmaxq_f32(v, vzero); v = vmaxq_f32(v, vzero);
...@@ -60,7 +60,7 @@ void ReluxNeon(const float *input, const float limit, ...@@ -60,7 +60,7 @@ void ReluxNeon(const float *input, const float limit,
output[i] = std::min(std::max(input[i], 0.f), limit); output[i] = std::min(std::max(input[i], 0.f), limit);
} }
#else #else
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::min(std::max(input[i], 0.f), limit); output[i] = std::min(std::max(input[i], 0.f), limit);
} }
......
...@@ -60,7 +60,7 @@ void Conv2dNeonK15x1S1(const float *input, ...@@ -60,7 +60,7 @@ void Conv2dNeonK15x1S1(const float *input,
const index_t tile_width = const index_t tile_width =
out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3]; out_shape[1] < 4 ? RoundUpDiv4(out_shape[3]) : out_shape[3];
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; ++m) { for (index_t m = 0; m < out_shape[1]; ++m) {
for (index_t w = 0; w < out_shape[3]; w += tile_width) { for (index_t w = 0; w < out_shape[3]; w += tile_width) {
......
...@@ -61,7 +61,7 @@ void Conv2dNeonK1x15S1(const float *input, ...@@ -61,7 +61,7 @@ void Conv2dNeonK1x15S1(const float *input,
const index_t tile_height = const index_t tile_height =
out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2]; out_shape[1] < 4 ? RoundUpDiv4(out_shape[2]) : out_shape[2];
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; ++m) { for (index_t m = 0; m < out_shape[1]; ++m) {
for (index_t h = 0; h < out_shape[2]; h += tile_height) { for (index_t h = 0; h < out_shape[2]; h += tile_height) {
......
...@@ -32,7 +32,7 @@ void Conv2dNeonK1x7S1(const float *input, ...@@ -32,7 +32,7 @@ void Conv2dNeonK1x7S1(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 4) { for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1]; const index_t out_channels = out_shape[1];
......
...@@ -33,7 +33,7 @@ void Conv2dNeonK3x3S1(const float *input, ...@@ -33,7 +33,7 @@ void Conv2dNeonK3x3S1(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 2) { for (index_t m = 0; m < out_shape[1]; m += 2) {
const index_t out_channels = out_shape[1]; const index_t out_channels = out_shape[1];
...@@ -515,7 +515,7 @@ void Conv2dNeonK3x3S2(const float *input, ...@@ -515,7 +515,7 @@ void Conv2dNeonK3x3S2(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; ++m) { for (index_t m = 0; m < out_shape[1]; ++m) {
for (index_t c = 0; c < in_shape[1]; ++c) { for (index_t c = 0; c < in_shape[1]; ++c) {
......
...@@ -87,7 +87,7 @@ void Conv2dNeonK5x5S1(const float *input, ...@@ -87,7 +87,7 @@ void Conv2dNeonK5x5S1(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 4) { for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1]; const index_t out_channels = out_shape[1];
......
...@@ -32,7 +32,7 @@ void Conv2dNeonK7x1S1(const float *input, ...@@ -32,7 +32,7 @@ void Conv2dNeonK7x1S1(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 4) { for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1]; const index_t out_channels = out_shape[1];
......
...@@ -164,7 +164,7 @@ void Conv2dNeonK7x7S1(const float *input, ...@@ -164,7 +164,7 @@ void Conv2dNeonK7x7S1(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 4) { for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1]; const index_t out_channels = out_shape[1];
...@@ -319,7 +319,7 @@ void Conv2dNeonK7x7S2(const float *input, ...@@ -319,7 +319,7 @@ void Conv2dNeonK7x7S2(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 4) { for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1]; const index_t out_channels = out_shape[1];
...@@ -484,7 +484,7 @@ void Conv2dNeonK7x7S3(const float *input, ...@@ -484,7 +484,7 @@ void Conv2dNeonK7x7S3(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; m += 4) { for (index_t m = 0; m < out_shape[1]; m += 4) {
const index_t out_channels = out_shape[1]; const index_t out_channels = out_shape[1];
......
...@@ -34,7 +34,7 @@ void TransformInput4x4(const float *input, ...@@ -34,7 +34,7 @@ void TransformInput4x4(const float *input,
const index_t input_batch_size = in_height_width * in_channels; const index_t input_batch_size = in_height_width * in_channels;
const index_t output_batch_size = 16 * in_channels * tile_count; const index_t output_batch_size = 16 * in_channels * tile_count;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t n = 0; n < batch; ++n) { for (index_t n = 0; n < batch; ++n) {
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
index_t tile_index = 0; index_t tile_index = 0;
...@@ -155,7 +155,7 @@ void TransformInput8x8(const float *input, ...@@ -155,7 +155,7 @@ void TransformInput8x8(const float *input,
const index_t input_batch_size = in_height_width * in_channels; const index_t input_batch_size = in_height_width * in_channels;
const index_t output_batch_size = 64 * in_channels * tile_count; const index_t output_batch_size = 64 * in_channels * tile_count;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t n = 0; n < batch; ++n) { for (index_t n = 0; n < batch; ++n) {
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
index_t tile_index = 0; index_t tile_index = 0;
...@@ -292,7 +292,7 @@ void TransformOutput4x4(const float *input, ...@@ -292,7 +292,7 @@ void TransformOutput4x4(const float *input,
const index_t out_image_size = out_height * out_width; const index_t out_image_size = out_height * out_width;
const index_t output_batch_size = out_channels * out_image_size; const index_t output_batch_size = out_channels * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t n = 0; n < batch; ++n) { for (index_t n = 0; n < batch; ++n) {
for (index_t m = 0; m < out_channels; ++m) { for (index_t m = 0; m < out_channels; ++m) {
index_t tile_offset = 0; index_t tile_offset = 0;
...@@ -388,7 +388,7 @@ void TransformOutput8x8(const float *input, ...@@ -388,7 +388,7 @@ void TransformOutput8x8(const float *input,
const index_t out_image_size = out_height * out_width; const index_t out_image_size = out_height * out_width;
const index_t output_batch_size = out_channels * out_image_size; const index_t output_batch_size = out_channels * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t n = 0; n < batch; ++n) { for (index_t n = 0; n < batch; ++n) {
for (index_t m = 0; m < out_channels; ++m) { for (index_t m = 0; m < out_channels; ++m) {
index_t tile_offset = 0; index_t tile_offset = 0;
...@@ -471,7 +471,7 @@ void TransformFilter4x4(const float *filter, ...@@ -471,7 +471,7 @@ void TransformFilter4x4(const float *filter,
float *output) { float *output) {
const index_t stride = out_channels * in_channels; const index_t stride = out_channels * in_channels;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t m = 0; m < out_channels; ++m) { for (index_t m = 0; m < out_channels; ++m) {
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
float g0, g1, g2, g3, g4, g5, g6, g7, g8; float g0, g1, g2, g3, g4, g5, g6, g7, g8;
...@@ -573,7 +573,7 @@ void TransformFilter8x8(const float *filter, ...@@ -573,7 +573,7 @@ void TransformFilter8x8(const float *filter,
{1.0f / 45, -1.0f / 90, 1.0f / 180}, {1.0f / 45, -1.0f / 90, 1.0f / 180},
{0.0f, 0.0f, 1.0f}}; {0.0f, 0.0f, 1.0f}};
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t m = 0; m < out_channels; ++m) { for (index_t m = 0; m < out_channels; ++m) {
for (index_t c = 0; c < in_channels; ++c) { for (index_t c = 0; c < in_channels; ++c) {
// load filter // load filter
...@@ -720,7 +720,7 @@ void ConvRef3x3s1(const float *input, ...@@ -720,7 +720,7 @@ void ConvRef3x3s1(const float *input,
index_t out_height = in_height - 2; index_t out_height = in_height - 2;
index_t out_width = in_width - 2; index_t out_width = in_width - 2;
#pragma omp parallel for collapse(4) #pragma omp parallel for collapse(4) schedule(runtime)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
for (index_t m = 0; m < out_channels; ++m) { for (index_t m = 0; m < out_channels; ++m) {
for (index_t h = 0; h < out_height; ++h) { for (index_t h = 0; h < out_height; ++h) {
......
...@@ -33,7 +33,7 @@ void Deconv2dNeonK2x2S1(const float *input, ...@@ -33,7 +33,7 @@ void Deconv2dNeonK2x2S1(const float *input,
const index_t out_img_size = outh * outw; const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t oc = 0; oc < outch; oc += 2) { for (index_t oc = 0; oc < outch; oc += 2) {
if (oc + 1 < outch) { if (oc + 1 < outch) {
...@@ -199,7 +199,7 @@ void Deconv2dNeonK2x2S2(const float *input, ...@@ -199,7 +199,7 @@ void Deconv2dNeonK2x2S2(const float *input,
const index_t outw = out_shape[3]; const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw; const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t oc = 0; oc < outch; ++oc) { for (index_t oc = 0; oc < outch; ++oc) {
float *out_base = output + (b * outch + oc) * out_img_size; float *out_base = output + (b * outch + oc) * out_img_size;
......
...@@ -33,7 +33,7 @@ void Deconv2dNeonK3x3S1(const float *input, ...@@ -33,7 +33,7 @@ void Deconv2dNeonK3x3S1(const float *input,
const index_t out_img_size = outh * outw; const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t oc = 0; oc < outch; oc += 2) { for (index_t oc = 0; oc < outch; oc += 2) {
if (oc + 1 < outch) { if (oc + 1 < outch) {
...@@ -293,7 +293,7 @@ void Deconv2dNeonK3x3S2(const float *input, ...@@ -293,7 +293,7 @@ void Deconv2dNeonK3x3S2(const float *input,
const index_t outw = out_shape[3]; const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw; const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t oc = 0; oc < outch; ++oc) { for (index_t oc = 0; oc < outch; ++oc) {
float *out_base = output + (b * outch + oc) * out_img_size; float *out_base = output + (b * outch + oc) * out_img_size;
......
...@@ -31,7 +31,7 @@ void Deconv2dNeonK4x4S1(const float *input, ...@@ -31,7 +31,7 @@ void Deconv2dNeonK4x4S1(const float *input,
const index_t outw = out_shape[3]; const index_t outw = out_shape[3];
const index_t outch = out_shape[1]; const index_t outch = out_shape[1];
const index_t out_img_size = outh * outw; const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t oc = 0; oc < outch; oc += 2) { for (index_t oc = 0; oc < outch; oc += 2) {
if (oc + 1 < outch) { if (oc + 1 < outch) {
...@@ -386,7 +386,7 @@ void Deconv2dNeonK4x4S2(const float *input, ...@@ -386,7 +386,7 @@ void Deconv2dNeonK4x4S2(const float *input,
const index_t outch = out_shape[1]; const index_t outch = out_shape[1];
const index_t out_img_size = outh * outw; const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t p = 0; p < outch; p++) { for (index_t p = 0; p < outch; p++) {
float *out_base = output + (b * outch + p) * out_img_size; float *out_base = output + (b * outch + p) * out_img_size;
......
...@@ -70,7 +70,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input, ...@@ -70,7 +70,7 @@ void DepthwiseConv2dNeonK3x3S1(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < in_shape[0]; ++b) { for (index_t b = 0; b < in_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; ++m) { for (index_t m = 0; m < out_shape[1]; ++m) {
index_t c = m / multiplier; index_t c = m / multiplier;
...@@ -250,7 +250,7 @@ void DepthwiseConv2dNeonK3x3S2(const float *input, ...@@ -250,7 +250,7 @@ void DepthwiseConv2dNeonK3x3S2(const float *input,
const index_t in_batch_size = in_shape[1] * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < in_shape[0]; ++b) { for (index_t b = 0; b < in_shape[0]; ++b) {
for (index_t m = 0; m < out_shape[1]; ++m) { for (index_t m = 0; m < out_shape[1]; ++m) {
index_t c = m / multiplier; index_t c = m / multiplier;
......
...@@ -32,7 +32,7 @@ void DepthwiseDeconv2dNeonK3x3S1(const float *input, ...@@ -32,7 +32,7 @@ void DepthwiseDeconv2dNeonK3x3S1(const float *input,
const index_t outw = out_shape[3]; const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw; const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
const index_t offset = b * channels + c; const index_t offset = b * channels + c;
...@@ -137,7 +137,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input, ...@@ -137,7 +137,7 @@ void DepthwiseDeconv2dNeonK3x3S2(const float *input,
const index_t outw = out_shape[3]; const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw; const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
const index_t offset = b * channels + c; const index_t offset = b * channels + c;
...@@ -251,7 +251,7 @@ void GroupDeconv2dNeonK3x3S1(const float *input, ...@@ -251,7 +251,7 @@ void GroupDeconv2dNeonK3x3S1(const float *input,
const index_t inch_g = inch / group; const index_t inch_g = inch / group;
const index_t outch_g = outch / group; const index_t outch_g = outch / group;
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (int g = 0; g < group; ++g) { for (int g = 0; g < group; ++g) {
for (index_t oc = 0; oc < outch_g; oc += 2) { for (index_t oc = 0; oc < outch_g; oc += 2) {
...@@ -525,7 +525,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input, ...@@ -525,7 +525,7 @@ void GroupDeconv2dNeonK3x3S2(const float *input,
const index_t inch_g = inch / group; const index_t inch_g = inch / group;
const index_t outch_g = outch / group; const index_t outch_g = outch / group;
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (int g = 0; g < group; ++g) { for (int g = 0; g < group; ++g) {
for (index_t oc = 0; oc < outch_g; ++oc) { for (index_t oc = 0; oc < outch_g; ++oc) {
......
...@@ -33,7 +33,7 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input, ...@@ -33,7 +33,7 @@ void DepthwiseDeconv2dNeonK4x4S1(const float *input,
const index_t outw = out_shape[3]; const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw; const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
const index_t offset = b * channels + c; const index_t offset = b * channels + c;
...@@ -169,7 +169,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input, ...@@ -169,7 +169,7 @@ void DepthwiseDeconv2dNeonK4x4S2(const float *input,
const index_t outw = out_shape[3]; const index_t outw = out_shape[3];
const index_t out_img_size = outh * outw; const index_t out_img_size = outh * outw;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
const index_t offset = b * channels + c; const index_t offset = b * channels + c;
...@@ -304,7 +304,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input, ...@@ -304,7 +304,7 @@ void GroupDeconv2dNeonK4x4S1(const float *input,
const index_t inch_g = inch / group; const index_t inch_g = inch / group;
const index_t outch_g = outch / group; const index_t outch_g = outch / group;
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (int g = 0; g < group; ++g) { for (int g = 0; g < group; ++g) {
for (index_t oc = 0; oc < outch_g; oc += 2) { for (index_t oc = 0; oc < outch_g; oc += 2) {
...@@ -679,7 +679,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input, ...@@ -679,7 +679,7 @@ void GroupDeconv2dNeonK4x4S2(const float *input,
const index_t inch_g = inch / group; const index_t inch_g = inch / group;
const index_t outch_g = outch / group; const index_t outch_g = outch / group;
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (int g = 0; g < group; ++g) { for (int g = 0; g < group; ++g) {
for (index_t oc = 0; oc < outch_g; oc++) { for (index_t oc = 0; oc < outch_g; oc++) {
......
...@@ -124,7 +124,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase { ...@@ -124,7 +124,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width); std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / out_width);
// make channel outter loop so we can make best use of cache // make channel outter loop so we can make best use of cache
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3) schedule(runtime)
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
for (index_t block_h = 0; block_h < in_height; for (index_t block_h = 0; block_h < in_height;
block_h += block_h_size) { block_h += block_h_size) {
...@@ -213,7 +213,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase { ...@@ -213,7 +213,7 @@ class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
index_t out_width = space_tensor->dim(2); index_t out_width = space_tensor->dim(2);
index_t channels = space_tensor->dim(3); index_t channels = space_tensor->dim(3);
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t in_b = 0; in_b < in_batches; ++in_b) { for (index_t in_b = 0; in_b < in_batches; ++in_b) {
const index_t b = in_b % out_batches; const index_t b = in_b % out_batches;
const index_t tile_index = in_b / out_batches; const index_t tile_index = in_b / out_batches;
......
...@@ -55,7 +55,7 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation { ...@@ -55,7 +55,7 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
index_t batch_size = channels * image_size; index_t batch_size = channels * image_size;
index_t channels_per_group = channels / groups_; index_t channels_per_group = channels / groups_;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
const T *input_base = input_ptr + b * batch_size; const T *input_base = input_ptr + b * batch_size;
......
...@@ -475,7 +475,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase { ...@@ -475,7 +475,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
// unpack output // unpack output
if (extra_output_height != height || extra_output_width != width) { if (extra_output_height != height || extra_output_width != width) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
for (index_t h = 0; h < height; ++h) { for (index_t h = 0; h < height; ++h) {
...@@ -494,7 +494,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase { ...@@ -494,7 +494,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
if (bias_data != nullptr) { if (bias_data != nullptr) {
const index_t image_size = height * width; const index_t image_size = height * width;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
float *output_ptr = output_data + (b * channels + c) * image_size; float *output_ptr = output_data + (b * channels + c) * image_size;
...@@ -539,7 +539,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase { ...@@ -539,7 +539,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
const index_t out_batch_size = filter_shape[0] * out_image_size; const index_t out_batch_size = filter_shape[0] * out_image_size;
const index_t filter_size = filter_shape[2] * filter_shape[3]; const index_t filter_size = filter_shape[2] * filter_shape[3];
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < in_shape[0]; b++) { for (index_t b = 0; b < in_shape[0]; b++) {
for (index_t m = 0; m < filter_shape[0]; m += 4) { for (index_t m = 0; m < filter_shape[0]; m += 4) {
const index_t in_width = in_shape[3]; const index_t in_width = in_shape[3];
...@@ -867,7 +867,7 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase { ...@@ -867,7 +867,7 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
const index_t input_row_size = in_shape[2] * in_shape[3]; const index_t input_row_size = in_shape[2] * in_shape[3];
const index_t patch_row_size = filter_w * in_shape[3]; const index_t patch_row_size = filter_w * in_shape[3];
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t h = 0; h < out_shape[1]; ++h) { for (index_t h = 0; h < out_shape[1]; ++h) {
for (index_t w = 0; w < out_shape[2]; ++w) { for (index_t w = 0; w < out_shape[2]; ++w) {
......
...@@ -395,7 +395,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor, ...@@ -395,7 +395,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
const index_t in_batch_size = channels * in_image_size; const index_t in_batch_size = channels * in_image_size;
const index_t out_batch_size = channels * out_image_size; const index_t out_batch_size = channels * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (int i = 0; i < batch; ++i) { for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) { for (int j = 0; j < channels; ++j) {
for (int k = 0; k < height; ++k) { for (int k = 0; k < height; ++k) {
...@@ -443,7 +443,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor, ...@@ -443,7 +443,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
if (padding_same_value) { if (padding_same_value) {
LOG(FATAL) << "Not implemented"; LOG(FATAL) << "Not implemented";
} else { } else {
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3) schedule(runtime)
for (int n = 0; n < batch; ++n) { for (int n = 0; n < batch; ++n) {
for (int h = 0; h < height; ++h) { for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) { for (int w = 0; w < width; ++w) {
......
...@@ -276,7 +276,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase { ...@@ -276,7 +276,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
const index_t batch = out_shape[0]; const index_t batch = out_shape[0];
const index_t channels = out_shape[1]; const index_t channels = out_shape[1];
const index_t img_size = out_shape[2] * out_shape[3]; const index_t img_size = out_shape[2] * out_shape[3];
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
for (index_t i = 0; i < img_size; ++i) { for (index_t i = 0; i < img_size; ++i) {
...@@ -324,7 +324,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase { ...@@ -324,7 +324,7 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
const index_t out_channels = out_shape[1]; const index_t out_channels = out_shape[1];
const index_t in_channels = in_shape[1]; const index_t in_channels = in_shape[1];
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (int b = 0; b < batch; ++b) { for (int b = 0; b < batch; ++b) {
for (int oc = 0; oc < out_channels; ++oc) { for (int oc = 0; oc < out_channels; ++oc) {
float *out_base = float *out_base =
......
...@@ -57,7 +57,7 @@ class DepthToSpaceOp : public Operation { ...@@ -57,7 +57,7 @@ class DepthToSpaceOp : public Operation {
const T *input_ptr = input->data<T>(); const T *input_ptr = input->data<T>();
T *output_ptr = output->mutable_data<T>(); T *output_ptr = output->mutable_data<T>();
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t b = 0; b < batch_size; ++b) { for (index_t b = 0; b < batch_size; ++b) {
for (index_t d = 0; d < output_depth; ++d) { for (index_t d = 0; d < output_depth; ++d) {
for (index_t h = 0; h < output_height; ++h) { for (index_t h = 0; h < output_height; ++h) {
......
...@@ -201,7 +201,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, ...@@ -201,7 +201,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
switch (type) { switch (type) {
case SUM: case SUM:
if (coeff.empty()) { if (coeff.empty()) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) { for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) { for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] = output[i + d * common_size] =
...@@ -213,7 +213,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, ...@@ -213,7 +213,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
if (swapped) { if (swapped) {
std::swap(coeff_copy[0], coeff_copy[1]); std::swap(coeff_copy[0], coeff_copy[1]);
} }
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) { for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) { for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] = output[i + d * common_size] =
...@@ -225,7 +225,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, ...@@ -225,7 +225,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
break; break;
case SUB: case SUB:
if (!swapped) { if (!swapped) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) { for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) { for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] = output[i + d * common_size] =
...@@ -233,7 +233,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, ...@@ -233,7 +233,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
} }
} }
} else { } else {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) { for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) { for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] = output[i + d * common_size] =
...@@ -243,7 +243,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, ...@@ -243,7 +243,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
} }
break; break;
case PROD: case PROD:
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) { for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) { for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] = input0[i + d * common_size] * input1[i]; output[i + d * common_size] = input0[i + d * common_size] * input1[i];
...@@ -252,7 +252,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, ...@@ -252,7 +252,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
break; break;
case DIV: case DIV:
if (!swapped) { if (!swapped) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) { for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) { for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] = output[i + d * common_size] =
...@@ -260,7 +260,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, ...@@ -260,7 +260,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
} }
} }
} else { } else {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) { for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) { for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] = output[i + d * common_size] =
...@@ -270,7 +270,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, ...@@ -270,7 +270,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
} }
break; break;
case MIN: case MIN:
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) { for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) { for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] = output[i + d * common_size] =
...@@ -279,7 +279,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, ...@@ -279,7 +279,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
} }
break; break;
case MAX: case MAX:
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) { for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) { for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] = output[i + d * common_size] =
...@@ -288,7 +288,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, ...@@ -288,7 +288,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
} }
break; break;
case SQR_DIFF: case SQR_DIFF:
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) { for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) { for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] = output[i + d * common_size] =
...@@ -298,7 +298,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, ...@@ -298,7 +298,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
break; break;
case POW: case POW:
if (!swapped) { if (!swapped) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) { for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) { for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] = output[i + d * common_size] =
...@@ -306,7 +306,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type, ...@@ -306,7 +306,7 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
} }
} }
} else { } else {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) { for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) { for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] = output[i + d * common_size] =
...@@ -316,19 +316,19 @@ inline void TensorBroadcastEltwise(const EltwiseType type, ...@@ -316,19 +316,19 @@ inline void TensorBroadcastEltwise(const EltwiseType type,
} }
break; break;
case NEG: case NEG:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < diff_size * common_size; ++i) { for (index_t i = 0; i < diff_size * common_size; ++i) {
output[i] = -input0[i]; output[i] = -input0[i];
} }
break; break;
case ABS: case ABS:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < diff_size * common_size; ++i) { for (index_t i = 0; i < diff_size * common_size; ++i) {
output[i] = std::fabs(input0[i]); output[i] = std::fabs(input0[i]);
} }
break; break;
case EQUAL: case EQUAL:
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t d = 0; d < diff_size; ++d) { for (index_t d = 0; d < diff_size; ++d) {
for (index_t i = 0; i < common_size; ++i) { for (index_t i = 0; i < common_size; ++i) {
output[i + d * common_size] = output[i + d * common_size] =
...@@ -353,7 +353,7 @@ inline void TensorEltwise(const EltwiseType type, ...@@ -353,7 +353,7 @@ inline void TensorEltwise(const EltwiseType type,
switch (type) { switch (type) {
case SUM: case SUM:
if (coeff.empty()) { if (coeff.empty()) {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] + input1[i]; output[i] = input0[i] + input1[i];
} }
...@@ -363,7 +363,7 @@ inline void TensorEltwise(const EltwiseType type, ...@@ -363,7 +363,7 @@ inline void TensorEltwise(const EltwiseType type,
if (swapped) { if (swapped) {
std::swap(coeff_copy[0], coeff_copy[1]); std::swap(coeff_copy[0], coeff_copy[1]);
} }
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1]; output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1];
} }
...@@ -371,20 +371,20 @@ inline void TensorEltwise(const EltwiseType type, ...@@ -371,20 +371,20 @@ inline void TensorEltwise(const EltwiseType type,
break; break;
case SUB: case SUB:
if (!swapped) { if (!swapped) {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] - input1[i]; output[i] = input0[i] - input1[i];
} }
} else { } else {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input1[i] - input0[i]; output[i] = input1[i] - input0[i];
} }
} }
break; break;
case PROD: case PROD:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] * input1[i]; output[i] = input0[i] * input1[i];
} }
...@@ -392,34 +392,34 @@ inline void TensorEltwise(const EltwiseType type, ...@@ -392,34 +392,34 @@ inline void TensorEltwise(const EltwiseType type,
break; break;
case DIV: case DIV:
if (!swapped) { if (!swapped) {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] / input1[i]; output[i] = input0[i] / input1[i];
} }
} else { } else {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input1[i] / input0[i]; output[i] = input1[i] / input0[i];
} }
} }
break; break;
case MIN: case MIN:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::min(input0[i], input1[i]); output[i] = std::min(input0[i], input1[i]);
} }
break; break;
case MAX: case MAX:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::max(input0[i], input1[i]); output[i] = std::max(input0[i], input1[i]);
} }
break; break;
case SQR_DIFF: case SQR_DIFF:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input0[i] - input1[i], 2.f); output[i] = std::pow(input0[i] - input1[i], 2.f);
} }
...@@ -427,7 +427,7 @@ inline void TensorEltwise(const EltwiseType type, ...@@ -427,7 +427,7 @@ inline void TensorEltwise(const EltwiseType type,
break; break;
case POW: case POW:
if (!swapped) { if (!swapped) {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input0[i], input1[i]); output[i] = std::pow(input0[i], input1[i]);
} }
...@@ -438,19 +438,19 @@ inline void TensorEltwise(const EltwiseType type, ...@@ -438,19 +438,19 @@ inline void TensorEltwise(const EltwiseType type,
} }
break; break;
case NEG: case NEG:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = -input0[i]; output[i] = -input0[i];
} }
break; break;
case ABS: case ABS:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::fabs(input0[i]); output[i] = std::fabs(input0[i]);
} }
break; break;
case EQUAL: case EQUAL:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] == input1[i]; output[i] = input0[i] == input1[i];
} }
...@@ -472,7 +472,7 @@ inline void TensorScalarEltwise(const EltwiseType type, ...@@ -472,7 +472,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
switch (type) { switch (type) {
case SUM: case SUM:
if (coeff.empty()) { if (coeff.empty()) {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] + input1; output[i] = input0[i] + input1;
} }
...@@ -482,7 +482,7 @@ inline void TensorScalarEltwise(const EltwiseType type, ...@@ -482,7 +482,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
if (swapped) { if (swapped) {
std::swap(coeff_copy[0], coeff_copy[1]); std::swap(coeff_copy[0], coeff_copy[1]);
} }
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1]; output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1];
} }
...@@ -490,20 +490,20 @@ inline void TensorScalarEltwise(const EltwiseType type, ...@@ -490,20 +490,20 @@ inline void TensorScalarEltwise(const EltwiseType type,
break; break;
case SUB: case SUB:
if (!swapped) { if (!swapped) {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] - input1; output[i] = input0[i] - input1;
} }
} else { } else {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input1 - input0[i]; output[i] = input1 - input0[i];
} }
} }
break; break;
case PROD: case PROD:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] * input1; output[i] = input0[i] * input1;
} }
...@@ -511,34 +511,34 @@ inline void TensorScalarEltwise(const EltwiseType type, ...@@ -511,34 +511,34 @@ inline void TensorScalarEltwise(const EltwiseType type,
break; break;
case DIV: case DIV:
if (!swapped) { if (!swapped) {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] / input1; output[i] = input0[i] / input1;
} }
} else { } else {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input1 / input0[i]; output[i] = input1 / input0[i];
} }
} }
break; break;
case MIN: case MIN:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::min(input0[i], input1); output[i] = std::min(input0[i], input1);
} }
break; break;
case MAX: case MAX:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::max(input0[i], input1); output[i] = std::max(input0[i], input1);
} }
break; break;
case SQR_DIFF: case SQR_DIFF:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input0[i] - input1, 2.f); output[i] = std::pow(input0[i] - input1, 2.f);
} }
...@@ -546,7 +546,7 @@ inline void TensorScalarEltwise(const EltwiseType type, ...@@ -546,7 +546,7 @@ inline void TensorScalarEltwise(const EltwiseType type,
break; break;
case POW: case POW:
if (!swapped) { if (!swapped) {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::pow(input0[i], input1); output[i] = std::pow(input0[i], input1);
} }
...@@ -557,19 +557,19 @@ inline void TensorScalarEltwise(const EltwiseType type, ...@@ -557,19 +557,19 @@ inline void TensorScalarEltwise(const EltwiseType type,
} }
break; break;
case NEG: case NEG:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = -input0[i]; output[i] = -input0[i];
} }
break; break;
case ABS: case ABS:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = std::fabs(input0[i]); output[i] = std::fabs(input0[i]);
} }
break; break;
case EQUAL: case EQUAL:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < size; ++i) { for (index_t i = 0; i < size; ++i) {
output[i] = input0[i] == input1; output[i] = input0[i] == input1;
} }
...@@ -594,7 +594,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, ...@@ -594,7 +594,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
switch (type) { switch (type) {
case SUM: case SUM:
if (coeff.empty()) { if (coeff.empty()) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
...@@ -610,7 +610,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, ...@@ -610,7 +610,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
if (swapped) { if (swapped) {
std::swap(coeff_copy[0], coeff_copy[1]); std::swap(coeff_copy[0], coeff_copy[1]);
} }
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
...@@ -626,7 +626,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, ...@@ -626,7 +626,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
break; break;
case SUB: case SUB:
if (!swapped) { if (!swapped) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
...@@ -638,7 +638,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, ...@@ -638,7 +638,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
} }
} }
} else { } else {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
...@@ -652,7 +652,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, ...@@ -652,7 +652,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
} }
break; break;
case PROD: case PROD:
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
...@@ -666,7 +666,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, ...@@ -666,7 +666,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
break; break;
case DIV: case DIV:
if (!swapped) { if (!swapped) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
...@@ -678,7 +678,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, ...@@ -678,7 +678,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
} }
} }
} else { } else {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
...@@ -692,7 +692,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, ...@@ -692,7 +692,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
} }
break; break;
case MIN: case MIN:
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
...@@ -705,7 +705,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, ...@@ -705,7 +705,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
} }
break; break;
case MAX: case MAX:
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
...@@ -718,7 +718,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, ...@@ -718,7 +718,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
} }
break; break;
case SQR_DIFF: case SQR_DIFF:
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
...@@ -732,7 +732,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, ...@@ -732,7 +732,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
break; break;
case POW: case POW:
if (!swapped) { if (!swapped) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
...@@ -744,7 +744,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type, ...@@ -744,7 +744,7 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
} }
} }
} else { } else {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
...@@ -758,19 +758,19 @@ inline void TensorEltwisePerChannel(const EltwiseType type, ...@@ -758,19 +758,19 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
} }
break; break;
case NEG: case NEG:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < batch0 * channel * image_size; ++i) { for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
output[i] = -input0[i]; output[i] = -input0[i];
} }
break; break;
case ABS: case ABS:
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = 0; i < batch0 * channel * image_size; ++i) { for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
output[i] = std::fabs(input0[i]); output[i] = std::fabs(input0[i]);
} }
break; break;
case EQUAL: case EQUAL:
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch0; ++b) { for (index_t b = 0; b < batch0; ++b) {
for (index_t c = 0; c < channel; ++c) { for (index_t c = 0; c < channel; ++c) {
const T *in0_ptr = input0 + ((b * channel) + c) * image_size; const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
...@@ -991,7 +991,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation { ...@@ -991,7 +991,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
index_t handled_output_size = 0; index_t handled_output_size = 0;
#ifdef MACE_ENABLE_NEON #ifdef MACE_ENABLE_NEON
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) { for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) {
const auto input0_val = vld1_u8(input0_ptr + i); const auto input0_val = vld1_u8(input0_ptr + i);
const auto input1_val = vld1_u8(input1_ptr + i); const auto input1_val = vld1_u8(input1_ptr + i);
...@@ -1037,7 +1037,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation { ...@@ -1037,7 +1037,7 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
} }
handled_output_size = output->size() - output->size() % 8; handled_output_size = output->size() - output->size() % 8;
#endif // NEON #endif // NEON
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t i = handled_output_size; i < output->size(); ++i) { for (index_t i = handled_output_size; i < output->size(); ++i) {
const int32_t offset_input0 = input0_ptr[i] - input0->zero_point(); const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
const int32_t offset_input1 = input1_ptr[i] - input1->zero_point(); const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
......
...@@ -62,7 +62,7 @@ class GatherOp : public Operation { ...@@ -62,7 +62,7 @@ class GatherOp : public Operation {
params->shape().end(), 1, std::multiplies<index_t>()); params->shape().end(), 1, std::multiplies<index_t>());
index_t index_size = indices->size(); index_t index_size = indices->size();
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t l = 0; l < lhs_size; ++l) { for (index_t l = 0; l < lhs_size; ++l) {
for (index_t idx = 0; idx < index_size; ++idx) { for (index_t idx = 0; idx < index_size; ++idx) {
MACE_ASSERT(indices_data[idx] < axis_dim_size, "idx out of bound: ", MACE_ASSERT(indices_data[idx] < axis_dim_size, "idx out of bound: ",
......
...@@ -53,7 +53,7 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation { ...@@ -53,7 +53,7 @@ class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
index_t image_size = height * width; index_t image_size = height * width;
index_t batch_size = channels * image_size; index_t batch_size = channels * image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
const int begin_input_c = std::max(static_cast<index_t>(0), const int begin_input_c = std::max(static_cast<index_t>(0),
......
...@@ -133,7 +133,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase { ...@@ -133,7 +133,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
const index_t in_batch_size = in_shape[1] * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t c = 0; c < out_shape[1]; ++c) { for (index_t c = 0; c < out_shape[1]; ++c) {
const index_t out_base = b * out_batch_size + c * out_image_size; const index_t out_base = b * out_batch_size + c * out_image_size;
...@@ -179,7 +179,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase { ...@@ -179,7 +179,7 @@ class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
const index_t in_batch_size = in_shape[1] * in_image_size; const index_t in_batch_size = in_shape[1] * in_image_size;
const index_t out_batch_size = out_shape[1] * out_image_size; const index_t out_batch_size = out_shape[1] * out_image_size;
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t c = 0; c < out_shape[1]; ++c) { for (index_t c = 0; c < out_shape[1]; ++c) {
const index_t out_base = b * out_batch_size + c * out_image_size; const index_t out_base = b * out_batch_size + c * out_image_size;
...@@ -301,7 +301,7 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase { ...@@ -301,7 +301,7 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
const int *stride_hw, const int *stride_hw,
const int *pad_hw, const int *pad_hw,
uint8_t *output) { uint8_t *output) {
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t h = 0; h < out_shape[1]; ++h) { for (index_t h = 0; h < out_shape[1]; ++h) {
for (index_t w = 0; w < out_shape[2]; ++w) { for (index_t w = 0; w < out_shape[2]; ++w) {
...@@ -358,7 +358,7 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase { ...@@ -358,7 +358,7 @@ class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
const int *stride_hw, const int *stride_hw,
const int *pad_hw, const int *pad_hw,
uint8_t *output) { uint8_t *output) {
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3) schedule(runtime)
for (index_t b = 0; b < out_shape[0]; ++b) { for (index_t b = 0; b < out_shape[0]; ++b) {
for (index_t h = 0; h < out_shape[1]; ++h) { for (index_t h = 0; h < out_shape[1]; ++h) {
for (index_t w = 0; w < out_shape[2]; ++w) { for (index_t w = 0; w < out_shape[2]; ++w) {
......
...@@ -134,7 +134,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase { ...@@ -134,7 +134,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
} }
output_ptr[0] = sum / data_reshape_[0]; output_ptr[0] = sum / data_reshape_[0];
} else { } else {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (int i = 0; i < data_reshape_[0]; ++i) { for (int i = 0; i < data_reshape_[0]; ++i) {
output_ptr[i] = input_ptr[i]; output_ptr[i] = input_ptr[i];
} }
...@@ -142,7 +142,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase { ...@@ -142,7 +142,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
break; break;
case 2: case 2:
if (reduce_first_axis_) { if (reduce_first_axis_) {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (int i = 0; i < data_reshape_[1]; ++i) { for (int i = 0; i < data_reshape_[1]; ++i) {
for (int j = 0; j < data_reshape_[0]; ++j) { for (int j = 0; j < data_reshape_[0]; ++j) {
output_ptr[i] += input_ptr[j * data_reshape_[1] + i]; output_ptr[i] += input_ptr[j * data_reshape_[1] + i];
...@@ -150,7 +150,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase { ...@@ -150,7 +150,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
output_ptr[i] /= data_reshape_[0]; output_ptr[i] /= data_reshape_[0];
} }
} else { } else {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (int i = 0; i < data_reshape_[0]; ++i) { for (int i = 0; i < data_reshape_[0]; ++i) {
for (int j = 0; j < data_reshape_[1]; ++j) { for (int j = 0; j < data_reshape_[1]; ++j) {
output_ptr[i] += input_ptr[i * data_reshape_[1] + j]; output_ptr[i] += input_ptr[i * data_reshape_[1] + j];
...@@ -161,7 +161,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase { ...@@ -161,7 +161,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
break; break;
case 3: case 3:
if (reduce_first_axis_) { if (reduce_first_axis_) {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (int i = 0; i < data_reshape_[1]; ++i) { for (int i = 0; i < data_reshape_[1]; ++i) {
for (int j = 0; j < data_reshape_[2]; ++j) { for (int j = 0; j < data_reshape_[2]; ++j) {
for (int k = 0; k < data_reshape_[0]; ++k) { for (int k = 0; k < data_reshape_[0]; ++k) {
...@@ -173,7 +173,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase { ...@@ -173,7 +173,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
output_ptr[i] /= (data_reshape_[0] * data_reshape_[2]); output_ptr[i] /= (data_reshape_[0] * data_reshape_[2]);
} }
} else { } else {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (int i = 0; i < data_reshape_[0]; ++i) { for (int i = 0; i < data_reshape_[0]; ++i) {
for (int j = 0; j < data_reshape_[2]; ++j) { for (int j = 0; j < data_reshape_[2]; ++j) {
for (int k = 0; k < data_reshape_[1]; ++k) { for (int k = 0; k < data_reshape_[1]; ++k) {
...@@ -188,7 +188,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase { ...@@ -188,7 +188,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
break; break;
case 4: case 4:
if (reduce_first_axis_) { if (reduce_first_axis_) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (int i = 0; i < data_reshape_[1]; ++i) { for (int i = 0; i < data_reshape_[1]; ++i) {
for (int j = 0; j < data_reshape_[3]; ++j) { for (int j = 0; j < data_reshape_[3]; ++j) {
for (int k = 0; k < data_reshape_[2]; ++k) { for (int k = 0; k < data_reshape_[2]; ++k) {
...@@ -203,7 +203,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase { ...@@ -203,7 +203,7 @@ class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
} }
} }
} else { } else {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (int i = 0; i < data_reshape_[0]; ++i) { for (int i = 0; i < data_reshape_[0]; ++i) {
for (int j = 0; j < data_reshape_[2]; ++j) { for (int j = 0; j < data_reshape_[2]; ++j) {
for (int k = 0; k < data_reshape_[1]; ++k) { for (int k = 0; k < data_reshape_[1]; ++k) {
......
...@@ -85,7 +85,7 @@ inline void ResizeImage(const float *images, ...@@ -85,7 +85,7 @@ inline void ResizeImage(const float *images,
const float height_scale, const float height_scale,
const float width_scale, const float width_scale,
float *output) { float *output) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch_size; ++b) { for (index_t b = 0; b < batch_size; ++b) {
for (index_t y = 0; y < out_height; ++y) { for (index_t y = 0; y < out_height; ++y) {
std::vector<float> y_weights; std::vector<float> y_weights;
......
...@@ -95,7 +95,7 @@ inline void ResizeImageNCHW(const T *images, ...@@ -95,7 +95,7 @@ inline void ResizeImageNCHW(const T *images,
T *output) { T *output) {
const CachedInterpolation *xs = xs_vec.data(); const CachedInterpolation *xs = xs_vec.data();
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2) schedule(runtime)
for (index_t b = 0; b < batch_size; ++b) { for (index_t b = 0; b < batch_size; ++b) {
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
const T const T
...@@ -141,7 +141,7 @@ inline void ResizeImageNHWC(const T *images, ...@@ -141,7 +141,7 @@ inline void ResizeImageNHWC(const T *images,
for (index_t b = 0; b < batch_size; ++b) { for (index_t b = 0; b < batch_size; ++b) {
const T *input_base = images + b * channels * in_height * in_width; const T *input_base = images + b * channels * in_height * in_width;
T *output_base = output + b * channels * out_height * out_width; T *output_base = output + b * channels * out_height * out_width;
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t y = 0; y < out_height; ++y) { for (index_t y = 0; y < out_height; ++y) {
const T const T
*y_lower_input_ptr = input_base + ys[y].lower * in_width * channels; *y_lower_input_ptr = input_base + ys[y].lower * in_width * channels;
......
...@@ -252,7 +252,7 @@ void SGemm::RunInternal(const PackedBlock &lhs, ...@@ -252,7 +252,7 @@ void SGemm::RunInternal(const PackedBlock &lhs,
} }
if (batch >= MaceOpenMPThreadCount) { if (batch >= MaceOpenMPThreadCount) {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
MACE_SGEMM_RUN_PER_BATCH MACE_SGEMM_RUN_PER_BATCH
} else { } else {
MACE_SGEMM_RUN_PER_BATCH MACE_SGEMM_RUN_PER_BATCH
...@@ -279,7 +279,7 @@ void SGemm::RunPerBatch(const float *lhs_data, ...@@ -279,7 +279,7 @@ void SGemm::RunPerBatch(const float *lhs_data,
// as possible to cache, by tiling lhs by height and rhs by width. // as possible to cache, by tiling lhs by height and rhs by width.
// w: 4 // w: 4
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t bw = 0; bw < block_w; ++bw) { for (index_t bw = 0; bw < block_w; ++bw) {
index_t remain_h = height; index_t remain_h = height;
index_t block_h = 0; index_t block_h = 0;
...@@ -702,7 +702,7 @@ void SGemm::RunPerBatch(const float *lhs_data, ...@@ -702,7 +702,7 @@ void SGemm::RunPerBatch(const float *lhs_data,
rhs_data += (width - remain_w) * depth; rhs_data += (width - remain_w) * depth;
// w: 1 // w: 1
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t bw = 0; bw < remain_w; ++bw) { for (index_t bw = 0; bw < remain_w; ++bw) {
index_t remain_h = height; index_t remain_h = height;
...@@ -923,7 +923,7 @@ void SGemm::Pack(const MatrixMap<const float> &src, ...@@ -923,7 +923,7 @@ void SGemm::Pack(const MatrixMap<const float> &src,
PackPerBatch(src, order, b, packed_data + b * height * width); \ PackPerBatch(src, order, b, packed_data + b * height * width); \
} }
if (src.batch() >= MaceOpenMPThreadCount) { if (src.batch() >= MaceOpenMPThreadCount) {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
MACE_SGEMM_PACK_PER_BATCH MACE_SGEMM_PACK_PER_BATCH
} else { } else {
MACE_SGEMM_PACK_PER_BATCH MACE_SGEMM_PACK_PER_BATCH
...@@ -945,7 +945,7 @@ void SGemm::UnPack(const PackedBlock &packed_result, ...@@ -945,7 +945,7 @@ void SGemm::UnPack(const PackedBlock &packed_result,
} }
if (matrix_map->batch() >= MaceOpenMPThreadCount) { if (matrix_map->batch() >= MaceOpenMPThreadCount) {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
MACE_SGEMM_UNPACK_PER_BATCH MACE_SGEMM_UNPACK_PER_BATCH
} else { } else {
MACE_SGEMM_UNPACK_PER_BATCH MACE_SGEMM_UNPACK_PER_BATCH
...@@ -968,7 +968,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src, ...@@ -968,7 +968,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
index_t h = 0; index_t h = 0;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
#if defined(__aarch64__) #if defined(__aarch64__)
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t ih = h; ih <= height - 8; ih += 8) { for (index_t ih = h; ih <= height - 8; ih += 8) {
const float *src_data_ptr = src_data + ih * width; const float *src_data_ptr = src_data + ih * width;
float *packed_data_ptr = packed_data + ih * width; float *packed_data_ptr = packed_data + ih * width;
...@@ -989,7 +989,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src, ...@@ -989,7 +989,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
} }
h += (height - h) / 8 * 8; h += (height - h) / 8 * 8;
#endif #endif
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t ih = h; ih <= height - 4; ih += 4) { for (index_t ih = h; ih <= height - 4; ih += 4) {
const float *src_data_ptr = src_data + ih * width; const float *src_data_ptr = src_data + ih * width;
float *packed_data_ptr = packed_data + ih * width; float *packed_data_ptr = packed_data + ih * width;
...@@ -1005,7 +1005,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src, ...@@ -1005,7 +1005,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
} }
h += (height - h) / 4 * 4; h += (height - h) / 4 * 4;
#endif #endif
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t ih = h; ih < height; ++ih) { for (index_t ih = h; ih < height; ++ih) {
std::copy_n(src_data + ih * width, width, packed_data + ih * width); std::copy_n(src_data + ih * width, width, packed_data + ih * width);
} }
...@@ -1015,7 +1015,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src, ...@@ -1015,7 +1015,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
index_t h = 0; index_t h = 0;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
#if defined(__aarch64__) #if defined(__aarch64__)
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t ih = h; ih <= height - 8; ih += 8) { for (index_t ih = h; ih <= height - 8; ih += 8) {
const float *src_data_ptr = src_data + ih; const float *src_data_ptr = src_data + ih;
float *packed_data_ptr = packed_data + ih * width; float *packed_data_ptr = packed_data + ih * width;
...@@ -1030,7 +1030,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src, ...@@ -1030,7 +1030,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
} }
h += (height - h) / 8 * 8; h += (height - h) / 8 * 8;
#endif #endif
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t ih = h; ih <= height - 4; ih += 4) { for (index_t ih = h; ih <= height - 4; ih += 4) {
const float *src_data_ptr = src_data + ih; const float *src_data_ptr = src_data + ih;
float *packed_data_ptr = packed_data + ih * width; float *packed_data_ptr = packed_data + ih * width;
...@@ -1043,7 +1043,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src, ...@@ -1043,7 +1043,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
} }
h += (height - h) / 4 * 4; h += (height - h) / 4 * 4;
#endif #endif
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t ih = h; ih < height; ++ih) { for (index_t ih = h; ih < height; ++ih) {
const float *src_data_ptr = src_data + ih; const float *src_data_ptr = src_data + ih;
float *packed_data_ptr = packed_data + ih * width; float *packed_data_ptr = packed_data + ih * width;
...@@ -1056,7 +1056,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src, ...@@ -1056,7 +1056,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
// This is for packing no-transpose rhs. // This is for packing no-transpose rhs.
index_t w = 0; index_t w = 0;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t iw = w; iw <= width - 4; iw += 4) { for (index_t iw = w; iw <= width - 4; iw += 4) {
const float *src_data_ptr = src_data + iw; const float *src_data_ptr = src_data + iw;
float *packed_data_ptr = packed_data + iw * height; float *packed_data_ptr = packed_data + iw * height;
...@@ -1069,7 +1069,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src, ...@@ -1069,7 +1069,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
} }
w += (width - w) / 4 * 4; w += (width - w) / 4 * 4;
#endif #endif
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t iw = w; iw < width; ++iw) { for (index_t iw = w; iw < width; ++iw) {
const float *src_data_ptr = src_data + iw; const float *src_data_ptr = src_data + iw;
float *packed_data_ptr = packed_data + iw * height; float *packed_data_ptr = packed_data + iw * height;
...@@ -1082,7 +1082,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src, ...@@ -1082,7 +1082,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
// This is for packing transpose-needed rhs. // This is for packing transpose-needed rhs.
index_t w = 0; index_t w = 0;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t iw = w; iw <= width - 4; iw += 4) { for (index_t iw = w; iw <= width - 4; iw += 4) {
const float *src_data_ptr = src_data + iw * height; const float *src_data_ptr = src_data + iw * height;
float *packed_data_ptr = packed_data + iw * height; float *packed_data_ptr = packed_data + iw * height;
...@@ -1098,7 +1098,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src, ...@@ -1098,7 +1098,7 @@ void SGemm::PackPerBatch(const MatrixMap<const float> &src,
} }
w += (width - w) / 4 * 4; w += (width - w) / 4 * 4;
#endif #endif
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t iw = w; iw < width; ++iw) { for (index_t iw = w; iw < width; ++iw) {
std::copy_n(src_data + iw * height, height, packed_data + iw * height); std::copy_n(src_data + iw * height, height, packed_data + iw * height);
} }
...@@ -1118,7 +1118,7 @@ void SGemm::UnPackPerBatch(const float *packed_data, ...@@ -1118,7 +1118,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
// This is for non-transposed result // This is for non-transposed result
index_t w = 0; index_t w = 0;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t iw = w; iw <= width - 4; iw += 4) { for (index_t iw = w; iw <= width - 4; iw += 4) {
const float *packed_data_ptr = packed_data + iw * height; const float *packed_data_ptr = packed_data + iw * height;
float *unpacked_data_ptr = unpacked_data + iw; float *unpacked_data_ptr = unpacked_data + iw;
...@@ -1131,7 +1131,7 @@ void SGemm::UnPackPerBatch(const float *packed_data, ...@@ -1131,7 +1131,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
} }
w += (width - w) / 4 * 4; w += (width - w) / 4 * 4;
#endif #endif
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t iw = w; iw < width; ++iw) { for (index_t iw = w; iw < width; ++iw) {
const float *packed_data_ptr = packed_data + iw * height; const float *packed_data_ptr = packed_data + iw * height;
float *unpacked_data_ptr = unpacked_data + iw; float *unpacked_data_ptr = unpacked_data + iw;
...@@ -1143,7 +1143,7 @@ void SGemm::UnPackPerBatch(const float *packed_data, ...@@ -1143,7 +1143,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
// This is for transposed result // This is for transposed result
index_t w = 0; index_t w = 0;
#if defined(MACE_ENABLE_NEON) #if defined(MACE_ENABLE_NEON)
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t iw = w; iw <= width - 4; iw += 4) { for (index_t iw = w; iw <= width - 4; iw += 4) {
const float *packed_data_ptr = packed_data + iw * height; const float *packed_data_ptr = packed_data + iw * height;
float *unpacked_data_ptr = unpacked_data + iw * height; float *unpacked_data_ptr = unpacked_data + iw * height;
...@@ -1159,7 +1159,7 @@ void SGemm::UnPackPerBatch(const float *packed_data, ...@@ -1159,7 +1159,7 @@ void SGemm::UnPackPerBatch(const float *packed_data,
} }
w += (width - w) / 4 * 4; w += (width - w) / 4 * 4;
#endif #endif
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t iw = w; iw < width; ++iw) { for (index_t iw = w; iw < width; ++iw) {
std::copy_n( std::copy_n(
packed_data + iw * height, height, unpacked_data + iw * height); packed_data + iw * height, height, unpacked_data + iw * height);
......
...@@ -59,7 +59,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation { ...@@ -59,7 +59,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
const index_t batch_size = class_count * class_size; const index_t batch_size = class_count * class_size;
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t k = 0; k < class_size; ++k) { for (index_t k = 0; k < class_size; ++k) {
const float *input_ptr = input_data + b * batch_size + k; const float *input_ptr = input_data + b * batch_size + k;
float *output_ptr = output_data + b * batch_size + k; float *output_ptr = output_data + b * batch_size + k;
...@@ -94,7 +94,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation { ...@@ -94,7 +94,7 @@ class SoftmaxOp<DeviceType::CPU, float> : public Operation {
} else if (input->dim_size() == 2) { // normal 2d softmax } else if (input->dim_size() == 2) { // normal 2d softmax
const index_t class_size = input->dim(0); const index_t class_size = input->dim(0);
const index_t class_count = input->dim(1); const index_t class_count = input->dim(1);
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t k = 0; k < class_size; ++k) { for (index_t k = 0; k < class_size; ++k) {
const float *input_ptr = input_data + k * class_count; const float *input_ptr = input_data + k * class_count;
float *output_ptr = output_data + k * class_count; float *output_ptr = output_data + k * class_count;
...@@ -172,7 +172,7 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation { ...@@ -172,7 +172,7 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
// If depth is short, do it using float32. Float computation should not // If depth is short, do it using float32. Float computation should not
// be here, but as long as it is on CPU, it is fine. // be here, but as long as it is on CPU, it is fine.
if (depth < 32) { if (depth < 32) {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
const uint8_t *input_ptr = input_data + b * depth; const uint8_t *input_ptr = input_data + b * depth;
uint8_t *output_ptr = output_data + b * depth; uint8_t *output_ptr = output_data + b * depth;
...@@ -201,7 +201,7 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation { ...@@ -201,7 +201,7 @@ class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
(1ll << 31) - 1.0)); (1ll << 31) - 1.0));
int32_t input_delta_limit = -((1ll << 31) - 1) / scale_q; int32_t input_delta_limit = -((1ll << 31) - 1) / scale_q;
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t b = 0; b < batch; ++b) { for (index_t b = 0; b < batch; ++b) {
const uint8_t *input_ptr = input_data + b * depth; const uint8_t *input_ptr = input_data + b * depth;
uint8_t *output_ptr = output_data + b * depth; uint8_t *output_ptr = output_data + b * depth;
......
...@@ -129,7 +129,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase { ...@@ -129,7 +129,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width); std::max(static_cast<index_t>(1), 8 * 1024 / block_shape_w / in_width);
// make channel outter loop so we can make best use of cache // make channel outter loop so we can make best use of cache
#pragma omp parallel for collapse(3) #pragma omp parallel for collapse(3) schedule(runtime)
for (index_t c = 0; c < channels; ++c) { for (index_t c = 0; c < channels; ++c) {
for (index_t block_h = 0; block_h < out_height; for (index_t block_h = 0; block_h < out_height;
block_h += block_h_size) { block_h += block_h_size) {
...@@ -238,7 +238,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, uint8_t> : public SpaceToBatchOpBase { ...@@ -238,7 +238,7 @@ class SpaceToBatchNDOp<DeviceType::CPU, uint8_t> : public SpaceToBatchOpBase {
index_t out_width = batch_tensor->dim(2); index_t out_width = batch_tensor->dim(2);
index_t channels = batch_tensor->dim(3); index_t channels = batch_tensor->dim(3);
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (index_t b = 0; b < out_batches; ++b) { for (index_t b = 0; b < out_batches; ++b) {
const index_t in_b = b % in_batches; const index_t in_b = b % in_batches;
const index_t tile_index = b / in_batches; const index_t tile_index = b / in_batches;
......
...@@ -64,7 +64,7 @@ class SqrDiffMeanOp : public Operation { ...@@ -64,7 +64,7 @@ class SqrDiffMeanOp : public Operation {
const index_t img_size = input0->dim(2) * input0->dim(3); const index_t img_size = input0->dim(2) * input0->dim(3);
const index_t bc = input0->dim(0) * input0->dim(1); const index_t bc = input0->dim(0) * input0->dim(1);
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (int i = 0; i < bc; ++i) { for (int i = 0; i < bc; ++i) {
for (int j = 0; j < img_size; ++j) { for (int j = 0; j < img_size; ++j) {
T diff = input_ptr0[i * img_size + j] - input_ptr1[i]; T diff = input_ptr0[i * img_size + j] - input_ptr1[i];
......
...@@ -48,10 +48,28 @@ enum GPUPriorityHint { ...@@ -48,10 +48,28 @@ enum GPUPriorityHint {
PRIORITY_HIGH = 3 PRIORITY_HIGH = 3
}; };
// AFFINITY_NONE: initiate 'num_threads_hint' threads with no affinity
// scheduled.
// If 'num_threads_hint' is -1 or greater than number of available cores,
// 'num_threads_hint' will be reset to number of available cores.
// AFFINITY_BIG_ONLY: all available big cores are used, and number of threads
// is equal to numbers of available big cores.
// AFFINITY_LITTLE_ONLY: all available little cores are used, and number of
// threads is equal to numbers of available little cores.
// AFFINITY_HIGH_PERFORMANCE: initiate 'num_threads_hint' threads on different
// cores with top-num_threads_hint frequencies.
// If 'num_threads_hint' is -1 or greater than number of available cores,
// 'num_threads_hint' will be reset to number of available cores.
// AFFINITY_POWER_SAVE: initiate 'num_threads_hint' threads on different
// cores with bottom-num_threads_hint frequencies.
// If 'num_threads_hint' is -1 or greater than number of available cores,
// 'num_threads_hint' will be reset to number of available cores.
enum CPUAffinityPolicy { enum CPUAffinityPolicy {
AFFINITY_NONE = 0, AFFINITY_NONE = 0,
AFFINITY_BIG_ONLY = 1, AFFINITY_BIG_ONLY = 1,
AFFINITY_LITTLE_ONLY = 2, AFFINITY_LITTLE_ONLY = 2,
AFFINITY_HIGH_PERFORMANCE = 3,
AFFINITY_POWER_SAVE = 4,
}; };
struct CallStats { struct CallStats {
......
...@@ -99,7 +99,7 @@ inline void QuantizeWithScaleAndZeropoint(const float *input, ...@@ -99,7 +99,7 @@ inline void QuantizeWithScaleAndZeropoint(const float *input,
int32_t zero_point, int32_t zero_point,
T *output) { T *output) {
float recip_scale = 1 / scale; float recip_scale = 1 / scale;
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
output[i] = Saturate<T>(roundf(zero_point + recip_scale * input[i])); output[i] = Saturate<T>(roundf(zero_point + recip_scale * input[i]));
} }
...@@ -128,7 +128,7 @@ inline void Dequantize(const T *input, ...@@ -128,7 +128,7 @@ inline void Dequantize(const T *input,
const float scale, const float scale,
const int32_t zero_point, const int32_t zero_point,
float *output) { float *output) {
#pragma omp parallel for #pragma omp parallel for schedule(runtime)
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
output[i] = scale * (input[i] - zero_point); output[i] = scale * (input[i] - zero_point);
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册