提交 1e91554e 编写于 作者: 叶剑武

Merge branch 'opt_code' into 'master'

remove openmp, fix scratch bug in micro and opt some codes

See merge request deep-computing/mace!1270
......@@ -364,12 +364,12 @@ Tuning for specific SoC's GPU
.. note::
You should plug in device(s) with the specific SoC(s).
You must specify the ``target_socs`` in your YAML file and plug in device(s) with the specific SoC(s).
.. code-block:: sh
python tools/converter.py run --config=/path/to/model_deployment_file.yml --validate
python tools/converter.py run --config=/path/to/model_deployment_file.yml
The command will generate two files in `build/${library_name}/opencl`, like the following dir-tree.
......@@ -487,7 +487,7 @@ the detailed information is in :doc:`benchmark`.
- default
- commands
- explanation
* - --omp_num_threads
* - --num_threads
- int
- -1
- ``run``
......
......@@ -180,4 +180,4 @@ Please refer to \ ``mace/tools/mace_run.cc``\ for full usage. The following list
// 5. Run the model
MaceStatus status = engine.Run(inputs, &outputs);
More details are in :doc:`advanced_usage`.
More details are in :doc:`advanced_usage_cmake`.
......@@ -54,7 +54,7 @@ cp -rf include/mace $INCLUDE_DIR
cp -rf build/mobilenet/include/mace/public/*.h $INCLUDE_DIR/mace/public/
cp -rf build/mobilenet/model $LIBRARY_DIR
bazel build --config android --config optimization $BAZEL_LIBMACE_TARGET --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=$TARGET_ABI
bazel build --config android --config optimization $BAZEL_LIBMACE_TARGET --define neon=true --define opencl=true --define quantize=true --cpu=$TARGET_ABI
cp -rf $BAZEL_GEN_LIBMACE_PATH $LIBMACE_DIR
if [ $MACE_LINK_TYPE == "dynamic" ]; then
......
......@@ -16,7 +16,7 @@ android {
externalNativeBuild {
cmake {
cppFlags "-std=c++11 -fopenmp"
cppFlags "-std=c++11"
abiFilters "arm64-v8a"
}
}
......
......@@ -95,7 +95,7 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext(
JNIEXPORT jint JNICALL
Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
JNIEnv *env, jclass thisObj, jint omp_num_threads, jint cpu_affinity_policy,
JNIEnv *env, jclass thisObj, jint num_threads, jint cpu_affinity_policy,
jint gpu_perf_hint, jint gpu_priority_hint,
jstring model_name_str, jstring device) {
MaceContext &mace_context = GetMaceContext();
......@@ -110,14 +110,13 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
mace::MaceStatus status;
mace::MaceEngineConfig config(mace_context.device_type);
status = config.SetCPUThreadPolicy(
omp_num_threads,
num_threads,
static_cast<mace::CPUAffinityPolicy>(cpu_affinity_policy));
if (status != mace::MaceStatus::MACE_SUCCESS) {
__android_log_print(ANDROID_LOG_ERROR,
"image_classify attrs",
"openmp result: %s, threads: %d, cpu: %d",
status.information().c_str(), omp_num_threads,
cpu_affinity_policy);
"threads: %d, cpu: %d",
num_threads, cpu_affinity_policy);
}
if (mace_context.device_type == mace::DeviceType::GPU) {
config.SetGPUContext(mace_context.gpu_context);
......
......@@ -316,8 +316,6 @@ class MACE_API MaceEngineConfig {
/// (AFFINITY_NONE) cores according to the policy. The threads number will
/// also be truncated to the corresponding cores number when num_threads_hint
/// is larger than it.
/// The OpenMP threads will be bind to (via sched_setaffinity) big cores
/// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
///
/// \param num_threads_hint it is only a hint.
/// \param policy one of CPUAffinityPolicy
......
......@@ -108,14 +108,6 @@ config_setting(
visibility = ["//visibility:public"],
)
config_setting(
name = "openmp_enabled",
define_values = {
"openmp": "true",
},
visibility = ["//visibility:public"],
)
config_setting(
name = "opencl_enabled",
define_values = {
......
......@@ -17,7 +17,6 @@ load(
"if_not_apu_enabled",
"if_not_hexagon_enabled",
"if_opencl_enabled",
"if_openmp_enabled",
"if_quantize_enabled",
"if_rpcmem_enabled",
)
......@@ -81,10 +80,7 @@ cc_library(
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
"-DMACE_ENABLE_OPENMP",
]) + if_opencl_enabled([
] + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE",
......
......@@ -14,10 +14,6 @@
#include "mace/core/runtime/cpu/cpu_runtime.h"
#ifdef MACE_ENABLE_OPENMP
#include <omp.h>
#endif
#include <algorithm>
#include <cerrno>
#include <cmath>
......@@ -35,62 +31,7 @@
namespace mace {
int MaceOpenMPThreadCount = 1;
enum SchedulePolicy {
SCHED_STATIC,
SCHED_GUIDED,
};
namespace {
MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
const std::vector<size_t> &cpu_ids,
SchedulePolicy schedule_policy) {
MaceOpenMPThreadCount = omp_num_threads;
SchedSetAffinity(cpu_ids);
#ifdef MACE_ENABLE_OPENMP
VLOG(1) << "Set OpenMP threads number: " << omp_num_threads
<< ", CPU core IDs: " << MakeString(cpu_ids);
if (schedule_policy == SCHED_GUIDED) {
omp_set_schedule(omp_sched_guided, 1);
} else if (schedule_policy == SCHED_STATIC) {
omp_set_schedule(omp_sched_static, 0);
} else {
LOG(WARNING) << "Unknown schedule policy: " << schedule_policy;
}
omp_set_num_threads(omp_num_threads);
#else
MACE_UNUSED(omp_num_threads);
MACE_UNUSED(schedule_policy);
VLOG(2) << "Set OpenMP threads number failed: OpenMP not enabled.";
#endif
#ifdef MACE_ENABLE_OPENMP
std::vector<MaceStatus> status(omp_num_threads,
MaceStatus::MACE_INVALID_ARGS);
#pragma omp parallel for
for (int i = 0; i < omp_num_threads; ++i) {
VLOG(1) << "Set affinity for OpenMP thread " << omp_get_thread_num()
<< "/" << omp_get_num_threads();
status[i] = SchedSetAffinity(cpu_ids);
}
for (int i = 0; i < omp_num_threads; ++i) {
if (status[i] != MaceStatus::MACE_SUCCESS)
return MaceStatus::MACE_INVALID_ARGS;
}
return MaceStatus::MACE_SUCCESS;
#else
MaceStatus status = SchedSetAffinity(cpu_ids);
VLOG(1) << "Set affinity without OpenMP: " << MakeString(cpu_ids);
return status;
#endif
}
} // namespace
MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
MaceStatus CPURuntime::SetThreadsHintAndAffinityPolicy(
int num_threads_hint,
CPUAffinityPolicy policy,
void *gemm_context) {
......@@ -115,19 +56,8 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
#else
MACE_UNUSED(gemm_context);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENMP
omp_set_num_threads(num_threads_hint);
#else
VLOG(2) << "Set OpenMP threads number failed: OpenMP not enabled.";
#endif
return MaceStatus::MACE_SUCCESS;
}
SchedulePolicy sched_policy = SCHED_GUIDED;
float first_freq = cpu_max_freqs[cores_to_use[0]];
float last_freq = cpu_max_freqs[cores_to_use[cores_to_use.size() - 1]];
if (std::abs(first_freq - last_freq) < 1e-6) {
sched_policy = SCHED_STATIC;
return MaceStatus::MACE_SUCCESS;
}
#ifdef MACE_ENABLE_QUANTIZE
......@@ -137,9 +67,10 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
}
#endif // MACE_ENABLE_QUANTIZE
return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint,
cores_to_use,
sched_policy);
MaceStatus status = SchedSetAffinity(cores_to_use);
VLOG(1) << "Set affinity : " << MakeString(cores_to_use);
return status;
}
} // namespace mace
......
......@@ -29,8 +29,6 @@
namespace mace {
extern int MaceOpenMPThreadCount;
class CPURuntime {
public:
CPURuntime(const int num_threads,
......@@ -43,9 +41,9 @@ class CPURuntime {
#ifdef MACE_ENABLE_QUANTIZE
MACE_CHECK_NOTNULL(GetGemmlowpContext());
#endif // MACE_ENABLE_QUANTIZE
SetOpenMPThreadsAndAffinityPolicy(num_threads_,
policy_,
gemm_context_);
SetThreadsHintAndAffinityPolicy(num_threads_,
policy_,
gemm_context_);
}
#ifdef MACE_ENABLE_QUANTIZE
......@@ -78,8 +76,8 @@ class CPURuntime {
}
private:
MaceStatus SetOpenMPThreadsAndAffinityPolicy(
int omp_num_threads_hint,
MaceStatus SetThreadsHintAndAffinityPolicy(
int num_threads_hint,
CPUAffinityPolicy policy,
void *gemm_context);
......
......@@ -20,7 +20,6 @@ load(
"if_linux_base",
"if_neon_enabled",
"if_opencl_enabled",
"if_openmp_enabled",
"if_quantize_enabled",
"if_rpcmem_enabled",
)
......@@ -33,7 +32,7 @@ cc_library(
copts = [
"-Werror",
"-Wextra",
] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
] + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
......@@ -70,9 +69,7 @@ cc_binary(
"-Wl,--version-script",
"$(location //mace/libmace:mace_version_script.lds)",
],
) + if_openmp_enabled([
"-fopenmp",
]),
),
linkshared = 1,
linkstatic = 0,
deps = [
......
# -*- Python -*-
def if_linux_base(a, default_value = []):
return select({
"//mace:linux_base": a,
"//conditions:default": default_value,
})
return select({
"//mace:linux_base": a,
"//conditions:default": default_value,
})
def if_android(a, default_value = []):
return select({
"//mace:android": a,
"//conditions:default": default_value,
})
return select({
"//mace:android": a,
"//conditions:default": default_value,
})
def if_linux(a, default_value = []):
return select({
"//mace:linux": a,
"//conditions:default": default_value,
})
return select({
"//mace:linux": a,
"//conditions:default": default_value,
})
def if_darwin(a, default_value = []):
return select({
"//mace:darwin": a,
"//conditions:default": default_value,
})
return select({
"//mace:darwin": a,
"//conditions:default": default_value,
})
def if_android_armv7(a):
return select({
"//mace:android_armv7": a,
"//conditions:default": [],
})
return select({
"//mace:android_armv7": a,
"//conditions:default": [],
})
def if_android_arm64(a):
return select({
"//mace:android_arm64": a,
"//conditions:default": [],
})
return select({
"//mace:android_arm64": a,
"//conditions:default": [],
})
def if_arm_linux_aarch64(a):
return select({
"//mace:arm_linux_aarch64": a,
"//conditions:default": [],
})
return select({
"//mace:arm_linux_aarch64": a,
"//conditions:default": [],
})
def if_arm_linux_armhf(a):
return select({
"//mace:arm_linux_armhf": a,
"//conditions:default": []
})
return select({
"//mace:arm_linux_armhf": a,
"//conditions:default": [],
})
def if_neon_enabled(a, default_value = []):
return select({
"//mace:neon_enabled": a,
"//conditions:default": default_value,
})
return select({
"//mace:neon_enabled": a,
"//conditions:default": default_value,
})
def if_hexagon_enabled(a):
return select({
"//mace:hexagon_enabled": a,
"//conditions:default": [],
})
return select({
"//mace:hexagon_enabled": a,
"//conditions:default": [],
})
def if_not_hexagon_enabled(a):
return select({
"//mace:hexagon_enabled": [],
"//conditions:default": a,
})
return select({
"//mace:hexagon_enabled": [],
"//conditions:default": a,
})
def if_hta_enabled(a):
return select({
"//mace:hta_enabled": a,
"//conditions:default": [],
})
return select({
"//mace:hta_enabled": a,
"//conditions:default": [],
})
def if_hexagon_or_hta_enabled(a):
return select({
"//mace:hexagon_enabled": a,
"//mace:hta_enabled": a,
"//conditions:default": [],
})
return select({
"//mace:hexagon_enabled": a,
"//mace:hta_enabled": a,
"//conditions:default": [],
})
def if_apu_enabled(a):
return select({
"//mace:apu_enabled": a,
"//conditions:default": [],
})
return select({
"//mace:apu_enabled": a,
"//conditions:default": [],
})
def if_not_apu_enabled(a):
return select({
"//mace:apu_enabled": [],
"//conditions:default": a,
})
def if_openmp_enabled(a):
return select({
"//mace:openmp_enabled": a,
"//conditions:default": [],
})
return select({
"//mace:apu_enabled": [],
"//conditions:default": a,
})
def if_opencl_enabled(a, default_value = []):
return select({
"//mace:opencl_enabled": a,
"//conditions:default": default_value,
})
return select({
"//mace:opencl_enabled": a,
"//conditions:default": default_value,
})
def if_quantize_enabled(a):
return select({
"//mace:quantize_enabled": a,
"//conditions:default": [],
})
return select({
"//mace:quantize_enabled": a,
"//conditions:default": [],
})
def if_bfloat16_enabled(a):
return select({
"//mace:bfloat16_enabled": a,
"//conditions:default": [],
})
return select({
"//mace:bfloat16_enabled": a,
"//conditions:default": [],
})
def if_rpcmem_enabled(a):
return select({
"//mace:rpcmem_enabled": a,
"//conditions:default": [],
})
return select({
"//mace:rpcmem_enabled": a,
"//conditions:default": [],
})
def mace_version_genrule():
native.genrule(
name = "mace_version_gen",
srcs = [str(Label("@local_version_config//:gen/version"))],
outs = ["version/version.cc"],
cmd = "cat $(SRCS) > $@;"
)
native.genrule(
name = "mace_version_gen",
srcs = [str(Label("@local_version_config//:gen/version"))],
outs = ["version/version.cc"],
cmd = "cat $(SRCS) > $@;",
)
def encrypt_opencl_kernel_genrule():
srcs = [
......
......@@ -14,7 +14,6 @@ load(
"if_hexagon_enabled",
"if_neon_enabled",
"if_opencl_enabled",
"if_openmp_enabled",
"if_quantize_enabled",
)
......@@ -36,9 +35,7 @@ cc_library(
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
]) + if_neon_enabled([
] + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon-fp16",
......@@ -77,9 +74,7 @@ cc_library(
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
]) + if_neon_enabled([
] + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon-fp16",
......@@ -134,9 +129,7 @@ cc_library(
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
]) + if_neon_enabled([
] + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon-fp16",
......@@ -176,9 +169,7 @@ cc_library(
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
]) + if_neon_enabled([
] + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon-fp16",
......@@ -221,9 +212,7 @@ cc_library(
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
]) + if_neon_enabled([
] + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon-fp16",
......@@ -263,9 +252,7 @@ cc_library(
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
]) + if_neon_enabled([
] + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon-fp16",
......
......@@ -21,8 +21,9 @@
#define MACE_ENABLE_FP16_NEON
#endif
#include "mace/core/ops/op_context.h"
#include "mace/core/types.h"
#include "mace/utils/thread_pool.h"
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
#include <arm_neon.h>
......@@ -36,9 +37,10 @@ namespace mace {
namespace ops {
template<typename INPUT_TYPE_LEFT,
typename INPUT_TYPE_RIGHT,
typename OUTPUT_TYPE>
void FP16Gemv(const INPUT_TYPE_LEFT *m_ptr,
typename INPUT_TYPE_RIGHT,
typename OUTPUT_TYPE>
void FP16Gemv(OpContext *context,
const INPUT_TYPE_LEFT *m_ptr,
const INPUT_TYPE_RIGHT *v_ptr,
const index_t height,
const index_t width,
......@@ -46,78 +48,83 @@ void FP16Gemv(const INPUT_TYPE_LEFT *m_ptr,
#if defined(MACE_ENABLE_FP16_NEON) && defined(__ANDROID__)
template<>
void FP16Gemv<float16_t, float, float>(const float16_t *m_ptr,
void FP16Gemv<float16_t, float, float>(OpContext *context,
const float16_t *m_ptr,
const float *v_ptr,
const index_t height,
const index_t width,
float *out_ptr) {
#pragma omp parallel for
for (index_t h = 0; h < height; ++h) {
const float16_t *m_ptr0 = m_ptr + h * width;
const float *v_ptr0 = v_ptr;
float *out_ptr0 = out_ptr + h;
float sum0 = 0;
float32x4_t vm0, vm1, vm2, vm3;
float32x4_t vv0, vv1, vv2, vv3;
float32x4_t vsum0 = vdupq_n_f32(0.f);
float32x4_t vsum1 = vdupq_n_f32(0.f);
float32x4_t vsum2 = vdupq_n_f32(0.f);
float32x4_t vsum3 = vdupq_n_f32(0.f);
index_t w;
for (w = 0; w + 15 < width; w += 16) {
vm0 = vcvt_f32_f16(vld1_f16(m_ptr0));
vv0 = vld1q_f32(v_ptr0);
vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4));
vv1 = vld1q_f32(v_ptr0 + 4);
vm2 = vcvt_f32_f16(vld1_f16(m_ptr0 + 8));
vv2 = vld1q_f32(v_ptr0 + 8);
vm3 = vcvt_f32_f16(vld1_f16(m_ptr0 + 12));
vv3 = vld1q_f32(v_ptr0 + 12);
vsum0 = vmlaq_f32(vsum0, vm0, vv0);
vsum1 = vmlaq_f32(vsum1, vm1, vv1);
vsum2 = vmlaq_f32(vsum2, vm2, vv2);
vsum3 = vmlaq_f32(vsum3, vm3, vv3);
m_ptr0 += 16;
v_ptr0 += 16;
}
for (; w + 7 < width; w += 8) {
vm0 = vcvt_f32_f16(vld1_f16(m_ptr0));
vv0 = vld1q_f32(v_ptr0);
vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4));
vv1 = vld1q_f32(v_ptr0 + 4);
vsum0 = vmlaq_f32(vsum0, vm0, vv0);
vsum1 = vmlaq_f32(vsum1, vm1, vv1);
m_ptr0 += 8;
v_ptr0 += 8;
}
for (; w + 3 < width; w += 4) {
vm0 = vcvt_f32_f16(vld1_f16(m_ptr0));
vv0 = vld1q_f32(v_ptr0);
vsum0 = vmlaq_f32(vsum0, vm0, vv0);
m_ptr0 += 4;
v_ptr0 += 4;
}
vsum0 += vsum1;
vsum2 += vsum3;
vsum0 += vsum2;
sum0 = vaddvq_f32(vsum0);
for (; w < width; ++w) {
sum0 += m_ptr0[0] * v_ptr0[0];
m_ptr0++;
v_ptr0++;
}
*out_ptr0++ = sum0;
}
utils::ThreadPool &thread_pool =
context->device()->cpu_runtime()->thread_pool();
thread_pool.Compute1D([=](index_t start0, index_t end0, index_t step0) {
for (index_t h = start0; h < end0; h += step0) {
const float16_t *m_ptr0 = m_ptr + h * width;
const float *v_ptr0 = v_ptr;
float *out_ptr0 = out_ptr + h;
float sum0 = 0;
float32x4_t vm0, vm1, vm2, vm3;
float32x4_t vv0, vv1, vv2, vv3;
float32x4_t vsum0 = vdupq_n_f32(0.f);
float32x4_t vsum1 = vdupq_n_f32(0.f);
float32x4_t vsum2 = vdupq_n_f32(0.f);
float32x4_t vsum3 = vdupq_n_f32(0.f);
index_t w;
for (w = 0; w + 15 < width; w += 16) {
vm0 = vcvt_f32_f16(vld1_f16(m_ptr0));
vv0 = vld1q_f32(v_ptr0);
vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4));
vv1 = vld1q_f32(v_ptr0 + 4);
vm2 = vcvt_f32_f16(vld1_f16(m_ptr0 + 8));
vv2 = vld1q_f32(v_ptr0 + 8);
vm3 = vcvt_f32_f16(vld1_f16(m_ptr0 + 12));
vv3 = vld1q_f32(v_ptr0 + 12);
vsum0 = vmlaq_f32(vsum0, vm0, vv0);
vsum1 = vmlaq_f32(vsum1, vm1, vv1);
vsum2 = vmlaq_f32(vsum2, vm2, vv2);
vsum3 = vmlaq_f32(vsum3, vm3, vv3);
m_ptr0 += 16;
v_ptr0 += 16;
}
for (; w + 7 < width; w += 8) {
vm0 = vcvt_f32_f16(vld1_f16(m_ptr0));
vv0 = vld1q_f32(v_ptr0);
vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4));
vv1 = vld1q_f32(v_ptr0 + 4);
vsum0 = vmlaq_f32(vsum0, vm0, vv0);
vsum1 = vmlaq_f32(vsum1, vm1, vv1);
m_ptr0 += 8;
v_ptr0 += 8;
}
for (; w + 3 < width; w += 4) {
vm0 = vcvt_f32_f16(vld1_f16(m_ptr0));
vv0 = vld1q_f32(v_ptr0);
vsum0 = vmlaq_f32(vsum0, vm0, vv0);
m_ptr0 += 4;
v_ptr0 += 4;
}
vsum0 += vsum1;
vsum2 += vsum3;
vsum0 += vsum2;
sum0 = vaddvq_f32(vsum0);
for (; w < width; ++w) {
sum0 += m_ptr0[0] * v_ptr0[0];
m_ptr0++;
v_ptr0++;
}
*out_ptr0++ = sum0;
}
}, 0, height, 1);
}
#endif // MACE_ENABLE_FP16_NEON && __ANDROID__
......
......@@ -20,7 +20,7 @@
#ifdef MACE_ENABLE_QUANTIZE
#include "mace/ops/arm/q8/quantization_util.h"
// We reuse TensorFlow Lite's optimized depthwiseconv_uint8 and parallelized it
// using OpenMP for MACE's quantized depthwise_conv2d.
// using thread pool for MACE's quantized depthwise_conv2d.
#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
#endif // MACE_ENABLE_QUANTIZE
......
......@@ -567,21 +567,15 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase {
B->dtype() == DT_FLOAT) {
auto *a_ptr_base = A->data<float16_t>();
auto *b_ptr_base = B->data<float>();
FP16Gemv(a_ptr_base,
b_ptr_base,
height,
K,
c_ptr_base);
FP16Gemv(context, a_ptr_base, b_ptr_base,
height, K, c_ptr_base);
return MaceStatus::MACE_SUCCESS;
} else if (height == 1 && transpose_b_ && A->dtype() == DT_FLOAT &&
B->dtype() == DT_FLOAT16) {
auto *b_ptr_base = B->data<float16_t>();
auto *a_ptr_base = A->data<float>();
FP16Gemv(b_ptr_base,
a_ptr_base,
width,
K,
c_ptr_base);
FP16Gemv(context, b_ptr_base, a_ptr_base,
width, K, c_ptr_base);
return MaceStatus::MACE_SUCCESS;
} else {
LOG(INFO) << "Matmul fp16 gemv args: " << height << " " << width << " "
......
......@@ -5,7 +5,6 @@ load(
"if_android",
"if_hexagon_enabled",
"if_opencl_enabled",
"if_openmp_enabled",
)
licenses(["notice"]) # Apache 2.0
......
......@@ -150,7 +150,7 @@ DEFINE_int32(restart_round, 1, "restart round");
DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
DEFINE_int32(gpu_perf_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
DEFINE_int32(omp_num_threads, -1, "num of openmp threads");
DEFINE_int32(num_threads, -1, "num of threads");
DEFINE_int32(cpu_affinity_policy, 1,
"0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY");
DEFINE_bool(benchmark, false, "enable benchmark op");
......@@ -170,10 +170,10 @@ bool RunModel(const std::string &model_name,
MaceStatus status;
MaceEngineConfig config(device_type);
status = config.SetCPUThreadPolicy(
FLAGS_omp_num_threads,
FLAGS_num_threads,
static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
if (status != MaceStatus::MACE_SUCCESS) {
LOG(WARNING) << "Set openmp or cpu affinity failed.";
LOG(WARNING) << "Set cpu affinity failed.";
}
#if defined(MACE_ENABLE_OPENCL) || defined(MACE_ENABLE_HTA)
std::shared_ptr<GPUContext> gpu_context;
......@@ -544,7 +544,7 @@ int Main(int argc, char **argv) {
LOG(INFO) << "restart_round: " << FLAGS_restart_round;
LOG(INFO) << "gpu_perf_hint: " << FLAGS_gpu_perf_hint;
LOG(INFO) << "gpu_priority_hint: " << FLAGS_gpu_priority_hint;
LOG(INFO) << "omp_num_threads: " << FLAGS_omp_num_threads;
LOG(INFO) << "num_threads: " << FLAGS_num_threads;
LOG(INFO) << "cpu_affinity_policy: " << FLAGS_cpu_affinity_policy;
auto limit_opencl_kernel_time = getenv("MACE_LIMIT_OPENCL_KERNEL_TIME");
if (limit_opencl_kernel_time) {
......
......@@ -12,7 +12,6 @@ load(
"if_android",
"if_android_armv7",
"if_neon_enabled",
"if_openmp_enabled",
)
cc_library(
......@@ -41,9 +40,7 @@ cc_library(
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
]) + if_neon_enabled([
] + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
......
......@@ -11,19 +11,18 @@ load(
"if_hexagon_enabled",
"if_neon_enabled",
"if_opencl_enabled",
"if_openmp_enabled",
"if_quantize_enabled",
)
cc_library(
name = "benchmark_utils",
testonly = 1,
hdrs = glob([
"mace/benchmark_utils/*.h",
]),
srcs = glob([
"mace/benchmark_utils/*.cc",
]),
hdrs = glob([
"mace/benchmark_utils/*.h",
]),
copts = [
"-Werror",
"-Wextra",
......@@ -31,9 +30,9 @@ cc_library(
],
strip_include_prefix = "",
deps = [
"//external:gflags_nothreads",
"//mace/core",
"//test/ccutils",
"//external:gflags_nothreads",
],
)
......
......@@ -20,7 +20,7 @@
#include "mace/ops/ops_test_util.h"
DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*");
DEFINE_int32(omp_num_threads, -1, "num of openmp threads");
DEFINE_int32(num_threads, -1, "num of threads");
DEFINE_int32(cpu_affinity_policy, 1,
"0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY");
......@@ -32,7 +32,7 @@ int main(int argc, char **argv) {
// config runtime
mace::ops::test::OpTestContext::Get(
FLAGS_omp_num_threads,
FLAGS_num_threads,
static_cast<mace::CPUAffinityPolicy>(FLAGS_cpu_affinity_policy));
mace::testing::Benchmark::Run(FLAGS_filter.c_str());
......
......@@ -12,7 +12,6 @@ load(
"if_hta_enabled",
"if_neon_enabled",
"if_opencl_enabled",
"if_openmp_enabled",
"if_quantize_enabled",
)
......@@ -49,9 +48,7 @@ cc_test(
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
]) + if_neon_enabled([
] + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon-fp16",
......@@ -65,9 +62,6 @@ cc_test(
]) + if_hta_enabled([
"-DMACE_ENABLE_HTA",
]),
linkopts = if_openmp_enabled([
"-fopenmp",
]),
linkstatic = 1,
deps = [
"//mace/ops",
......
......@@ -12,7 +12,6 @@ load(
"if_hta_enabled",
"if_neon_enabled",
"if_opencl_enabled",
"if_openmp_enabled",
"if_quantize_enabled",
)
......@@ -36,9 +35,7 @@ cc_test(
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
]) + if_neon_enabled([
] + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
......@@ -53,9 +50,6 @@ cc_test(
]) + if_hta_enabled([
"-DMACE_ENABLE_HTA",
]),
linkopts = if_openmp_enabled([
"-fopenmp",
]),
linkstatic = 1,
deps = [
":mace_api_test_header",
......@@ -73,9 +67,7 @@ cc_test(
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
]) + if_neon_enabled([
] + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
......@@ -90,9 +82,6 @@ cc_test(
]) + if_hta_enabled([
"-DMACE_ENABLE_HTA",
]),
linkopts = if_openmp_enabled([
"-fopenmp",
]),
linkstatic = 1,
deps = [
":mace_api_test_header",
......@@ -110,9 +99,7 @@ cc_test(
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
]) + if_neon_enabled([
] + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
......@@ -127,9 +114,6 @@ cc_test(
]) + if_hta_enabled([
"-DMACE_ENABLE_HTA",
]),
linkopts = if_openmp_enabled([
"-fopenmp",
]),
linkstatic = 1,
deps = [
"//mace/libmace",
......@@ -146,9 +130,7 @@ cc_test(
"-Werror",
"-Wextra",
"-Wno-missing-field-initializers",
] + if_openmp_enabled([
"-fopenmp",
]) + if_neon_enabled([
] + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon",
......@@ -163,9 +145,6 @@ cc_test(
]) + if_hta_enabled([
"-DMACE_ENABLE_HTA",
]),
linkopts = if_openmp_enabled([
"-fopenmp",
]),
linkstatic = 1,
deps = [
"//mace/libmace",
......
......@@ -11,7 +11,6 @@ load(
"if_hexagon_enabled",
"if_neon_enabled",
"if_opencl_enabled",
"if_openmp_enabled",
"if_quantize_enabled",
)
......@@ -29,7 +28,7 @@ cc_library(
copts = [
"-Werror",
"-Wextra",
] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([
] + if_neon_enabled([
"-DMACE_ENABLE_NEON",
]) + if_android_armv7([
"-mfpu=neon-fp16",
......
......@@ -117,7 +117,7 @@ DataFormatStrs = [
class DefaultValues(object):
mace_lib_type = MACELibType.static
omp_num_threads = -1,
num_threads = -1,
cpu_affinity_policy = 1,
gpu_perf_hint = 3,
gpu_priority_hint = 3,
......@@ -887,7 +887,7 @@ def convert_func(flags):
################################
# run
################################
def build_mace_run(configs, target_abi, toolchain, enable_openmp,
def build_mace_run(configs, target_abi, toolchain,
address_sanitizer, mace_lib_type, debug_mode):
library_name = configs[YAMLKeyword.library_name]
......@@ -913,7 +913,6 @@ def build_mace_run(configs, target_abi, toolchain, enable_openmp,
enable_hexagon=hexagon_enabled(configs),
enable_hta=hta_enabled(configs),
enable_apu=apu_enabled(configs),
enable_openmp=enable_openmp,
enable_opencl=opencl_enabled(configs),
enable_quantize=quantize_enabled(configs),
enable_bfloat16=bfloat16_enabled(configs),
......@@ -961,7 +960,6 @@ def run_mace(flags):
build_mace_run(configs,
target_abi,
toolchain,
flags.enable_openmp,
flags.address_sanitizer,
flags.mace_lib_type,
flags.debug_mode)
......@@ -1079,14 +1077,10 @@ def parse_args():
default=DefaultValues.mace_lib_type,
help="[static | dynamic], Which type MACE library to use.")
run.add_argument(
"--enable_openmp",
action="store_true",
help="Enable openmp for multiple thread.")
run.add_argument(
"--omp_num_threads",
"--num_threads",
type=int,
default=DefaultValues.omp_num_threads,
help="num of openmp threads")
default=DefaultValues.num_threads,
help="num of threads")
run.add_argument(
"--cpu_affinity_policy",
type=int,
......
......@@ -173,7 +173,7 @@ class DeviceWrapper:
opencl_binary_file,
opencl_parameter_file,
libmace_dynamic_library_path,
omp_num_threads=-1,
num_threads=-1,
cpu_affinity_policy=1,
gpu_perf_hint=3,
gpu_priority_hint=3,
......@@ -189,11 +189,11 @@ class DeviceWrapper:
benchmark=False,
):
six.print_("* Run '%s' with round=%s, restart_round=%s, tuning=%s, "
"out_of_range_check=%s, omp_num_threads=%s, "
"out_of_range_check=%s, num_threads=%s, "
"cpu_affinity_policy=%s, gpu_perf_hint=%s, "
"gpu_priority_hint=%s" %
(model_tag, running_round, restart_round, str(tuning),
str(out_of_range_check), omp_num_threads,
str(out_of_range_check), num_threads,
cpu_affinity_policy, gpu_perf_hint, gpu_priority_hint))
mace_model_path = ""
if model_graph_format == ModelFormat.file:
......@@ -236,7 +236,7 @@ class DeviceWrapper:
"--device=%s" % device_type,
"--round=%s" % running_round,
"--restart_round=%s" % restart_round,
"--omp_num_threads=%s" % omp_num_threads,
"--num_threads=%s" % num_threads,
"--cpu_affinity_policy=%s" % cpu_affinity_policy,
"--gpu_perf_hint=%s" % gpu_perf_hint,
"--gpu_priority_hint=%s" % gpu_priority_hint,
......@@ -336,7 +336,7 @@ class DeviceWrapper:
"--device=%s" % device_type,
"--round=%s" % running_round,
"--restart_round=%s" % restart_round,
"--omp_num_threads=%s" % omp_num_threads,
"--num_threads=%s" % num_threads,
"--cpu_affinity_policy=%s" % cpu_affinity_policy,
"--gpu_perf_hint=%s" % gpu_perf_hint,
"--gpu_priority_hint=%s" % gpu_priority_hint,
......@@ -541,7 +541,7 @@ class DeviceWrapper:
out_of_range_check=flags.gpu_out_of_range_check,
model_graph_format=configs[
YAMLKeyword.model_graph_format],
omp_num_threads=flags.omp_num_threads,
num_threads=flags.num_threads,
cpu_affinity_policy=flags.cpu_affinity_policy,
gpu_perf_hint=flags.gpu_perf_hint,
gpu_priority_hint=flags.gpu_priority_hint,
......
......@@ -31,7 +31,7 @@ class ScratchComputer:
MaceOp.Conv2D: self.scratch_size_no_need,
MaceOp.Squeeze: self.scratch_size_of_squeeze,
MaceOp.Softmax: self.scratch_size_no_need,
MaceOp.Eltwise: self.scratch_size_no_need,
MaceOp.Eltwise: self.scratch_size_eltwise,
MaceOp.Activation: self.scratch_size_no_need,
MaceOp.StridedSlice: self.scratch_size_no_need,
MaceOp.Reduce: self.scratch_size_no_need,
......@@ -126,4 +126,8 @@ class ScratchComputer:
def scratch_size_of_squeeze(self, op_def):
input0_dims = self.get_op_input_dims(op_def, 0)
return len(input0_dims) * self.get_data_bytes(mace_pb2.DT_FLOAT)
return len(input0_dims) * self.get_data_bytes(mace_pb2.DT_INT32) * 2
def scratch_size_eltwise(self, op_def):
input0_dims = self.get_op_input_dims(op_def, 0)
return len(input0_dims) * self.get_data_bytes(mace_pb2.DT_INT32) * 3
......@@ -19,17 +19,19 @@ import six
from py_proto import mace_pb2
from transform import base_converter
from transform.base_converter import PoolingType
from transform.base_converter import PaddingMode
from transform.base_converter import ActivationType
from transform.base_converter import ConverterUtil
from transform.base_converter import DataFormat
from transform.base_converter import EltwiseType
from transform.base_converter import ReduceType
from transform.base_converter import FrameworkType
from transform.base_converter import RoundMode
from transform.base_converter import DataFormat
from transform.base_converter import MaceOp
from transform.base_converter import MaceKeyword
from transform.base_converter import ConverterUtil
from transform.base_converter import PoolingType
from transform.base_converter import PaddingMode
from transform.base_converter import PadType
from transform.base_converter import ReduceType
from transform.base_converter import RoundMode
from utils.util import mace_check
import numpy as np
......@@ -1225,11 +1227,11 @@ class OnnxConverter(base_converter.ConverterInterface):
padding_type_arg = op.arg.add()
padding_type_arg.name = MaceKeyword.mace_padding_type_str
if mode == 'reflect':
padding_type_arg.i = PadType.REFLECT
padding_type_arg.i = PadType.REFLECT.value
elif mode == 'edge':
padding_type_arg.i = PadType.SYMMETRIC
padding_type_arg.i = PadType.SYMMETRIC.value
else:
padding_type_arg.i = PadType.CONSTANT
padding_type_arg.i = PadType.CONSTANT.value
if 'pads' in node.attrs:
paddings_arg = op.arg.add()
paddings_arg.name = MaceKeyword.mace_paddings_str
......
......@@ -266,7 +266,6 @@ def bazel_build(target,
enable_hexagon=False,
enable_hta=False,
enable_apu=False,
enable_openmp=False,
enable_neon=True,
enable_opencl=True,
enable_quantize=True,
......@@ -284,8 +283,6 @@ def bazel_build(target,
"--config",
toolchain,
"--define",
"openmp=%s" % str(enable_openmp).lower(),
"--define",
"quantize=%s" % str(enable_quantize).lower(),
"--define",
"bfloat16=%s" % str(enable_bfloat16).lower(),
......@@ -301,8 +298,6 @@ def bazel_build(target,
"--define",
"neon=%s" % str(enable_neon).lower(),
"--define",
"openmp=%s" % str(enable_openmp).lower(),
"--define",
"opencl=%s" % str(enable_opencl).lower(),
"--define",
"quantize=%s" % str(enable_quantize).lower(),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册