提交 1e91554e 编写于 作者: 叶剑武

Merge branch 'opt_code' into 'master'

remove openmp, fix scratch bug in micro and opt some codes

See merge request deep-computing/mace!1270
...@@ -364,12 +364,12 @@ Tuning for specific SoC's GPU ...@@ -364,12 +364,12 @@ Tuning for specific SoC's GPU
.. note:: .. note::
You should plug in device(s) with the specific SoC(s). You must specify the ``target_socs`` in your YAML file and plug in device(s) with the specific SoC(s).
.. code-block:: sh .. code-block:: sh
python tools/converter.py run --config=/path/to/model_deployment_file.yml --validate python tools/converter.py run --config=/path/to/model_deployment_file.yml
The command will generate two files in `build/${library_name}/opencl`, like the following dir-tree. The command will generate two files in `build/${library_name}/opencl`, like the following dir-tree.
...@@ -487,7 +487,7 @@ the detailed information is in :doc:`benchmark`. ...@@ -487,7 +487,7 @@ the detailed information is in :doc:`benchmark`.
- default - default
- commands - commands
- explanation - explanation
* - --omp_num_threads * - --num_threads
- int - int
- -1 - -1
- ``run`` - ``run``
......
...@@ -180,4 +180,4 @@ Please refer to \ ``mace/tools/mace_run.cc``\ for full usage. The following list ...@@ -180,4 +180,4 @@ Please refer to \ ``mace/tools/mace_run.cc``\ for full usage. The following list
// 5. Run the model // 5. Run the model
MaceStatus status = engine.Run(inputs, &outputs); MaceStatus status = engine.Run(inputs, &outputs);
More details are in :doc:`advanced_usage`. More details are in :doc:`advanced_usage_cmake`.
...@@ -54,7 +54,7 @@ cp -rf include/mace $INCLUDE_DIR ...@@ -54,7 +54,7 @@ cp -rf include/mace $INCLUDE_DIR
cp -rf build/mobilenet/include/mace/public/*.h $INCLUDE_DIR/mace/public/ cp -rf build/mobilenet/include/mace/public/*.h $INCLUDE_DIR/mace/public/
cp -rf build/mobilenet/model $LIBRARY_DIR cp -rf build/mobilenet/model $LIBRARY_DIR
bazel build --config android --config optimization $BAZEL_LIBMACE_TARGET --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=$TARGET_ABI bazel build --config android --config optimization $BAZEL_LIBMACE_TARGET --define neon=true --define opencl=true --define quantize=true --cpu=$TARGET_ABI
cp -rf $BAZEL_GEN_LIBMACE_PATH $LIBMACE_DIR cp -rf $BAZEL_GEN_LIBMACE_PATH $LIBMACE_DIR
if [ $MACE_LINK_TYPE == "dynamic" ]; then if [ $MACE_LINK_TYPE == "dynamic" ]; then
......
...@@ -16,7 +16,7 @@ android { ...@@ -16,7 +16,7 @@ android {
externalNativeBuild { externalNativeBuild {
cmake { cmake {
cppFlags "-std=c++11 -fopenmp" cppFlags "-std=c++11"
abiFilters "arm64-v8a" abiFilters "arm64-v8a"
} }
} }
......
...@@ -95,7 +95,7 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext( ...@@ -95,7 +95,7 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext(
JNIEXPORT jint JNICALL JNIEXPORT jint JNICALL
Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine( Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
JNIEnv *env, jclass thisObj, jint omp_num_threads, jint cpu_affinity_policy, JNIEnv *env, jclass thisObj, jint num_threads, jint cpu_affinity_policy,
jint gpu_perf_hint, jint gpu_priority_hint, jint gpu_perf_hint, jint gpu_priority_hint,
jstring model_name_str, jstring device) { jstring model_name_str, jstring device) {
MaceContext &mace_context = GetMaceContext(); MaceContext &mace_context = GetMaceContext();
...@@ -110,14 +110,13 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine( ...@@ -110,14 +110,13 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
mace::MaceStatus status; mace::MaceStatus status;
mace::MaceEngineConfig config(mace_context.device_type); mace::MaceEngineConfig config(mace_context.device_type);
status = config.SetCPUThreadPolicy( status = config.SetCPUThreadPolicy(
omp_num_threads, num_threads,
static_cast<mace::CPUAffinityPolicy>(cpu_affinity_policy)); static_cast<mace::CPUAffinityPolicy>(cpu_affinity_policy));
if (status != mace::MaceStatus::MACE_SUCCESS) { if (status != mace::MaceStatus::MACE_SUCCESS) {
__android_log_print(ANDROID_LOG_ERROR, __android_log_print(ANDROID_LOG_ERROR,
"image_classify attrs", "image_classify attrs",
"openmp result: %s, threads: %d, cpu: %d", "threads: %d, cpu: %d",
status.information().c_str(), omp_num_threads, num_threads, cpu_affinity_policy);
cpu_affinity_policy);
} }
if (mace_context.device_type == mace::DeviceType::GPU) { if (mace_context.device_type == mace::DeviceType::GPU) {
config.SetGPUContext(mace_context.gpu_context); config.SetGPUContext(mace_context.gpu_context);
......
...@@ -316,8 +316,6 @@ class MACE_API MaceEngineConfig { ...@@ -316,8 +316,6 @@ class MACE_API MaceEngineConfig {
/// (AFFINITY_NONE) cores according to the policy. The threads number will /// (AFFINITY_NONE) cores according to the policy. The threads number will
/// also be truncated to the corresponding cores number when num_threads_hint /// also be truncated to the corresponding cores number when num_threads_hint
/// is larger than it. /// is larger than it.
/// The OpenMP threads will be bind to (via sched_setaffinity) big cores
/// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY).
/// ///
/// \param num_threads_hint it is only a hint. /// \param num_threads_hint it is only a hint.
/// \param policy one of CPUAffinityPolicy /// \param policy one of CPUAffinityPolicy
......
...@@ -108,14 +108,6 @@ config_setting( ...@@ -108,14 +108,6 @@ config_setting(
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
) )
config_setting(
name = "openmp_enabled",
define_values = {
"openmp": "true",
},
visibility = ["//visibility:public"],
)
config_setting( config_setting(
name = "opencl_enabled", name = "opencl_enabled",
define_values = { define_values = {
......
...@@ -17,7 +17,6 @@ load( ...@@ -17,7 +17,6 @@ load(
"if_not_apu_enabled", "if_not_apu_enabled",
"if_not_hexagon_enabled", "if_not_hexagon_enabled",
"if_opencl_enabled", "if_opencl_enabled",
"if_openmp_enabled",
"if_quantize_enabled", "if_quantize_enabled",
"if_rpcmem_enabled", "if_rpcmem_enabled",
) )
...@@ -81,10 +80,7 @@ cc_library( ...@@ -81,10 +80,7 @@ cc_library(
"-Werror", "-Werror",
"-Wextra", "-Wextra",
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
] + if_openmp_enabled([ ] + if_opencl_enabled([
"-fopenmp",
"-DMACE_ENABLE_OPENMP",
]) + if_opencl_enabled([
"-DMACE_ENABLE_OPENCL", "-DMACE_ENABLE_OPENCL",
]) + if_quantize_enabled([ ]) + if_quantize_enabled([
"-DMACE_ENABLE_QUANTIZE", "-DMACE_ENABLE_QUANTIZE",
......
...@@ -14,10 +14,6 @@ ...@@ -14,10 +14,6 @@
#include "mace/core/runtime/cpu/cpu_runtime.h" #include "mace/core/runtime/cpu/cpu_runtime.h"
#ifdef MACE_ENABLE_OPENMP
#include <omp.h>
#endif
#include <algorithm> #include <algorithm>
#include <cerrno> #include <cerrno>
#include <cmath> #include <cmath>
...@@ -35,62 +31,7 @@ ...@@ -35,62 +31,7 @@
namespace mace { namespace mace {
int MaceOpenMPThreadCount = 1; MaceStatus CPURuntime::SetThreadsHintAndAffinityPolicy(
enum SchedulePolicy {
SCHED_STATIC,
SCHED_GUIDED,
};
namespace {
MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
const std::vector<size_t> &cpu_ids,
SchedulePolicy schedule_policy) {
MaceOpenMPThreadCount = omp_num_threads;
SchedSetAffinity(cpu_ids);
#ifdef MACE_ENABLE_OPENMP
VLOG(1) << "Set OpenMP threads number: " << omp_num_threads
<< ", CPU core IDs: " << MakeString(cpu_ids);
if (schedule_policy == SCHED_GUIDED) {
omp_set_schedule(omp_sched_guided, 1);
} else if (schedule_policy == SCHED_STATIC) {
omp_set_schedule(omp_sched_static, 0);
} else {
LOG(WARNING) << "Unknown schedule policy: " << schedule_policy;
}
omp_set_num_threads(omp_num_threads);
#else
MACE_UNUSED(omp_num_threads);
MACE_UNUSED(schedule_policy);
VLOG(2) << "Set OpenMP threads number failed: OpenMP not enabled.";
#endif
#ifdef MACE_ENABLE_OPENMP
std::vector<MaceStatus> status(omp_num_threads,
MaceStatus::MACE_INVALID_ARGS);
#pragma omp parallel for
for (int i = 0; i < omp_num_threads; ++i) {
VLOG(1) << "Set affinity for OpenMP thread " << omp_get_thread_num()
<< "/" << omp_get_num_threads();
status[i] = SchedSetAffinity(cpu_ids);
}
for (int i = 0; i < omp_num_threads; ++i) {
if (status[i] != MaceStatus::MACE_SUCCESS)
return MaceStatus::MACE_INVALID_ARGS;
}
return MaceStatus::MACE_SUCCESS;
#else
MaceStatus status = SchedSetAffinity(cpu_ids);
VLOG(1) << "Set affinity without OpenMP: " << MakeString(cpu_ids);
return status;
#endif
}
} // namespace
MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
int num_threads_hint, int num_threads_hint,
CPUAffinityPolicy policy, CPUAffinityPolicy policy,
void *gemm_context) { void *gemm_context) {
...@@ -115,19 +56,8 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( ...@@ -115,19 +56,8 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
#else #else
MACE_UNUSED(gemm_context); MACE_UNUSED(gemm_context);
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENMP
omp_set_num_threads(num_threads_hint);
#else
VLOG(2) << "Set OpenMP threads number failed: OpenMP not enabled.";
#endif
return MaceStatus::MACE_SUCCESS;
}
SchedulePolicy sched_policy = SCHED_GUIDED; return MaceStatus::MACE_SUCCESS;
float first_freq = cpu_max_freqs[cores_to_use[0]];
float last_freq = cpu_max_freqs[cores_to_use[cores_to_use.size() - 1]];
if (std::abs(first_freq - last_freq) < 1e-6) {
sched_policy = SCHED_STATIC;
} }
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
...@@ -137,9 +67,10 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( ...@@ -137,9 +67,10 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
} }
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint, MaceStatus status = SchedSetAffinity(cores_to_use);
cores_to_use, VLOG(1) << "Set affinity : " << MakeString(cores_to_use);
sched_policy);
return status;
} }
} // namespace mace } // namespace mace
......
...@@ -29,8 +29,6 @@ ...@@ -29,8 +29,6 @@
namespace mace { namespace mace {
extern int MaceOpenMPThreadCount;
class CPURuntime { class CPURuntime {
public: public:
CPURuntime(const int num_threads, CPURuntime(const int num_threads,
...@@ -43,9 +41,9 @@ class CPURuntime { ...@@ -43,9 +41,9 @@ class CPURuntime {
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
MACE_CHECK_NOTNULL(GetGemmlowpContext()); MACE_CHECK_NOTNULL(GetGemmlowpContext());
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
SetOpenMPThreadsAndAffinityPolicy(num_threads_, SetThreadsHintAndAffinityPolicy(num_threads_,
policy_, policy_,
gemm_context_); gemm_context_);
} }
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
...@@ -78,8 +76,8 @@ class CPURuntime { ...@@ -78,8 +76,8 @@ class CPURuntime {
} }
private: private:
MaceStatus SetOpenMPThreadsAndAffinityPolicy( MaceStatus SetThreadsHintAndAffinityPolicy(
int omp_num_threads_hint, int num_threads_hint,
CPUAffinityPolicy policy, CPUAffinityPolicy policy,
void *gemm_context); void *gemm_context);
......
...@@ -20,7 +20,6 @@ load( ...@@ -20,7 +20,6 @@ load(
"if_linux_base", "if_linux_base",
"if_neon_enabled", "if_neon_enabled",
"if_opencl_enabled", "if_opencl_enabled",
"if_openmp_enabled",
"if_quantize_enabled", "if_quantize_enabled",
"if_rpcmem_enabled", "if_rpcmem_enabled",
) )
...@@ -33,7 +32,7 @@ cc_library( ...@@ -33,7 +32,7 @@ cc_library(
copts = [ copts = [
"-Werror", "-Werror",
"-Wextra", "-Wextra",
] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([ ] + if_neon_enabled([
"-DMACE_ENABLE_NEON", "-DMACE_ENABLE_NEON",
]) + if_android_armv7([ ]) + if_android_armv7([
"-mfpu=neon", "-mfpu=neon",
...@@ -70,9 +69,7 @@ cc_binary( ...@@ -70,9 +69,7 @@ cc_binary(
"-Wl,--version-script", "-Wl,--version-script",
"$(location //mace/libmace:mace_version_script.lds)", "$(location //mace/libmace:mace_version_script.lds)",
], ],
) + if_openmp_enabled([ ),
"-fopenmp",
]),
linkshared = 1, linkshared = 1,
linkstatic = 0, linkstatic = 0,
deps = [ deps = [
......
# -*- Python -*- # -*- Python -*-
def if_linux_base(a, default_value = []): def if_linux_base(a, default_value = []):
return select({ return select({
"//mace:linux_base": a, "//mace:linux_base": a,
"//conditions:default": default_value, "//conditions:default": default_value,
}) })
def if_android(a, default_value = []): def if_android(a, default_value = []):
return select({ return select({
"//mace:android": a, "//mace:android": a,
"//conditions:default": default_value, "//conditions:default": default_value,
}) })
def if_linux(a, default_value = []): def if_linux(a, default_value = []):
return select({ return select({
"//mace:linux": a, "//mace:linux": a,
"//conditions:default": default_value, "//conditions:default": default_value,
}) })
def if_darwin(a, default_value = []): def if_darwin(a, default_value = []):
return select({ return select({
"//mace:darwin": a, "//mace:darwin": a,
"//conditions:default": default_value, "//conditions:default": default_value,
}) })
def if_android_armv7(a): def if_android_armv7(a):
return select({ return select({
"//mace:android_armv7": a, "//mace:android_armv7": a,
"//conditions:default": [], "//conditions:default": [],
}) })
def if_android_arm64(a): def if_android_arm64(a):
return select({ return select({
"//mace:android_arm64": a, "//mace:android_arm64": a,
"//conditions:default": [], "//conditions:default": [],
}) })
def if_arm_linux_aarch64(a): def if_arm_linux_aarch64(a):
return select({ return select({
"//mace:arm_linux_aarch64": a, "//mace:arm_linux_aarch64": a,
"//conditions:default": [], "//conditions:default": [],
}) })
def if_arm_linux_armhf(a): def if_arm_linux_armhf(a):
return select({ return select({
"//mace:arm_linux_armhf": a, "//mace:arm_linux_armhf": a,
"//conditions:default": [] "//conditions:default": [],
}) })
def if_neon_enabled(a, default_value = []): def if_neon_enabled(a, default_value = []):
return select({ return select({
"//mace:neon_enabled": a, "//mace:neon_enabled": a,
"//conditions:default": default_value, "//conditions:default": default_value,
}) })
def if_hexagon_enabled(a): def if_hexagon_enabled(a):
return select({ return select({
"//mace:hexagon_enabled": a, "//mace:hexagon_enabled": a,
"//conditions:default": [], "//conditions:default": [],
}) })
def if_not_hexagon_enabled(a): def if_not_hexagon_enabled(a):
return select({ return select({
"//mace:hexagon_enabled": [], "//mace:hexagon_enabled": [],
"//conditions:default": a, "//conditions:default": a,
}) })
def if_hta_enabled(a): def if_hta_enabled(a):
return select({ return select({
"//mace:hta_enabled": a, "//mace:hta_enabled": a,
"//conditions:default": [], "//conditions:default": [],
}) })
def if_hexagon_or_hta_enabled(a): def if_hexagon_or_hta_enabled(a):
return select({ return select({
"//mace:hexagon_enabled": a, "//mace:hexagon_enabled": a,
"//mace:hta_enabled": a, "//mace:hta_enabled": a,
"//conditions:default": [], "//conditions:default": [],
}) })
def if_apu_enabled(a): def if_apu_enabled(a):
return select({ return select({
"//mace:apu_enabled": a, "//mace:apu_enabled": a,
"//conditions:default": [], "//conditions:default": [],
}) })
def if_not_apu_enabled(a): def if_not_apu_enabled(a):
return select({ return select({
"//mace:apu_enabled": [], "//mace:apu_enabled": [],
"//conditions:default": a, "//conditions:default": a,
}) })
def if_openmp_enabled(a):
return select({
"//mace:openmp_enabled": a,
"//conditions:default": [],
})
def if_opencl_enabled(a, default_value = []): def if_opencl_enabled(a, default_value = []):
return select({ return select({
"//mace:opencl_enabled": a, "//mace:opencl_enabled": a,
"//conditions:default": default_value, "//conditions:default": default_value,
}) })
def if_quantize_enabled(a): def if_quantize_enabled(a):
return select({ return select({
"//mace:quantize_enabled": a, "//mace:quantize_enabled": a,
"//conditions:default": [], "//conditions:default": [],
}) })
def if_bfloat16_enabled(a): def if_bfloat16_enabled(a):
return select({ return select({
"//mace:bfloat16_enabled": a, "//mace:bfloat16_enabled": a,
"//conditions:default": [], "//conditions:default": [],
}) })
def if_rpcmem_enabled(a): def if_rpcmem_enabled(a):
return select({ return select({
"//mace:rpcmem_enabled": a, "//mace:rpcmem_enabled": a,
"//conditions:default": [], "//conditions:default": [],
}) })
def mace_version_genrule(): def mace_version_genrule():
native.genrule( native.genrule(
name = "mace_version_gen", name = "mace_version_gen",
srcs = [str(Label("@local_version_config//:gen/version"))], srcs = [str(Label("@local_version_config//:gen/version"))],
outs = ["version/version.cc"], outs = ["version/version.cc"],
cmd = "cat $(SRCS) > $@;" cmd = "cat $(SRCS) > $@;",
) )
def encrypt_opencl_kernel_genrule(): def encrypt_opencl_kernel_genrule():
srcs = [ srcs = [
......
...@@ -14,7 +14,6 @@ load( ...@@ -14,7 +14,6 @@ load(
"if_hexagon_enabled", "if_hexagon_enabled",
"if_neon_enabled", "if_neon_enabled",
"if_opencl_enabled", "if_opencl_enabled",
"if_openmp_enabled",
"if_quantize_enabled", "if_quantize_enabled",
) )
...@@ -36,9 +35,7 @@ cc_library( ...@@ -36,9 +35,7 @@ cc_library(
"-Werror", "-Werror",
"-Wextra", "-Wextra",
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
] + if_openmp_enabled([ ] + if_neon_enabled([
"-fopenmp",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON", "-DMACE_ENABLE_NEON",
]) + if_android_armv7([ ]) + if_android_armv7([
"-mfpu=neon-fp16", "-mfpu=neon-fp16",
...@@ -77,9 +74,7 @@ cc_library( ...@@ -77,9 +74,7 @@ cc_library(
"-Werror", "-Werror",
"-Wextra", "-Wextra",
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
] + if_openmp_enabled([ ] + if_neon_enabled([
"-fopenmp",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON", "-DMACE_ENABLE_NEON",
]) + if_android_armv7([ ]) + if_android_armv7([
"-mfpu=neon-fp16", "-mfpu=neon-fp16",
...@@ -134,9 +129,7 @@ cc_library( ...@@ -134,9 +129,7 @@ cc_library(
"-Werror", "-Werror",
"-Wextra", "-Wextra",
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
] + if_openmp_enabled([ ] + if_neon_enabled([
"-fopenmp",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON", "-DMACE_ENABLE_NEON",
]) + if_android_armv7([ ]) + if_android_armv7([
"-mfpu=neon-fp16", "-mfpu=neon-fp16",
...@@ -176,9 +169,7 @@ cc_library( ...@@ -176,9 +169,7 @@ cc_library(
"-Werror", "-Werror",
"-Wextra", "-Wextra",
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
] + if_openmp_enabled([ ] + if_neon_enabled([
"-fopenmp",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON", "-DMACE_ENABLE_NEON",
]) + if_android_armv7([ ]) + if_android_armv7([
"-mfpu=neon-fp16", "-mfpu=neon-fp16",
...@@ -221,9 +212,7 @@ cc_library( ...@@ -221,9 +212,7 @@ cc_library(
"-Werror", "-Werror",
"-Wextra", "-Wextra",
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
] + if_openmp_enabled([ ] + if_neon_enabled([
"-fopenmp",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON", "-DMACE_ENABLE_NEON",
]) + if_android_armv7([ ]) + if_android_armv7([
"-mfpu=neon-fp16", "-mfpu=neon-fp16",
...@@ -263,9 +252,7 @@ cc_library( ...@@ -263,9 +252,7 @@ cc_library(
"-Werror", "-Werror",
"-Wextra", "-Wextra",
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
] + if_openmp_enabled([ ] + if_neon_enabled([
"-fopenmp",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON", "-DMACE_ENABLE_NEON",
]) + if_android_armv7([ ]) + if_android_armv7([
"-mfpu=neon-fp16", "-mfpu=neon-fp16",
......
...@@ -21,8 +21,9 @@ ...@@ -21,8 +21,9 @@
#define MACE_ENABLE_FP16_NEON #define MACE_ENABLE_FP16_NEON
#endif #endif
#include "mace/core/ops/op_context.h"
#include "mace/core/types.h" #include "mace/core/types.h"
#include "mace/utils/thread_pool.h"
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) #if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
#include <arm_neon.h> #include <arm_neon.h>
...@@ -36,9 +37,10 @@ namespace mace { ...@@ -36,9 +37,10 @@ namespace mace {
namespace ops { namespace ops {
template<typename INPUT_TYPE_LEFT, template<typename INPUT_TYPE_LEFT,
typename INPUT_TYPE_RIGHT, typename INPUT_TYPE_RIGHT,
typename OUTPUT_TYPE> typename OUTPUT_TYPE>
void FP16Gemv(const INPUT_TYPE_LEFT *m_ptr, void FP16Gemv(OpContext *context,
const INPUT_TYPE_LEFT *m_ptr,
const INPUT_TYPE_RIGHT *v_ptr, const INPUT_TYPE_RIGHT *v_ptr,
const index_t height, const index_t height,
const index_t width, const index_t width,
...@@ -46,78 +48,83 @@ void FP16Gemv(const INPUT_TYPE_LEFT *m_ptr, ...@@ -46,78 +48,83 @@ void FP16Gemv(const INPUT_TYPE_LEFT *m_ptr,
#if defined(MACE_ENABLE_FP16_NEON) && defined(__ANDROID__) #if defined(MACE_ENABLE_FP16_NEON) && defined(__ANDROID__)
template<> template<>
void FP16Gemv<float16_t, float, float>(const float16_t *m_ptr, void FP16Gemv<float16_t, float, float>(OpContext *context,
const float16_t *m_ptr,
const float *v_ptr, const float *v_ptr,
const index_t height, const index_t height,
const index_t width, const index_t width,
float *out_ptr) { float *out_ptr) {
#pragma omp parallel for utils::ThreadPool &thread_pool =
for (index_t h = 0; h < height; ++h) { context->device()->cpu_runtime()->thread_pool();
const float16_t *m_ptr0 = m_ptr + h * width;
const float *v_ptr0 = v_ptr; thread_pool.Compute1D([=](index_t start0, index_t end0, index_t step0) {
float *out_ptr0 = out_ptr + h; for (index_t h = start0; h < end0; h += step0) {
float sum0 = 0; const float16_t *m_ptr0 = m_ptr + h * width;
const float *v_ptr0 = v_ptr;
float32x4_t vm0, vm1, vm2, vm3; float *out_ptr0 = out_ptr + h;
float32x4_t vv0, vv1, vv2, vv3; float sum0 = 0;
float32x4_t vsum0 = vdupq_n_f32(0.f);
float32x4_t vsum1 = vdupq_n_f32(0.f); float32x4_t vm0, vm1, vm2, vm3;
float32x4_t vsum2 = vdupq_n_f32(0.f); float32x4_t vv0, vv1, vv2, vv3;
float32x4_t vsum3 = vdupq_n_f32(0.f); float32x4_t vsum0 = vdupq_n_f32(0.f);
float32x4_t vsum1 = vdupq_n_f32(0.f);
index_t w; float32x4_t vsum2 = vdupq_n_f32(0.f);
for (w = 0; w + 15 < width; w += 16) { float32x4_t vsum3 = vdupq_n_f32(0.f);
vm0 = vcvt_f32_f16(vld1_f16(m_ptr0));
vv0 = vld1q_f32(v_ptr0); index_t w;
vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4)); for (w = 0; w + 15 < width; w += 16) {
vv1 = vld1q_f32(v_ptr0 + 4); vm0 = vcvt_f32_f16(vld1_f16(m_ptr0));
vm2 = vcvt_f32_f16(vld1_f16(m_ptr0 + 8)); vv0 = vld1q_f32(v_ptr0);
vv2 = vld1q_f32(v_ptr0 + 8); vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4));
vm3 = vcvt_f32_f16(vld1_f16(m_ptr0 + 12)); vv1 = vld1q_f32(v_ptr0 + 4);
vv3 = vld1q_f32(v_ptr0 + 12); vm2 = vcvt_f32_f16(vld1_f16(m_ptr0 + 8));
vv2 = vld1q_f32(v_ptr0 + 8);
vsum0 = vmlaq_f32(vsum0, vm0, vv0); vm3 = vcvt_f32_f16(vld1_f16(m_ptr0 + 12));
vsum1 = vmlaq_f32(vsum1, vm1, vv1); vv3 = vld1q_f32(v_ptr0 + 12);
vsum2 = vmlaq_f32(vsum2, vm2, vv2);
vsum3 = vmlaq_f32(vsum3, vm3, vv3); vsum0 = vmlaq_f32(vsum0, vm0, vv0);
vsum1 = vmlaq_f32(vsum1, vm1, vv1);
m_ptr0 += 16; vsum2 = vmlaq_f32(vsum2, vm2, vv2);
v_ptr0 += 16; vsum3 = vmlaq_f32(vsum3, vm3, vv3);
}
m_ptr0 += 16;
for (; w + 7 < width; w += 8) { v_ptr0 += 16;
vm0 = vcvt_f32_f16(vld1_f16(m_ptr0)); }
vv0 = vld1q_f32(v_ptr0);
vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4)); for (; w + 7 < width; w += 8) {
vv1 = vld1q_f32(v_ptr0 + 4); vm0 = vcvt_f32_f16(vld1_f16(m_ptr0));
vv0 = vld1q_f32(v_ptr0);
vsum0 = vmlaq_f32(vsum0, vm0, vv0); vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4));
vsum1 = vmlaq_f32(vsum1, vm1, vv1); vv1 = vld1q_f32(v_ptr0 + 4);
m_ptr0 += 8; vsum0 = vmlaq_f32(vsum0, vm0, vv0);
v_ptr0 += 8; vsum1 = vmlaq_f32(vsum1, vm1, vv1);
}
m_ptr0 += 8;
for (; w + 3 < width; w += 4) { v_ptr0 += 8;
vm0 = vcvt_f32_f16(vld1_f16(m_ptr0)); }
vv0 = vld1q_f32(v_ptr0);
vsum0 = vmlaq_f32(vsum0, vm0, vv0); for (; w + 3 < width; w += 4) {
vm0 = vcvt_f32_f16(vld1_f16(m_ptr0));
m_ptr0 += 4; vv0 = vld1q_f32(v_ptr0);
v_ptr0 += 4; vsum0 = vmlaq_f32(vsum0, vm0, vv0);
}
vsum0 += vsum1; m_ptr0 += 4;
vsum2 += vsum3; v_ptr0 += 4;
vsum0 += vsum2; }
sum0 = vaddvq_f32(vsum0); vsum0 += vsum1;
vsum2 += vsum3;
for (; w < width; ++w) { vsum0 += vsum2;
sum0 += m_ptr0[0] * v_ptr0[0]; sum0 = vaddvq_f32(vsum0);
m_ptr0++;
v_ptr0++; for (; w < width; ++w) {
} sum0 += m_ptr0[0] * v_ptr0[0];
*out_ptr0++ = sum0; m_ptr0++;
} v_ptr0++;
}
*out_ptr0++ = sum0;
}
}, 0, height, 1);
} }
#endif // MACE_ENABLE_FP16_NEON && __ANDROID__ #endif // MACE_ENABLE_FP16_NEON && __ANDROID__
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#ifdef MACE_ENABLE_QUANTIZE #ifdef MACE_ENABLE_QUANTIZE
#include "mace/ops/arm/q8/quantization_util.h" #include "mace/ops/arm/q8/quantization_util.h"
// We reuse TensorFlow Lite's optimized depthwiseconv_uint8 and parallelized it // We reuse TensorFlow Lite's optimized depthwiseconv_uint8 and parallelized it
// using OpenMP for MACE's quantized depthwise_conv2d. // using thread pool for MACE's quantized depthwise_conv2d.
#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h" #include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
#endif // MACE_ENABLE_QUANTIZE #endif // MACE_ENABLE_QUANTIZE
......
...@@ -567,21 +567,15 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase { ...@@ -567,21 +567,15 @@ class MatMulOp<CPU, float16_t> : public MatMulOpBase {
B->dtype() == DT_FLOAT) { B->dtype() == DT_FLOAT) {
auto *a_ptr_base = A->data<float16_t>(); auto *a_ptr_base = A->data<float16_t>();
auto *b_ptr_base = B->data<float>(); auto *b_ptr_base = B->data<float>();
FP16Gemv(a_ptr_base, FP16Gemv(context, a_ptr_base, b_ptr_base,
b_ptr_base, height, K, c_ptr_base);
height,
K,
c_ptr_base);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} else if (height == 1 && transpose_b_ && A->dtype() == DT_FLOAT && } else if (height == 1 && transpose_b_ && A->dtype() == DT_FLOAT &&
B->dtype() == DT_FLOAT16) { B->dtype() == DT_FLOAT16) {
auto *b_ptr_base = B->data<float16_t>(); auto *b_ptr_base = B->data<float16_t>();
auto *a_ptr_base = A->data<float>(); auto *a_ptr_base = A->data<float>();
FP16Gemv(b_ptr_base, FP16Gemv(context, b_ptr_base, a_ptr_base,
a_ptr_base, width, K, c_ptr_base);
width,
K,
c_ptr_base);
return MaceStatus::MACE_SUCCESS; return MaceStatus::MACE_SUCCESS;
} else { } else {
LOG(INFO) << "Matmul fp16 gemv args: " << height << " " << width << " " LOG(INFO) << "Matmul fp16 gemv args: " << height << " " << width << " "
......
...@@ -5,7 +5,6 @@ load( ...@@ -5,7 +5,6 @@ load(
"if_android", "if_android",
"if_hexagon_enabled", "if_hexagon_enabled",
"if_opencl_enabled", "if_opencl_enabled",
"if_openmp_enabled",
) )
licenses(["notice"]) # Apache 2.0 licenses(["notice"]) # Apache 2.0
......
...@@ -150,7 +150,7 @@ DEFINE_int32(restart_round, 1, "restart round"); ...@@ -150,7 +150,7 @@ DEFINE_int32(restart_round, 1, "restart round");
DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable"); DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable");
DEFINE_int32(gpu_perf_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); DEFINE_int32(gpu_perf_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH");
DEFINE_int32(omp_num_threads, -1, "num of openmp threads"); DEFINE_int32(num_threads, -1, "num of threads");
DEFINE_int32(cpu_affinity_policy, 1, DEFINE_int32(cpu_affinity_policy, 1,
"0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY"); "0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY");
DEFINE_bool(benchmark, false, "enable benchmark op"); DEFINE_bool(benchmark, false, "enable benchmark op");
...@@ -170,10 +170,10 @@ bool RunModel(const std::string &model_name, ...@@ -170,10 +170,10 @@ bool RunModel(const std::string &model_name,
MaceStatus status; MaceStatus status;
MaceEngineConfig config(device_type); MaceEngineConfig config(device_type);
status = config.SetCPUThreadPolicy( status = config.SetCPUThreadPolicy(
FLAGS_omp_num_threads, FLAGS_num_threads,
static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy)); static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
if (status != MaceStatus::MACE_SUCCESS) { if (status != MaceStatus::MACE_SUCCESS) {
LOG(WARNING) << "Set openmp or cpu affinity failed."; LOG(WARNING) << "Set cpu affinity failed.";
} }
#if defined(MACE_ENABLE_OPENCL) || defined(MACE_ENABLE_HTA) #if defined(MACE_ENABLE_OPENCL) || defined(MACE_ENABLE_HTA)
std::shared_ptr<GPUContext> gpu_context; std::shared_ptr<GPUContext> gpu_context;
...@@ -544,7 +544,7 @@ int Main(int argc, char **argv) { ...@@ -544,7 +544,7 @@ int Main(int argc, char **argv) {
LOG(INFO) << "restart_round: " << FLAGS_restart_round; LOG(INFO) << "restart_round: " << FLAGS_restart_round;
LOG(INFO) << "gpu_perf_hint: " << FLAGS_gpu_perf_hint; LOG(INFO) << "gpu_perf_hint: " << FLAGS_gpu_perf_hint;
LOG(INFO) << "gpu_priority_hint: " << FLAGS_gpu_priority_hint; LOG(INFO) << "gpu_priority_hint: " << FLAGS_gpu_priority_hint;
LOG(INFO) << "omp_num_threads: " << FLAGS_omp_num_threads; LOG(INFO) << "num_threads: " << FLAGS_num_threads;
LOG(INFO) << "cpu_affinity_policy: " << FLAGS_cpu_affinity_policy; LOG(INFO) << "cpu_affinity_policy: " << FLAGS_cpu_affinity_policy;
auto limit_opencl_kernel_time = getenv("MACE_LIMIT_OPENCL_KERNEL_TIME"); auto limit_opencl_kernel_time = getenv("MACE_LIMIT_OPENCL_KERNEL_TIME");
if (limit_opencl_kernel_time) { if (limit_opencl_kernel_time) {
......
...@@ -12,7 +12,6 @@ load( ...@@ -12,7 +12,6 @@ load(
"if_android", "if_android",
"if_android_armv7", "if_android_armv7",
"if_neon_enabled", "if_neon_enabled",
"if_openmp_enabled",
) )
cc_library( cc_library(
...@@ -41,9 +40,7 @@ cc_library( ...@@ -41,9 +40,7 @@ cc_library(
"-Werror", "-Werror",
"-Wextra", "-Wextra",
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
] + if_openmp_enabled([ ] + if_neon_enabled([
"-fopenmp",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON", "-DMACE_ENABLE_NEON",
]) + if_android_armv7([ ]) + if_android_armv7([
"-mfpu=neon", "-mfpu=neon",
......
...@@ -11,19 +11,18 @@ load( ...@@ -11,19 +11,18 @@ load(
"if_hexagon_enabled", "if_hexagon_enabled",
"if_neon_enabled", "if_neon_enabled",
"if_opencl_enabled", "if_opencl_enabled",
"if_openmp_enabled",
"if_quantize_enabled", "if_quantize_enabled",
) )
cc_library( cc_library(
name = "benchmark_utils", name = "benchmark_utils",
testonly = 1, testonly = 1,
hdrs = glob([
"mace/benchmark_utils/*.h",
]),
srcs = glob([ srcs = glob([
"mace/benchmark_utils/*.cc", "mace/benchmark_utils/*.cc",
]), ]),
hdrs = glob([
"mace/benchmark_utils/*.h",
]),
copts = [ copts = [
"-Werror", "-Werror",
"-Wextra", "-Wextra",
...@@ -31,9 +30,9 @@ cc_library( ...@@ -31,9 +30,9 @@ cc_library(
], ],
strip_include_prefix = "", strip_include_prefix = "",
deps = [ deps = [
"//external:gflags_nothreads",
"//mace/core", "//mace/core",
"//test/ccutils", "//test/ccutils",
"//external:gflags_nothreads",
], ],
) )
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#include "mace/ops/ops_test_util.h" #include "mace/ops/ops_test_util.h"
DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*"); DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*");
DEFINE_int32(omp_num_threads, -1, "num of openmp threads"); DEFINE_int32(num_threads, -1, "num of threads");
DEFINE_int32(cpu_affinity_policy, 1, DEFINE_int32(cpu_affinity_policy, 1,
"0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY"); "0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY");
...@@ -32,7 +32,7 @@ int main(int argc, char **argv) { ...@@ -32,7 +32,7 @@ int main(int argc, char **argv) {
// config runtime // config runtime
mace::ops::test::OpTestContext::Get( mace::ops::test::OpTestContext::Get(
FLAGS_omp_num_threads, FLAGS_num_threads,
static_cast<mace::CPUAffinityPolicy>(FLAGS_cpu_affinity_policy)); static_cast<mace::CPUAffinityPolicy>(FLAGS_cpu_affinity_policy));
mace::testing::Benchmark::Run(FLAGS_filter.c_str()); mace::testing::Benchmark::Run(FLAGS_filter.c_str());
......
...@@ -12,7 +12,6 @@ load( ...@@ -12,7 +12,6 @@ load(
"if_hta_enabled", "if_hta_enabled",
"if_neon_enabled", "if_neon_enabled",
"if_opencl_enabled", "if_opencl_enabled",
"if_openmp_enabled",
"if_quantize_enabled", "if_quantize_enabled",
) )
...@@ -49,9 +48,7 @@ cc_test( ...@@ -49,9 +48,7 @@ cc_test(
"-Werror", "-Werror",
"-Wextra", "-Wextra",
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
] + if_openmp_enabled([ ] + if_neon_enabled([
"-fopenmp",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON", "-DMACE_ENABLE_NEON",
]) + if_android_armv7([ ]) + if_android_armv7([
"-mfpu=neon-fp16", "-mfpu=neon-fp16",
...@@ -65,9 +62,6 @@ cc_test( ...@@ -65,9 +62,6 @@ cc_test(
]) + if_hta_enabled([ ]) + if_hta_enabled([
"-DMACE_ENABLE_HTA", "-DMACE_ENABLE_HTA",
]), ]),
linkopts = if_openmp_enabled([
"-fopenmp",
]),
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
"//mace/ops", "//mace/ops",
......
...@@ -12,7 +12,6 @@ load( ...@@ -12,7 +12,6 @@ load(
"if_hta_enabled", "if_hta_enabled",
"if_neon_enabled", "if_neon_enabled",
"if_opencl_enabled", "if_opencl_enabled",
"if_openmp_enabled",
"if_quantize_enabled", "if_quantize_enabled",
) )
...@@ -36,9 +35,7 @@ cc_test( ...@@ -36,9 +35,7 @@ cc_test(
"-Werror", "-Werror",
"-Wextra", "-Wextra",
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
] + if_openmp_enabled([ ] + if_neon_enabled([
"-fopenmp",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON", "-DMACE_ENABLE_NEON",
]) + if_android_armv7([ ]) + if_android_armv7([
"-mfpu=neon", "-mfpu=neon",
...@@ -53,9 +50,6 @@ cc_test( ...@@ -53,9 +50,6 @@ cc_test(
]) + if_hta_enabled([ ]) + if_hta_enabled([
"-DMACE_ENABLE_HTA", "-DMACE_ENABLE_HTA",
]), ]),
linkopts = if_openmp_enabled([
"-fopenmp",
]),
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
":mace_api_test_header", ":mace_api_test_header",
...@@ -73,9 +67,7 @@ cc_test( ...@@ -73,9 +67,7 @@ cc_test(
"-Werror", "-Werror",
"-Wextra", "-Wextra",
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
] + if_openmp_enabled([ ] + if_neon_enabled([
"-fopenmp",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON", "-DMACE_ENABLE_NEON",
]) + if_android_armv7([ ]) + if_android_armv7([
"-mfpu=neon", "-mfpu=neon",
...@@ -90,9 +82,6 @@ cc_test( ...@@ -90,9 +82,6 @@ cc_test(
]) + if_hta_enabled([ ]) + if_hta_enabled([
"-DMACE_ENABLE_HTA", "-DMACE_ENABLE_HTA",
]), ]),
linkopts = if_openmp_enabled([
"-fopenmp",
]),
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
":mace_api_test_header", ":mace_api_test_header",
...@@ -110,9 +99,7 @@ cc_test( ...@@ -110,9 +99,7 @@ cc_test(
"-Werror", "-Werror",
"-Wextra", "-Wextra",
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
] + if_openmp_enabled([ ] + if_neon_enabled([
"-fopenmp",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON", "-DMACE_ENABLE_NEON",
]) + if_android_armv7([ ]) + if_android_armv7([
"-mfpu=neon", "-mfpu=neon",
...@@ -127,9 +114,6 @@ cc_test( ...@@ -127,9 +114,6 @@ cc_test(
]) + if_hta_enabled([ ]) + if_hta_enabled([
"-DMACE_ENABLE_HTA", "-DMACE_ENABLE_HTA",
]), ]),
linkopts = if_openmp_enabled([
"-fopenmp",
]),
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
"//mace/libmace", "//mace/libmace",
...@@ -146,9 +130,7 @@ cc_test( ...@@ -146,9 +130,7 @@ cc_test(
"-Werror", "-Werror",
"-Wextra", "-Wextra",
"-Wno-missing-field-initializers", "-Wno-missing-field-initializers",
] + if_openmp_enabled([ ] + if_neon_enabled([
"-fopenmp",
]) + if_neon_enabled([
"-DMACE_ENABLE_NEON", "-DMACE_ENABLE_NEON",
]) + if_android_armv7([ ]) + if_android_armv7([
"-mfpu=neon", "-mfpu=neon",
...@@ -163,9 +145,6 @@ cc_test( ...@@ -163,9 +145,6 @@ cc_test(
]) + if_hta_enabled([ ]) + if_hta_enabled([
"-DMACE_ENABLE_HTA", "-DMACE_ENABLE_HTA",
]), ]),
linkopts = if_openmp_enabled([
"-fopenmp",
]),
linkstatic = 1, linkstatic = 1,
deps = [ deps = [
"//mace/libmace", "//mace/libmace",
......
...@@ -11,7 +11,6 @@ load( ...@@ -11,7 +11,6 @@ load(
"if_hexagon_enabled", "if_hexagon_enabled",
"if_neon_enabled", "if_neon_enabled",
"if_opencl_enabled", "if_opencl_enabled",
"if_openmp_enabled",
"if_quantize_enabled", "if_quantize_enabled",
) )
...@@ -29,7 +28,7 @@ cc_library( ...@@ -29,7 +28,7 @@ cc_library(
copts = [ copts = [
"-Werror", "-Werror",
"-Wextra", "-Wextra",
] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([ ] + if_neon_enabled([
"-DMACE_ENABLE_NEON", "-DMACE_ENABLE_NEON",
]) + if_android_armv7([ ]) + if_android_armv7([
"-mfpu=neon-fp16", "-mfpu=neon-fp16",
......
...@@ -117,7 +117,7 @@ DataFormatStrs = [ ...@@ -117,7 +117,7 @@ DataFormatStrs = [
class DefaultValues(object): class DefaultValues(object):
mace_lib_type = MACELibType.static mace_lib_type = MACELibType.static
omp_num_threads = -1, num_threads = -1,
cpu_affinity_policy = 1, cpu_affinity_policy = 1,
gpu_perf_hint = 3, gpu_perf_hint = 3,
gpu_priority_hint = 3, gpu_priority_hint = 3,
...@@ -887,7 +887,7 @@ def convert_func(flags): ...@@ -887,7 +887,7 @@ def convert_func(flags):
################################ ################################
# run # run
################################ ################################
def build_mace_run(configs, target_abi, toolchain, enable_openmp, def build_mace_run(configs, target_abi, toolchain,
address_sanitizer, mace_lib_type, debug_mode): address_sanitizer, mace_lib_type, debug_mode):
library_name = configs[YAMLKeyword.library_name] library_name = configs[YAMLKeyword.library_name]
...@@ -913,7 +913,6 @@ def build_mace_run(configs, target_abi, toolchain, enable_openmp, ...@@ -913,7 +913,6 @@ def build_mace_run(configs, target_abi, toolchain, enable_openmp,
enable_hexagon=hexagon_enabled(configs), enable_hexagon=hexagon_enabled(configs),
enable_hta=hta_enabled(configs), enable_hta=hta_enabled(configs),
enable_apu=apu_enabled(configs), enable_apu=apu_enabled(configs),
enable_openmp=enable_openmp,
enable_opencl=opencl_enabled(configs), enable_opencl=opencl_enabled(configs),
enable_quantize=quantize_enabled(configs), enable_quantize=quantize_enabled(configs),
enable_bfloat16=bfloat16_enabled(configs), enable_bfloat16=bfloat16_enabled(configs),
...@@ -961,7 +960,6 @@ def run_mace(flags): ...@@ -961,7 +960,6 @@ def run_mace(flags):
build_mace_run(configs, build_mace_run(configs,
target_abi, target_abi,
toolchain, toolchain,
flags.enable_openmp,
flags.address_sanitizer, flags.address_sanitizer,
flags.mace_lib_type, flags.mace_lib_type,
flags.debug_mode) flags.debug_mode)
...@@ -1079,14 +1077,10 @@ def parse_args(): ...@@ -1079,14 +1077,10 @@ def parse_args():
default=DefaultValues.mace_lib_type, default=DefaultValues.mace_lib_type,
help="[static | dynamic], Which type MACE library to use.") help="[static | dynamic], Which type MACE library to use.")
run.add_argument( run.add_argument(
"--enable_openmp", "--num_threads",
action="store_true",
help="Enable openmp for multiple thread.")
run.add_argument(
"--omp_num_threads",
type=int, type=int,
default=DefaultValues.omp_num_threads, default=DefaultValues.num_threads,
help="num of openmp threads") help="num of threads")
run.add_argument( run.add_argument(
"--cpu_affinity_policy", "--cpu_affinity_policy",
type=int, type=int,
......
...@@ -173,7 +173,7 @@ class DeviceWrapper: ...@@ -173,7 +173,7 @@ class DeviceWrapper:
opencl_binary_file, opencl_binary_file,
opencl_parameter_file, opencl_parameter_file,
libmace_dynamic_library_path, libmace_dynamic_library_path,
omp_num_threads=-1, num_threads=-1,
cpu_affinity_policy=1, cpu_affinity_policy=1,
gpu_perf_hint=3, gpu_perf_hint=3,
gpu_priority_hint=3, gpu_priority_hint=3,
...@@ -189,11 +189,11 @@ class DeviceWrapper: ...@@ -189,11 +189,11 @@ class DeviceWrapper:
benchmark=False, benchmark=False,
): ):
six.print_("* Run '%s' with round=%s, restart_round=%s, tuning=%s, " six.print_("* Run '%s' with round=%s, restart_round=%s, tuning=%s, "
"out_of_range_check=%s, omp_num_threads=%s, " "out_of_range_check=%s, num_threads=%s, "
"cpu_affinity_policy=%s, gpu_perf_hint=%s, " "cpu_affinity_policy=%s, gpu_perf_hint=%s, "
"gpu_priority_hint=%s" % "gpu_priority_hint=%s" %
(model_tag, running_round, restart_round, str(tuning), (model_tag, running_round, restart_round, str(tuning),
str(out_of_range_check), omp_num_threads, str(out_of_range_check), num_threads,
cpu_affinity_policy, gpu_perf_hint, gpu_priority_hint)) cpu_affinity_policy, gpu_perf_hint, gpu_priority_hint))
mace_model_path = "" mace_model_path = ""
if model_graph_format == ModelFormat.file: if model_graph_format == ModelFormat.file:
...@@ -236,7 +236,7 @@ class DeviceWrapper: ...@@ -236,7 +236,7 @@ class DeviceWrapper:
"--device=%s" % device_type, "--device=%s" % device_type,
"--round=%s" % running_round, "--round=%s" % running_round,
"--restart_round=%s" % restart_round, "--restart_round=%s" % restart_round,
"--omp_num_threads=%s" % omp_num_threads, "--num_threads=%s" % num_threads,
"--cpu_affinity_policy=%s" % cpu_affinity_policy, "--cpu_affinity_policy=%s" % cpu_affinity_policy,
"--gpu_perf_hint=%s" % gpu_perf_hint, "--gpu_perf_hint=%s" % gpu_perf_hint,
"--gpu_priority_hint=%s" % gpu_priority_hint, "--gpu_priority_hint=%s" % gpu_priority_hint,
...@@ -336,7 +336,7 @@ class DeviceWrapper: ...@@ -336,7 +336,7 @@ class DeviceWrapper:
"--device=%s" % device_type, "--device=%s" % device_type,
"--round=%s" % running_round, "--round=%s" % running_round,
"--restart_round=%s" % restart_round, "--restart_round=%s" % restart_round,
"--omp_num_threads=%s" % omp_num_threads, "--num_threads=%s" % num_threads,
"--cpu_affinity_policy=%s" % cpu_affinity_policy, "--cpu_affinity_policy=%s" % cpu_affinity_policy,
"--gpu_perf_hint=%s" % gpu_perf_hint, "--gpu_perf_hint=%s" % gpu_perf_hint,
"--gpu_priority_hint=%s" % gpu_priority_hint, "--gpu_priority_hint=%s" % gpu_priority_hint,
...@@ -541,7 +541,7 @@ class DeviceWrapper: ...@@ -541,7 +541,7 @@ class DeviceWrapper:
out_of_range_check=flags.gpu_out_of_range_check, out_of_range_check=flags.gpu_out_of_range_check,
model_graph_format=configs[ model_graph_format=configs[
YAMLKeyword.model_graph_format], YAMLKeyword.model_graph_format],
omp_num_threads=flags.omp_num_threads, num_threads=flags.num_threads,
cpu_affinity_policy=flags.cpu_affinity_policy, cpu_affinity_policy=flags.cpu_affinity_policy,
gpu_perf_hint=flags.gpu_perf_hint, gpu_perf_hint=flags.gpu_perf_hint,
gpu_priority_hint=flags.gpu_priority_hint, gpu_priority_hint=flags.gpu_priority_hint,
......
...@@ -31,7 +31,7 @@ class ScratchComputer: ...@@ -31,7 +31,7 @@ class ScratchComputer:
MaceOp.Conv2D: self.scratch_size_no_need, MaceOp.Conv2D: self.scratch_size_no_need,
MaceOp.Squeeze: self.scratch_size_of_squeeze, MaceOp.Squeeze: self.scratch_size_of_squeeze,
MaceOp.Softmax: self.scratch_size_no_need, MaceOp.Softmax: self.scratch_size_no_need,
MaceOp.Eltwise: self.scratch_size_no_need, MaceOp.Eltwise: self.scratch_size_eltwise,
MaceOp.Activation: self.scratch_size_no_need, MaceOp.Activation: self.scratch_size_no_need,
MaceOp.StridedSlice: self.scratch_size_no_need, MaceOp.StridedSlice: self.scratch_size_no_need,
MaceOp.Reduce: self.scratch_size_no_need, MaceOp.Reduce: self.scratch_size_no_need,
...@@ -126,4 +126,8 @@ class ScratchComputer: ...@@ -126,4 +126,8 @@ class ScratchComputer:
def scratch_size_of_squeeze(self, op_def): def scratch_size_of_squeeze(self, op_def):
input0_dims = self.get_op_input_dims(op_def, 0) input0_dims = self.get_op_input_dims(op_def, 0)
return len(input0_dims) * self.get_data_bytes(mace_pb2.DT_FLOAT) return len(input0_dims) * self.get_data_bytes(mace_pb2.DT_INT32) * 2
def scratch_size_eltwise(self, op_def):
input0_dims = self.get_op_input_dims(op_def, 0)
return len(input0_dims) * self.get_data_bytes(mace_pb2.DT_INT32) * 3
...@@ -19,17 +19,19 @@ import six ...@@ -19,17 +19,19 @@ import six
from py_proto import mace_pb2 from py_proto import mace_pb2
from transform import base_converter from transform import base_converter
from transform.base_converter import PoolingType
from transform.base_converter import PaddingMode
from transform.base_converter import ActivationType from transform.base_converter import ActivationType
from transform.base_converter import ConverterUtil
from transform.base_converter import DataFormat
from transform.base_converter import EltwiseType from transform.base_converter import EltwiseType
from transform.base_converter import ReduceType
from transform.base_converter import FrameworkType from transform.base_converter import FrameworkType
from transform.base_converter import RoundMode
from transform.base_converter import DataFormat
from transform.base_converter import MaceOp from transform.base_converter import MaceOp
from transform.base_converter import MaceKeyword from transform.base_converter import MaceKeyword
from transform.base_converter import ConverterUtil from transform.base_converter import PoolingType
from transform.base_converter import PaddingMode
from transform.base_converter import PadType
from transform.base_converter import ReduceType
from transform.base_converter import RoundMode
from utils.util import mace_check from utils.util import mace_check
import numpy as np import numpy as np
...@@ -1225,11 +1227,11 @@ class OnnxConverter(base_converter.ConverterInterface): ...@@ -1225,11 +1227,11 @@ class OnnxConverter(base_converter.ConverterInterface):
padding_type_arg = op.arg.add() padding_type_arg = op.arg.add()
padding_type_arg.name = MaceKeyword.mace_padding_type_str padding_type_arg.name = MaceKeyword.mace_padding_type_str
if mode == 'reflect': if mode == 'reflect':
padding_type_arg.i = PadType.REFLECT padding_type_arg.i = PadType.REFLECT.value
elif mode == 'edge': elif mode == 'edge':
padding_type_arg.i = PadType.SYMMETRIC padding_type_arg.i = PadType.SYMMETRIC.value
else: else:
padding_type_arg.i = PadType.CONSTANT padding_type_arg.i = PadType.CONSTANT.value
if 'pads' in node.attrs: if 'pads' in node.attrs:
paddings_arg = op.arg.add() paddings_arg = op.arg.add()
paddings_arg.name = MaceKeyword.mace_paddings_str paddings_arg.name = MaceKeyword.mace_paddings_str
......
...@@ -266,7 +266,6 @@ def bazel_build(target, ...@@ -266,7 +266,6 @@ def bazel_build(target,
enable_hexagon=False, enable_hexagon=False,
enable_hta=False, enable_hta=False,
enable_apu=False, enable_apu=False,
enable_openmp=False,
enable_neon=True, enable_neon=True,
enable_opencl=True, enable_opencl=True,
enable_quantize=True, enable_quantize=True,
...@@ -284,8 +283,6 @@ def bazel_build(target, ...@@ -284,8 +283,6 @@ def bazel_build(target,
"--config", "--config",
toolchain, toolchain,
"--define", "--define",
"openmp=%s" % str(enable_openmp).lower(),
"--define",
"quantize=%s" % str(enable_quantize).lower(), "quantize=%s" % str(enable_quantize).lower(),
"--define", "--define",
"bfloat16=%s" % str(enable_bfloat16).lower(), "bfloat16=%s" % str(enable_bfloat16).lower(),
...@@ -301,8 +298,6 @@ def bazel_build(target, ...@@ -301,8 +298,6 @@ def bazel_build(target,
"--define", "--define",
"neon=%s" % str(enable_neon).lower(), "neon=%s" % str(enable_neon).lower(),
"--define", "--define",
"openmp=%s" % str(enable_openmp).lower(),
"--define",
"opencl=%s" % str(enable_opencl).lower(), "opencl=%s" % str(enable_opencl).lower(),
"--define", "--define",
"quantize=%s" % str(enable_quantize).lower(), "quantize=%s" % str(enable_quantize).lower(),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册