diff --git a/docs/user_guide/advanced_usage.rst b/docs/user_guide/advanced_usage.rst index 665daa71645f7de3b278b1f59d2c3d95aa7aaaaa..4e34309eba18b9d5d0400f9f5ac1c3213f485961 100644 --- a/docs/user_guide/advanced_usage.rst +++ b/docs/user_guide/advanced_usage.rst @@ -364,12 +364,12 @@ Tuning for specific SoC's GPU .. note:: - You should plug in device(s) with the specific SoC(s). + You must specify the ``target_socs`` in your YAML file and plug in device(s) with the specific SoC(s). .. code-block:: sh - python tools/converter.py run --config=/path/to/model_deployment_file.yml --validate + python tools/converter.py run --config=/path/to/model_deployment_file.yml The command will generate two files in `build/${library_name}/opencl`, like the following dir-tree. @@ -487,7 +487,7 @@ the detailed information is in :doc:`benchmark`. - default - commands - explanation - * - --omp_num_threads + * - --num_threads - int - -1 - ``run`` diff --git a/docs/user_guide/basic_usage_cmake.rst b/docs/user_guide/basic_usage_cmake.rst index 79a9a12089216c980e6ae4a049806b1220523e64..131e375f52346cd1b97d6a00f47c4825bfa8622f 100644 --- a/docs/user_guide/basic_usage_cmake.rst +++ b/docs/user_guide/basic_usage_cmake.rst @@ -180,4 +180,4 @@ Please refer to \ ``mace/tools/mace_run.cc``\ for full usage. The following list // 5. Run the model MaceStatus status = engine.Run(inputs, &outputs); -More details are in :doc:`advanced_usage`. +More details are in :doc:`advanced_usage_cmake`. diff --git a/examples/android/build.sh b/examples/android/build.sh index e941ad32f32db4c9ee4c3b882ac9d7e08afef68f..3ce6f3b0d683c6e82ae36e957a2d45945c28e9d6 100755 --- a/examples/android/build.sh +++ b/examples/android/build.sh @@ -54,7 +54,7 @@ cp -rf include/mace $INCLUDE_DIR cp -rf build/mobilenet/include/mace/public/*.h $INCLUDE_DIR/mace/public/ cp -rf build/mobilenet/model $LIBRARY_DIR -bazel build --config android --config optimization $BAZEL_LIBMACE_TARGET --define neon=true --define openmp=true --define opencl=true --define quantize=true --cpu=$TARGET_ABI +bazel build --config android --config optimization $BAZEL_LIBMACE_TARGET --define neon=true --define opencl=true --define quantize=true --cpu=$TARGET_ABI cp -rf $BAZEL_GEN_LIBMACE_PATH $LIBMACE_DIR if [ $MACE_LINK_TYPE == "dynamic" ]; then diff --git a/examples/android/macelibrary/build.gradle b/examples/android/macelibrary/build.gradle index 891ad5b7584c16408011fb03758c1ed8d2dafa88..1bc37c96558918c5247e23d8345cc5295d955abe 100644 --- a/examples/android/macelibrary/build.gradle +++ b/examples/android/macelibrary/build.gradle @@ -16,7 +16,7 @@ android { externalNativeBuild { cmake { - cppFlags "-std=c++11 -fopenmp" + cppFlags "-std=c++11" abiFilters "arm64-v8a" } } diff --git a/examples/android/macelibrary/src/main/cpp/image_classify.cc b/examples/android/macelibrary/src/main/cpp/image_classify.cc index 760030709f5ab900cd12b9196afb3d3fdeaba4a3..0babf53e198bc60d5b9c01e7b2e090cb48306bda 100755 --- a/examples/android/macelibrary/src/main/cpp/image_classify.cc +++ b/examples/android/macelibrary/src/main/cpp/image_classify.cc @@ -95,7 +95,7 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateGPUContext( JNIEXPORT jint JNICALL Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine( - JNIEnv *env, jclass thisObj, jint omp_num_threads, jint cpu_affinity_policy, + JNIEnv *env, jclass thisObj, jint num_threads, jint cpu_affinity_policy, jint gpu_perf_hint, jint gpu_priority_hint, jstring model_name_str, jstring device) { MaceContext &mace_context = GetMaceContext(); @@ -110,14 +110,13 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine( mace::MaceStatus status; mace::MaceEngineConfig config(mace_context.device_type); status = config.SetCPUThreadPolicy( - omp_num_threads, + num_threads, static_cast(cpu_affinity_policy)); if (status != mace::MaceStatus::MACE_SUCCESS) { __android_log_print(ANDROID_LOG_ERROR, "image_classify attrs", - "openmp result: %s, threads: %d, cpu: %d", - status.information().c_str(), omp_num_threads, - cpu_affinity_policy); + "threads: %d, cpu: %d", + num_threads, cpu_affinity_policy); } if (mace_context.device_type == mace::DeviceType::GPU) { config.SetGPUContext(mace_context.gpu_context); diff --git a/include/mace/public/mace.h b/include/mace/public/mace.h index 7018fc53437c7139b227f5e21ed4f8b2f0875d21..ca723c41b82e40a70f4ef198129477550b9af679 100644 --- a/include/mace/public/mace.h +++ b/include/mace/public/mace.h @@ -316,8 +316,6 @@ class MACE_API MaceEngineConfig { /// (AFFINITY_NONE) cores according to the policy. The threads number will /// also be truncated to the corresponding cores number when num_threads_hint /// is larger than it. - /// The OpenMP threads will be bind to (via sched_setaffinity) big cores - /// (AFFINITY_BIG_ONLY) and little cores (AFFINITY_LITTLE_ONLY). /// /// \param num_threads_hint it is only a hint. /// \param policy one of CPUAffinityPolicy diff --git a/mace/BUILD.bazel b/mace/BUILD.bazel index 1a9d56eadfaaaf2629b0c450b305b0a05bad61ef..3dbccfa5db560e6a21d80263ac7c9938dfe29160 100644 --- a/mace/BUILD.bazel +++ b/mace/BUILD.bazel @@ -108,14 +108,6 @@ config_setting( visibility = ["//visibility:public"], ) -config_setting( - name = "openmp_enabled", - define_values = { - "openmp": "true", - }, - visibility = ["//visibility:public"], -) - config_setting( name = "opencl_enabled", define_values = { diff --git a/mace/core/BUILD.bazel b/mace/core/BUILD.bazel index 67d94f103b489257896bebac79aec570fbe314bb..f418e8143489a323747bf2a33f938bd0fe6df52d 100644 --- a/mace/core/BUILD.bazel +++ b/mace/core/BUILD.bazel @@ -17,7 +17,6 @@ load( "if_not_apu_enabled", "if_not_hexagon_enabled", "if_opencl_enabled", - "if_openmp_enabled", "if_quantize_enabled", "if_rpcmem_enabled", ) @@ -81,10 +80,7 @@ cc_library( "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ] + if_openmp_enabled([ - "-fopenmp", - "-DMACE_ENABLE_OPENMP", - ]) + if_opencl_enabled([ + ] + if_opencl_enabled([ "-DMACE_ENABLE_OPENCL", ]) + if_quantize_enabled([ "-DMACE_ENABLE_QUANTIZE", diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc index 6880dc47145bc761037b3f21be4f4791f4b26613..20e3be1b19acabb98599fc3afc78523218102d66 100644 --- a/mace/core/runtime/cpu/cpu_runtime.cc +++ b/mace/core/runtime/cpu/cpu_runtime.cc @@ -14,10 +14,6 @@ #include "mace/core/runtime/cpu/cpu_runtime.h" -#ifdef MACE_ENABLE_OPENMP -#include -#endif - #include #include #include @@ -35,62 +31,7 @@ namespace mace { -int MaceOpenMPThreadCount = 1; - -enum SchedulePolicy { - SCHED_STATIC, - SCHED_GUIDED, -}; - -namespace { - -MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, - const std::vector &cpu_ids, - SchedulePolicy schedule_policy) { - MaceOpenMPThreadCount = omp_num_threads; - SchedSetAffinity(cpu_ids); -#ifdef MACE_ENABLE_OPENMP - VLOG(1) << "Set OpenMP threads number: " << omp_num_threads - << ", CPU core IDs: " << MakeString(cpu_ids); - if (schedule_policy == SCHED_GUIDED) { - omp_set_schedule(omp_sched_guided, 1); - } else if (schedule_policy == SCHED_STATIC) { - omp_set_schedule(omp_sched_static, 0); - } else { - LOG(WARNING) << "Unknown schedule policy: " << schedule_policy; - } - - omp_set_num_threads(omp_num_threads); -#else - MACE_UNUSED(omp_num_threads); - MACE_UNUSED(schedule_policy); - VLOG(2) << "Set OpenMP threads number failed: OpenMP not enabled."; -#endif - -#ifdef MACE_ENABLE_OPENMP - std::vector status(omp_num_threads, - MaceStatus::MACE_INVALID_ARGS); -#pragma omp parallel for - for (int i = 0; i < omp_num_threads; ++i) { - VLOG(1) << "Set affinity for OpenMP thread " << omp_get_thread_num() - << "/" << omp_get_num_threads(); - status[i] = SchedSetAffinity(cpu_ids); - } - for (int i = 0; i < omp_num_threads; ++i) { - if (status[i] != MaceStatus::MACE_SUCCESS) - return MaceStatus::MACE_INVALID_ARGS; - } - return MaceStatus::MACE_SUCCESS; -#else - MaceStatus status = SchedSetAffinity(cpu_ids); - VLOG(1) << "Set affinity without OpenMP: " << MakeString(cpu_ids); - return status; -#endif -} - -} // namespace - -MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( +MaceStatus CPURuntime::SetThreadsHintAndAffinityPolicy( int num_threads_hint, CPUAffinityPolicy policy, void *gemm_context) { @@ -115,19 +56,8 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( #else MACE_UNUSED(gemm_context); #endif // MACE_ENABLE_QUANTIZE -#ifdef MACE_ENABLE_OPENMP - omp_set_num_threads(num_threads_hint); -#else - VLOG(2) << "Set OpenMP threads number failed: OpenMP not enabled."; -#endif - return MaceStatus::MACE_SUCCESS; - } - SchedulePolicy sched_policy = SCHED_GUIDED; - float first_freq = cpu_max_freqs[cores_to_use[0]]; - float last_freq = cpu_max_freqs[cores_to_use[cores_to_use.size() - 1]]; - if (std::abs(first_freq - last_freq) < 1e-6) { - sched_policy = SCHED_STATIC; + return MaceStatus::MACE_SUCCESS; } #ifdef MACE_ENABLE_QUANTIZE @@ -137,9 +67,10 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( } #endif // MACE_ENABLE_QUANTIZE - return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint, - cores_to_use, - sched_policy); + MaceStatus status = SchedSetAffinity(cores_to_use); + VLOG(1) << "Set affinity : " << MakeString(cores_to_use); + + return status; } } // namespace mace diff --git a/mace/core/runtime/cpu/cpu_runtime.h b/mace/core/runtime/cpu/cpu_runtime.h index d97c90ce7cb1e06390582f3dc25e3bc94e2e0680..96a1773101de3644e058f6c8b27c79bf0a9c87d3 100644 --- a/mace/core/runtime/cpu/cpu_runtime.h +++ b/mace/core/runtime/cpu/cpu_runtime.h @@ -29,8 +29,6 @@ namespace mace { -extern int MaceOpenMPThreadCount; - class CPURuntime { public: CPURuntime(const int num_threads, @@ -43,9 +41,9 @@ class CPURuntime { #ifdef MACE_ENABLE_QUANTIZE MACE_CHECK_NOTNULL(GetGemmlowpContext()); #endif // MACE_ENABLE_QUANTIZE - SetOpenMPThreadsAndAffinityPolicy(num_threads_, - policy_, - gemm_context_); + SetThreadsHintAndAffinityPolicy(num_threads_, + policy_, + gemm_context_); } #ifdef MACE_ENABLE_QUANTIZE @@ -78,8 +76,8 @@ class CPURuntime { } private: - MaceStatus SetOpenMPThreadsAndAffinityPolicy( - int omp_num_threads_hint, + MaceStatus SetThreadsHintAndAffinityPolicy( + int num_threads_hint, CPUAffinityPolicy policy, void *gemm_context); diff --git a/mace/libmace/BUILD.bazel b/mace/libmace/BUILD.bazel index 2c02a4a790dde4d696a6638540daf621b39ebef1..051a57e7ad231e703650453d9bfead0a60c11c5b 100644 --- a/mace/libmace/BUILD.bazel +++ b/mace/libmace/BUILD.bazel @@ -20,7 +20,6 @@ load( "if_linux_base", "if_neon_enabled", "if_opencl_enabled", - "if_openmp_enabled", "if_quantize_enabled", "if_rpcmem_enabled", ) @@ -33,7 +32,7 @@ cc_library( copts = [ "-Werror", "-Wextra", - ] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([ + ] + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ "-mfpu=neon", @@ -70,9 +69,7 @@ cc_binary( "-Wl,--version-script", "$(location //mace/libmace:mace_version_script.lds)", ], - ) + if_openmp_enabled([ - "-fopenmp", - ]), + ), linkshared = 1, linkstatic = 0, deps = [ diff --git a/mace/mace.bzl b/mace/mace.bzl index 330a305e7008f0e8b168af6452b20e1ec0aecbb4..895cc9ba8050aad0dedfe0282af4fea4fd7d1050 100644 --- a/mace/mace.bzl +++ b/mace/mace.bzl @@ -1,133 +1,127 @@ # -*- Python -*- def if_linux_base(a, default_value = []): - return select({ - "//mace:linux_base": a, - "//conditions:default": default_value, - }) + return select({ + "//mace:linux_base": a, + "//conditions:default": default_value, + }) def if_android(a, default_value = []): - return select({ - "//mace:android": a, - "//conditions:default": default_value, - }) + return select({ + "//mace:android": a, + "//conditions:default": default_value, + }) def if_linux(a, default_value = []): - return select({ - "//mace:linux": a, - "//conditions:default": default_value, - }) + return select({ + "//mace:linux": a, + "//conditions:default": default_value, + }) def if_darwin(a, default_value = []): - return select({ - "//mace:darwin": a, - "//conditions:default": default_value, - }) + return select({ + "//mace:darwin": a, + "//conditions:default": default_value, + }) def if_android_armv7(a): - return select({ - "//mace:android_armv7": a, - "//conditions:default": [], - }) + return select({ + "//mace:android_armv7": a, + "//conditions:default": [], + }) def if_android_arm64(a): - return select({ - "//mace:android_arm64": a, - "//conditions:default": [], - }) + return select({ + "//mace:android_arm64": a, + "//conditions:default": [], + }) def if_arm_linux_aarch64(a): - return select({ - "//mace:arm_linux_aarch64": a, - "//conditions:default": [], - }) + return select({ + "//mace:arm_linux_aarch64": a, + "//conditions:default": [], + }) def if_arm_linux_armhf(a): - return select({ - "//mace:arm_linux_armhf": a, - "//conditions:default": [] - }) + return select({ + "//mace:arm_linux_armhf": a, + "//conditions:default": [], + }) def if_neon_enabled(a, default_value = []): - return select({ - "//mace:neon_enabled": a, - "//conditions:default": default_value, - }) + return select({ + "//mace:neon_enabled": a, + "//conditions:default": default_value, + }) def if_hexagon_enabled(a): - return select({ - "//mace:hexagon_enabled": a, - "//conditions:default": [], - }) + return select({ + "//mace:hexagon_enabled": a, + "//conditions:default": [], + }) def if_not_hexagon_enabled(a): - return select({ - "//mace:hexagon_enabled": [], - "//conditions:default": a, - }) + return select({ + "//mace:hexagon_enabled": [], + "//conditions:default": a, + }) def if_hta_enabled(a): - return select({ - "//mace:hta_enabled": a, - "//conditions:default": [], - }) + return select({ + "//mace:hta_enabled": a, + "//conditions:default": [], + }) def if_hexagon_or_hta_enabled(a): - return select({ - "//mace:hexagon_enabled": a, - "//mace:hta_enabled": a, - "//conditions:default": [], - }) + return select({ + "//mace:hexagon_enabled": a, + "//mace:hta_enabled": a, + "//conditions:default": [], + }) def if_apu_enabled(a): - return select({ - "//mace:apu_enabled": a, - "//conditions:default": [], - }) + return select({ + "//mace:apu_enabled": a, + "//conditions:default": [], + }) def if_not_apu_enabled(a): - return select({ - "//mace:apu_enabled": [], - "//conditions:default": a, - }) - -def if_openmp_enabled(a): - return select({ - "//mace:openmp_enabled": a, - "//conditions:default": [], - }) + return select({ + "//mace:apu_enabled": [], + "//conditions:default": a, + }) def if_opencl_enabled(a, default_value = []): - return select({ - "//mace:opencl_enabled": a, - "//conditions:default": default_value, - }) + return select({ + "//mace:opencl_enabled": a, + "//conditions:default": default_value, + }) def if_quantize_enabled(a): - return select({ - "//mace:quantize_enabled": a, - "//conditions:default": [], - }) + return select({ + "//mace:quantize_enabled": a, + "//conditions:default": [], + }) def if_bfloat16_enabled(a): - return select({ - "//mace:bfloat16_enabled": a, - "//conditions:default": [], - }) + return select({ + "//mace:bfloat16_enabled": a, + "//conditions:default": [], + }) def if_rpcmem_enabled(a): - return select({ - "//mace:rpcmem_enabled": a, - "//conditions:default": [], - }) + return select({ + "//mace:rpcmem_enabled": a, + "//conditions:default": [], + }) def mace_version_genrule(): - native.genrule( - name = "mace_version_gen", - srcs = [str(Label("@local_version_config//:gen/version"))], - outs = ["version/version.cc"], - cmd = "cat $(SRCS) > $@;" - ) + native.genrule( + name = "mace_version_gen", + srcs = [str(Label("@local_version_config//:gen/version"))], + outs = ["version/version.cc"], + cmd = "cat $(SRCS) > $@;", + ) def encrypt_opencl_kernel_genrule(): srcs = [ diff --git a/mace/ops/BUILD.bazel b/mace/ops/BUILD.bazel index 32226af45b1d7798e9a8abc90f6c6381ad0e6b03..39d954f82ab6b3ef3ad78325b105f781eed21ac4 100644 --- a/mace/ops/BUILD.bazel +++ b/mace/ops/BUILD.bazel @@ -14,7 +14,6 @@ load( "if_hexagon_enabled", "if_neon_enabled", "if_opencl_enabled", - "if_openmp_enabled", "if_quantize_enabled", ) @@ -36,9 +35,7 @@ cc_library( "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ] + if_openmp_enabled([ - "-fopenmp", - ]) + if_neon_enabled([ + ] + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ "-mfpu=neon-fp16", @@ -77,9 +74,7 @@ cc_library( "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ] + if_openmp_enabled([ - "-fopenmp", - ]) + if_neon_enabled([ + ] + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ "-mfpu=neon-fp16", @@ -134,9 +129,7 @@ cc_library( "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ] + if_openmp_enabled([ - "-fopenmp", - ]) + if_neon_enabled([ + ] + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ "-mfpu=neon-fp16", @@ -176,9 +169,7 @@ cc_library( "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ] + if_openmp_enabled([ - "-fopenmp", - ]) + if_neon_enabled([ + ] + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ "-mfpu=neon-fp16", @@ -221,9 +212,7 @@ cc_library( "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ] + if_openmp_enabled([ - "-fopenmp", - ]) + if_neon_enabled([ + ] + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ "-mfpu=neon-fp16", @@ -263,9 +252,7 @@ cc_library( "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ] + if_openmp_enabled([ - "-fopenmp", - ]) + if_neon_enabled([ + ] + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ "-mfpu=neon-fp16", diff --git a/mace/ops/arm/fp16/gemv.h b/mace/ops/arm/fp16/gemv.h index aa5add8cf684a13c1a036c8784eda10298b163f0..e1bcf22c49565c493a5453c2cb2c538115a6ed92 100644 --- a/mace/ops/arm/fp16/gemv.h +++ b/mace/ops/arm/fp16/gemv.h @@ -21,8 +21,9 @@ #define MACE_ENABLE_FP16_NEON #endif - +#include "mace/core/ops/op_context.h" #include "mace/core/types.h" +#include "mace/utils/thread_pool.h" #if defined(MACE_ENABLE_NEON) && defined(__ANDROID__) #include @@ -36,9 +37,10 @@ namespace mace { namespace ops { template -void FP16Gemv(const INPUT_TYPE_LEFT *m_ptr, + typename INPUT_TYPE_RIGHT, + typename OUTPUT_TYPE> +void FP16Gemv(OpContext *context, + const INPUT_TYPE_LEFT *m_ptr, const INPUT_TYPE_RIGHT *v_ptr, const index_t height, const index_t width, @@ -46,78 +48,83 @@ void FP16Gemv(const INPUT_TYPE_LEFT *m_ptr, #if defined(MACE_ENABLE_FP16_NEON) && defined(__ANDROID__) template<> -void FP16Gemv(const float16_t *m_ptr, +void FP16Gemv(OpContext *context, + const float16_t *m_ptr, const float *v_ptr, const index_t height, const index_t width, float *out_ptr) { -#pragma omp parallel for - for (index_t h = 0; h < height; ++h) { - const float16_t *m_ptr0 = m_ptr + h * width; - const float *v_ptr0 = v_ptr; - float *out_ptr0 = out_ptr + h; - float sum0 = 0; - - float32x4_t vm0, vm1, vm2, vm3; - float32x4_t vv0, vv1, vv2, vv3; - float32x4_t vsum0 = vdupq_n_f32(0.f); - float32x4_t vsum1 = vdupq_n_f32(0.f); - float32x4_t vsum2 = vdupq_n_f32(0.f); - float32x4_t vsum3 = vdupq_n_f32(0.f); - - index_t w; - for (w = 0; w + 15 < width; w += 16) { - vm0 = vcvt_f32_f16(vld1_f16(m_ptr0)); - vv0 = vld1q_f32(v_ptr0); - vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4)); - vv1 = vld1q_f32(v_ptr0 + 4); - vm2 = vcvt_f32_f16(vld1_f16(m_ptr0 + 8)); - vv2 = vld1q_f32(v_ptr0 + 8); - vm3 = vcvt_f32_f16(vld1_f16(m_ptr0 + 12)); - vv3 = vld1q_f32(v_ptr0 + 12); - - vsum0 = vmlaq_f32(vsum0, vm0, vv0); - vsum1 = vmlaq_f32(vsum1, vm1, vv1); - vsum2 = vmlaq_f32(vsum2, vm2, vv2); - vsum3 = vmlaq_f32(vsum3, vm3, vv3); - - m_ptr0 += 16; - v_ptr0 += 16; - } - - for (; w + 7 < width; w += 8) { - vm0 = vcvt_f32_f16(vld1_f16(m_ptr0)); - vv0 = vld1q_f32(v_ptr0); - vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4)); - vv1 = vld1q_f32(v_ptr0 + 4); - - vsum0 = vmlaq_f32(vsum0, vm0, vv0); - vsum1 = vmlaq_f32(vsum1, vm1, vv1); - - m_ptr0 += 8; - v_ptr0 += 8; - } - - for (; w + 3 < width; w += 4) { - vm0 = vcvt_f32_f16(vld1_f16(m_ptr0)); - vv0 = vld1q_f32(v_ptr0); - vsum0 = vmlaq_f32(vsum0, vm0, vv0); - - m_ptr0 += 4; - v_ptr0 += 4; - } - vsum0 += vsum1; - vsum2 += vsum3; - vsum0 += vsum2; - sum0 = vaddvq_f32(vsum0); - - for (; w < width; ++w) { - sum0 += m_ptr0[0] * v_ptr0[0]; - m_ptr0++; - v_ptr0++; - } - *out_ptr0++ = sum0; - } + utils::ThreadPool &thread_pool = + context->device()->cpu_runtime()->thread_pool(); + + thread_pool.Compute1D([=](index_t start0, index_t end0, index_t step0) { + for (index_t h = start0; h < end0; h += step0) { + const float16_t *m_ptr0 = m_ptr + h * width; + const float *v_ptr0 = v_ptr; + float *out_ptr0 = out_ptr + h; + float sum0 = 0; + + float32x4_t vm0, vm1, vm2, vm3; + float32x4_t vv0, vv1, vv2, vv3; + float32x4_t vsum0 = vdupq_n_f32(0.f); + float32x4_t vsum1 = vdupq_n_f32(0.f); + float32x4_t vsum2 = vdupq_n_f32(0.f); + float32x4_t vsum3 = vdupq_n_f32(0.f); + + index_t w; + for (w = 0; w + 15 < width; w += 16) { + vm0 = vcvt_f32_f16(vld1_f16(m_ptr0)); + vv0 = vld1q_f32(v_ptr0); + vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4)); + vv1 = vld1q_f32(v_ptr0 + 4); + vm2 = vcvt_f32_f16(vld1_f16(m_ptr0 + 8)); + vv2 = vld1q_f32(v_ptr0 + 8); + vm3 = vcvt_f32_f16(vld1_f16(m_ptr0 + 12)); + vv3 = vld1q_f32(v_ptr0 + 12); + + vsum0 = vmlaq_f32(vsum0, vm0, vv0); + vsum1 = vmlaq_f32(vsum1, vm1, vv1); + vsum2 = vmlaq_f32(vsum2, vm2, vv2); + vsum3 = vmlaq_f32(vsum3, vm3, vv3); + + m_ptr0 += 16; + v_ptr0 += 16; + } + + for (; w + 7 < width; w += 8) { + vm0 = vcvt_f32_f16(vld1_f16(m_ptr0)); + vv0 = vld1q_f32(v_ptr0); + vm1 = vcvt_f32_f16(vld1_f16(m_ptr0 + 4)); + vv1 = vld1q_f32(v_ptr0 + 4); + + vsum0 = vmlaq_f32(vsum0, vm0, vv0); + vsum1 = vmlaq_f32(vsum1, vm1, vv1); + + m_ptr0 += 8; + v_ptr0 += 8; + } + + for (; w + 3 < width; w += 4) { + vm0 = vcvt_f32_f16(vld1_f16(m_ptr0)); + vv0 = vld1q_f32(v_ptr0); + vsum0 = vmlaq_f32(vsum0, vm0, vv0); + + m_ptr0 += 4; + v_ptr0 += 4; + } + vsum0 += vsum1; + vsum2 += vsum3; + vsum0 += vsum2; + sum0 = vaddvq_f32(vsum0); + + for (; w < width; ++w) { + sum0 += m_ptr0[0] * v_ptr0[0]; + m_ptr0++; + v_ptr0++; + } + *out_ptr0++ = sum0; + } + }, 0, height, 1); } #endif // MACE_ENABLE_FP16_NEON && __ANDROID__ diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc index a6372f514e4d72ea10ad36c7f996585ee3104d9a..2d52af6a4060a7307b98b2a5ab1b7570ede29ee0 100644 --- a/mace/ops/depthwise_conv2d.cc +++ b/mace/ops/depthwise_conv2d.cc @@ -20,7 +20,7 @@ #ifdef MACE_ENABLE_QUANTIZE #include "mace/ops/arm/q8/quantization_util.h" // We reuse TensorFlow Lite's optimized depthwiseconv_uint8 and parallelized it -// using OpenMP for MACE's quantized depthwise_conv2d. +// using thread pool for MACE's quantized depthwise_conv2d. #include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h" #endif // MACE_ENABLE_QUANTIZE diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc index 9f989b41bdc78db440d05be2a41b1b7f5191f770..2ef13fe1b0f1d6fb65e4d252e3a4f588c486fca1 100644 --- a/mace/ops/matmul.cc +++ b/mace/ops/matmul.cc @@ -567,21 +567,15 @@ class MatMulOp : public MatMulOpBase { B->dtype() == DT_FLOAT) { auto *a_ptr_base = A->data(); auto *b_ptr_base = B->data(); - FP16Gemv(a_ptr_base, - b_ptr_base, - height, - K, - c_ptr_base); + FP16Gemv(context, a_ptr_base, b_ptr_base, + height, K, c_ptr_base); return MaceStatus::MACE_SUCCESS; } else if (height == 1 && transpose_b_ && A->dtype() == DT_FLOAT && B->dtype() == DT_FLOAT16) { auto *b_ptr_base = B->data(); auto *a_ptr_base = A->data(); - FP16Gemv(b_ptr_base, - a_ptr_base, - width, - K, - c_ptr_base); + FP16Gemv(context, b_ptr_base, a_ptr_base, + width, K, c_ptr_base); return MaceStatus::MACE_SUCCESS; } else { LOG(INFO) << "Matmul fp16 gemv args: " << height << " " << width << " " diff --git a/mace/tools/BUILD.bazel b/mace/tools/BUILD.bazel index de2b9a35884cb0019631bebc12fb698af9f2ff73..43201a290903a3597f1fad90c555ca38da68e358 100644 --- a/mace/tools/BUILD.bazel +++ b/mace/tools/BUILD.bazel @@ -5,7 +5,6 @@ load( "if_android", "if_hexagon_enabled", "if_opencl_enabled", - "if_openmp_enabled", ) licenses(["notice"]) # Apache 2.0 diff --git a/mace/tools/mace_run.cc b/mace/tools/mace_run.cc index 6d025026d728825c8c55dd44b60832840d8879c9..25b054111cdf8f88bab16bd11cdaaa56aa1e7002 100644 --- a/mace/tools/mace_run.cc +++ b/mace/tools/mace_run.cc @@ -150,7 +150,7 @@ DEFINE_int32(restart_round, 1, "restart round"); DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable"); DEFINE_int32(gpu_perf_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); -DEFINE_int32(omp_num_threads, -1, "num of openmp threads"); +DEFINE_int32(num_threads, -1, "num of threads"); DEFINE_int32(cpu_affinity_policy, 1, "0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY"); DEFINE_bool(benchmark, false, "enable benchmark op"); @@ -170,10 +170,10 @@ bool RunModel(const std::string &model_name, MaceStatus status; MaceEngineConfig config(device_type); status = config.SetCPUThreadPolicy( - FLAGS_omp_num_threads, + FLAGS_num_threads, static_cast(FLAGS_cpu_affinity_policy)); if (status != MaceStatus::MACE_SUCCESS) { - LOG(WARNING) << "Set openmp or cpu affinity failed."; + LOG(WARNING) << "Set cpu affinity failed."; } #if defined(MACE_ENABLE_OPENCL) || defined(MACE_ENABLE_HTA) std::shared_ptr gpu_context; @@ -544,7 +544,7 @@ int Main(int argc, char **argv) { LOG(INFO) << "restart_round: " << FLAGS_restart_round; LOG(INFO) << "gpu_perf_hint: " << FLAGS_gpu_perf_hint; LOG(INFO) << "gpu_priority_hint: " << FLAGS_gpu_priority_hint; - LOG(INFO) << "omp_num_threads: " << FLAGS_omp_num_threads; + LOG(INFO) << "num_threads: " << FLAGS_num_threads; LOG(INFO) << "cpu_affinity_policy: " << FLAGS_cpu_affinity_policy; auto limit_opencl_kernel_time = getenv("MACE_LIMIT_OPENCL_KERNEL_TIME"); if (limit_opencl_kernel_time) { diff --git a/mace/utils/BUILD.bazel b/mace/utils/BUILD.bazel index e28fb04d6ab93da8d8f789c9dc39cbfbdf02d929..31396426a0dcf2696874c2d39781a0d8657c25a2 100644 --- a/mace/utils/BUILD.bazel +++ b/mace/utils/BUILD.bazel @@ -12,7 +12,6 @@ load( "if_android", "if_android_armv7", "if_neon_enabled", - "if_openmp_enabled", ) cc_library( @@ -41,9 +40,7 @@ cc_library( "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ] + if_openmp_enabled([ - "-fopenmp", - ]) + if_neon_enabled([ + ] + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ "-mfpu=neon", diff --git a/test/ccbenchmark/BUILD.bazel b/test/ccbenchmark/BUILD.bazel index 4b8eefb6eeea5b333c382fba798a86c62b128fb9..c4581a8b0cbf88b13c0523499bd912ee3bbe865f 100644 --- a/test/ccbenchmark/BUILD.bazel +++ b/test/ccbenchmark/BUILD.bazel @@ -11,19 +11,18 @@ load( "if_hexagon_enabled", "if_neon_enabled", "if_opencl_enabled", - "if_openmp_enabled", "if_quantize_enabled", ) cc_library( name = "benchmark_utils", testonly = 1, - hdrs = glob([ - "mace/benchmark_utils/*.h", - ]), srcs = glob([ "mace/benchmark_utils/*.cc", ]), + hdrs = glob([ + "mace/benchmark_utils/*.h", + ]), copts = [ "-Werror", "-Wextra", @@ -31,9 +30,9 @@ cc_library( ], strip_include_prefix = "", deps = [ + "//external:gflags_nothreads", "//mace/core", "//test/ccutils", - "//external:gflags_nothreads", ], ) diff --git a/test/ccbenchmark/mace/benchmark_utils/test_benchmark_main.cc b/test/ccbenchmark/mace/benchmark_utils/test_benchmark_main.cc index 43059577d6ad363aba7432b5df600bcd81adcde3..55e484b1a27f5dfa9f58400e4ebb64bb6834e52d 100644 --- a/test/ccbenchmark/mace/benchmark_utils/test_benchmark_main.cc +++ b/test/ccbenchmark/mace/benchmark_utils/test_benchmark_main.cc @@ -20,7 +20,7 @@ #include "mace/ops/ops_test_util.h" DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*"); -DEFINE_int32(omp_num_threads, -1, "num of openmp threads"); +DEFINE_int32(num_threads, -1, "num of threads"); DEFINE_int32(cpu_affinity_policy, 1, "0:AFFINITY_NONE/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY"); @@ -32,7 +32,7 @@ int main(int argc, char **argv) { // config runtime mace::ops::test::OpTestContext::Get( - FLAGS_omp_num_threads, + FLAGS_num_threads, static_cast(FLAGS_cpu_affinity_policy)); mace::testing::Benchmark::Run(FLAGS_filter.c_str()); diff --git a/test/ccunit/BUILD.bazel b/test/ccunit/BUILD.bazel index ce4d268cd3ba86b2badb605a4b21d1d10422afbc..75a6ce7853972fdc2f94ebb40f4c95041b9bf3ad 100644 --- a/test/ccunit/BUILD.bazel +++ b/test/ccunit/BUILD.bazel @@ -12,7 +12,6 @@ load( "if_hta_enabled", "if_neon_enabled", "if_opencl_enabled", - "if_openmp_enabled", "if_quantize_enabled", ) @@ -49,9 +48,7 @@ cc_test( "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ] + if_openmp_enabled([ - "-fopenmp", - ]) + if_neon_enabled([ + ] + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ "-mfpu=neon-fp16", @@ -65,9 +62,6 @@ cc_test( ]) + if_hta_enabled([ "-DMACE_ENABLE_HTA", ]), - linkopts = if_openmp_enabled([ - "-fopenmp", - ]), linkstatic = 1, deps = [ "//mace/ops", diff --git a/test/ccunit/mace/libmace/BUILD.bazel b/test/ccunit/mace/libmace/BUILD.bazel index 09f3b6d8886d967d278e4f9a073be11ca68868e8..1897bca164b9334e0aa8961edeedeb541f3fc3b0 100644 --- a/test/ccunit/mace/libmace/BUILD.bazel +++ b/test/ccunit/mace/libmace/BUILD.bazel @@ -12,7 +12,6 @@ load( "if_hta_enabled", "if_neon_enabled", "if_opencl_enabled", - "if_openmp_enabled", "if_quantize_enabled", ) @@ -36,9 +35,7 @@ cc_test( "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ] + if_openmp_enabled([ - "-fopenmp", - ]) + if_neon_enabled([ + ] + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ "-mfpu=neon", @@ -53,9 +50,6 @@ cc_test( ]) + if_hta_enabled([ "-DMACE_ENABLE_HTA", ]), - linkopts = if_openmp_enabled([ - "-fopenmp", - ]), linkstatic = 1, deps = [ ":mace_api_test_header", @@ -73,9 +67,7 @@ cc_test( "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ] + if_openmp_enabled([ - "-fopenmp", - ]) + if_neon_enabled([ + ] + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ "-mfpu=neon", @@ -90,9 +82,6 @@ cc_test( ]) + if_hta_enabled([ "-DMACE_ENABLE_HTA", ]), - linkopts = if_openmp_enabled([ - "-fopenmp", - ]), linkstatic = 1, deps = [ ":mace_api_test_header", @@ -110,9 +99,7 @@ cc_test( "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ] + if_openmp_enabled([ - "-fopenmp", - ]) + if_neon_enabled([ + ] + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ "-mfpu=neon", @@ -127,9 +114,6 @@ cc_test( ]) + if_hta_enabled([ "-DMACE_ENABLE_HTA", ]), - linkopts = if_openmp_enabled([ - "-fopenmp", - ]), linkstatic = 1, deps = [ "//mace/libmace", @@ -146,9 +130,7 @@ cc_test( "-Werror", "-Wextra", "-Wno-missing-field-initializers", - ] + if_openmp_enabled([ - "-fopenmp", - ]) + if_neon_enabled([ + ] + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ "-mfpu=neon", @@ -163,9 +145,6 @@ cc_test( ]) + if_hta_enabled([ "-DMACE_ENABLE_HTA", ]), - linkopts = if_openmp_enabled([ - "-fopenmp", - ]), linkstatic = 1, deps = [ "//mace/libmace", diff --git a/test/ccutils/BUILD.bazel b/test/ccutils/BUILD.bazel index bb31fff9ade347d3c248f22227cd99b53a3aa92f..5f8eec89f181b0e20bd1b9d484bb081266de5f5e 100644 --- a/test/ccutils/BUILD.bazel +++ b/test/ccutils/BUILD.bazel @@ -11,7 +11,6 @@ load( "if_hexagon_enabled", "if_neon_enabled", "if_opencl_enabled", - "if_openmp_enabled", "if_quantize_enabled", ) @@ -29,7 +28,7 @@ cc_library( copts = [ "-Werror", "-Wextra", - ] + if_openmp_enabled(["-fopenmp"]) + if_neon_enabled([ + ] + if_neon_enabled([ "-DMACE_ENABLE_NEON", ]) + if_android_armv7([ "-mfpu=neon-fp16", diff --git a/tools/converter.py b/tools/converter.py index 6f33885b9945135268db7ed7450b2a0e60ae39d1..dbf1f669c0dc2eebb83b0ebb6576461618ede8d8 100644 --- a/tools/converter.py +++ b/tools/converter.py @@ -117,7 +117,7 @@ DataFormatStrs = [ class DefaultValues(object): mace_lib_type = MACELibType.static - omp_num_threads = -1, + num_threads = -1, cpu_affinity_policy = 1, gpu_perf_hint = 3, gpu_priority_hint = 3, @@ -887,7 +887,7 @@ def convert_func(flags): ################################ # run ################################ -def build_mace_run(configs, target_abi, toolchain, enable_openmp, +def build_mace_run(configs, target_abi, toolchain, address_sanitizer, mace_lib_type, debug_mode): library_name = configs[YAMLKeyword.library_name] @@ -913,7 +913,6 @@ def build_mace_run(configs, target_abi, toolchain, enable_openmp, enable_hexagon=hexagon_enabled(configs), enable_hta=hta_enabled(configs), enable_apu=apu_enabled(configs), - enable_openmp=enable_openmp, enable_opencl=opencl_enabled(configs), enable_quantize=quantize_enabled(configs), enable_bfloat16=bfloat16_enabled(configs), @@ -961,7 +960,6 @@ def run_mace(flags): build_mace_run(configs, target_abi, toolchain, - flags.enable_openmp, flags.address_sanitizer, flags.mace_lib_type, flags.debug_mode) @@ -1079,14 +1077,10 @@ def parse_args(): default=DefaultValues.mace_lib_type, help="[static | dynamic], Which type MACE library to use.") run.add_argument( - "--enable_openmp", - action="store_true", - help="Enable openmp for multiple thread.") - run.add_argument( - "--omp_num_threads", + "--num_threads", type=int, - default=DefaultValues.omp_num_threads, - help="num of openmp threads") + default=DefaultValues.num_threads, + help="num of threads") run.add_argument( "--cpu_affinity_policy", type=int, diff --git a/tools/device.py b/tools/device.py index 9ec9edf4e91e6bcdb6a5a5168a7b572be1795f26..e8763e1d90842894e530fa9c4c3a03b635f4250e 100644 --- a/tools/device.py +++ b/tools/device.py @@ -173,7 +173,7 @@ class DeviceWrapper: opencl_binary_file, opencl_parameter_file, libmace_dynamic_library_path, - omp_num_threads=-1, + num_threads=-1, cpu_affinity_policy=1, gpu_perf_hint=3, gpu_priority_hint=3, @@ -189,11 +189,11 @@ class DeviceWrapper: benchmark=False, ): six.print_("* Run '%s' with round=%s, restart_round=%s, tuning=%s, " - "out_of_range_check=%s, omp_num_threads=%s, " + "out_of_range_check=%s, num_threads=%s, " "cpu_affinity_policy=%s, gpu_perf_hint=%s, " "gpu_priority_hint=%s" % (model_tag, running_round, restart_round, str(tuning), - str(out_of_range_check), omp_num_threads, + str(out_of_range_check), num_threads, cpu_affinity_policy, gpu_perf_hint, gpu_priority_hint)) mace_model_path = "" if model_graph_format == ModelFormat.file: @@ -236,7 +236,7 @@ class DeviceWrapper: "--device=%s" % device_type, "--round=%s" % running_round, "--restart_round=%s" % restart_round, - "--omp_num_threads=%s" % omp_num_threads, + "--num_threads=%s" % num_threads, "--cpu_affinity_policy=%s" % cpu_affinity_policy, "--gpu_perf_hint=%s" % gpu_perf_hint, "--gpu_priority_hint=%s" % gpu_priority_hint, @@ -336,7 +336,7 @@ class DeviceWrapper: "--device=%s" % device_type, "--round=%s" % running_round, "--restart_round=%s" % restart_round, - "--omp_num_threads=%s" % omp_num_threads, + "--num_threads=%s" % num_threads, "--cpu_affinity_policy=%s" % cpu_affinity_policy, "--gpu_perf_hint=%s" % gpu_perf_hint, "--gpu_priority_hint=%s" % gpu_priority_hint, @@ -541,7 +541,7 @@ class DeviceWrapper: out_of_range_check=flags.gpu_out_of_range_check, model_graph_format=configs[ YAMLKeyword.model_graph_format], - omp_num_threads=flags.omp_num_threads, + num_threads=flags.num_threads, cpu_affinity_policy=flags.cpu_affinity_policy, gpu_perf_hint=flags.gpu_perf_hint, gpu_priority_hint=flags.gpu_priority_hint, diff --git a/tools/python/micro/scratch_computer.py b/tools/python/micro/scratch_computer.py index 986527189df7b95b14f1225f2b47eb6d43582889..347eaba4ef6137f210e31786ff6ade54f8b22534 100644 --- a/tools/python/micro/scratch_computer.py +++ b/tools/python/micro/scratch_computer.py @@ -31,7 +31,7 @@ class ScratchComputer: MaceOp.Conv2D: self.scratch_size_no_need, MaceOp.Squeeze: self.scratch_size_of_squeeze, MaceOp.Softmax: self.scratch_size_no_need, - MaceOp.Eltwise: self.scratch_size_no_need, + MaceOp.Eltwise: self.scratch_size_eltwise, MaceOp.Activation: self.scratch_size_no_need, MaceOp.StridedSlice: self.scratch_size_no_need, MaceOp.Reduce: self.scratch_size_no_need, @@ -126,4 +126,8 @@ class ScratchComputer: def scratch_size_of_squeeze(self, op_def): input0_dims = self.get_op_input_dims(op_def, 0) - return len(input0_dims) * self.get_data_bytes(mace_pb2.DT_FLOAT) + return len(input0_dims) * self.get_data_bytes(mace_pb2.DT_INT32) * 2 + + def scratch_size_eltwise(self, op_def): + input0_dims = self.get_op_input_dims(op_def, 0) + return len(input0_dims) * self.get_data_bytes(mace_pb2.DT_INT32) * 3 diff --git a/tools/python/transform/onnx_converter.py b/tools/python/transform/onnx_converter.py index ef4bba0ec871e6edc377ea0cd6be914bf5cf3f20..00e0aca83be082a510667cb60159762d2b04dafe 100644 --- a/tools/python/transform/onnx_converter.py +++ b/tools/python/transform/onnx_converter.py @@ -19,17 +19,19 @@ import six from py_proto import mace_pb2 from transform import base_converter -from transform.base_converter import PoolingType -from transform.base_converter import PaddingMode from transform.base_converter import ActivationType +from transform.base_converter import ConverterUtil +from transform.base_converter import DataFormat from transform.base_converter import EltwiseType -from transform.base_converter import ReduceType from transform.base_converter import FrameworkType -from transform.base_converter import RoundMode -from transform.base_converter import DataFormat from transform.base_converter import MaceOp from transform.base_converter import MaceKeyword -from transform.base_converter import ConverterUtil +from transform.base_converter import PoolingType +from transform.base_converter import PaddingMode +from transform.base_converter import PadType +from transform.base_converter import ReduceType +from transform.base_converter import RoundMode + from utils.util import mace_check import numpy as np @@ -1225,11 +1227,11 @@ class OnnxConverter(base_converter.ConverterInterface): padding_type_arg = op.arg.add() padding_type_arg.name = MaceKeyword.mace_padding_type_str if mode == 'reflect': - padding_type_arg.i = PadType.REFLECT + padding_type_arg.i = PadType.REFLECT.value elif mode == 'edge': - padding_type_arg.i = PadType.SYMMETRIC + padding_type_arg.i = PadType.SYMMETRIC.value else: - padding_type_arg.i = PadType.CONSTANT + padding_type_arg.i = PadType.CONSTANT.value if 'pads' in node.attrs: paddings_arg = op.arg.add() paddings_arg.name = MaceKeyword.mace_paddings_str diff --git a/tools/sh_commands.py b/tools/sh_commands.py index 7642a881b5f0b751a7a1a66dd53c3fa44e50b3ad..f7d8c857c33e91a7ce3bb7844f53e67759197413 100644 --- a/tools/sh_commands.py +++ b/tools/sh_commands.py @@ -266,7 +266,6 @@ def bazel_build(target, enable_hexagon=False, enable_hta=False, enable_apu=False, - enable_openmp=False, enable_neon=True, enable_opencl=True, enable_quantize=True, @@ -284,8 +283,6 @@ def bazel_build(target, "--config", toolchain, "--define", - "openmp=%s" % str(enable_openmp).lower(), - "--define", "quantize=%s" % str(enable_quantize).lower(), "--define", "bfloat16=%s" % str(enable_bfloat16).lower(), @@ -301,8 +298,6 @@ def bazel_build(target, "--define", "neon=%s" % str(enable_neon).lower(), "--define", - "openmp=%s" % str(enable_openmp).lower(), - "--define", "opencl=%s" % str(enable_opencl).lower(), "--define", "quantize=%s" % str(enable_quantize).lower(),