diff --git a/dnn/src/arm_common/elemwise/unary/algo.cpp b/dnn/src/arm_common/elemwise/unary/algo.cpp index 4b59b7e65239b5125bcb7ac6696fbeb4b2fdfb53..2c947eb0ebaf90faac54d60fb3388d1abc632d2e 100644 --- a/dnn/src/arm_common/elemwise/unary/algo.cpp +++ b/dnn/src/arm_common/elemwise/unary/algo.cpp @@ -71,12 +71,19 @@ void ElemwiseImpl::AlgoUnary::exec(const KernParam& kern_param) const { thin_function \ run = OpCallerUnary<_op<_type, _type>, \ BcastType::VEC>::run; \ - MEGDNN_DISPATCH_CPU_KERN( \ + auto kernel = [nr_elems, nr_elems_per_thread, src0, dst_tensor, \ + run](size_t task_id, size_t) { \ + size_t offset = task_id * nr_elems_per_thread; \ + size_t nr_elems_thread = \ + std::min(nr_elems - offset, nr_elems_per_thread); \ + run(static_cast(src0.raw_ptr) + offset, \ + static_cast<_type*>(dst_tensor.raw_ptr) + offset, \ + src0.layout.dtype, dst_tensor.layout.dtype, \ + nr_elems_thread); \ + }; \ + MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \ static_cast(kern_param.handle), \ - run(static_cast(src0.raw_ptr), \ - static_cast<_type*>(dst_tensor.raw_ptr), \ - src0.layout.dtype, dst_tensor.layout.dtype, \ - nr_elems)); \ + nr_threads, kernel); \ } \ MIDOUT_END(); \ return @@ -86,7 +93,12 @@ void ElemwiseImpl::AlgoUnary::exec(const KernParam& kern_param) const { auto& src0 = elparam[0]; auto& dst_tensor = *(kern_param.m_dst); + size_t nr_threads = static_cast(kern_param.handle) + ->megcore_dispatcher() + ->nr_threads(); + size_t nr_elems = src0.layout.total_nr_elems(); + size_t nr_elems_per_thread = (nr_elems + nr_threads - 1) / nr_threads; #define DISPATCH_MODE_FLOAT(_case, _type, _type_midout_id) \ switch (kern_param.mode) { \ diff --git a/dnn/test/arm_common/elemwise.cpp b/dnn/test/arm_common/elemwise.cpp index efedb76f581ff5730391e05bd4ab5874a7ef6326..6f31d389f92951d18c1f9105901fd780acb76ad7 100644 --- a/dnn/test/arm_common/elemwise.cpp +++ b/dnn/test/arm_common/elemwise.cpp @@ -26,6 +26,13 @@ TYPED_TEST(ARM_ELEMWISE, run) { elemwise::run_test(this->handle()); } +template +class ARM_ELEMWISE_MULTI_THREADS : public ARM_COMMON_MULTI_THREADS {}; +TYPED_TEST_CASE(ARM_ELEMWISE_MULTI_THREADS, elemwise::test_types); +TYPED_TEST(ARM_ELEMWISE_MULTI_THREADS, run) { + elemwise::run_test(this->handle()); +} + TEST_F(ARM_COMMON, ELEMWISE_FORWARD_TERNARY) { using Mode = ElemwiseForward::Param::Mode; Checker checker(handle()); diff --git a/scripts/cmake-build/cross_build_android_arm_inference.sh b/scripts/cmake-build/cross_build_android_arm_inference.sh index 1e360046b239360fad6f5441692200696f80f428..1f5236e91e25b5604a396bae1bd2e4d30470ef51 100755 --- a/scripts/cmake-build/cross_build_android_arm_inference.sh +++ b/scripts/cmake-build/cross_build_android_arm_inference.sh @@ -2,7 +2,7 @@ set -e ARCHS=("arm64-v8a" "armeabi-v7a") -BUILD_TYPE=RelWithDebInfo +BUILD_TYPE=Release MGE_ARMV8_2_FEATURE_FP16=OFF MGE_DISABLE_FLOAT16=OFF ARCH=arm64-v8a