From 538d3de9d2edaad55252611c3be3a10a536a3069 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 30 Mar 2020 22:13:40 +0800 Subject: [PATCH] feat(dnn/aarch64): add arm64 nchw44 mk matmul GitOrigin-RevId: 698a11c3fd578c0424c3623e278fcf023166839d --- dnn/src/common/hash_ct.h | 147 +++++++++++++++++ dnn/src/common/utils.h | 5 + dnn/src/fallback/matrix_mul/gemm_common.h | 191 ++++++++++++---------- dnn/src/fallback/matrix_mul/gemm_impl.h | 4 +- 4 files changed, 257 insertions(+), 90 deletions(-) create mode 100644 dnn/src/common/hash_ct.h diff --git a/dnn/src/common/hash_ct.h b/dnn/src/common/hash_ct.h new file mode 100644 index 000000000..14f5909de --- /dev/null +++ b/dnn/src/common/hash_ct.h @@ -0,0 +1,147 @@ +/** + * Copyright (c) 2015 Daniel Kirchner + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * --------------------------------------------------------------------------- + * \file dnn/src/common/hash_ct.h + * + * \brief compile time hash for strings + * + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + * + * This file has been modified by Megvii ("Megvii Modifications"). + * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights + * reserved. + * + * --------------------------------------------------------------------------- + * + */ + +#pragma once +#include +namespace megdnn { +/*! + * \brief compile-time XX64 hash implementation + * + * see https://github.com/ekpyron/xxhashct/blob/master/xxh64.hpp + */ +class XXHash64CT { +public: + static constexpr uint64_t hash(const char* p, uint64_t len, uint64_t seed) { + return finalize( + (len >= 32 ? h32bytes(p, len, seed) : seed + PRIME5) + len, + p + (len & ~0x1F), len & 0x1F); + } + +private: + static constexpr uint64_t PRIME1 = 11400714785074694791ULL; + static constexpr uint64_t PRIME2 = 14029467366897019727ULL; + static constexpr uint64_t PRIME3 = 1609587929392839161ULL; + static constexpr uint64_t PRIME4 = 9650029242287828579ULL; + static constexpr uint64_t PRIME5 = 2870177450012600261ULL; + + static constexpr uint64_t rotl(uint64_t x, int r) { + return ((x << r) | (x >> (64 - r))); + } + static constexpr uint64_t mix1(const uint64_t h, const uint64_t prime, + int rshift) { + return (h ^ (h >> rshift)) * prime; + } + static constexpr uint64_t mix2(const uint64_t p, const uint64_t v = 0) { + return rotl(v + p * PRIME2, 31) * PRIME1; + } + static constexpr uint64_t mix3(const uint64_t h, const uint64_t v) { + return (h ^ mix2(v)) * PRIME1 + PRIME4; + } +#ifdef XXH64_BIG_ENDIAN + static constexpr uint32_t endian32(const char* v) { + return uint32_t(uint8_t(v[3])) | (uint32_t(uint8_t(v[2])) << 8) | + (uint32_t(uint8_t(v[1])) << 16) | + (uint32_t(uint8_t(v[0])) << 24); + } + static constexpr uint64_t endian64(const char* v) { + return uint64_t(uint8_t(v[7])) | (uint64_t(uint8_t(v[6])) << 8) | + (uint64_t(uint8_t(v[5])) << 16) | + (uint64_t(uint8_t(v[4])) << 24) | + (uint64_t(uint8_t(v[3])) << 32) | + (uint64_t(uint8_t(v[2])) << 40) | + (uint64_t(uint8_t(v[1])) << 48) | + (uint64_t(uint8_t(v[0])) << 56); + } +#else + static constexpr uint32_t endian32(const char* v) { + return uint32_t(uint8_t(v[0])) | (uint32_t(uint8_t(v[1])) << 8) | + (uint32_t(uint8_t(v[2])) << 16) | + (uint32_t(uint8_t(v[3])) << 24); + } + static constexpr uint64_t endian64(const char* v) { + return uint64_t(uint8_t(v[0])) | (uint64_t(uint8_t(v[1])) << 8) | + (uint64_t(uint8_t(v[2])) << 16) | + (uint64_t(uint8_t(v[3])) << 24) | + (uint64_t(uint8_t(v[4])) << 32) | + (uint64_t(uint8_t(v[5])) << 40) | + (uint64_t(uint8_t(v[6])) << 48) | + (uint64_t(uint8_t(v[7])) << 56); + } +#endif + static constexpr uint64_t fetch64(const char* p, const uint64_t v = 0) { + return mix2(endian64(p), v); + } + static constexpr uint64_t fetch32(const char* p) { + return uint64_t(endian32(p)) * PRIME1; + } + static constexpr uint64_t fetch8(const char* p) { + return uint8_t(*p) * PRIME5; + } + // clang-format off + static constexpr uint64_t finalize (const uint64_t h, const char *p, + uint64_t len) { + return (len >= 8) ? (finalize (rotl (h ^ fetch64 (p), 27) + * PRIME1 + PRIME4, p + 8, len - 8)) : + ((len >= 4) ? (finalize (rotl (h ^ fetch32 (p), 23) + * PRIME2 + PRIME3, p + 4, len - 4)) : + ((len > 0) ? (finalize (rotl (h ^ fetch8 (p), 11) + * PRIME1, p + 1, len - 1)) : + (mix1 (mix1 (mix1 (h, PRIME2, 33), PRIME3, 29), 1, 32)))); + } + static constexpr uint64_t h32bytes (const char *p, uint64_t len, + const uint64_t v1,const uint64_t v2, + const uint64_t v3, const uint64_t v4) { + return (len >= 32) ? h32bytes (p + 32, len - 32, fetch64 (p, v1), + fetch64 (p + 8, v2), fetch64 (p + 16, v3), + fetch64 (p + 24, v4)) : + mix3 (mix3 (mix3 (mix3 (rotl (v1, 1) + rotl (v2, 7) + rotl (v3, 12) + + rotl (v4, 18), v1), v2), v3), v4); + } + static constexpr uint64_t h32bytes (const char *p, uint64_t len, const uint64_t seed) { + return h32bytes (p, len, seed + PRIME1 + PRIME2, seed + PRIME2, seed, seed - PRIME1); + } + // clang-format on +}; +} // namespace megdnn + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/common/utils.h b/dnn/src/common/utils.h index 500fc98ac..d688166c2 100644 --- a/dnn/src/common/utils.h +++ b/dnn/src/common/utils.h @@ -17,6 +17,7 @@ #include "megdnn/handle.h" #include "megdnn/thin/small_vector.h" +#include "src/common/hash_ct.h" #include "src/common/utils.cuh" #include @@ -228,6 +229,10 @@ MEGDNN_CONSTEXPR std::size_t operator"" _z(unsigned long long n) { return n; } +constexpr uint32_t operator"" _hash(char const* str, size_t count) { + return XXHash64CT::hash(str, count, 20160701); +} + template std::string vec2str(Vec&& vec) { std::string res; diff --git a/dnn/src/fallback/matrix_mul/gemm_common.h b/dnn/src/fallback/matrix_mul/gemm_common.h index dc9ea680f..678aae007 100644 --- a/dnn/src/fallback/matrix_mul/gemm_common.h +++ b/dnn/src/fallback/matrix_mul/gemm_common.h @@ -362,96 +362,111 @@ void gemm_kern(const Tin* packA, const Tin* packB, size_t M, size_t N, size_t K, InnerBlockSize get_inner_block_size() const override; \ size_t get_packA_type_size() const override; -#define MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_PACKA( \ - _algo_name, _midout_name, _mid_index, _strategy, _i_type, _c_type, \ - _packa_type) \ - \ - MatrixMulImpl::kern_naked_t MatrixMulImpl::_algo_name::get_kern_naked( \ - const KernSizeParam&) const { \ - auto kern = [](const MatrixMulImpl::KernParam& kern_param, \ - const void* packed_a, const void* packed_b) { \ - MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index)) { \ - auto M = kern_param.M, N = kern_param.N, K = kern_param.K; \ - auto trA = kern_param.trA, trB = kern_param.trB; \ - auto LDC = kern_param.LDC; \ - auto A_type = kern_param.A_type, B_type = kern_param.B_type, \ - C_type = kern_param.C_type; \ - auto Cptr = kern_param.C<_c_type>(); \ - \ - _strategy strategy(M, N, K, A_type, B_type, C_type); \ - megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB, \ - strategy) \ - .execute_naked(Cptr, LDC, packed_a, packed_b); \ - } \ - MIDOUT_END(); \ - }; \ - return kern; \ - } \ - \ - void MatrixMulImpl::_algo_name::pack_A(const KernParam& kern_param, \ - void* out, size_t index, \ - size_t stride) const { \ - auto M = kern_param.M, N = kern_param.N, K = kern_param.K; \ - auto A_type = kern_param.A_type, B_type = kern_param.B_type, \ - C_type = kern_param.C_type; \ - \ - auto trA = kern_param.trA, trB = kern_param.trB; \ - auto LDA = kern_param.LDA; \ - const auto Aptr = kern_param.A<_i_type>(); \ - _strategy strategy(M, N, K, A_type, B_type, C_type); \ - size_t start_index = index * stride; \ - size_t end_index = start_index + stride; \ - end_index = std::min(end_index, M); \ - megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB, \ - strategy) \ - .pack_A(reinterpret_cast<_packa_type*>(out), Aptr, LDA, \ - start_index, end_index); \ - } \ - \ - void MatrixMulImpl::_algo_name::pack_B(const KernParam& kern_param, \ - void* out, const size_t x0, \ - size_t xmax) const { \ - auto M = kern_param.M, N = kern_param.N, K = kern_param.K; \ - auto A_type = kern_param.A_type, B_type = kern_param.B_type, \ - C_type = kern_param.C_type; \ - \ - auto trA = kern_param.trA, trB = kern_param.trB; \ - auto LDB = kern_param.LDB; \ - const auto Bptr = kern_param.B<_i_type>(); \ - _strategy strategy(M, N, K, A_type, B_type, C_type); \ - megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB, \ - strategy) \ - .pack_B(reinterpret_cast<_i_type*>(out), Bptr, LDB, x0, xmax); \ - } \ - \ - WorkspaceBundle MatrixMulImpl::_algo_name::get_bundle( \ - const KernSizeParam& kern_size_param) const { \ - auto M = kern_size_param.M, N = kern_size_param.N, \ - K = kern_size_param.K; \ - auto trA = kern_size_param.trA, trB = kern_size_param.trB; \ - auto A_type = kern_size_param.A_type, B_type = kern_size_param.B_type, \ - C_type = kern_size_param.C_type; \ - _strategy strategy(M, N, K, A_type, B_type, C_type); \ - return megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB, \ - strategy) \ - .get_bundle(); \ - } \ - \ - MatrixMulImpl::_algo_name::InnerBlockSize \ - MatrixMulImpl::_algo_name::get_inner_block_size() const { \ - return {_strategy::KERNEL_H, _strategy::KERNEL_W, \ - _strategy::UNROLL_K}; \ - } \ - \ - size_t MatrixMulImpl::_algo_name::get_packA_type_size() const { \ - return sizeof(_packa_type); \ +#define MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL( \ + _algo_name, _midout_name, _mid_index, _strategy, _i_type, _c_type, \ + _packa_type) \ + \ + MatrixMulImpl::kern_naked_t MatrixMulImpl::_algo_name::get_kern_naked( \ + const KernSizeParam&) const { \ + auto kern = [](const MatrixMulImpl::KernParam& kern_param, \ + const void* packed_a, const void* packed_b) { \ + MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index), \ + midout_iv("get_kern_naked"_hash)) { \ + auto M = kern_param.M, N = kern_param.N, K = kern_param.K; \ + auto trA = kern_param.trA, trB = kern_param.trB; \ + auto LDC = kern_param.LDC; \ + auto A_type = kern_param.A_type, B_type = kern_param.B_type, \ + C_type = kern_param.C_type; \ + auto Cptr = kern_param.C<_c_type>(); \ + \ + _strategy strategy(M, N, K, A_type, B_type, C_type); \ + megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB, \ + strategy) \ + .execute_naked(Cptr, LDC, packed_a, packed_b); \ + } \ + MIDOUT_END(); \ + }; \ + return kern; \ + } \ + \ + void MatrixMulImpl::_algo_name::pack_A(const KernParam& kern_param, \ + void* out, size_t index, \ + size_t stride) const { \ + MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index), \ + midout_iv("pack_A"_hash)) { \ + auto M = kern_param.M, N = kern_param.N, K = kern_param.K; \ + auto A_type = kern_param.A_type, B_type = kern_param.B_type, \ + C_type = kern_param.C_type; \ + \ + auto trA = kern_param.trA, trB = kern_param.trB; \ + auto LDA = kern_param.LDA; \ + const auto Aptr = kern_param.A<_i_type>(); \ + _strategy strategy(M, N, K, A_type, B_type, C_type); \ + size_t start_index = index * stride; \ + size_t end_index = start_index + stride; \ + end_index = std::min(end_index, M); \ + megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB, \ + strategy) \ + .pack_A(reinterpret_cast<_packa_type*>(out), Aptr, LDA, \ + start_index, end_index); \ + } \ + MIDOUT_END(); \ + } \ + \ + void MatrixMulImpl::_algo_name::pack_B(const KernParam& kern_param, \ + void* out, const size_t x0, \ + size_t xmax) const { \ + MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index), \ + midout_iv("pack_B"_hash)) { \ + auto M = kern_param.M, N = kern_param.N, K = kern_param.K; \ + auto A_type = kern_param.A_type, B_type = kern_param.B_type, \ + C_type = kern_param.C_type; \ + \ + auto trA = kern_param.trA, trB = kern_param.trB; \ + auto LDB = kern_param.LDB; \ + const auto Bptr = kern_param.B<_i_type>(); \ + _strategy strategy(M, N, K, A_type, B_type, C_type); \ + megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB, \ + strategy) \ + .pack_B(reinterpret_cast<_i_type*>(out), Bptr, LDB, x0, \ + xmax); \ + } \ + MIDOUT_END(); \ + } \ + \ + WorkspaceBundle MatrixMulImpl::_algo_name::get_bundle( \ + const KernSizeParam& kern_size_param) const { \ + MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index), \ + midout_iv("get_bundle"_hash)) { \ + auto M = kern_size_param.M, N = kern_size_param.N, \ + K = kern_size_param.K; \ + auto trA = kern_size_param.trA, trB = kern_size_param.trB; \ + auto A_type = kern_size_param.A_type, \ + B_type = kern_size_param.B_type, \ + C_type = kern_size_param.C_type; \ + _strategy strategy(M, N, K, A_type, B_type, C_type); \ + return megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, \ + trB, strategy) \ + .get_bundle(); \ + } \ + MIDOUT_END(); \ + } \ + \ + MatrixMulImpl::_algo_name::InnerBlockSize \ + MatrixMulImpl::_algo_name::get_inner_block_size() const { \ + return {_strategy::KERNEL_H, _strategy::KERNEL_W, \ + _strategy::UNROLL_K}; \ + } \ + \ + size_t MatrixMulImpl::_algo_name::get_packA_type_size() const { \ + return sizeof(_packa_type); \ } -#define MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL( \ - _algo_name, _midout_name, _mid_index, _strategy, _i_type, _c_type) \ - MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_PACKA(_algo_name, _midout_name, \ - _mid_index, _strategy, _i_type, \ - _c_type, _i_type) +#define MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL( \ + _algo_name, _midout_name, _mid_index, _strategy, _i_type, _c_type) \ + MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL(_algo_name, _midout_name, \ + _mid_index, _strategy, \ + _i_type, _c_type, _i_type) } // namespace matmul } // namespace megdnn diff --git a/dnn/src/fallback/matrix_mul/gemm_impl.h b/dnn/src/fallback/matrix_mul/gemm_impl.h index 1ab5b1eaa..b25621e0c 100644 --- a/dnn/src/fallback/matrix_mul/gemm_impl.h +++ b/dnn/src/fallback/matrix_mul/gemm_impl.h @@ -70,9 +70,9 @@ class GemmInterleaved { public: size_t get_workspace_size() const { - return get_a_workspace_size() + get_b_workspace_size() + - get_c_workspace_size(); + return get_bundle().total_size_in_bytes(); } + WorkspaceBundle get_bundle() const { return {nullptr, {get_a_workspace_size(), get_b_workspace_size(), -- GitLab