提交 538d3de9 编写于 作者: M Megvii Engine Team

feat(dnn/aarch64): add arm64 nchw44 mk matmul

GitOrigin-RevId: 698a11c3fd578c0424c3623e278fcf023166839d
上级 51786484
/**
* Copyright (c) 2015 Daniel Kirchner
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
* ---------------------------------------------------------------------------
* \file dnn/src/common/hash_ct.h
*
* \brief compile time hash for strings
*
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*
* This file has been modified by Megvii ("Megvii Modifications").
* All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights
* reserved.
*
* ---------------------------------------------------------------------------
*
*/
#pragma once
#include <cstdint>
namespace megdnn {
/*!
* \brief compile-time XX64 hash implementation
*
* see https://github.com/ekpyron/xxhashct/blob/master/xxh64.hpp
*/
class XXHash64CT {
public:
static constexpr uint64_t hash(const char* p, uint64_t len, uint64_t seed) {
return finalize(
(len >= 32 ? h32bytes(p, len, seed) : seed + PRIME5) + len,
p + (len & ~0x1F), len & 0x1F);
}
private:
static constexpr uint64_t PRIME1 = 11400714785074694791ULL;
static constexpr uint64_t PRIME2 = 14029467366897019727ULL;
static constexpr uint64_t PRIME3 = 1609587929392839161ULL;
static constexpr uint64_t PRIME4 = 9650029242287828579ULL;
static constexpr uint64_t PRIME5 = 2870177450012600261ULL;
static constexpr uint64_t rotl(uint64_t x, int r) {
return ((x << r) | (x >> (64 - r)));
}
static constexpr uint64_t mix1(const uint64_t h, const uint64_t prime,
int rshift) {
return (h ^ (h >> rshift)) * prime;
}
static constexpr uint64_t mix2(const uint64_t p, const uint64_t v = 0) {
return rotl(v + p * PRIME2, 31) * PRIME1;
}
static constexpr uint64_t mix3(const uint64_t h, const uint64_t v) {
return (h ^ mix2(v)) * PRIME1 + PRIME4;
}
#ifdef XXH64_BIG_ENDIAN
static constexpr uint32_t endian32(const char* v) {
return uint32_t(uint8_t(v[3])) | (uint32_t(uint8_t(v[2])) << 8) |
(uint32_t(uint8_t(v[1])) << 16) |
(uint32_t(uint8_t(v[0])) << 24);
}
static constexpr uint64_t endian64(const char* v) {
return uint64_t(uint8_t(v[7])) | (uint64_t(uint8_t(v[6])) << 8) |
(uint64_t(uint8_t(v[5])) << 16) |
(uint64_t(uint8_t(v[4])) << 24) |
(uint64_t(uint8_t(v[3])) << 32) |
(uint64_t(uint8_t(v[2])) << 40) |
(uint64_t(uint8_t(v[1])) << 48) |
(uint64_t(uint8_t(v[0])) << 56);
}
#else
static constexpr uint32_t endian32(const char* v) {
return uint32_t(uint8_t(v[0])) | (uint32_t(uint8_t(v[1])) << 8) |
(uint32_t(uint8_t(v[2])) << 16) |
(uint32_t(uint8_t(v[3])) << 24);
}
static constexpr uint64_t endian64(const char* v) {
return uint64_t(uint8_t(v[0])) | (uint64_t(uint8_t(v[1])) << 8) |
(uint64_t(uint8_t(v[2])) << 16) |
(uint64_t(uint8_t(v[3])) << 24) |
(uint64_t(uint8_t(v[4])) << 32) |
(uint64_t(uint8_t(v[5])) << 40) |
(uint64_t(uint8_t(v[6])) << 48) |
(uint64_t(uint8_t(v[7])) << 56);
}
#endif
static constexpr uint64_t fetch64(const char* p, const uint64_t v = 0) {
return mix2(endian64(p), v);
}
static constexpr uint64_t fetch32(const char* p) {
return uint64_t(endian32(p)) * PRIME1;
}
static constexpr uint64_t fetch8(const char* p) {
return uint8_t(*p) * PRIME5;
}
// clang-format off
static constexpr uint64_t finalize (const uint64_t h, const char *p,
uint64_t len) {
return (len >= 8) ? (finalize (rotl (h ^ fetch64 (p), 27)
* PRIME1 + PRIME4, p + 8, len - 8)) :
((len >= 4) ? (finalize (rotl (h ^ fetch32 (p), 23)
* PRIME2 + PRIME3, p + 4, len - 4)) :
((len > 0) ? (finalize (rotl (h ^ fetch8 (p), 11)
* PRIME1, p + 1, len - 1)) :
(mix1 (mix1 (mix1 (h, PRIME2, 33), PRIME3, 29), 1, 32))));
}
static constexpr uint64_t h32bytes (const char *p, uint64_t len,
const uint64_t v1,const uint64_t v2,
const uint64_t v3, const uint64_t v4) {
return (len >= 32) ? h32bytes (p + 32, len - 32, fetch64 (p, v1),
fetch64 (p + 8, v2), fetch64 (p + 16, v3),
fetch64 (p + 24, v4)) :
mix3 (mix3 (mix3 (mix3 (rotl (v1, 1) + rotl (v2, 7) + rotl (v3, 12)
+ rotl (v4, 18), v1), v2), v3), v4);
}
static constexpr uint64_t h32bytes (const char *p, uint64_t len, const uint64_t seed) {
return h32bytes (p, len, seed + PRIME1 + PRIME2, seed + PRIME2, seed, seed - PRIME1);
}
// clang-format on
};
} // namespace megdnn
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
......@@ -17,6 +17,7 @@
#include "megdnn/handle.h"
#include "megdnn/thin/small_vector.h"
#include "src/common/hash_ct.h"
#include "src/common/utils.cuh"
#include <cmath>
......@@ -228,6 +229,10 @@ MEGDNN_CONSTEXPR std::size_t operator"" _z(unsigned long long n) {
return n;
}
constexpr uint32_t operator"" _hash(char const* str, size_t count) {
return XXHash64CT::hash(str, count, 20160701);
}
template <typename Vec>
std::string vec2str(Vec&& vec) {
std::string res;
......
......@@ -362,96 +362,111 @@ void gemm_kern(const Tin* packA, const Tin* packB, size_t M, size_t N, size_t K,
InnerBlockSize get_inner_block_size() const override; \
size_t get_packA_type_size() const override;
#define MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_PACKA( \
_algo_name, _midout_name, _mid_index, _strategy, _i_type, _c_type, \
_packa_type) \
\
MatrixMulImpl::kern_naked_t MatrixMulImpl::_algo_name::get_kern_naked( \
const KernSizeParam&) const { \
auto kern = [](const MatrixMulImpl::KernParam& kern_param, \
const void* packed_a, const void* packed_b) { \
MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index)) { \
auto M = kern_param.M, N = kern_param.N, K = kern_param.K; \
auto trA = kern_param.trA, trB = kern_param.trB; \
auto LDC = kern_param.LDC; \
auto A_type = kern_param.A_type, B_type = kern_param.B_type, \
C_type = kern_param.C_type; \
auto Cptr = kern_param.C<_c_type>(); \
\
_strategy strategy(M, N, K, A_type, B_type, C_type); \
megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB, \
strategy) \
.execute_naked(Cptr, LDC, packed_a, packed_b); \
} \
MIDOUT_END(); \
}; \
return kern; \
} \
\
void MatrixMulImpl::_algo_name::pack_A(const KernParam& kern_param, \
void* out, size_t index, \
size_t stride) const { \
auto M = kern_param.M, N = kern_param.N, K = kern_param.K; \
auto A_type = kern_param.A_type, B_type = kern_param.B_type, \
C_type = kern_param.C_type; \
\
auto trA = kern_param.trA, trB = kern_param.trB; \
auto LDA = kern_param.LDA; \
const auto Aptr = kern_param.A<_i_type>(); \
_strategy strategy(M, N, K, A_type, B_type, C_type); \
size_t start_index = index * stride; \
size_t end_index = start_index + stride; \
end_index = std::min(end_index, M); \
megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB, \
strategy) \
.pack_A(reinterpret_cast<_packa_type*>(out), Aptr, LDA, \
start_index, end_index); \
} \
\
void MatrixMulImpl::_algo_name::pack_B(const KernParam& kern_param, \
void* out, const size_t x0, \
size_t xmax) const { \
auto M = kern_param.M, N = kern_param.N, K = kern_param.K; \
auto A_type = kern_param.A_type, B_type = kern_param.B_type, \
C_type = kern_param.C_type; \
\
auto trA = kern_param.trA, trB = kern_param.trB; \
auto LDB = kern_param.LDB; \
const auto Bptr = kern_param.B<_i_type>(); \
_strategy strategy(M, N, K, A_type, B_type, C_type); \
megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB, \
strategy) \
.pack_B(reinterpret_cast<_i_type*>(out), Bptr, LDB, x0, xmax); \
} \
\
WorkspaceBundle MatrixMulImpl::_algo_name::get_bundle( \
const KernSizeParam& kern_size_param) const { \
auto M = kern_size_param.M, N = kern_size_param.N, \
K = kern_size_param.K; \
auto trA = kern_size_param.trA, trB = kern_size_param.trB; \
auto A_type = kern_size_param.A_type, B_type = kern_size_param.B_type, \
C_type = kern_size_param.C_type; \
_strategy strategy(M, N, K, A_type, B_type, C_type); \
return megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB, \
strategy) \
.get_bundle(); \
} \
\
MatrixMulImpl::_algo_name::InnerBlockSize \
MatrixMulImpl::_algo_name::get_inner_block_size() const { \
return {_strategy::KERNEL_H, _strategy::KERNEL_W, \
_strategy::UNROLL_K}; \
} \
\
size_t MatrixMulImpl::_algo_name::get_packA_type_size() const { \
return sizeof(_packa_type); \
#define MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL( \
_algo_name, _midout_name, _mid_index, _strategy, _i_type, _c_type, \
_packa_type) \
\
MatrixMulImpl::kern_naked_t MatrixMulImpl::_algo_name::get_kern_naked( \
const KernSizeParam&) const { \
auto kern = [](const MatrixMulImpl::KernParam& kern_param, \
const void* packed_a, const void* packed_b) { \
MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index), \
midout_iv("get_kern_naked"_hash)) { \
auto M = kern_param.M, N = kern_param.N, K = kern_param.K; \
auto trA = kern_param.trA, trB = kern_param.trB; \
auto LDC = kern_param.LDC; \
auto A_type = kern_param.A_type, B_type = kern_param.B_type, \
C_type = kern_param.C_type; \
auto Cptr = kern_param.C<_c_type>(); \
\
_strategy strategy(M, N, K, A_type, B_type, C_type); \
megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB, \
strategy) \
.execute_naked(Cptr, LDC, packed_a, packed_b); \
} \
MIDOUT_END(); \
}; \
return kern; \
} \
\
void MatrixMulImpl::_algo_name::pack_A(const KernParam& kern_param, \
void* out, size_t index, \
size_t stride) const { \
MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index), \
midout_iv("pack_A"_hash)) { \
auto M = kern_param.M, N = kern_param.N, K = kern_param.K; \
auto A_type = kern_param.A_type, B_type = kern_param.B_type, \
C_type = kern_param.C_type; \
\
auto trA = kern_param.trA, trB = kern_param.trB; \
auto LDA = kern_param.LDA; \
const auto Aptr = kern_param.A<_i_type>(); \
_strategy strategy(M, N, K, A_type, B_type, C_type); \
size_t start_index = index * stride; \
size_t end_index = start_index + stride; \
end_index = std::min(end_index, M); \
megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB, \
strategy) \
.pack_A(reinterpret_cast<_packa_type*>(out), Aptr, LDA, \
start_index, end_index); \
} \
MIDOUT_END(); \
} \
\
void MatrixMulImpl::_algo_name::pack_B(const KernParam& kern_param, \
void* out, const size_t x0, \
size_t xmax) const { \
MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index), \
midout_iv("pack_B"_hash)) { \
auto M = kern_param.M, N = kern_param.N, K = kern_param.K; \
auto A_type = kern_param.A_type, B_type = kern_param.B_type, \
C_type = kern_param.C_type; \
\
auto trA = kern_param.trA, trB = kern_param.trB; \
auto LDB = kern_param.LDB; \
const auto Bptr = kern_param.B<_i_type>(); \
_strategy strategy(M, N, K, A_type, B_type, C_type); \
megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB, \
strategy) \
.pack_B(reinterpret_cast<_i_type*>(out), Bptr, LDB, x0, \
xmax); \
} \
MIDOUT_END(); \
} \
\
WorkspaceBundle MatrixMulImpl::_algo_name::get_bundle( \
const KernSizeParam& kern_size_param) const { \
MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index), \
midout_iv("get_bundle"_hash)) { \
auto M = kern_size_param.M, N = kern_size_param.N, \
K = kern_size_param.K; \
auto trA = kern_size_param.trA, trB = kern_size_param.trB; \
auto A_type = kern_size_param.A_type, \
B_type = kern_size_param.B_type, \
C_type = kern_size_param.C_type; \
_strategy strategy(M, N, K, A_type, B_type, C_type); \
return megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, \
trB, strategy) \
.get_bundle(); \
} \
MIDOUT_END(); \
} \
\
MatrixMulImpl::_algo_name::InnerBlockSize \
MatrixMulImpl::_algo_name::get_inner_block_size() const { \
return {_strategy::KERNEL_H, _strategy::KERNEL_W, \
_strategy::UNROLL_K}; \
} \
\
size_t MatrixMulImpl::_algo_name::get_packA_type_size() const { \
return sizeof(_packa_type); \
}
#define MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL( \
_algo_name, _midout_name, _mid_index, _strategy, _i_type, _c_type) \
MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_PACKA(_algo_name, _midout_name, \
_mid_index, _strategy, _i_type, \
_c_type, _i_type)
#define MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL( \
_algo_name, _midout_name, _mid_index, _strategy, _i_type, _c_type) \
MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL(_algo_name, _midout_name, \
_mid_index, _strategy, \
_i_type, _c_type, _i_type)
} // namespace matmul
} // namespace megdnn
......
......@@ -70,9 +70,9 @@ class GemmInterleaved<Strategy, true> {
public:
size_t get_workspace_size() const {
return get_a_workspace_size() + get_b_workspace_size() +
get_c_workspace_size();
return get_bundle().total_size_in_bytes();
}
WorkspaceBundle get_bundle() const {
return {nullptr,
{get_a_workspace_size(), get_b_workspace_size(),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册