提交 c985204b 编写于 作者: M Megvii Engine Team 提交者: Xinran Xu

feat(dnn): add conv1x1 algo and tests

GitOrigin-RevId: 374a62cf12efb74fd92ee5d0ec8df7cfd40addba
上级 d8d3f405
/**
* \file dnn/src/fallback/conv_bias/conv1x1/algos.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "src/fallback/conv_bias/conv1x1/algos.h"
#include "src/common/opr_delegate.h"
#include "src/fallback/conv_bias/common.h"
#include "src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h"
#include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h"
#include "src/fallback/conv_bias/opr_impl.h"
#include "megdnn/opr_param_defs.h"
#include "src/naive/convolution/helper.h"
#if MEGDNN_X86
#include "src/x86/conv_bias/postprocess_helper.h"
#endif
#include "midout.h"
MIDOUT_DECL(megdnn_fallback_conv1x1)
using namespace megdnn;
using namespace fallback;
#if MEGDNN_X86
using namespace x86;
#endif
using namespace conv1x1;
size_t ConvBiasImpl::AlgoConv1x1::get_oc_tile_size_heuristic(
const NCBKernSizeParam& param) const {
size_t OH = param.osz[0];
size_t OW = param.osz[1];
size_t OC = param.filter_meta.ocpg;
if (OH * OW >= 56 * 56 || OC >= 64)
return m_oc_block_size;
return div_ceil(OC, param.nr_threads);
}
size_t ConvBiasImpl::AlgoConv1x1::get_workspace(
ConvBiasImpl*, const NCBKernSizeParam& param) const {
size_t OH = param.osz[0];
size_t OW = param.osz[1];
size_t compt_oc_block_size = get_oc_tile_size_heuristic(param);
auto matmul_param =
get_matmul_kern_param(param, OH * OW, compt_oc_block_size);
auto pack_mode = m_matmul_algo->packmode();
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) {
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 0) {
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher;
return dispatcher
.get_bundle(param, matmul_param, m_matmul_algo,
compt_oc_block_size)
.total_size_in_bytes();
}
MIDOUT_END();
} else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 1) {
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> dispatcher;
return dispatcher
.get_bundle(param, matmul_param, m_matmul_algo,
compt_oc_block_size)
.total_size_in_bytes();
}
MIDOUT_END();
} else {
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 2) {
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> dispatcher;
return dispatcher
.get_bundle(param, matmul_param, m_matmul_algo,
compt_oc_block_size)
.total_size_in_bytes();
}
MIDOUT_END();
}
return 0;
}
SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns(
ConvBiasImpl* opr, const NCBKernSizeParam& param) const {
SmallVector<ConvBiasImpl::NCBKern> ret_kern;
size_t OH = param.osz[0];
size_t OW = param.osz[1];
size_t OC = param.filter_meta.ocpg;
size_t compt_oc_block_size = get_oc_tile_size_heuristic(param);
size_t GROUP = param.filter_meta.group;
size_t BATCH = param.n;
size_t oc_blocks_per_group = div_ceil(OC, compt_oc_block_size);
auto matmul_param =
get_matmul_kern_param(param, OH * OW, compt_oc_block_size);
WorkspaceBundle whole_bundle = {nullptr, {}};
WorkspaceBundle thread_bundle = {nullptr, {}};
WorkspaceBundle matmul_bundle = {nullptr, {}};
auto pack_mode = m_matmul_algo->packmode();
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) {
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 0) {
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher;
whole_bundle = dispatcher.get_bundle(
param, matmul_param, m_matmul_algo, compt_oc_block_size);
matmul_bundle = m_matmul_algo->get_bundle(matmul_param);
}
MIDOUT_END();
} else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 1) {
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> dispatcher;
whole_bundle = dispatcher.get_bundle(
param, matmul_param, m_matmul_algo, compt_oc_block_size);
matmul_bundle = m_matmul_algo->get_bundle(matmul_param);
}
MIDOUT_END();
} else {
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 2) {
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> dispatcher;
whole_bundle = dispatcher.get_bundle(
param, matmul_param, m_matmul_algo, compt_oc_block_size);
matmul_bundle = {
nullptr,
{0, 0, m_matmul_algo->get_workspace(matmul_param)}};
}
MIDOUT_END();
}
//! get thread bundle
thread_bundle = get_thread_bundle(param, matmul_bundle.get_size(2),
compt_oc_block_size);
Conv1x1StrategyBase* conv1x1_strategy =
Conv1x1Factory::make_conv1x1_strategy(param, pack_mode,
opr->param().format);
auto kern_packA = [this, whole_bundle, matmul_bundle, param,
compt_oc_block_size, conv1x1_strategy](
const NCBKernParam& ncb_param,
const NCBKernIndex& ncb_index) mutable {
conv1x1_strategy->packA(whole_bundle, matmul_bundle,
compt_oc_block_size, this->m_matmul_algo, param,
ncb_param, std::move(ncb_index));
};
auto kern_packB = [this, whole_bundle, matmul_bundle, param,
conv1x1_strategy](
const NCBKernParam& ncb_param,
const NCBKernIndex& ncb_index) mutable {
conv1x1_strategy->packB(whole_bundle, matmul_bundle,
this->m_matmul_algo, param, ncb_param,
std::move(ncb_index));
};
auto kern_compt = [this, whole_bundle, matmul_bundle, thread_bundle, param,
compt_oc_block_size, conv1x1_strategy](
const NCBKernParam& ncb_param,
const NCBKernIndex& ncb_index) mutable {
conv1x1_strategy->exec(whole_bundle, matmul_bundle, thread_bundle,
compt_oc_block_size, this->m_matmul_algo, param,
ncb_param, std::move(ncb_index));
};
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT ||
pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}});
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) {
ret_kern.push_back({kern_packB, {1}});
}
}
ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}});
return ret_kern;
}
bool ConvBiasImpl::AlgoConv1x1::usable(ConvBiasImpl* opr,
const NCBKernSizeParam& param,
AlgoSelectionStrategy) const {
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 2) {
//! only support nchw format
if (opr->param().format != param::ConvBias::Format::NCHW)
return false;
size_t FH = param.filter_meta.spatial[0],
FW = param.filter_meta.spatial[1];
size_t PH = param.filter_meta.padding[0],
PW = param.filter_meta.padding[1];
size_t SH = param.filter_meta.stride[0],
SW = param.filter_meta.stride[1];
if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1)
return false;
//! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode
//! is identity otherwise return false mean that 8x8x32 and 8x8x16
//! not support PostProcess
if (param.src_type.enumv() == param.filter_type.enumv() &&
(param.src_type.enumv() == DTypeEnum::Int8 &&
(param.dst_type.enumv() == DTypeEnum::Int16 ||
param.dst_type.enumv() == DTypeEnum::Int32)) &&
param.bias_mode != megdnn::BiasMode::NO_BIAS &&
param.nonlineMode != megdnn::NonlineMode::IDENTITY)
return false;
if (param.src_type.enumv() == param.filter_type.enumv() &&
((param.src_type.enumv() == DTypeEnum::QuantizedS8 ||
param.src_type.enumv() == DTypeEnum::Quantized8Asymm) &&
param.dst_type.enumv() == DTypeEnum::QuantizedS32) &&
param.bias_mode != megdnn::BiasMode::NO_BIAS &&
param.nonlineMode != megdnn::NonlineMode::IDENTITY)
return false;
size_t OH = param.osz[0];
size_t OW = param.osz[1];
MatrixMulImpl::KernSizeParam matmul_param =
get_matmul_kern_param(param, OH * OW, get_oc_tile_size_heuristic(param));
bool matmulusable = m_matmul_algo->usable(matmul_param);
return matmulusable &&
(param.filter_meta.dilation[0] ==
param.filter_meta.dilation[1] &&
param.filter_meta.dilation[0] == 1) &&
param.compute_mode == param::ConvBias::ComputeMode::DEFAULT;
}
MIDOUT_END();
return false;
}
/**
* \file dnn/src/fallback/conv_bias/conv1x1/algos.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "megdnn/thin/small_vector.h"
#include "src/common/utils.h"
#include "src/fallback/conv_bias/opr_impl.h"
#include "src/fallback/matrix_mul/opr_impl.h"
namespace megdnn {
namespace fallback {
class ConvBiasImpl::AlgoConv1x1 final : public AlgoBase {
public:
AlgoConv1x1(MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size)
: m_matmul_algo(matmul_algo), m_oc_block_size(oc_block_size) {}
bool is_reproducible() const override { return true; }
const char* name() const override {
if (m_name.empty()) {
m_name = ssprintf("CONV1x1:%s:%zu", m_matmul_algo->name(),
m_oc_block_size);
}
return m_name.c_str();
}
bool usable(ConvBiasImpl* opr, const NCBKernSizeParam& param,
AlgoSelectionStrategy algo_selection_strategy) const override;
size_t get_workspace(ConvBiasImpl*,
const NCBKernSizeParam& param) const override;
SmallVector<NCBKern> dispatch_kerns(
ConvBiasImpl* opr, const NCBKernSizeParam& param) const override;
protected:
size_t get_oc_tile_size_heuristic(const NCBKernSizeParam& param) const;
private:
MatrixMulImpl::AlgoBase* m_matmul_algo;
mutable std::string m_name;
mutable size_t m_oc_block_size = 0;
};
} // namespace fallback
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#pragma once
#include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h"
namespace megdnn {
namespace fallback {
namespace conv1x1 {
namespace {
//! get_thread_bundle
WorkspaceBundle get_thread_bundle(const ConvBiasImpl::NCBKernSizeParam& param,
size_t matmul_c_size, size_t oc_tile_size) {
//! for some cases, matmul result need temp space to store
size_t OH = param.osz[0];
size_t OW = param.osz[1];
bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
(param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
size_t matmul_dst_bytes_per_thread =
is_dst_8bit ? oc_tile_size * OH * OW * sizeof(param.bias_type) : 0;
return WorkspaceBundle{nullptr,
{matmul_c_size, matmul_dst_bytes_per_thread}};
}
} // anonymous namespace
template <MatrixMulImpl::AlgoBase::PackMode pack_mode>
class Conv1x1Kerns {
public:
//! get_bundle
WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param,
const MatrixMulImpl::KernSizeParam& matmul_param,
const MatrixMulImpl::AlgoBase* matmul_algo,
size_t oc_tile_size) {
size_t GROUP = param.filter_meta.group;
size_t OC = param.filter_meta.ocpg;
size_t BATCH = param.n;
//! bundle per thread
//! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH
//! * OW this does not bother packb bytes
auto matmul_bundle = matmul_algo->get_bundle(matmul_param);
auto thread_bundle = get_thread_bundle(param, matmul_bundle.get_size(2),
oc_tile_size);
//! size per thread
size_t all_threads_bytes =
thread_bundle.total_size_in_bytes() * param.nr_threads;
//! packa size = GROUP * packa_size_each_group
size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0);
size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size);
size_t all_packa_bytes =
packa_bytes_per_oc_tile * oc_tiles_per_group * GROUP;
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA)
return WorkspaceBundle{nullptr,
{all_packa_bytes, 0, all_threads_bytes}};
//! packb size = N * GROUP * packb_size_per_group
size_t packb_bytes_per_group = matmul_bundle.get_size(1);
size_t all_packb_bytes = packb_bytes_per_group * GROUP * BATCH;
return WorkspaceBundle{
nullptr, {all_packa_bytes, all_packb_bytes, all_threads_bytes}};
}
};
template<>
class Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> {
public:
//! get_bundle
WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param,
const MatrixMulImpl::KernSizeParam& matmul_param,
const MatrixMulImpl::AlgoBase* matmul_algo,
size_t oc_tile_size) {
size_t matmul_size = matmul_algo->get_workspace(matmul_param);
auto thread_bundle = get_thread_bundle(param, matmul_size, oc_tile_size);
//! size per thread
size_t all_threads_bytes =
thread_bundle.total_size_in_bytes() * param.nr_threads;
return WorkspaceBundle{nullptr, {0, 0, all_threads_bytes}};
}
};
} // namespace conv1x1
} // namespace fallback
} // namespace megdnn
/**
* \file dnn/src/fallback/conv_bias/conv1x1/Conv1x1_strategy.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include <unordered_map>
#include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h"
#include "midout.h"
MIDOUT_DECL(megdnn_fallback_conv1x1_factory_strategy)
namespace megdnn {
namespace fallback {
namespace conv1x1 {
namespace {
struct StrategyHashParam {
ConvBiasImpl::NCBKernSizeParam param;
param::ConvBias::Format format;
MatrixMulImpl::AlgoBase::PackMode packmode;
};
struct StrategyHashParamHash {
std::size_t operator()(const StrategyHashParam& sparam) const {
constexpr size_t base = 1; //! avoid hashkey is zero
std::size_t result =
static_cast<std::size_t>(sparam.param.src_type.enumv()) + base;
result = result ^
((static_cast<std::size_t>(sparam.param.dst_type.enumv()) +
base)
<< 3);
result = result ^
((static_cast<std::size_t>(sparam.param.filter_type.enumv()) +
base)
<< 6);
result = result ^
((static_cast<std::size_t>(sparam.param.bias_type.enumv()) +
base)
<< 9);
result = result ^
((static_cast<std::size_t>(sparam.format) + base) << 12);
result = result ^
((static_cast<std::size_t>(sparam.packmode) + base) << 15);
return result;
};
};
struct StrategyHashParamEqual {
bool operator()(const StrategyHashParam& param1,
const StrategyHashParam& param2) const {
bool flags = true;
flags = param1.param.src_type == param2.param.src_type && flags;
flags = param1.param.filter_type == param2.param.filter_type && flags;
flags = param1.param.bias_type == param2.param.bias_type && flags;
flags = param1.param.dst_type == param2.param.dst_type && flags;
flags = param1.format == param2.format && flags;
flags = param1.packmode == param2.packmode && flags;
return flags;
};
};
std::unique_ptr<Conv1x1StrategyBase> create_conv1x1_strategy(
const ConvBiasImpl::NCBKernSizeParam& param,
MatrixMulImpl::AlgoBase::PackMode pack_mode,
param::ConvBias::Format format) {
MEGDNN_MARK_USED_VAR(format);
#define cb1(_packmode, _dt, _post_ctype, _postprocess_mode, _midout_tag) \
MIDOUT_BEGIN(megdnn_fallback_conv1x1_factory_strategy, \
midout_iv(_midout_tag)) { \
if (param.filter_type.enumv() == DTypeTrait<_dt>::enumv) { \
return std::make_unique< \
Conv1x1Strategy<_dt, _dt, _dt, _post_ctype, _post_ctype, \
_postprocess_mode, _packmode>>(); \
} \
} \
MIDOUT_END()
#define cb2(_packmode, _i_src_type, _i_bias_type, _i_dst_type, _src_ctype, \
_bias_ctype, _dst_ctype, _postprocess_mode, _midout_tag) \
MIDOUT_BEGIN(megdnn_fallback_conv1x1_factory_strategy, \
midout_iv(_midout_tag)) { \
if (param.filter_type.enumv() == param.src_type.enumv() && \
param.src_type.enumv() == DTypeTrait<_i_src_type>::enumv && \
param.dst_type.enumv() == DTypeTrait<_i_dst_type>::enumv) { \
return std::make_unique< \
Conv1x1Strategy<_src_ctype, _bias_ctype, _dst_ctype, \
DTypeTrait<_i_bias_type>::ctype, \
DTypeTrait<_i_dst_type>::ctype, \
_postprocess_mode, _packmode>>(); \
} \
} \
MIDOUT_END()
switch (pack_mode) {
case MatrixMulImpl::AlgoBase::PackMode::DEFAULT:
cb1(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_float32,
dt_float32, PostprocessMode::FLOAT, "Default::FLOAT"_hash);
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
cb1(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_float16, __fp16,
PostprocessMode::FLOAT, "Default::FLOAT16_FP16"_hash);
#else
#if !MEGDNN_DISABLE_FLOAT16
cb1(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_float16,
dt_float16, PostprocessMode::NO_PROCESS,
"Default::FLOAT16_FLOAT16"_hash);
#endif
#endif
cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_int8, dt_int32,
dt_int32, dt_int8, dt_int32, dt_int32,
PostprocessMode::NO_PROCESS, "Default::INT8x8x32_INT32"_hash);
cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_int8, dt_int16,
dt_int16, dt_int8, dt_int16, dt_int16,
PostprocessMode::NO_PROCESS, "Default::INT8x8x16_INT16"_hash);
#if MEGDNN_AARCH64 || MEGDNN_ARMV7
cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT,
dtype::Quantized8Asymm, dtype::QuantizedS32,
dtype::QuantizedS32, dt_uint8, dt_int32, dt_int32,
PostprocessMode::NO_PROCESS,
"Default::QUINT8x8x32_QINT32"_hash);
cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT,
dtype::Quantized8Asymm, dtype::QuantizedS32,
dtype::Quantized8Asymm, dt_uint8, dt_int32, dt_uint8,
PostprocessMode::QUANTIZED, "Default::QUINT8x8x32_QUINT8"_hash);
#endif
cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dtype::QuantizedS8,
dtype::QuantizedS32, dtype::QuantizedS32, dt_int8, dt_int32,
dt_int32, PostprocessMode::NO_PROCESS,
"Default::QINT8x8x32_QINT32"_hash);
cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dtype::QuantizedS8,
dtype::QuantizedS32, dtype::QuantizedS8, dt_int8, dt_int32,
dt_int8, PostprocessMode::QUANTIZED,
"Default::QINT8x8x32_QINT8"_hash);
break;
case MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA:
cb1(MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA, dt_float32,
dt_float32, PostprocessMode::FLOAT, "OnlyPackA::FLOAT"_hash);
break;
case MatrixMulImpl::AlgoBase::PackMode::NO_PACK:
cb1(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dt_float32,
dt_float32, PostprocessMode::FLOAT, "NoPack::FLOAT"_hash);
cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dt_int8, dt_int16,
dt_int16, dt_int8, dt_int16, dt_int16,
PostprocessMode::NO_PROCESS, "NoPack::INT8x8x16_INT16"_hash);
cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dt_int8, dt_int32,
dt_int32, dt_int8, dt_int32, dt_int32,
PostprocessMode::NO_PROCESS, "NoPack::INT8x8x32_INT32"_hash);
cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK,
dtype::QuantizedS8, dtype::QuantizedS32,
dtype::QuantizedS32, dt_int8, dt_int32, dt_int32,
PostprocessMode::NO_PROCESS,
"NoPack::QINT8x8x32_QINT32"_hash);
break;
default:
megdnn_throw("Invalid Pack Mode");
break;
}
#undef cb1
#undef cb2
megdnn_throw("Invalid Data Type");
return nullptr;
}
class StrategyDelegationStorage {
public:
Conv1x1StrategyBase* get(const ConvBiasImpl::NCBKernSizeParam& param,
MatrixMulImpl::AlgoBase::PackMode pack_mode,
param::ConvBias::Format format) {
MEGDNN_LOCK_GUARD(m_mtx);
StrategyHashParam sparam;
sparam.param = param;
sparam.format = format;
sparam.packmode = pack_mode;
if (m_map_strategies.find(sparam) == m_map_strategies.end()) {
auto strategy = create_conv1x1_strategy(param, pack_mode, format);
m_map_strategies[sparam] = std::move(strategy);
}
return m_map_strategies[sparam].get();
}
private:
std::mutex m_mtx;
std::unordered_map<StrategyHashParam, std::unique_ptr<Conv1x1StrategyBase>,
StrategyHashParamHash, StrategyHashParamEqual>
m_map_strategies;
};
} // anonymous namespace
Conv1x1StrategyBase* Conv1x1Factory::make_conv1x1_strategy(
const ConvBiasImpl::NCBKernSizeParam& param,
MatrixMulImpl::AlgoBase::PackMode pack_mode,
param::ConvBias::Format format) {
static StrategyDelegationStorage storage;
return storage.get(param, pack_mode, format);
}
} // namespace conv1x1
} // namespace fallback
} // namespace megdnn
/**
* \file dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megdnn/opr_param_defs.h"
#include "src/fallback/conv_bias/opr_impl.h"
#if MEGDNN_X86
#include "src/x86/conv_bias/postprocess_helper.h"
#endif
namespace megdnn {
namespace fallback {
namespace conv1x1 {
#if MEGDNN_X86
using namespace x86;
#endif
namespace {
//! get_matmul_kern_param
MatrixMulImpl::KernSizeParam get_matmul_kern_param(
const ConvBiasImpl::NCBKernSizeParam& param, size_t n, size_t m) {
size_t M = m;
size_t N = n;
size_t K = param.filter_meta.icpg; //! K = IC
size_t LDA = K, LDB = N, LDC = N;
bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
(param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
return {param.filter_type,
param.src_type,
is_dst_8bit ? param.bias_type : param.dst_type,
M,
N,
K,
LDA,
LDB,
LDC,
false,
false,
param::MatrixMul::ComputeMode::DEFAULT,
param::MatrixMul::Format::DEFAULT};
}
} // namespace
class Conv1x1StrategyBase {
public:
virtual void packA(WorkspaceBundle& whole_bundle,
WorkspaceBundle& matmul_bundle,
size_t oc_tile_size,
const MatrixMulImpl::AlgoBase* matmul_algo,
const ConvBiasImpl::NCBKernSizeParam& param,
const ConvBiasImpl::NCBKernParam& ncb_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) = 0;
virtual void packB(WorkspaceBundle& whole_bundle,
WorkspaceBundle& matmul_bundle,
const MatrixMulImpl::AlgoBase* matmul_algo,
const ConvBiasImpl::NCBKernSizeParam& param,
const ConvBiasImpl::NCBKernParam& ncb_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) = 0;
virtual void exec(WorkspaceBundle& whole_bundle,
WorkspaceBundle& matmul_bundle,
WorkspaceBundle& thread_bundle,
size_t oc_tile_size,
const MatrixMulImpl::AlgoBase* matmul_algo,
const ConvBiasImpl::NCBKernSizeParam& param,
const ConvBiasImpl::NCBKernParam& ncb_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) = 0;
virtual ~Conv1x1StrategyBase() = default;
};
template <typename src_ctype, typename bias_ctype, typename dst_ctype,
typename op_ctype, typename op_dtype,
megdnn::PostprocessMode postprocess_mode, MatrixMulImpl::AlgoBase::PackMode pack_mode>
class Conv1x1Strategy : public Conv1x1StrategyBase {
public:
void packA(WorkspaceBundle& whole_bundle,
WorkspaceBundle& matmul_bundle,
size_t oc_tile_size,
const MatrixMulImpl::AlgoBase* matmul_algo,
const ConvBiasImpl::NCBKernSizeParam& param,
const ConvBiasImpl::NCBKernParam& ncb_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) override {
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK) {
megdnn_log_error("NoPack mode has no packA kernel");
return;
}
whole_bundle.set(ncb_param.workspace_ptr);
//! packa size per group
size_t OC = param.filter_meta.ocpg;
size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size);
size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0);
size_t packa_bytes_per_group =
oc_tiles_per_group * packa_bytes_per_oc_tile;
size_t group_id = ncb_index.ndrange_id[0];
size_t oc_tile_id_in_group = ncb_index.ndrange_id[1];
size_t oc_start = oc_tile_id_in_group * oc_tile_size;
size_t oc_end = oc_start + oc_tile_size;
oc_end = (oc_end <= OC ? oc_end : OC);
size_t OH = param.osz[0];
size_t OW = param.osz[1];
size_t IC = param.filter_meta.icpg;
MatrixMulImpl::KernParam matmul_kern_param;
static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) =
get_matmul_kern_param(param, OH * OW, oc_end - oc_start);
size_t bytes_offset_of_a_panel =
group_id * packa_bytes_per_group +
oc_tile_id_in_group * packa_bytes_per_oc_tile;
size_t numbers_offset_of_filter =
oc_tile_size * IC * oc_tile_id_in_group;
src_ctype* a_panel = reinterpret_cast<src_ctype*>(
reinterpret_cast<int8_t*>(whole_bundle.get(0)) +
bytes_offset_of_a_panel);
matmul_kern_param.A_ptr = const_cast<src_ctype*>(
ncb_param.filter<src_ctype>(group_id) +
numbers_offset_of_filter);
matmul_algo->pack_A(matmul_kern_param, a_panel, 0,
oc_end - oc_start);
}
void packB(WorkspaceBundle& whole_bundle,
WorkspaceBundle& matmul_bundle,
const MatrixMulImpl::AlgoBase* matmul_algo,
const ConvBiasImpl::NCBKernSizeParam& param,
const ConvBiasImpl::NCBKernParam& ncb_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) override {
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) {
whole_bundle.set(ncb_param.workspace_ptr);
//! packb size per group
size_t packb_bytes_per_group = matmul_bundle.get_size(1);
size_t GROUP = param.filter_meta.group;
size_t BATCH = param.n;
size_t SH = param.filter_meta.stride[0];
size_t SW = param.filter_meta.stride[1];
size_t OH = param.osz[0];
size_t OW = param.osz[1];
size_t OC = param.filter_meta.ocpg;
MatrixMulImpl::KernParam matmul_kern_param;
static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) =
get_matmul_kern_param(param, OH * OW, OC);
rep(batch, BATCH) {
rep(g, GROUP) {
if (SH == 2 && SW == 2)
megdnn_throw("no support for stride = 2");
size_t bytes_offset_of_b_panel =
batch * packb_bytes_per_group * GROUP +
g * packb_bytes_per_group;
src_ctype* b_panel = reinterpret_cast<src_ctype*>(
reinterpret_cast<int8_t*>(whole_bundle.get(1)) +
bytes_offset_of_b_panel);
matmul_kern_param.B_ptr = const_cast<src_ctype*>(
ncb_param.src<src_ctype>(batch, g));
matmul_algo->pack_B(matmul_kern_param, b_panel, 0, OH * OW);
}
}
} else {
megdnn_log_error("OnlyPackA mode and NoPack mode has no packB kernel");
}
}
void exec(WorkspaceBundle& whole_bundle,
WorkspaceBundle& matmul_bundle,
WorkspaceBundle& thread_bundle,
size_t oc_tile_size,
const MatrixMulImpl::AlgoBase* matmul_algo,
const ConvBiasImpl::NCBKernSizeParam& param,
const ConvBiasImpl::NCBKernParam& ncb_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) override {
whole_bundle.set(ncb_param.workspace_ptr);
size_t OC = param.filter_meta.ocpg;
size_t IC = param.filter_meta.icpg;
//! packa bytes per group
size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size);
size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0);
size_t packa_bytes_per_group =
packa_bytes_per_oc_tile * oc_tiles_per_group;
//! packb bytes per group
size_t packb_bytes_per_group = matmul_bundle.get_size(1);
//! matmul bytes per thread
size_t matmul_bytes_per_thread = thread_bundle.get_size(0);
size_t batch_id = ncb_index.ndrange_id[0];
size_t group_id = ncb_index.ndrange_id[1];
size_t oc_tile_id_in_group = ncb_index.ndrange_id[2];
size_t thread_id = ncb_index.thread_id;
size_t GROUP = param.filter_meta.group;
size_t OH = param.osz[0];
size_t OW = param.osz[1];
size_t oc_start = oc_tile_size * oc_tile_id_in_group;
size_t oc_end = oc_start + oc_tile_size;
oc_end = (oc_end <= OC ? oc_end : OC);
MatrixMulImpl::KernParam matmul_kern_param;
static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) =
get_matmul_kern_param(param, OH * OW, oc_end - oc_start);
size_t bytes_offset_of_a_panel =
group_id * packa_bytes_per_group +
oc_tile_id_in_group * packa_bytes_per_oc_tile;
int8_t* a_panel = reinterpret_cast<int8_t*>(whole_bundle.get(0)) +
bytes_offset_of_a_panel;
size_t bytes_offset_of_b_panel =
batch_id * packb_bytes_per_group * GROUP +
group_id * packb_bytes_per_group;
int8_t* b_panel = reinterpret_cast<int8_t*>(whole_bundle.get(1)) +
bytes_offset_of_b_panel;
size_t thread_offset = thread_bundle.total_size_in_bytes() * thread_id;
size_t bytes_offset_of_matmul_dst_this_thread =
thread_offset + thread_bundle.get_size(0);
int8_t* matmul_temp_dst =
reinterpret_cast<int8_t*>(whole_bundle.get(2)) +
bytes_offset_of_matmul_dst_this_thread;
size_t numbers_of_ncb_dst_offset =
oc_tile_size * OH * OW * oc_tile_id_in_group;
void* conv_bias_dst = static_cast<void*>(
ncb_param.dst<dst_ctype>(batch_id, group_id) +
numbers_of_ncb_dst_offset);
size_t numbers_of_ncb_filter_offset =
oc_tile_size * IC * oc_tile_id_in_group;
matmul_kern_param.A_ptr = const_cast<src_ctype*>(
ncb_param.filter<src_ctype>(group_id) +
numbers_of_ncb_filter_offset);
matmul_kern_param.B_ptr = const_cast<src_ctype*>(
ncb_param.src<src_ctype>(batch_id, group_id));
matmul_kern_param.workspace_ptr =
reinterpret_cast<int8_t*>(whole_bundle.get(2)) + thread_offset;
matmul_kern_param.workspace_size = matmul_bytes_per_thread;
bool is_dst_8bit =
(param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
(param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
void* matmul_dst = is_dst_8bit ? matmul_temp_dst : conv_bias_dst;
matmul_kern_param.C_ptr = matmul_dst;
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK) {
auto matmul_kern = matmul_algo->get_kern(matmul_kern_param);
matmul_kern(matmul_kern_param);
} else {
auto matmul_kern_naked =
matmul_algo->get_kern_naked(matmul_kern_param);
matmul_kern_naked(matmul_kern_param, a_panel, b_panel);
}
//! do postprocess
void* bias_ptr = nullptr;
if (param.bias_mode == megdnn::BiasMode::BIAS)
bias_ptr = static_cast<void*>(const_cast<bias_ctype*>(
ncb_param.bias<bias_ctype>(batch_id, group_id) +
numbers_of_ncb_dst_offset));
else
bias_ptr = static_cast<void*>(const_cast<bias_ctype*>(
ncb_param.bias<bias_ctype>(batch_id, group_id) + oc_start));
PostProcess<op_ctype, op_dtype, postprocess_mode>::run(
matmul_dst, bias_ptr, conv_bias_dst, param.bias_mode,
param.nonlineMode, param.bias_type, param.dst_type, 1_z,
oc_end - oc_start, OH, OW);
}
};
class Conv1x1Factory {
public:
static Conv1x1StrategyBase* make_conv1x1_strategy(
const ConvBiasImpl::NCBKernSizeParam& param,
MatrixMulImpl::AlgoBase::PackMode pack_mode,
param::ConvBias::Format format);
};
} // namespace conv1x1
} // namespace fallback
} // namespace megdnn
......@@ -15,6 +15,7 @@
#include "src/common/opr_delegate.h"
#include "src/common/utils.h"
#include "src/fallback/conv_bias/algos.h"
#include "src/fallback/conv_bias/conv1x1/algos.h"
#include "src/fallback/conv_bias/im2col/algos.h"
#include "src/fallback/conv_bias/opr_impl.h"
#include "src/naive/convolution/algorithms.h"
......@@ -54,7 +55,13 @@ public:
ohw_tile_size));
all_algos.emplace_back(refhold.back().get());
}
#if 1
for (size_t oc_tile_size : {24, 48}) {
refhold.emplace_back(new AlgoConv1x1(
static_cast<MatrixMulImpl::AlgoBase*>(algo),
oc_tile_size));
all_algos.emplace_back(refhold.back().get());
}
#if 0
//! As these algos maybe very slow, it will make fastrun search slow, so
//! we disable it, but for the test of strategyhelper, we just keep it.
//! FIXME: I do not know a better way to do it.
......
......@@ -248,6 +248,7 @@ protected:
private:
class AlgoNaive;
class AlgoIm2col;
class AlgoConv1x1;
class AlgoWinogradF32;
class AlgoWinogradF32_4x4;
class AlgoWinogradQS8;
......
......@@ -438,7 +438,6 @@ size_t MatrixMulImpl::AlgoInt8x8x32SSEM4N8K2::get_workspace(
m, n, k, trans_a, trans_b, strategy, cacheline)
.get_workspace_size();
}
MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL(
AlgoInt8x8x32SSEM4N8K2, megdnn_x86_matmul_kern, 9,
x86::matmul::gemm_sse_s8s8s32_4x8x2, dt_int8, dt_int32, dt_int16);
......
......@@ -875,6 +875,82 @@ std::vector<conv_bias::TestArg> get_conv_bias_args(
return args;
}
std::vector<megdnn::test::conv_bias::TestArg> get_conv_bias_1x1_args(
bool no_bias, bool no_nonlinemode, bool quantized_nlmod,
bool only_broadcast_bias) {
using namespace conv_bias;
using Param = param::ConvBias;
using NLMode = param::ConvBias::NonlineMode;
using CONVMode = param::ConvBias::Mode;
std::vector<TestArg> args;
auto pack = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h,
size_t stride, NLMode nlmode, CONVMode convmode) {
Param param;
param.stride_h = stride;
param.stride_w = stride;
param.pad_h = 0;
param.pad_w = 0;
param.mode = convmode;
param.nonlineMode = nlmode;
args.emplace_back(param, TensorShape{n, ic, h, w},
TensorShape{oc, ic, 1, 1}, TensorShape{});
if (!no_bias) {
args.emplace_back(param, TensorShape{n, ic, h, w},
TensorShape{oc, ic, 1, 1},
TensorShape{1, oc, 1, 1});
if (!only_broadcast_bias) {
args.emplace_back(param, TensorShape{n, ic, h, w},
TensorShape{oc, ic, 1, 1},
TensorShape{n, oc, (h - 1) / stride + 1,
(w - 1) / stride + 1});
}
}
param.sparse = param::ConvBias::Sparse::GROUP;
args.emplace_back(param, TensorShape{n, 2 * ic, h, w},
TensorShape{2, oc, ic, 1, 1}, TensorShape{});
if (!no_bias) {
args.emplace_back(param, TensorShape{n, 2 * ic, h, w},
TensorShape{2, oc, ic, 1, 1},
TensorShape{1, 2 * oc, 1, 1});
if (!only_broadcast_bias) {
args.emplace_back(param, TensorShape{n, 2 * ic, h, w},
TensorShape{2, oc, ic, 1, 1},
TensorShape{n, 2 * oc, (h - 1) / stride + 1,
(w - 1) / stride + 1});
}
}
};
std::vector<NLMode> nonlinemode = {NLMode::IDENTITY};
if (!no_nonlinemode) {
nonlinemode.emplace_back(NLMode::RELU);
nonlinemode.emplace_back(NLMode::H_SWISH);
if (!quantized_nlmod) {
nonlinemode.emplace_back(NLMode::SIGMOID);
}
}
std::vector<CONVMode> convmodes{param::ConvBias::Mode::CONVOLUTION,
param::ConvBias::Mode::CROSS_CORRELATION};
for (size_t n : {1, 2})
for (size_t oc : {1, 9, 33})
for (size_t ic : {1, 16, 64})
for (size_t size : {7, 14, 28})
for (auto nlmode : nonlinemode)
for (auto convmode : convmodes) {
pack(n, oc, ic, size, size, 1, nlmode, convmode);
}
return args;
}
void check_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle,
const char* algo_name) {
using namespace conv_bias;
......
......@@ -76,6 +76,10 @@ std::vector<megdnn::test::conv_bias::TestArg> get_conv_bias_args(
bool no_nonlinemode, bool quantized_nlmod = false,
bool only_broadcast_bias = false);
std::vector<megdnn::test::conv_bias::TestArg> get_conv_bias_1x1_args(
bool no_bias, bool no_nonlinemode, bool quantized_nlmod = false,
bool only_broadcast_bias = false);
void check_conv_bias(std::vector<megdnn::test::conv_bias::TestArg> args,
megdnn::Handle* handle, const char* algo_name);
......
......@@ -919,6 +919,79 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) {
#undef cb
}
/**************************** Conv1x1 PackA *************************/
namespace {
void checker_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle,
RNG* rng, float epsilon, DType type0, DType type1,
DType type2, DType type3, const char* algo_name) {
using namespace conv_bias;
Checker<ConvBias> checker(handle);
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name));
checker.set_dtype(0, type0);
checker.set_dtype(1, type1);
checker.set_dtype(2, type2);
checker.set_dtype(4, type3);
checker.set_epsilon(epsilon);
if (NULL != rng) {
checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng);
}
for (auto&& arg : args) {
checker.set_param(arg.param).execs(
{arg.src, arg.filter, arg.bias, {}, {}});
}
}
} // namespace
#if MEGDNN_X86_WITH_MKL
TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_PACKA) {
using namespace conv_bias;
std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false);
check_conv_bias(args, handle(), "CONV1x1:X86_F32_MKL_PACKA:24");
}
TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_BLAS) {
using namespace conv_bias;
std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false);
check_conv_bias(args, handle(), "CONV1x1:X86_F32_BLAS:48");
}
#endif
TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_INT8X8X32) {
using namespace conv_bias;
UniformIntRNG rng{-50, 50};
float epsilon = 0.001;
std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(true, true);
#if MEGDNN_X86_WITH_MKL_DNN
if (x86::is_supported(x86::SIMDType::VNNI)) {
checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
"CONV1x1:X86_INT8X8X32_MKLDNN:24");
}
#endif
#if MEGDNN_X86_WITH_VNNI
if (x86::is_supported(x86::SIMDType::VNNI)) {
checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
"CONV1x1:X86_INT8X8X32_VNNI:24");
}
#endif
if (x86::is_supported(x86::SIMDType::AVX2)) {
checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
"CONV1x1:X86_INT8X8X32_AVX2_4X16X2:24");
checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
"CONV1x1:X86_INT8X8X32_AVX2_2X4X16:24");
}
checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{},
dtype::Int8{}, dtype::Int32{}, dtype::Int32{},
"CONV1x1:X86_INT8X8X32_SSE_4X8X2:48");
}
/************************* End Conv1x1 PackA ************************/
#endif
TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册