提交 90ca8554 编写于 作者: M Megvii Engine Team

feat(dnn/x86): add avx2 int8 stride1 chanwise multithread conv

GitOrigin-RevId: 8f310c3d139dfc27a4083f354597363681b73ba5
上级 0bdb64c5
...@@ -40,6 +40,15 @@ ...@@ -40,6 +40,15 @@
UNROLL_RAW16(cb, v0, ##a) \ UNROLL_RAW16(cb, v0, ##a) \
cb(16, ##a) cb(17, ##a) cb(18, ##a) cb(19, ##a) cb(20, ##a) cb(21, ##a) \ cb(16, ##a) cb(17, ##a) cb(18, ##a) cb(19, ##a) cb(20, ##a) cb(21, ##a) \
cb(22, ##a) cb(23, ##a) cb(22, ##a) cb(23, ##a)
#define UNROLL_RAW25(cb, v0, a...) \
UNROLL_RAW24(cb, v0, ##a) \
cb(24, ##a)
#define UNROLL_RAW49(cb, v0, a...) \
UNROLL_RAW25(cb, v0, ##a) \
cb(25, ##a) cb(26, ##a) cb(27, ##a) cb(28, ##a) cb(29, ##a) cb(30, ##a) \
cb(31, ##a) cb(32, ##a) cb(33, ##a) cb(34, ##a) cb(35, ##a) cb(36, ##a) \
cb(37, ##a) cb(38, ##a) cb(39, ##a) cb(40, ##a) cb(41, ##a) cb(42, ##a) \
cb(43, ##a) cb(44, ##a) cb(45, ##a) cb(46, ##a) cb(47, ##a) cb(48, ##a)
#define UNROLL_CALL0(step, cb, v...) UNROLL_RAW##step(cb, 0, ##v) #define UNROLL_CALL0(step, cb, v...) UNROLL_RAW##step(cb, 0, ##v)
#define UNROLL_CALL1(step, cb, v...) UNROLL_CALL0(step, cb, ##v) #define UNROLL_CALL1(step, cb, v...) UNROLL_CALL0(step, cb, ##v)
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "src/fallback/convolution/img2col_helper.h" #include "src/fallback/convolution/img2col_helper.h"
#include "src/x86/conv_bias/int8/avx2_direct_conv_stride1.h" #include "src/x86/conv_bias/int8/avx2_direct_conv_stride1.h"
#include "src/x86/conv_bias/int8/avx2_direct_conv_stride2.h" #include "src/x86/conv_bias/int8/avx2_direct_conv_stride2.h"
#include "src/x86/conv_bias/int8/avx2_chanwise_stride1.h"
#include "src/x86/conv_bias/opr_impl.h" #include "src/x86/conv_bias/opr_impl.h"
#include "src/x86/conv_bias/postprocess_helper.h" #include "src/x86/conv_bias/postprocess_helper.h"
#include "src/x86/handle.h" #include "src/x86/handle.h"
...@@ -31,6 +32,65 @@ using namespace dnnl; ...@@ -31,6 +32,65 @@ using namespace dnnl;
using namespace megdnn; using namespace megdnn;
using namespace x86; using namespace x86;
bool ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::usable(
FallbackConvBiasImpl* /*opr*/, const NCBKernSizeParam& param,
AlgoSelectionStrategy /*algo_selection_strategy*/) const {
auto&& fm = param.filter_meta;
auto FH = fm.spatial[0];
bool aviliable =
((param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
param.filter_type.enumv() == DTypeEnum::QuantizedS8 &&
param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
(((param.src_type.enumv() == DTypeEnum::Int8 &&
param.filter_type.enumv() == DTypeEnum::Int8 &&
param.dst_type.enumv() == DTypeEnum::Int32) ||
(param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
param.filter_type.enumv() == DTypeEnum::QuantizedS8 &&
param.dst_type.enumv() == DTypeEnum::QuantizedS32)))) &&
fm.format == Param::Format::NCHW && fm.spatial_ndim == 2 &&
fm.dilation[0] == 1 && fm.dilation[1] == 1 &&
(FH == 2 || FH == 3 || FH == 5 || FH == 7) && fm.stride[0] == 1 &&
fm.stride[1] == 1 && (fm.icpg == 1) && (fm.ocpg == 1) &&
is_supported(SIMDType::AVX2);
return aviliable;
}
WorkspaceBundle ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::get_bundle(
const NCBKernSizeParam& param) {
size_t nr_threads = param.nr_threads;
size_t IH2, IW2, OH2, OW2;
size_t src_size = 0, dst_size = 0, int32_temp = 0;
avx2_chanwise_stride1::get_rectified_size(param, IH2, IW2, OH2, OW2);
if (avx2_chanwise_stride1::need_src_copy(param)) {
src_size = IH2 * IW2 * sizeof(int8_t) * nr_threads;
}
if (avx2_chanwise_stride1::need_dst_copy(param)) {
dst_size = OH2 * OW2 * param.dst_type.size() * nr_threads;
}
bool dst_need_convert = param.dst_type.enumv() == DTypeEnum::QuantizedS8;
if (dst_need_convert) {
int32_temp = OH2 * OW2 * sizeof(int32_t) * nr_threads;
}
return dst_need_convert
? WorkspaceBundle(nullptr, {src_size, dst_size, int32_temp})
: WorkspaceBundle(nullptr, {src_size, dst_size});
}
size_t ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::get_workspace(
FallbackConvBiasImpl*, const NCBKernSizeParam& param) const {
return get_bundle(param).total_size_in_bytes();
}
SmallVector<fallback::ConvBiasImpl::NCBKern>
ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::get_kimpls(
const NCBKernSizeParam& param) const {
auto bundle = get_bundle(param);
return avx2_chanwise_stride1::get_kimpls(param, bundle);
}
bool ConvBiasImpl::AlgoDirectAvx2Stride1Int8::usable( bool ConvBiasImpl::AlgoDirectAvx2Stride1Int8::usable(
FallbackConvBiasImpl* /*opr*/, const NCBKernSizeParam& param, FallbackConvBiasImpl* /*opr*/, const NCBKernSizeParam& param,
AlgoSelectionStrategy /*algo_selection_strategy*/) const { AlgoSelectionStrategy /*algo_selection_strategy*/) const {
......
...@@ -13,6 +13,29 @@ ...@@ -13,6 +13,29 @@
namespace megdnn { namespace megdnn {
namespace x86 { namespace x86 {
/* ===================== avx2 stride1 chanwise algo ===================== */
class ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8 final : public AlgoBase {
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;
static WorkspaceBundle get_bundle(const NCBKernSizeParam& param);
public:
bool is_reproducible() const override { return true; }
const char* name() const override {
return "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1";
}
bool usable(FallbackConvBiasImpl* opr, const NCBKernSizeParam& param,
AlgoSelectionStrategy algo_selection_strategy) const override;
size_t get_workspace(FallbackConvBiasImpl* opr,
const NCBKernSizeParam& param) const override;
virtual SmallVector<NCBKern> dispatch_kerns(
fallback::ConvBiasImpl*,
const NCBKernSizeParam& param) const override {
return get_kimpls(param);
}
void* type() const override;
};
/* ===================== avx2 stride1 direct algo ===================== */ /* ===================== avx2 stride1 direct algo ===================== */
class ConvBiasImpl::AlgoDirectAvx2Stride1Int8 final : public AlgoBase { class ConvBiasImpl::AlgoDirectAvx2Stride1Int8 final : public AlgoBase {
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;
......
此差异已折叠。
/**
* \file src/x86/conv_bias/int8/avx2_chanwsie_kern.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "src/x86/conv_bias/opr_impl.h"
namespace megdnn {
namespace x86 {
namespace avx2_chanwise_stride1 {
#define KERN(stride, i) \
template <BiasMode bias_mode, bool is_quantized, typename Op> \
MEGDNN_ATTRIBUTE_TARGET("avx2") \
void avx2_chanwise_direct_##stride##_##i##x##i##_int8( \
const int8_t* src, const int8_t* filter, const int32_t* bias, \
int32_t* temp, int8_t* dst, const size_t IH, const size_t IW, \
const size_t OH, const size_t OW, const Op& op);
KERN(stride1, 2)
KERN(stride1, 3)
KERN(stride1, 5)
KERN(stride1, 7)
#undef KERN
} // namespace avx2_chanwise_stride1
} // namespace x86
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file src/x86/conv_bias/int8/avx2_chanwsie_stride1.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/x86/conv_bias/int8/avx2_chanwise_stride1.h"
#include "src/x86/conv_bias/int8/avx2_chanwise_kern.h"
#include "src/x86/elemwise_op.h"
namespace megdnn {
namespace x86 {
namespace avx2_chanwise_stride1 {
bool need_dst_copy(const NCBKernSizeParam& param) {
return param.osz[1] % 16;
}
bool need_src_copy(const NCBKernSizeParam& param) {
auto&& fm = param.filter_meta;
return (fm.padding[0] != 0 || fm.padding[1] != 0) ? true
: need_dst_copy(param);
}
void get_rectified_size(const NCBKernSizeParam& param, size_t& IH2, size_t& IW2,
size_t& OH2, size_t& OW2) {
auto&& fm = param.filter_meta;
auto SW = fm.stride[1];
auto OH = param.osz[0];
auto OW = param.osz[1];
auto FH = fm.spatial[0];
auto FW = fm.spatial[1];
OH2 = OH;
OW2 = (OW + 15) & ~15;
IH2 = SW * OH + FH - SW;
IW2 = SW * OW2 + FW - SW;
}
void copy_padding_kern(WorkspaceBundle bundle,
const ConvBiasImpl::NCBKernParam& kern_param,
const ConvBiasImpl::NCBKernIndex& ncb_index) {
size_t IH = kern_param.isz[0];
size_t IW = kern_param.isz[1];
size_t PH = kern_param.filter_meta.padding[0];
size_t PW = kern_param.filter_meta.padding[1];
size_t IH2, IW2, OH2, OW2;
get_rectified_size(kern_param, IH2, IW2, OH2, OW2);
bool need_src_copy_var = need_src_copy(kern_param);
size_t padding_group_size = IH2 * IW2;
bundle.set(kern_param.workspace_ptr);
size_t group_id = ncb_index.ndrange_id[0],
batch_id = ncb_index.ndrange_id[1],
channel_id = ncb_index.ndrange_id[2];
size_t workspace_group_id = ncb_index.thread_id;
const int8_t* sptr = kern_param.src<int8_t>(batch_id, group_id, channel_id);
if (need_src_copy_var) {
int8_t* sptr_base = static_cast<int8_t*>(bundle.get(0)) +
workspace_group_id * padding_group_size;
std::memset(sptr_base, 0, sizeof(int8_t) * IH2 * IW2);
rep(ih, IH) {
std::memcpy(sptr_base + (ih + PH) * IW2 + PW, sptr + ih * IW,
sizeof(int8_t) * IW);
}
}
};
template <size_t filter, BiasMode bias_mode, bool is_quantized, typename Op>
void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
size_t OH = kern_param.osz[0];
size_t OW = kern_param.osz[1];
size_t IH2, IW2, OH2, OW2;
get_rectified_size(kern_param, IH2, IW2, OH2, OW2);
bool need_src_copy_var = need_src_copy(kern_param);
bool need_dst_copy_var = need_dst_copy(kern_param);
bool need_post_process =
kern_param.dst_type.enumv() == DTypeEnum::QuantizedS8;
Op op = Op(1.0f, 4.0f);
if (need_post_process) {
float scale_bias =
kern_param.bias_type.param<dtype::QuantizedS32>().scale;
float scale_dst = kern_param.dst_type.param<dtype::QuantizedS8>().scale;
op = Op(scale_bias, scale_dst);
}
size_t padding_group_size = IH2 * IW2;
bundle.set(kern_param.workspace_ptr);
size_t workspace_group_id = ncb_index.thread_id;
size_t group_id = ncb_index.ndrange_id[0],
batch_id = ncb_index.ndrange_id[1];
const int8_t* sptr = kern_param.src<dt_int8>(batch_id, group_id);
const int8_t* fptr =
kern_param.filter<dt_int8>(group_id);
void* dst = kern_param.dst<void>(batch_id, group_id);
const int32_t* bptr = kern_param.bias<dt_int32>(batch_id, group_id);
if (need_src_copy_var) {
sptr = static_cast<int8_t*>(bundle.get(0)) +
workspace_group_id * padding_group_size;
}
void* dptr = nullptr;
int32_t* tptr = nullptr;
if (need_dst_copy_var) {
dptr = reinterpret_cast<void*>(
reinterpret_cast<ptrdiff_t>(bundle.get(1)) +
ncb_index.thread_id * OH2 * OW2 * kern_param.dst_type.size());
} else {
dptr = dst;
}
#define KERN_NEED_POST_PROCESS(filter) \
avx2_chanwise_direct_stride1_##filter##x##filter##_int8<bias_mode, true, \
Op>( \
sptr, fptr, bptr, tptr, static_cast<int8_t*>(dptr), IH2, IW2, OH2, \
OW2, op)
#define KERN_NO_POST_PROCESS(filter) \
avx2_chanwise_direct_stride1_##filter##x##filter##_int8<bias_mode, false, \
Op>( \
sptr, fptr, bptr, static_cast<int32_t*>(dptr), nullptr, IH2, IW2, \
OH2, OW2, op)
if (need_post_process) {
tptr = static_cast<int32_t*>(bundle.get(2)) +
ncb_index.thread_id * OH2 * OW2 * kern_param.dst_type.size();
DISPATCH_FILTER(filter, KERN_NEED_POST_PROCESS)
} else {
DISPATCH_FILTER(filter, KERN_NO_POST_PROCESS)
}
#undef KERN_NEED_POST_PROCESS
#undef KERN_NO_POST_PROCESS
if (need_dst_copy_var) {
rep(oh, OH) {
std::memcpy(reinterpret_cast<void*>(
reinterpret_cast<ptrdiff_t>(dst) +
oh * OW * kern_param.dst_type.size()),
reinterpret_cast<void*>(
reinterpret_cast<ptrdiff_t>(dptr) +
oh * OW2 * kern_param.dst_type.size()),
kern_param.dst_type.size() * OW);
}
}
};
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& kern_param,
WorkspaceBundle bundle) {
MEGDNN_MARK_USED_VAR(kern_param);
auto fm = kern_param.filter_meta;
size_t group = fm.group;
size_t n = kern_param.n;
SmallVector<NCBKern> ncb_kerns;
conv_fun do_conv_fun = nullptr;
#define DO_CONV_KERN_FUN(filter, bias_mode, is_quantized, op) \
do_conv_fun = conv_kimpl<filter, bias_mode, is_quantized, op>;
#define GET_OP_PARAM(i, bias_mode, is_quantized) \
switch (kern_param.nonlineMode) { \
case param::ConvBias::NonlineMode::IDENTITY: \
DO_CONV_KERN_FUN(i, bias_mode, is_quantized, \
TypeCvtOp<SIMDType::AVX2 MEGDNN_COMMA dt_qint32 \
MEGDNN_COMMA dt_qint8>) \
break; \
case param::ConvBias::NonlineMode::RELU: \
DO_CONV_KERN_FUN(i, bias_mode, is_quantized, \
ReluOp<SIMDType::AVX2 MEGDNN_COMMA dt_qint32 \
MEGDNN_COMMA dt_qint8>) \
break; \
case param::ConvBias::NonlineMode::H_SWISH: \
DO_CONV_KERN_FUN(i, bias_mode, is_quantized, \
HSwishOp<SIMDType::AVX2 MEGDNN_COMMA dt_qint32 \
MEGDNN_COMMA dt_qint8>) \
break; \
default: \
megdnn_assert(0); \
break; \
}
#define GET_BIAS_MODE_PARAM(i, is_quantized) \
switch (kern_param.bias_mode) { \
case BiasMode::NO_BIAS: \
GET_OP_PARAM(i, BiasMode::NO_BIAS, is_quantized) \
break; \
case BiasMode::BROADCAST_CHANNEL_BIAS: \
GET_OP_PARAM(i, BiasMode::BROADCAST_CHANNEL_BIAS, is_quantized) \
break; \
default: \
megdnn_assert(0); \
break; \
}
#define GET_QUANTIZED(i) \
switch (kern_param.dst_type.enumv()) { \
case DTypeEnum::QuantizedS8: \
GET_BIAS_MODE_PARAM(i, true) \
break; \
case DTypeEnum::QuantizedS32: \
GET_BIAS_MODE_PARAM(i, false) \
break; \
case DTypeEnum::Int32: \
GET_BIAS_MODE_PARAM(i, false) \
break; \
default: \
megdnn_assert(0); \
break; \
}
#define DISPATCH_CONV_KERN() \
switch (kern_param.filter_meta.spatial[0]) { \
case 2: \
GET_QUANTIZED(2) \
break; \
case 3: \
GET_QUANTIZED(3) \
break; \
case 5: \
GET_QUANTIZED(5) \
break; \
case 7: \
GET_QUANTIZED(7) \
break; \
default: \
megdnn_assert(0); \
break; \
}
DISPATCH_CONV_KERN();
auto exec_one_group = [bundle, do_conv_fun](const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index) {
copy_padding_kern(bundle, kern_param, ncb_index);
do_conv_fun(bundle, kern_param, ncb_index);
};
ncb_kerns.push_back({exec_one_group, {group, n, 1_z}});
return ncb_kerns;
}
} // namespace avx2_chanwise_stride1
} // namespace x86
} // namespace megdnn
// vim: syntax=cpp.doxygen
/**
* \file src/x86/conv_bias/int8/avx2_chanwsie_stride1.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "src/x86/conv_bias/opr_impl.h"
namespace megdnn {
namespace x86 {
namespace avx2_chanwise_stride1 {
using NCBKern = fallback::ConvBiasImpl::NCBKern;
using NCBKernSizeParam = fallback::ConvBiasImpl::NCBKernSizeParam;
using NCBKernParam = fallback::ConvBiasImpl::NCBKernParam;
using NCBKernIndex = fallback::ConvBiasImpl::NCBKernIndex;
using conv_fun = std::function<void(WorkspaceBundle bundle,
const NCBKernParam& kern_param,
const NCBKernIndex& ncb_index)>;
bool need_dst_copy(const NCBKernSizeParam& param);
bool need_src_copy(const NCBKernSizeParam& param);
void get_rectified_size(const NCBKernSizeParam& param, size_t& IH2, size_t& IW2,
size_t& OH2, size_t& OW2);
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param,
WorkspaceBundle bundle);
} // namespace avx2_chanwise_stride1
} // namespace x86
} // namespace megdnn
// vim: syntax=cpp.doxygen
...@@ -10,7 +10,6 @@ ...@@ -10,7 +10,6 @@
*/ */
#include "src/x86/conv_bias/int8/avx2_direct_conv_stride1.h" #include "src/x86/conv_bias/int8/avx2_direct_conv_stride1.h"
#include "src/common/unroll_macro.h"
#include "src/x86/conv_bias/int8/common_helper.h" #include "src/x86/conv_bias/int8/common_helper.h"
#include "src/x86/conv_bias/postprocess_helper.h" #include "src/x86/conv_bias/postprocess_helper.h"
......
...@@ -10,7 +10,6 @@ ...@@ -10,7 +10,6 @@
*/ */
#include "src/x86/conv_bias/int8/avx2_direct_conv_stride2.h" #include "src/x86/conv_bias/int8/avx2_direct_conv_stride2.h"
#include "src/common/unroll_macro.h"
#include "src/x86/conv_bias/int8/common_helper.h" #include "src/x86/conv_bias/int8/common_helper.h"
#include "src/x86/conv_bias/postprocess_helper.h" #include "src/x86/conv_bias/postprocess_helper.h"
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#pragma once #pragma once
#include <immintrin.h> #include <immintrin.h>
#include "src/common/unroll_macro.h"
#include "megdnn/arch.h" #include "megdnn/arch.h"
#ifdef WIN32CMAKE #ifdef WIN32CMAKE
#include <smmintrin.h> #include <smmintrin.h>
......
...@@ -65,6 +65,10 @@ void* ConvBiasImpl::AlgoAVX2DirectConvStride2::type() const { ...@@ -65,6 +65,10 @@ void* ConvBiasImpl::AlgoAVX2DirectConvStride2::type() const {
return x86_algo_type; return x86_algo_type;
} }
void* ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::type() const {
return x86_algo_type;
}
class ConvBiasImpl::AlgoPack : NonCopyableObj { class ConvBiasImpl::AlgoPack : NonCopyableObj {
AlgoDirect stride1_direct_large_group{true}; AlgoDirect stride1_direct_large_group{true};
AlgoDirect stride1_direct_small_group{false}; AlgoDirect stride1_direct_small_group{false};
...@@ -72,6 +76,7 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj { ...@@ -72,6 +76,7 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {
AlgoDirectStride2 stride2_direct_small_group{false}; AlgoDirectStride2 stride2_direct_small_group{false};
AlgoDirectAvx2Stride1Int8 avx2_stride1_direct_int8; AlgoDirectAvx2Stride1Int8 avx2_stride1_direct_int8;
AlgoAVX2DirectConvStride2 avx2_stride2_direct; AlgoAVX2DirectConvStride2 avx2_stride2_direct;
AlgoChanWiseAvx2Stride1Qint8 avx2_stride1_chanwsie_qint8;
AlgoMatrixMul matmul; AlgoMatrixMul matmul;
#if defined(MEGDNN_X86_WITH_MKL_DNN) #if defined(MEGDNN_X86_WITH_MKL_DNN)
AlgoMkldnnMatmulQint8 mkldnn_matmul_qint8; AlgoMkldnnMatmulQint8 mkldnn_matmul_qint8;
...@@ -94,6 +99,7 @@ public: ...@@ -94,6 +99,7 @@ public:
all_algos.emplace_back(&stride2_direct_small_group); all_algos.emplace_back(&stride2_direct_small_group);
all_algos.emplace_back(&avx2_stride1_direct_int8); all_algos.emplace_back(&avx2_stride1_direct_int8);
all_algos.emplace_back(&avx2_stride2_direct); all_algos.emplace_back(&avx2_stride2_direct);
all_algos.emplace_back(&avx2_stride1_chanwsie_qint8);
all_algos.emplace_back(&matmul); all_algos.emplace_back(&matmul);
static CpuOprDelegationStorage<> storage; static CpuOprDelegationStorage<> storage;
......
...@@ -31,6 +31,7 @@ public: ...@@ -31,6 +31,7 @@ public:
class AlgoMatrixMul; class AlgoMatrixMul;
class AlgoDirectAvx2Stride1Int8; class AlgoDirectAvx2Stride1Int8;
class AlgoAVX2DirectConvStride2; class AlgoAVX2DirectConvStride2;
class AlgoChanWiseAvx2Stride1Qint8;
#if defined(MEGDNN_X86_WITH_MKL_DNN) #if defined(MEGDNN_X86_WITH_MKL_DNN)
class AlgoMkldnnConv; class AlgoMkldnnConv;
class AlgoMkldnnQint8; class AlgoMkldnnQint8;
......
...@@ -257,6 +257,32 @@ struct TypeCvtOp<SIMDType::SSE4_2, dt_qint32, dt_qint8> ...@@ -257,6 +257,32 @@ struct TypeCvtOp<SIMDType::SSE4_2, dt_qint32, dt_qint8>
} }
}; };
template <>
struct TypeCvtOp<SIMDType::AVX2, dt_qint32, dt_qint8>
: UnaryOpBase<SIMDType::AVX2, dt_qint32, dt_qint8> {
using UnaryOpBase::UnaryOpBase;
constexpr static size_t SIMD_WIDTH = 8;
MEGDNN_ATTRIBUTE_TARGET("avx2")
void operator()(const __m256ix2& vsrc, dt_qint8* dst) const {
_mm_store_si128((__m128i*)(dst), (operator()(vsrc)));
}
MEGDNN_ATTRIBUTE_TARGET("avx2")
__m128i operator()(const __m256ix2& vsrc) const {
auto cvtps_src0 = _mm256_cvtepi32_ps(vsrc.val[0]);
auto cvtps_src1 = _mm256_cvtepi32_ps(vsrc.val[1]);
auto vitem0 = _mm256_mul_ps(cvtps_src0, _mm256_set1_ps(this->scale));
auto vitem1 = _mm256_mul_ps(cvtps_src1, _mm256_set1_ps(this->scale));
return QConverter::convert<__m128i, __m256x2>({{vitem0, vitem1}});
}
void operator()(src_ctype src, dst_ctype* dst) {
*reinterpret_cast<int8_t*>(dst) = saturate<int8_t, float>(
std::round(src.as_int32() * scale), -128, 127);
}
};
template <> template <>
struct TypeCvtOp<SIMDType::SSE4_2, dt_float32, dt_qint8> struct TypeCvtOp<SIMDType::SSE4_2, dt_float32, dt_qint8>
: UnaryOpBase<SIMDType::SSE4_2, dt_float32, dt_qint8> { : UnaryOpBase<SIMDType::SSE4_2, dt_float32, dt_qint8> {
......
...@@ -40,6 +40,165 @@ TEST_F(X86, CONV_BIAS_FORWARD) { ...@@ -40,6 +40,165 @@ TEST_F(X86, CONV_BIAS_FORWARD) {
.execs({arg.src, arg.filter, arg.bias, {}, {}}); .execs({arg.src, arg.filter, arg.bias, {}, {}});
} }
} }
TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_INT8x8x32) {
using namespace conv_bias;
std::vector<TestArg> args;
auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
NonlineMode nonline_mode) {
if (w + 2 * p < kernel || h + 2 * p < kernel)
return;
param::ConvBias param;
param.stride_h = 1;
param.stride_w = 1;
param.pad_h = p;
param.pad_w = p;
param.nonlineMode = nonline_mode;
param.sparse = param::ConvBias::Sparse::GROUP;
//! no bias
args.emplace_back(param, TensorShape{2, ic, h, w},
TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{});
//! bias channel
args.emplace_back(param, TensorShape{2, ic, h, w},
TensorShape{ic, 1, 1, kernel, kernel},
TensorShape{1, ic, 1, 1});
};
for (size_t kernel : {2, 3, 5, 7})
for (size_t pad : {0, 1})
for (size_t ic : {1, 5, 17, 20})
for (size_t h : {7, 16, 38, 40})
for (size_t w : {16, 25, 40, 55})
for (NonlineMode nonline_mode : {NonlineMode::IDENTITY})
run(ic, w, h, kernel, pad, nonline_mode);
Checker<ConvBias> checker(handle());
UniformIntRNG rng{-50, 50};
checker.set_dtype(0, dtype::Int8())
.set_dtype(1, dtype::Int8())
.set_dtype(2, dtype::Int32())
.set_dtype(4, dtype::Int32())
.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &rng)
.set_epsilon(1e-3);
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"));
for (auto&& arg : args) {
checker.set_param(arg.param).exec(
{arg.src, arg.filter, arg.bias, {}, {}});
}
}
TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS32) {
using namespace conv_bias;
std::vector<TestArg> args;
auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
NonlineMode nonline_mode) {
if (w + 2 * p < kernel || h + 2 * p < kernel)
return;
param::ConvBias param;
param.stride_h = 1;
param.stride_w = 1;
param.pad_h = p;
param.pad_w = p;
param.nonlineMode = nonline_mode;
param.sparse = param::ConvBias::Sparse::GROUP;
//! no bias
args.emplace_back(param, TensorShape{2, ic, h, w},
TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{});
//! bias channel
args.emplace_back(param, TensorShape{2, ic, h, w},
TensorShape{ic, 1, 1, kernel, kernel},
TensorShape{1, ic, 1, 1});
};
for (size_t kernel : {2, 3, 5, 7})
for (size_t pad : {0, 1})
for (size_t ic : {1, 3, 5, 7, 17})
for (size_t h : {10, 17, 25, 30})
for (size_t w : {19, 28, 58, 168})
for (NonlineMode nonline_mode : {NonlineMode::IDENTITY})
run(ic, w, h, kernel, pad, nonline_mode);
Checker<ConvBias> checker(handle());
UniformIntRNG rng{-50, 50};
checker.set_dtype(0, dtype::QuantizedS8(2.5f))
.set_dtype(1, dtype::QuantizedS8(2.5f))
.set_dtype(2, dtype::QuantizedS32(6.25f))
.set_dtype(4, {})
.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &rng)
.set_epsilon(1e-3);
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"));
for (auto&& arg : args) {
checker.set_param(arg.param).exec(
{arg.src, arg.filter, arg.bias, {}, {}});
}
}
TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS8x8x8) {
using namespace conv_bias;
std::vector<TestArg> args;
auto run = [&](size_t ic, size_t w, size_t h, size_t kernel, size_t p,
NonlineMode nonline_mode) {
if (w + 2 * p < kernel || h + 2 * p < kernel)
return;
param::ConvBias param;
param.stride_h = 1;
param.stride_w = 1;
param.pad_h = p;
param.pad_w = p;
param.nonlineMode = nonline_mode;
param.sparse = param::ConvBias::Sparse::GROUP;
//! no bias
args.emplace_back(param, TensorShape{2, ic, h, w},
TensorShape{ic, 1, 1, kernel, kernel}, TensorShape{});
//! bias channel
args.emplace_back(param, TensorShape{2, ic, h, w},
TensorShape{ic, 1, 1, kernel, kernel},
TensorShape{1, ic, 1, 1});
};
for (size_t kernel : {2, 3, 5, 7})
for (size_t pad : {0, 1})
for (size_t ic : {1, 3, 5, 7, 17})
for (size_t h : {10, 15, 17, 30})
for (size_t w : {19, 28, 58, 168})
for (NonlineMode nonline_mode :
{NonlineMode::IDENTITY, NonlineMode::H_SWISH,
NonlineMode::RELU})
run(ic, w, h, kernel, pad, nonline_mode);
Checker<ConvBias> checker(handle());
UniformIntRNG rng{-50, 50};
checker.set_dtype(0, dtype::QuantizedS8(2.5f))
.set_dtype(1, dtype::QuantizedS8(2.5f))
.set_dtype(2, dtype::QuantizedS32(6.25f))
.set_dtype(4, dtype::QuantizedS8(60.25f))
.set_rng(0, &rng)
.set_rng(1, &rng)
.set_rng(2, &rng)
.set_epsilon(1e-3);
checker.set_before_exec_callback(
conv_bias::ConvBiasAlgoChecker<ConvBiasForward>(
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"));
for (auto&& arg : args) {
checker.set_param(arg.param).exec(
{arg.src, arg.filter, arg.bias, {}, {}});
}
}
TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_INT8x8x32) { TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_INT8x8x32) {
using namespace conv_bias; using namespace conv_bias;
std::vector<TestArg> args; std::vector<TestArg> args;
...@@ -1556,6 +1715,67 @@ void benchmark_impl_comp(const param::ConvBias param, ...@@ -1556,6 +1715,67 @@ void benchmark_impl_comp(const param::ConvBias param,
} }
} // namespace } // namespace
TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_CHANWISE_AVX2_INT8) {
constexpr size_t RUNS = 50;
param::ConvBias param;
param.stride_h = 1;
param.stride_w = 1;
param.sparse = param::ConvBias::Sparse::GROUP;
std::vector<DType> data_type = {dtype::Int8(), dtype::Int8(),
dtype::Int32(), dtype::Int32()};
std::vector<std::pair<SmallVector<TensorShape>, float>>
shapes_and_computation;
auto bench_case = [&](size_t N, size_t IC, size_t H, size_t W, size_t FS) {
param.pad_h = FS / 2;
param.pad_w = FS / 2;
SmallVector<TensorShape> shapes{
{N, IC, H, W}, {IC, 1, 1, FS, FS}, {}, {}, {}};
TensorShape dst{N, IC, (H + 2 * param.pad_h - FS) + 1,
(W + 2 * param.pad_w - FS) + 1};
float computations = (FS * FS * dst.total_nr_elems() * 2) * 1e-6;
shapes_and_computation.push_back(std::make_pair(shapes, computations));
};
bench_case(1, 32, 112, 112, 7);
bench_case(1, 144, 56, 56, 7);
bench_case(1, 192, 28, 28, 7);
bench_case(1, 384, 28, 28, 7);
bench_case(1, 576, 14, 14, 7);
bench_case(1, 960, 7, 7, 7);
bench_case(1, 32, 112, 112, 5);
bench_case(1, 144, 56, 56, 5);
bench_case(1, 192, 28, 28, 5);
bench_case(1, 384, 28, 28, 5);
bench_case(1, 576, 14, 14, 5);
bench_case(1, 960, 7, 7, 5);
bench_case(1, 32, 112, 112, 3);
bench_case(1, 144, 56, 56, 3);
bench_case(1, 192, 28, 28, 3);
bench_case(1, 384, 28, 28, 3);
bench_case(1, 576, 14, 14, 3);
bench_case(1, 960, 7, 7, 3);
bench_case(1, 32, 112, 112, 2);
bench_case(1, 144, 56, 56, 2);
bench_case(1, 192, 28, 28, 2);
bench_case(1, 384, 28, 28, 2);
bench_case(1, 576, 14, 14, 2);
bench_case(1, 960, 7, 7, 2);
std::string algo_name = "X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1";
printf("Benchmark X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1\n");
benchmark_impl(param, shapes_and_computation, algo_name, RUNS,
{4, {4, 5, 6, 7}}, {1, {4}}, data_type);
benchmark_impl(param, shapes_and_computation, algo_name, RUNS, {2, {4, 5}},
{1, {4}}, data_type);
shapes_and_computation.clear();
}
TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECT_AVX2_INT8) { TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_DIRECT_AVX2_INT8) {
constexpr size_t RUNS = 50; constexpr size_t RUNS = 50;
param::ConvBias param; param::ConvBias param;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册