提交 9f997ac5 编写于 作者: M Megvii Engine Team

fix(dnn/x86): enable i8i8i16 gemv used in conv

GitOrigin-RevId: d946e222439c183247ea84983f75eab862482eca
上级 36e3bb6e
......@@ -6,7 +6,8 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/fallback/conv_bias/conv1x1/algos.h"
......@@ -67,7 +68,8 @@ size_t ConvBiasImpl::AlgoConv1x1::get_workspace(
MIDOUT_END();
} else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 1) {
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> dispatcher;
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>
dispatcher;
return dispatcher
.get_bundle(param, matmul_param, m_matmul_algo,
compt_oc_block_size)
......@@ -116,7 +118,8 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns(
MIDOUT_END();
} else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 1) {
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> dispatcher;
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA>
dispatcher;
whole_bundle = dispatcher.get_bundle(
param, matmul_param, m_matmul_algo, compt_oc_block_size);
matmul_bundle = m_matmul_algo->get_bundle(matmul_param);
......@@ -140,7 +143,7 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns(
Conv1x1StrategyBase* conv1x1_strategy =
Conv1x1Factory::make_conv1x1_strategy(param, pack_mode,
opr->param().format);
opr->param().format);
auto kern_packA = [this, whole_bundle, matmul_bundle, param,
compt_oc_block_size, conv1x1_strategy](
......@@ -171,8 +174,8 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns(
pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) {
ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}});
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) {
ret_kern.push_back({kern_packB, {1}});
}
ret_kern.push_back({kern_packB, {1}});
}
}
ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}});
......@@ -230,7 +233,11 @@ bool ConvBiasImpl::AlgoConv1x1::usable(ConvBiasImpl* opr,
param, OH * OW, get_oc_tile_size_heuristic(param));
bool matmul_usable = m_matmul_algo->usable(matmul_param);
return matmul_usable &&
auto pack_mode = m_matmul_algo->packmode();
bool strategy_usable = Conv1x1Factory::can_make_conv1x1_strategy(
param, pack_mode, opr->param().format);
return matmul_usable && strategy_usable &&
(param.filter_meta.dilation[0] ==
param.filter_meta.dilation[1] &&
param.filter_meta.dilation[0] == 1) &&
......
......@@ -6,11 +6,12 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include <unordered_map>
#include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h"
#include <unordered_map>
#include "midout.h"
......@@ -157,10 +158,9 @@ std::unique_ptr<Conv1x1StrategyBase> create_conv1x1_strategy(
dt_int32, dt_int8, dt_int32, dt_int32,
PostprocessMode::NO_PROCESS, "NoPack::INT8x8x32_INT32"_hash);
cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK,
dtype::QuantizedS8, dtype::QuantizedS32,
dtype::QuantizedS32, dt_int8, dt_int32, dt_int32,
PostprocessMode::NO_PROCESS,
cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dtype::QuantizedS8,
dtype::QuantizedS32, dtype::QuantizedS32, dt_int8, dt_int32,
dt_int32, PostprocessMode::NO_PROCESS,
"NoPack::QINT8x8x32_QINT32"_hash);
break;
......@@ -208,6 +208,19 @@ Conv1x1StrategyBase* Conv1x1Factory::make_conv1x1_strategy(
return storage.get(param, pack_mode, format);
}
bool Conv1x1Factory::can_make_conv1x1_strategy(
const ConvBiasImpl::NCBKernSizeParam& param,
MatrixMulImpl::AlgoBase::PackMode pack_mode, param::ConvBias::Format) {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC || !MEGDNN_DISABLE_FLOAT16
if ((pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK ||
pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) &&
param.src_type.enumv() == DTypeTrait<dt_float16>::enumv) {
return false;
}
#endif
return true;
}
} // namespace conv1x1
} // namespace fallback
} // namespace megdnn
......@@ -320,6 +320,11 @@ public:
const ConvBiasImpl::NCBKernSizeParam& param,
MatrixMulImpl::AlgoBase::PackMode pack_mode,
param::ConvBias::Format format);
static bool can_make_conv1x1_strategy(
const ConvBiasImpl::NCBKernSizeParam& param,
MatrixMulImpl::AlgoBase::PackMode pack_mode,
param::ConvBias::Format format);
};
} // namespace conv1x1
......
......@@ -27,7 +27,7 @@ using namespace megdnn;
using namespace fallback;
size_t megdnn::fallback::get_format_pack_size(param::ConvBias::Format format) {
switch(format){
switch (format) {
case param::ConvBias::Format::NCHW44:
case param::ConvBias::Format::NCHW4:
return 4_z;
......@@ -57,10 +57,18 @@ public:
auto&& matmul_algos =
static_cast<fallback::MatrixMulImpl*>(matmul_opr)->algo_pack();
for (auto&& algo : matmul_algos) {
#if MEGDNN_X86
//! As we haven't direct conv for int8x8x16 yet, if we disable gemv here, it may
//! fallback to naive implementation, which may cause performance very low, so
//! here we just enable im2col for gemv in x86 backend.
//! FIXME: remove it when we add direct conv support for int8x8x16
#else
if (algo->algoset() ==
MatrixMulImpl::AlgoBase::AlgoSet::ALGO_TYPE_GEMV) {
continue;
}
#endif
for (size_t ohw_tile_size : {192, 384, 96, 48, 24}) {
refhold.emplace_back(new AlgoIm2col(
static_cast<MatrixMulImpl::AlgoBase*>(algo),
......
......@@ -6,7 +6,8 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "test/fallback/fixture.h"
......@@ -73,24 +74,115 @@ TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_MATRIX_MUL) {
profile(3, 3, 112, 112, 3, 1);
}
TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_MATRIX_MUL_8832) {
using Param = Convolution::Param;
auto run = [&](const TensorShapeArray& shapes, Param param) {
Benchmarker<Convolution> benchmarker_float(handle());
size_t RUN = 50;
auto tfloat = benchmarker_float.set_display(false)
.set_dtype(0, dtype::Int8{})
.set_dtype(1, dtype::Int8{})
.set_dtype(2, dtype::Int32{})
.set_times(RUN)
.set_param(param)
.exec(shapes);
size_t IC = shapes[1][1];
size_t FH = shapes[1][2];
size_t FW = shapes[1][3];
TensorLayout dst_layout;
auto opr = handle()->create_operator<Convolution>();
opr->param() = param;
opr->deduce_layout({shapes[0], dtype::Float32()},
{shapes[1], dtype::Float32()}, dst_layout);
printf("fp32 flops: %.3f mflops\n",
(IC * dst_layout.total_nr_elems() * FH * FW * 2) /
(tfloat / RUN * 1000));
};
auto profile = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
size_t stride) {
Param param;
param.stride_h = stride;
param.stride_w = stride;
param.pad_h = kernel / 2;
param.pad_w = kernel / 2;
param.pad_h = 0;
param.pad_w = 0;
printf("oc: %zd ic: %zd w: %zd h: %zd stride: %zd kernel_size: %zd\n",
oc, ic, w, h, stride, kernel);
run({{1, ic, h, w}, {oc, ic, kernel, kernel}, {}}, param);
};
profile(48, 128, 56, 88, 1, 1);
profile(56, 128, 64, 80, 3, 1);
profile(24, 3, 256, 320, 3, 2);
}
TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_MATRIX_MUL_8816) {
using Param = Convolution::Param;
auto run = [&](const TensorShapeArray& shapes, Param param) {
Benchmarker<Convolution> benchmarker_float(handle());
size_t RUN = 50;
auto tfloat = benchmarker_float.set_display(false)
.set_dtype(0, dtype::Int8{})
.set_dtype(1, dtype::Int8{})
.set_dtype(2, dtype::Int16{})
.set_times(RUN)
.set_param(param)
.exec(shapes);
size_t IC = shapes[1][1];
size_t FH = shapes[1][2];
size_t FW = shapes[1][3];
TensorLayout dst_layout;
auto opr = handle()->create_operator<Convolution>();
opr->param() = param;
opr->deduce_layout({shapes[0], dtype::Float32()},
{shapes[1], dtype::Float32()}, dst_layout);
printf("fp32 flops: %.3f mflops\n",
(IC * dst_layout.total_nr_elems() * FH * FW * 2) /
(tfloat / RUN * 1000));
};
auto profile = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
size_t stride) {
Param param;
param.stride_h = stride;
param.stride_w = stride;
param.pad_h = kernel / 2;
param.pad_w = kernel / 2;
param.pad_h = 0;
param.pad_w = 0;
printf("oc: %zd ic: %zd w: %zd h: %zd stride: %zd kernel_size: %zd\n",
oc, ic, w, h, stride, kernel);
run({{1, ic, h, w}, {oc, ic, kernel, kernel}, {}}, param);
};
profile(48, 128, 56, 88, 1, 1);
profile(48, 128, 56, 88, 1, 2);
profile(56, 128, 64, 80, 3, 1);
profile(24, 3, 256, 320, 3, 2);
}
TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_BACKWARD_DATA) {
using Param = ConvolutionBackwardData::Param;
auto run = [&](const TensorLayoutArray& tensors, Param param) {
Benchmarker<ConvolutionBackwardData> benchmarker_fallback(handle());
size_t RUN = 500;
benchmarker_fallback.set_display(false)
.set_dtype(0, dtype::Float32{})
.set_dtype(1, dtype::Float32{})
.set_times(RUN)
.set_param(param);
auto tmatmul = benchmarker_fallback.set_before_exec_callback(
AlgoChecker<ConvolutionBackwardData>(
"DeconvMatmul"))
.exec(tensors);
auto tdirect = benchmarker_fallback.set_before_exec_callback(
AlgoChecker<ConvolutionBackwardData>(
"DeconvDirect"))
.exec(tensors);
.set_dtype(0, dtype::Float32{})
.set_dtype(1, dtype::Float32{})
.set_times(RUN)
.set_param(param);
auto tmatmul = benchmarker_fallback
.set_before_exec_callback(
AlgoChecker<ConvolutionBackwardData>(
"DeconvMatmul"))
.exec(tensors);
auto tdirect = benchmarker_fallback
.set_before_exec_callback(
AlgoChecker<ConvolutionBackwardData>(
"DeconvDirect"))
.exec(tensors);
size_t IC = tensors[0][1];
size_t FH = tensors[0][2];
size_t FW = tensors[0][3];
......@@ -98,8 +190,8 @@ TEST_F(FALLBACK, BENCHMARK_CONVOLUTION_BACKWARD_DATA) {
printf("Direct_time: %.3f ms Direct_flops: %.3f mflops\n", tdirect,
total_flops / (tdirect / RUN * 1000));
printf("Matmul_time: %.3f ms Matmul_flops: %.3f mflops\n", tmatmul,
total_flops / (tmatmul/ RUN * 1000));
printf("speedup: %.3f\n", tdirect/tmatmul);
total_flops / (tmatmul / RUN * 1000));
printf("speedup: %.3f\n", tdirect / tmatmul);
};
auto profile = [&](size_t n, size_t ic, size_t oh, size_t ow, size_t oc,
......@@ -154,6 +246,51 @@ TEST_F(FALLBACK, CONVOLUTION_MATRIX_MUL) {
run(1, 3, 3, 112, 112, 3, 1);
run(1, 1, 1, 1, 1, 3, 3);
}
#if MEGDNN_X86
TEST_F(FALLBACK_MULTI_THREADS, CONVOLUTION_8816) {
Checker<Convolution> checker(handle());
using Param = Convolution::Param;
checker.set_before_exec_callback(AlgoChecker<Convolution>(".+FB_GEMV.+"));
auto run = [&](size_t n, size_t ic, size_t ih, size_t iw, size_t oc,
size_t fh, size_t fw, size_t pad, size_t stride,
size_t group) {
Param param;
param.sparse = group > 1 ? param::Convolution::Sparse::GROUP
: param::Convolution::Sparse::DENSE;
param.pad_h = param.pad_w = pad;
param.stride_h = param.stride_w = stride;
checker.set_param(param);
if (group > 1) {
checker.execl(
{{{n, ic, ih, iw}, dtype::Int8()},
{{group, oc / group, ic / group, fh, fw}, dtype::Int8()},
{{}, dtype::Int16()}});
} else {
checker.execl({{{n, ic, ih, iw}, dtype::Int8()},
{{oc, ic, fh, fw}, dtype::Int8()},
{{}, dtype::Int16()}});
}
};
for (auto n : {1, 2})
for (auto ic : {3, 4, 8, 12, 16})
for (auto oc : {4, 8, 16, 32})
for (auto ih : {7, 14, 15, 22})
for (auto iw : {7, 13, 11, 32})
for (auto filter : {1, 2, 3, 5, 7})
for (auto stride : {1, 2})
for (auto pad : {0, filter / 2}) {
run(n, ic, ih, iw, oc, filter, filter, pad,
stride, 1);
if (ic == oc) {
run(n, ic, ih, iw, oc, filter, filter,
pad, stride, ic);
}
}
}
#endif
TEST_F(FALLBACK, CONVOLUTION_NAIVE_ALGO_FP16) {
Checker<Convolution> checker(handle());
using Param = Convolution::Param;
......@@ -222,7 +359,7 @@ TEST_F(FALLBACK_MULTI_THREADS, CONVOLUTION_NAIVE_ALGO) {
TensorShape src{n, ic, ih, iw},
filter{group, oc / group, ic / group, fh, fw};
checker.set_param(param).set_dtype(2, {});
//!float32
//! float32
checker.set_dtype(0, dtype::Float32()).set_dtype(1, dtype::Float32());
checker.execs({src, filter, {}});
//! float16
......@@ -257,10 +394,10 @@ TEST_F(FALLBACK, CONVOLUTION_MATRIX_MUL_SINT8) {
param.pad_h = param.pad_w = 1;
param.stride_h = param.stride_w = 1;
checker.set_param(param)
.set_dtype(0, dtype::QuantizedS8(0.2f))
.set_dtype(1, dtype::QuantizedS8(0.2f))
.set_dtype(0, dtype::QuantizedS8(0.2f))
.set_dtype(1, dtype::QuantizedS8(0.2f))
// Use inferred output dtype.
.set_dtype(2, {});
.set_dtype(2, {});
checker.execs({{n, ic, ih, iw}, {oc, ic, fh, fw}, {}});
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册