提交 03808112 编写于 作者: M Megvii Engine Team

feat(dnn/arm_common): add nchw44 8x8x16 stride1 stride2

                    2x2 3x3 5x5 7x7 directconv

GitOrigin-RevId: 3710182af1974775c0960a4ebac3c7cc7e3d93d5
上级 2dbe8194
...@@ -38,6 +38,18 @@ public: ...@@ -38,6 +38,18 @@ public:
const NCBKernSizeParam& param) const override; const NCBKernSizeParam& param) const override;
}; };
class ConvBiasImpl::AlgoS8x8x16DirectNCHW44 final : public AlgoBase {
public:
AlgoS8x8x16DirectNCHW44() {}
bool is_reproducible() const override { return true; }
const char* name() const override { return "S8x8x16_NCHW44_DIRECT"; }
bool usable(const NCBKernSizeParam& param,
AlgoSelectionStrategy algo_selection_strategy) const override;
size_t get_workspace(const NCBKernSizeParam& param) const override;
virtual SmallVector<NCBKern> dispatch_kerns(
const NCBKernSizeParam& param) const override;
};
class ConvBiasImpl::AlgoI8x8x16Stride2 final : public AlgoBase { class ConvBiasImpl::AlgoI8x8x16Stride2 final : public AlgoBase {
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;
WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const; WorkspaceBundle get_bundle(const NCBKernSizeParam& param) const;
......
/**
* \file dnn/src/arm_common/conv_bias/int8x8x16/direct_nchw44_kern.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "src/arm_common/simd_macro/marm_neon.h"
#include "src/common/utils.h"
#include "src/fallback/conv_bias/common.h"
namespace megdnn {
namespace arm_common {
namespace int8x8x16_direct_nchw44 {
/**
origin src shape <n, ic/4, h, w, 4>
packed src shape <n, ic/4, h, w, 16>
example: (format like <ic>)
origin
<0> <1> <2> <3>
packed
low 64 bit <0> <0> <0> <0> | <1> <1> <1> <1>
---------------------------------------------------------------------
high 64 bit <2> <2> <2> <2> | <3> <3> <3> <3>
**/
static inline void nchw44_pack_src(const int8_t* src, int8_t* dst, int length) {
static const uint8_t src_idx_buffer[16] = {0, 0, 0, 0, 1, 1, 1, 1,
2, 2, 2, 2, 3, 3, 3, 3};
constexpr int pack_ic = 4;
constexpr int simd_len = 16;
uint8x16_t src_idx = vld1q_u8(src_idx_buffer);
for (int i = 0; i < length; i++) {
int8x16_t result = vld_dup_tbl_s32(src + i * pack_ic, src_idx);
vst1q_s8(dst + i * simd_len, result);
}
}
template <BiasMode bias_mode, int filter_size, int stride>
struct ConvDirectInt8Nchw44Choose {
static void impl(const int8_t* src, const int8_t* filter,
const int16_t* bias, int16_t* dst, const size_t oc,
const size_t ic, const size_t ih, const size_t iw,
const size_t oh, const size_t ow);
};
} // namespace int8_direct_nchw44
} // namespace arm_common
} // namespace megdnn
// vim: syntax=cpp.doxygen
...@@ -44,6 +44,7 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj { ...@@ -44,6 +44,7 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {
AlgoQU8DirectStride1 qu8_direct_stride1; AlgoQU8DirectStride1 qu8_direct_stride1;
AlgoS8DirectStride2 s8_direct_stride2; AlgoS8DirectStride2 s8_direct_stride2;
AlgoS8DirectNCHW44 s8_direct_nchw44; AlgoS8DirectNCHW44 s8_direct_nchw44;
AlgoS8x8x16DirectNCHW44 s8x8x16_direct_nchw44;
AlgoS8DirectNCHWNCHW44 s8_direct_nchw_nchw44; AlgoS8DirectNCHWNCHW44 s8_direct_nchw_nchw44;
AlgoS8DirectStride1 s8_direct_stride1; AlgoS8DirectStride1 s8_direct_stride1;
AlgoS8ChanWiseStride1NCHW44 s8_channel_wise_stride1_nchw44; AlgoS8ChanWiseStride1NCHW44 s8_channel_wise_stride1_nchw44;
...@@ -94,6 +95,7 @@ public: ...@@ -94,6 +95,7 @@ public:
direct_algos.emplace_back(&qu8_direct_stride1); direct_algos.emplace_back(&qu8_direct_stride1);
direct_algos.emplace_back(&s8_direct_stride2); direct_algos.emplace_back(&s8_direct_stride2);
direct_algos.emplace_back(&s8_direct_nchw44); direct_algos.emplace_back(&s8_direct_nchw44);
direct_algos.emplace_back(&s8x8x16_direct_nchw44);
direct_algos.emplace_back(&s8_direct_nchw_nchw44); direct_algos.emplace_back(&s8_direct_nchw_nchw44);
direct_algos.emplace_back(&s8_direct_stride1); direct_algos.emplace_back(&s8_direct_stride1);
......
...@@ -39,6 +39,7 @@ private: ...@@ -39,6 +39,7 @@ private:
class AlgoS8DirectStride1; class AlgoS8DirectStride1;
class AlgoS8DirectStride2; class AlgoS8DirectStride2;
class AlgoS8DirectNCHW44; class AlgoS8DirectNCHW44;
class AlgoS8x8x16DirectNCHW44;
class AlgoS8DirectNCHWNCHW44; class AlgoS8DirectNCHWNCHW44;
class AlgoQU8DirectStride1; class AlgoQU8DirectStride1;
class AlgoQU8DirectStride2; class AlgoQU8DirectStride2;
......
...@@ -518,6 +518,116 @@ void benchmark_im2col_single_algo(const char* im2col_name, Handle* handle, ...@@ -518,6 +518,116 @@ void benchmark_im2col_single_algo(const char* im2col_name, Handle* handle,
} }
} }
void benchmark_nchw44_8x8x16_vs_8x8x32(const char* im2col_name, Handle* handle,
size_t kernel, size_t stride,
size_t pack_size = 1) {
megdnn_assert(stride == 1 || stride == 2, "only support stride 1 or 2");
std::vector<conv_bias::TestArg> args;
auto pack = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel,
size_t p) {
if (ic % pack_size != 0 || oc % pack_size != 0)
return;
if (w + 2 * p < kernel || h + 2 * p < kernel)
return;
param::ConvBias param;
param.format = param::ConvBias::Format::NCHW44;
param.stride_h = stride;
param.stride_w = stride;
param.pad_h = p;
param.pad_w = p;
param.sparse = param::ConvBias::Sparse::DENSE;
args.push_back(conv_bias::TestArg{
param,
TensorShape{1, ic / 4, h, w, 4},
TensorShape{oc / 4, ic / 4, kernel, kernel, 4, 4},
{1, oc / 4, 1, 1, 4}});
};
pack(1, 64, 56, 56, kernel, 0);
pack(8, 64, 56, 56, kernel, 0);
pack(16, 64, 56, 56, kernel, 1);
pack(32, 64, 56, 56, kernel, 1);
pack(1, 64, 100, 100, kernel, 1);
pack(8, 64, 100, 100, kernel, 1);
pack(1, 64, 100, 100, kernel, 0);
pack(8, 64, 100, 100, kernel, 0);
pack(16, 64, 100, 100, kernel, 1);
pack(32, 64, 100, 100, kernel, 1);
pack(64, 64, 100, 100, kernel, 1);
pack(128, 64, 100, 100, kernel, 1);
pack(256, 64, 100, 100, kernel, 1);
pack(512, 64, 100, 100, kernel, 1);
pack(1024, 64, 100, 100, kernel, 1);
pack(1, 32, 200, 200, kernel, 1);
pack(8, 64, 200, 200, kernel, 1);
pack(1, 32, 200, 200, kernel, 0);
pack(8, 64, 200, 200, kernel, 0);
pack(16, 96, 200, 200, kernel, 1);
pack(32, 32, 200, 200, kernel, 1);
pack(64, 64, 200, 200, kernel, 1);
pack(128, 96, 200, 200, kernel, 1);
pack(1, 64, 10, 10, kernel, 1);
pack(8, 64, 10, 10, kernel, 1);
pack(16, 64, 10, 10, kernel, 1);
pack(32, 64, 10, 10, kernel, 1);
pack(64, 64, 10, 10, kernel, 1);
pack(128, 64, 10, 10, kernel, 1);
pack(256, 64, 10, 10, kernel, 1);
pack(512, 64, 10, 10, kernel, 1);
pack(1024, 64, 10, 10, kernel, 1);
using namespace conv_bias;
constexpr size_t RUN = 20;
Benchmarker<ConvBias> benchmark_im2col(handle);
benchmark_im2col.set_display(false);
benchmark_im2col.set_times(RUN);
Benchmarker<ConvBias> benchmark_8832(handle);
benchmark_8832.set_display(false);
benchmark_8832.set_times(RUN);
for (auto&& arg : args) {
TensorLayout dst_layout;
auto opr = handle->create_operator<ConvBias>();
opr->param() = arg.param;
opr->deduce_layout({arg.src, dtype::Float32()},
{arg.filter, dtype::Float32()},
{arg.bias, dtype::Float32()}, {}, dst_layout);
//! dst.nr_elems * IC * FH * FW * 2
float computations = dst_layout.total_nr_elems() * arg.filter[1] *
arg.filter[2] * arg.filter[3] * 2.0 * 4 /
(1024 * 1024 * 1024) * 1e3;
benchmark_im2col.set_param(arg.param);
benchmark_im2col.set_dtype(0, dtype::Int8());
benchmark_im2col.set_dtype(1, dtype::Int8());
benchmark_im2col.set_dtype(2, dtype::Int16());
benchmark_im2col.set_dtype(4, dtype::Int16());
auto used_8816 =
algo_benchmark<ConvBias>(benchmark_im2col,
{arg.src, arg.filter, {}, {}, {}},
im2col_name) /
RUN;
benchmark_8832.set_param(arg.param);
benchmark_8832.set_dtype(0, dtype::QuantizedS8(2.5));
benchmark_8832.set_dtype(1, dtype::QuantizedS8(2.5));
benchmark_8832.set_dtype(2, dtype::QuantizedS32(6.25));
benchmark_8832.set_dtype(4, {});
auto used_8832 =
algo_benchmark<ConvBias>(benchmark_8832,
{arg.src, arg.filter, {}, {}, {}},
"S8_NCHW44_DIRECT") /
RUN;
printf("%s %s: 8816: %f ms %f GFlops ", arg.src.to_string().c_str(),
arg.filter.to_string().c_str(), used_8816,
computations / used_8816);
printf("%s %s: 8832: %f ms %f GFlops ", arg.src.to_string().c_str(),
arg.filter.to_string().c_str(), used_8832,
computations / used_8832);
printf("speedup %f \n", used_8832 / used_8816);
}
}
void BENCHMARK_IM2COL_NCHW44_VS_NCHW(const char* algo_name, void BENCHMARK_IM2COL_NCHW44_VS_NCHW(const char* algo_name,
const char* im2col_name, Handle* handle, const char* im2col_name, Handle* handle,
size_t kernel, DType src_type, size_t kernel, DType src_type,
...@@ -872,6 +982,28 @@ TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_MATMUL) { ...@@ -872,6 +982,28 @@ TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_MATMUL) {
#endif #endif
#if MEGDNN_WITH_BENCHMARK #if MEGDNN_WITH_BENCHMARK
TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_8X8X16_DIRECT_STRIDE1) {
benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 2, 1,
4);
benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 3, 1,
4);
benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 5, 1,
4);
benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 7, 1,
4);
}
TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_8X8X16_DIRECT_STRIDE2) {
benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 2, 2,
4);
benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 3, 2,
4);
benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 5, 2,
4);
benchmark_nchw44_8x8x16_vs_8x8x32("S8x8x16_NCHW44_DIRECT", handle(), 7, 2,
4);
}
TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23) { TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23) {
#if MEGDNN_AARCH64 #if MEGDNN_AARCH64
benchmark_winograd("WINOGRAD:AARCH64_F32:1:2", handle(), 3); benchmark_winograd("WINOGRAD:AARCH64_F32:1:2", handle(), 3);
......
...@@ -534,11 +534,25 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_STRIDE1_NCHW44) { ...@@ -534,11 +534,25 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_STRIDE1_NCHW44) {
get_nchw44_conv_bias_args({2, 3, 5, 7}, 1, false, false, false), get_nchw44_conv_bias_args({2, 3, 5, 7}, 1, false, false, false),
handle(), "S8_NCHW44_DIRECT"); handle(), "S8_NCHW44_DIRECT");
} }
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_STRIDE1_NCHW44_8816) {
checker_conv_bias_int8x8x16(
get_nchw44_conv_bias_args({2, 3, 5, 7}, 1, false, false, true),
handle(), "S8x8x16_NCHW44_DIRECT");
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_STRIDE2_NCHW44_8816) {
checker_conv_bias_int8x8x16(
get_nchw44_conv_bias_args({2, 3, 5, 7}, 2, false, false, true),
handle(), "S8x8x16_NCHW44_DIRECT");
}
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_STRIDE1_NCHW44_8832) { TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_STRIDE1_NCHW44_8832) {
checker_conv_bias_qint8x8x32( checker_conv_bias_qint8x8x32(
get_nchw44_conv_bias_args({2, 3, 5, 7}, 1, false, false, true), get_nchw44_conv_bias_args({2, 3, 5, 7}, 1, false, false, true),
handle(), "S8_NCHW44_DIRECT"); handle(), "S8_NCHW44_DIRECT");
} }
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_STRIDE2_NCHW44_8832) { TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_INT8_STRIDE2_NCHW44_8832) {
checker_conv_bias_qint8x8x32( checker_conv_bias_qint8x8x32(
get_nchw44_conv_bias_args({2, 3, 5, 7}, 2, false, false, true), get_nchw44_conv_bias_args({2, 3, 5, 7}, 2, false, false, true),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册