提交 4e4497b9 编写于 作者: M Megvii Engine Team 提交者: huangxinda

refactor(mgb/dnn): x86 pooling rebase algochooser

GitOrigin-RevId: 96cdc57180ae3352eca4c5160797bb7c4732ae2e
上级 a33c3b73
/**
* \file dnn/src/x86/pooling/algos.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/x86/pooling/algo.h"
#include "megdnn/opr_param_defs.h"
#include "src/common/opr_delegate.h"
#include "src/common/utils.h"
#include "src/fallback/pooling/opr_impl.h"
#include "src/naive/handle.h"
#include "src/x86/handle.h"
#include "src/x86/pooling/do_max_pooling_3x3_s2x2_float_sse.h"
#include "src/x86/pooling/pooling_special_cases.h"
#include "src/x86/utils.h"
using namespace megdnn;
using namespace x86;
namespace {
#if MEGDNN_X86_WITH_MKL_DNN
template <dnnl::memory::format_tag format_tag, bool use_mkl_mem>
dnnl::memory tensor_to_mkl_memory(_megdnn_tensor_in src,
const dnnl::engine& mkldnn_eng,
dnnl::memory::data_type mkldnn_datatype) {
megdnn_assert(format_tag == dnnl::memory::format_tag::nChw8c ||
format_tag == dnnl::memory::format_tag::nchw ||
format_tag == dnnl::memory::format_tag::nhwc,
"not support format");
dnnl::memory::dims src_shape = {
static_cast<long>(src.layout[0]), static_cast<long>(src.layout[1]),
static_cast<long>(src.layout[2]), static_cast<long>(src.layout[3])};
if (format_tag == dnnl::memory::format_tag::nChw8c) {
src_shape = {static_cast<long>(src.layout[0]),
static_cast<long>(src.layout[1] * 8),
static_cast<long>(src.layout[2]),
static_cast<long>(src.layout[3])};
}
auto megdnn_src_md =
dnnl::memory::desc({src_shape}, mkldnn_datatype, format_tag);
if (use_mkl_mem) {
auto megdnn_src_memory = dnnl::memory(megdnn_src_md, mkldnn_eng);
return megdnn_src_memory;
} else {
auto megdnn_src_memory = dnnl::memory(megdnn_src_md, mkldnn_eng,
const_cast<void*>(src.raw_ptr));
return megdnn_src_memory;
}
}
#endif
} // namespace
PoolingImpl::AlgoPack::AlgoPack() {
all_algos.push_back(&algo_mean_w2s2_avx);
all_algos.push_back(&algo_mean_w2s2_sse3);
all_algos.push_back(&algo_max_w2s2_sse);
all_algos.push_back(&algo_max_w3s3_sse);
#if MEGDNN_X86_WITH_MKL_DNN
all_algos.push_back(&algo_mkldnn_nchw);
all_algos.push_back(&algo_mkldnn_nchw88);
#endif
all_algos.push_back(&algo_fallback);
for (auto&& algo : all_algos) {
m_all_algos_map.emplace(algo->info().desc, algo);
}
}
PoolingImpl::AlgoPack PoolingImpl::sm_algo_pack;
MEGDNN_DEF_GET_ALGO_FROM_DESC(PoolingImpl)
PoolingImpl::AlgoBase::SizeArgs::SizeArgs(PoolingImpl* o,
const TensorLayout& src,
const TensorLayout& dst)
: handle{static_cast<x86::HandleImpl*>(o->handle())},
opr{o},
layout_src{src},
layout_dst{dst} {}
PoolingImpl::AlgoBase::ExecArgs::ExecArgs(PoolingImpl* opr,
_megdnn_tensor_in src,
_megdnn_tensor_out dst,
_megdnn_workspace workspace)
: SizeArgs(opr, src.layout, dst.layout),
src_tensor{&src},
dst_tensor{&dst},
workspace{workspace} {}
std::string PoolingImpl::AlgoBase::SizeArgs::to_string() const {
return ssprintf("src=%s, dst=%s", layout_src.to_string().c_str(),
layout_dst.to_string().c_str());
}
bool PoolingImpl::AlgoMeanW2S2AVX::is_available(const SizeArgs& args) const {
auto SH = args.opr->param().stride_h;
auto SW = args.opr->param().stride_w;
auto FH = args.opr->param().window_h;
auto FW = args.opr->param().window_w;
return (is_supported(SIMDType::AVX) &&
args.opr->param().mode == Mode::AVERAGE &&
args.opr->param().format == Param::Format::NCHW &&
args.layout_src.dtype == dtype::Float32() && FH == 2 && FW == 2 &&
SH == 2 && SW == 2);
}
void PoolingImpl::AlgoMeanW2S2AVX::exec(const ExecArgs& args) const {
auto N = args.layout_src.shape[0];
auto C = args.layout_src.shape[1];
auto IH = args.layout_src.shape[2];
auto IW = args.layout_src.shape[3];
auto OH = args.layout_dst.shape[2];
auto OW = args.layout_dst.shape[3];
auto PH = args.opr->param().pad_h;
auto PW = args.opr->param().pad_w;
auto sptr = reinterpret_cast<dt_float32*>(args.src_tensor->raw_ptr);
auto dptr = reinterpret_cast<dt_float32*>(args.dst_tensor->raw_ptr);
auto handle = [=]() { return args.handle; };
MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) {
mean_pooling_w2x2_s2x2_avx(sptr + n * C * IH * IW + c * IH * IW, IH, IW,
dptr + n * C * OH * OW + c * OH * OW, OH, OW,
PH, PW, true);
});
}
bool PoolingImpl::AlgoMeanW2S2SSE3::is_available(const SizeArgs& args) const {
auto SH = args.opr->param().stride_h;
auto SW = args.opr->param().stride_w;
auto FH = args.opr->param().window_h;
auto FW = args.opr->param().window_w;
return (is_supported(SIMDType::SSE3) &&
args.opr->param().mode == Mode::AVERAGE &&
args.layout_src.dtype == dtype::Float32() &&
args.opr->param().format == Param::Format::NCHW && FH == 2 &&
FW == 2 && SH == 2 && SW == 2);
}
void PoolingImpl::AlgoMeanW2S2SSE3::exec(const ExecArgs& args) const {
auto N = args.layout_src.shape[0];
auto C = args.layout_src.shape[1];
auto IH = args.layout_src.shape[2];
auto IW = args.layout_src.shape[3];
auto OH = args.layout_dst.shape[2];
auto OW = args.layout_dst.shape[3];
auto PH = args.opr->param().pad_h;
auto PW = args.opr->param().pad_w;
auto sptr = reinterpret_cast<dt_float32*>(args.src_tensor->raw_ptr);
auto dptr = reinterpret_cast<dt_float32*>(args.dst_tensor->raw_ptr);
auto handle = [=]() { return args.handle; };
MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) {
mean_pooling_w2x2_s2x2_sse3(sptr + n * C * IH * IW + c * IH * IW, IH,
IW, dptr + n * C * OH * OW + c * OH * OW,
OH, OW, PH, PW, true);
});
}
bool PoolingImpl::AlgoMaxW2S2SSE::is_available(const SizeArgs& args) const {
auto SH = args.opr->param().stride_h;
auto SW = args.opr->param().stride_w;
auto FH = args.opr->param().window_h;
auto FW = args.opr->param().window_w;
return (is_supported(SIMDType::SSE) &&
args.layout_src.dtype == dtype::Float32() &&
args.opr->param().mode == Mode::MAX &&
args.opr->param().format == Param::Format::NCHW && FH == 2 &&
FW == 2 && SH == 2 && SW == 2);
}
void PoolingImpl::AlgoMaxW2S2SSE::exec(const ExecArgs& args) const {
auto N = args.layout_src.shape[0];
auto C = args.layout_src.shape[1];
auto IH = args.layout_src.shape[2];
auto IW = args.layout_src.shape[3];
auto OH = args.layout_dst.shape[2];
auto OW = args.layout_dst.shape[3];
auto PH = args.opr->param().pad_h;
auto PW = args.opr->param().pad_w;
auto sptr = reinterpret_cast<dt_float32*>(args.src_tensor->raw_ptr);
auto dptr = reinterpret_cast<dt_float32*>(args.dst_tensor->raw_ptr);
auto handle = [=]() { return args.handle; };
MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) {
max_pooling_w2x2_s2x2_sse(sptr + n * C * IH * IW + c * IH * IW, IH, IW,
dptr + n * C * OH * OW + c * OH * OW, OH, OW,
PH, PW);
});
}
bool PoolingImpl::AlgoMaxW3S3SSE::is_available(const SizeArgs& args) const {
auto SH = args.opr->param().stride_h;
auto SW = args.opr->param().stride_w;
auto FH = args.opr->param().window_h;
auto FW = args.opr->param().window_w;
return (is_supported(SIMDType::SSE) &&
args.layout_src.dtype == dtype::Float32() &&
args.opr->param().mode == Mode::MAX &&
args.opr->param().format == Param::Format::NCHW && FH == 3 &&
FW == 3 && SH == 2 && SW == 2);
}
void PoolingImpl::AlgoMaxW3S3SSE::exec(const ExecArgs& args) const {
auto N = args.layout_src.shape[0];
auto C = args.layout_src.shape[1];
auto IH = args.layout_src.shape[2];
auto IW = args.layout_src.shape[3];
auto OH = args.layout_dst.shape[2];
auto OW = args.layout_dst.shape[3];
auto PH = args.opr->param().pad_h;
auto PW = args.opr->param().pad_w;
auto sptr = reinterpret_cast<dt_float32*>(args.src_tensor->raw_ptr);
auto dptr = reinterpret_cast<dt_float32*>(args.dst_tensor->raw_ptr);
auto handle = [=]() { return args.handle; };
MEGDNN_DISPATCH_CPU_KERN_OPR(
WorkspaceBundle ws = get_bundle(args.layout_src, args.layout_dst,
args.opr->param());
ws.set(args.workspace.raw_ptr); rep(n, N) rep(c, C) {
do_max_pooling_3x3_s2x2_float_SSE(
sptr + n * C * IH * IW + c * IH * IW,
dptr + n * C * OH * OW + c * OH * OW, IH, IW, OH, OW,
PH, PW, ws);
});
}
#if MEGDNN_X86_WITH_MKL_DNN
bool PoolingImpl::AlgoMKLDNNNCHW::is_available(const SizeArgs& args) const {
return ((args.layout_src.dtype.enumv() == DTypeEnum::QuantizedS8 ||
args.layout_src.dtype.enumv() == DTypeEnum::Int8) &&
args.opr->param().mode == Mode::MAX &&
args.opr->param().format == Param::Format::NCHW);
}
void PoolingImpl::AlgoMKLDNNNCHW::exec(const ExecArgs& args) const {
auto PH = args.opr->param().pad_h;
auto PW = args.opr->param().pad_w;
auto FH = args.opr->param().window_h;
auto FW = args.opr->param().window_w;
auto SH = args.opr->param().stride_h;
auto SW = args.opr->param().stride_w;
auto handle = [=]() { return args.handle; };
auto x86_handle = static_cast<HandleImpl*>(inplace_cpu_handle().get());
auto mkldnn_eng = x86_handle->mkldnn_engine();
auto mkldnn_stream = x86_handle->mkldnn_stream();
auto mkldnn_pooling_mode = dnnl::algorithm::pooling_max;
dnnl::memory::dims pool_strides = {SH, SW};
dnnl::memory::dims pool_padding = {PH, PW};
dnnl::memory::dims pool_kernel = {FH, FW};
dnnl::memory&& megdnn_src_memory_ori =
tensor_to_mkl_memory<dnnl::memory::format_tag::nchw, false>(
*args.src_tensor, mkldnn_eng, dnnl::memory::data_type::s8);
dnnl::memory&& megdnn_dst_memory_ori =
tensor_to_mkl_memory<dnnl::memory::format_tag::nchw, false>(
*args.dst_tensor, mkldnn_eng, dnnl::memory::data_type::s8);
dnnl::memory&& megdnn_src_memory =
tensor_to_mkl_memory<dnnl::memory::format_tag::nhwc, true>(
*args.src_tensor, mkldnn_eng, dnnl::memory::data_type::s8);
dnnl::memory&& megdnn_dst_memory =
tensor_to_mkl_memory<dnnl::memory::format_tag::nhwc, true>(
*args.dst_tensor, mkldnn_eng, dnnl::memory::data_type::s8);
auto reorder_src = dnnl::reorder(megdnn_src_memory_ori, megdnn_src_memory);
auto reorder_dst = dnnl::reorder(megdnn_dst_memory, megdnn_dst_memory_ori);
auto pool1_desc = dnnl::pooling_forward::desc(
dnnl::prop_kind::forward_inference, mkldnn_pooling_mode,
megdnn_src_memory.get_desc(), megdnn_dst_memory.get_desc(),
pool_strides, pool_kernel, pool_padding, pool_padding);
auto pool_pd =
dnnl::pooling_forward::primitive_desc(pool1_desc, mkldnn_eng);
auto pool = dnnl::pooling_forward(pool_pd);
auto run = [mkldnn_stream, mkldnn_eng, reorder_src, pool, reorder_dst,
megdnn_src_memory_ori, megdnn_src_memory, megdnn_dst_memory,
megdnn_dst_memory_ori](void) {
MEGDNN_MARK_USED_VAR(mkldnn_eng);
auto mkl_stream = mkldnn_stream;
reorder_src.execute(mkl_stream, {{DNNL_ARG_FROM, megdnn_src_memory_ori},
{DNNL_ARG_TO, megdnn_src_memory}});
pool.execute(mkl_stream, {{DNNL_ARG_SRC, megdnn_src_memory},
{DNNL_ARG_DST, megdnn_dst_memory}});
reorder_dst.execute(mkl_stream, {{DNNL_ARG_FROM, megdnn_dst_memory},
{DNNL_ARG_TO, megdnn_dst_memory_ori}});
mkl_stream.wait();
};
MEGDNN_DISPATCH_CPU_KERN_OPR(run());
}
#endif
#if MEGDNN_X86_WITH_MKL_DNN
bool PoolingImpl::AlgoMKLDNNNCHW88::is_available(const SizeArgs& args) const {
return (args.layout_src.dtype == dtype::Float32() &&
args.opr->param().mode == Mode::MAX &&
args.opr->param().format == Param::Format::NCHW88);
}
void PoolingImpl::AlgoMKLDNNNCHW88::exec(const ExecArgs& args) const {
auto PH = args.opr->param().pad_h;
auto PW = args.opr->param().pad_w;
auto FH = args.opr->param().window_h;
auto FW = args.opr->param().window_w;
auto SH = args.opr->param().stride_h;
auto SW = args.opr->param().stride_w;
auto handle = [=]() { return args.handle; };
auto x86_handle = static_cast<HandleImpl*>(inplace_cpu_handle().get());
auto mkldnn_eng = x86_handle->mkldnn_engine();
auto mkldnn_stream = x86_handle->mkldnn_stream();
auto mkldnn_pooling_mode = dnnl::algorithm::pooling_max;
switch (args.opr->param().mode) {
case Mode::MAX:
mkldnn_pooling_mode = dnnl::algorithm::pooling_max;
break;
case Mode::AVERAGE:
mkldnn_pooling_mode = dnnl::algorithm::pooling_avg_include_padding;
break;
case Mode::AVERAGE_COUNT_EXCLUDE_PADDING:
mkldnn_pooling_mode = dnnl::algorithm::pooling_avg_exclude_padding;
break;
default:
megdnn_throw("not supported pooling mode\n");
};
dnnl::memory::dims pool_strides = {SH, SW};
dnnl::memory::dims pool_padding = {PH, PW};
dnnl::memory::dims pool_kernel = {FH, FW};
dnnl::memory&& megdnn_src_memory_ori =
tensor_to_mkl_memory<dnnl::memory::format_tag::nChw8c, false>(
*args.src_tensor, mkldnn_eng, dnnl::memory::data_type::f32);
dnnl::memory&& megdnn_dst_memory_ori =
tensor_to_mkl_memory<dnnl::memory::format_tag::nChw8c, false>(
*args.dst_tensor, mkldnn_eng, dnnl::memory::data_type::f32);
auto pool_desc = dnnl::pooling_forward::desc(
dnnl::prop_kind::forward_inference, mkldnn_pooling_mode,
megdnn_src_memory_ori.get_desc(), megdnn_dst_memory_ori.get_desc(),
pool_strides, pool_kernel, pool_padding, pool_padding);
auto pool_pd = dnnl::pooling_forward::primitive_desc(pool_desc, mkldnn_eng);
auto pool = dnnl::pooling_forward(pool_pd);
auto run = [mkldnn_stream, pool, mkldnn_eng, megdnn_src_memory_ori,
megdnn_dst_memory_ori](void) {
MEGDNN_MARK_USED_VAR(mkldnn_eng);
auto mkl_stream = mkldnn_stream;
pool.execute(mkl_stream, {{DNNL_ARG_SRC, megdnn_src_memory_ori},
{DNNL_ARG_DST, megdnn_dst_memory_ori}});
mkl_stream.wait();
};
MEGDNN_DISPATCH_CPU_KERN_OPR(run());
}
#endif
\ No newline at end of file
/**
* \file dnn/src/x86/pooling/algo.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include <unordered_map>
#include "src/common/algo_base.h"
#include "src/common/metahelper.h"
#include "src/x86/pooling/opr_impl.h"
#include "src/x86/handle.h"
namespace megdnn {
namespace x86 {
using AlgoBase = PoolingImpl::AlgoBase;
class PoolingImpl::AlgoBase : public Algorithm {
public:
enum class AlgoType : uint32_t {
X86_MeanW2S2AVX,
X86_MeanW2S2SSE3,
X86_MaxW2S2SSE,
X86_MaxW3S3SSE,
#if MEGDNN_X86_WITH_MKL_DNN
X86_MKLDNNNCHW,
X86_MKLDNNNCHW88,
#endif
X86_Fallback
};
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;
AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::X86; }
virtual ~AlgoBase() = default;
struct SizeArgs {
HandleImpl* handle;
PoolingImpl* opr;
const TensorLayout layout_src, layout_dst;
std::string to_string() const;
SizeArgs(PoolingImpl* opr, const TensorLayout& src,
const TensorLayout& dst);
};
struct ExecArgs : public SizeArgs {
const TensorND *src_tensor, *dst_tensor;
Workspace workspace;
ExecArgs(PoolingImpl* opr, _megdnn_tensor_in src,
_megdnn_tensor_out dst, _megdnn_workspace workspace);
};
virtual bool is_available(const SizeArgs& args) const = 0;
virtual void exec(const ExecArgs& args) const = 0;
uint32_t type() const override { return INVALID_ALGO_TYPE; };
bool is_available_attribute(
const SizeArgs& args,
const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE,
const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) {
return contain_attribute_all(positive_attr) &&
!contain_attribute_any(negative_attr) && is_available(args);
}
};
#define ALGO_IMPL(_name) \
class PoolingImpl::Algo##_name final : public AlgoBase { \
std::string m_algo_name; \
\
public: \
Algo##_name() : m_algo_name(std::string(#_name).append("_POOLING")) {} \
AlgoAttribute attribute() const override { \
return AlgoAttribute::REPRODUCIBLE; \
}; \
const char* name() const override { return m_algo_name.c_str(); } \
bool is_available(const SizeArgs& args) const override; \
void exec(const ExecArgs& args) const override; \
MEGDNN_DECL_ALGO_TYPE(X86_##_name) \
};
ALGO_IMPL(MeanW2S2AVX)
ALGO_IMPL(MeanW2S2SSE3)
ALGO_IMPL(MaxW2S2SSE)
ALGO_IMPL(MaxW3S3SSE)
#if MEGDNN_X86_WITH_MKL_DNN
ALGO_IMPL(MKLDNNNCHW)
ALGO_IMPL(MKLDNNNCHW88)
#endif
#undef ALGO_IMPL
class PoolingImpl::AlgoFallback final : public AlgoBase {
std::string m_algo_name;
public:
AlgoFallback() : m_algo_name("FALLBACK_POOLING") {}
AlgoAttribute attribute() const override {
return AlgoAttribute::REPRODUCIBLE;
};
const char* name() const override { return m_algo_name.c_str(); }
bool is_available(const SizeArgs&) const override { return true; }
void exec(const ExecArgs&) const override {}
MEGDNN_DECL_ALGO_TYPE(X86_Fallback)
};
class PoolingImpl::AlgoPack : NonCopyableObj {
private:
AlgoBase::Mapper m_all_algos_map;
AlgoMeanW2S2AVX algo_mean_w2s2_avx;
AlgoMeanW2S2SSE3 algo_mean_w2s2_sse3;
AlgoMaxW2S2SSE algo_max_w2s2_sse;
AlgoMaxW3S3SSE algo_max_w3s3_sse;
#if MEGDNN_X86_WITH_MKL_DNN
AlgoMKLDNNNCHW algo_mkldnn_nchw;
AlgoMKLDNNNCHW88 algo_mkldnn_nchw88;
#endif
AlgoFallback algo_fallback;
public:
AlgoPack();
std::vector<AlgoBase*> all_algos;
const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }
};
} // namespace x86
} // namespace megdnn
......@@ -13,9 +13,9 @@
#include "src/common/utils.h"
#include "src/naive/handle.h"
#include "src/x86/handle.h"
#include "src/x86/pooling/do_max_pooling_3x3_s2x2_float_sse.h"
#include "src/x86/pooling/pooling_special_cases.h"
#include "src/x86/utils.h"
#include "src/x86/pooling/algo.h"
#include "src/common/algo_chooser.h"
#if MEGDNN_X86_WITH_MKL_DNN
#include "mkldnn.hpp"
......@@ -24,10 +24,9 @@
using namespace megdnn;
using namespace x86;
namespace {
WorkspaceBundle get_bundle(const TensorLayout& src, const TensorLayout& dst,
const param::Pooling& param) {
WorkspaceBundle megdnn::x86::get_bundle(const TensorLayout& src,
const TensorLayout& dst,
const param::Pooling& param) {
megdnn_assert(
is_supported(SIMDType::SSE) && src.dtype == dtype::Float32() &&
param.format == param::Pooling::Format::NCHW &&
......@@ -45,242 +44,63 @@ WorkspaceBundle get_bundle(const TensorLayout& src, const TensorLayout& dst,
return ws;
}
#if MEGDNN_X86_WITH_MKL_DNN
template <dnnl::memory::format_tag format_tag, bool use_mkl_mem>
dnnl::memory tensor_to_mkl_memory(_megdnn_tensor_in src,
const dnnl::engine& mkldnn_eng,
dnnl::memory::data_type mkldnn_datatype) {
megdnn_assert(format_tag == dnnl::memory::format_tag::nChw8c ||
format_tag == dnnl::memory::format_tag::nchw ||
format_tag == dnnl::memory::format_tag::nhwc,
"not support format");
dnnl::memory::dims src_shape = {
static_cast<long>(src.layout[0]), static_cast<long>(src.layout[1]),
static_cast<long>(src.layout[2]), static_cast<long>(src.layout[3])};
if (format_tag == dnnl::memory::format_tag::nChw8c) {
src_shape = {static_cast<long>(src.layout[0]),
static_cast<long>(src.layout[1] * 8),
static_cast<long>(src.layout[2]),
static_cast<long>(src.layout[3])};
}
auto megdnn_src_md =
dnnl::memory::desc({src_shape}, mkldnn_datatype, format_tag);
if (use_mkl_mem) {
auto megdnn_src_memory = dnnl::memory(megdnn_src_md, mkldnn_eng);
return megdnn_src_memory;
size_t PoolingImpl::get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& dst) {
auto algo = get_algorithm(this, src, dst);
if (!is_fallback_algo(algo)) {
if (is_supported(SIMDType::SSE) && src.dtype == dtype::Float32() &&
param().mode == Mode::MAX &&
param().format == Param::Format::NCHW && param().window_h == 3 &&
param().window_w == 3 && param().stride_h == 2 &&
param().stride_w == 2) {
WorkspaceBundle ws = get_bundle(src, dst, param());
return ws.total_size_in_bytes();
} else {
return 0;
}
} else {
auto megdnn_src_memory = dnnl::memory(megdnn_src_md, mkldnn_eng,
const_cast<void*>(src.raw_ptr));
return megdnn_src_memory;
auto fallback_worksapce =
fallback::PoolingImpl::get_workspace_in_bytes(src, dst);
return fallback_worksapce;
}
}
#endif
} // namespace
size_t PoolingImpl::get_workspace_in_bytes(const TensorLayout& src,
const TensorLayout& dst) {
if (is_supported(SIMDType::SSE) && src.dtype == dtype::Float32() &&
param().mode == Mode::MAX && param().format == Param::Format::NCHW &&
param().window_h == 3 && param().window_w == 3 &&
param().stride_h == 2 && param().stride_w == 2) {
WorkspaceBundle ws = get_bundle(src, dst, param());
std::vector<Algorithm*> PoolingImpl::get_all_algorithms(
const TensorLayout& src, const TensorLayout& dst) {
return megdnn::get_all_algorithms<PoolingImpl>({this, src, dst});
}
return ws.total_size_in_bytes();
} else {
return 0;
Algorithm* PoolingImpl::get_algorithm_heuristic(
const TensorLayout& src, const TensorLayout& dst,
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) {
MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes);
AlgoBase::SizeArgs args(this, src, dst);
for (auto iter : algo_pack().all_algos) {
if (iter->is_available_attribute(args, positive_attr, negative_attr)) {
return iter;
}
}
megdnn_throw(
ssprintf("require algorithm with attribute(%s) and without "
"attribute(%s), but can't get suitable algo.\n",
Algorithm::attribute_str(positive_attr).c_str(),
Algorithm::attribute_str(negative_attr).c_str()));
return nullptr;
}
void PoolingImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
_megdnn_workspace workspace) {
check_exec(src.layout, dst.layout, workspace.size);
size_t N = src.layout.shape[0], C = src.layout.shape[1],
IH = src.layout.shape[2], IW = src.layout.shape[3];
size_t OH = dst.layout.shape[2], OW = dst.layout.shape[3];
auto mode = param().mode;
auto FH = param().window_h, FW = param().window_w;
auto SH = param().stride_h, SW = param().stride_w;
auto PH = param().pad_h, PW = param().pad_w;
bool is_average = (mode == Mode::AVERAGE);
bool is_include = true;
if (is_supported(SIMDType::AVX) && is_average &&
param().format == Param::Format::NCHW &&
src.layout.dtype == dtype::Float32() && FH == 2 && FW == 2 && SH == 2 &&
SW == 2) {
auto sptr = src.ptr<dt_float32>();
auto dptr = dst.ptr<dt_float32>();
MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) {
mean_pooling_w2x2_s2x2_avx(sptr + n * C * IH * IW + c * IH * IW, IH,
IW, dptr + n * C * OH * OW + c * OH * OW,
OH, OW, PH, PW, is_include);
});
return;
}
if (is_supported(SIMDType::SSE3) && is_average &&
src.layout.dtype == dtype::Float32() &&
param().format == Param::Format::NCHW && FH == 2 && FW == 2 &&
SH == 2 && SW == 2) {
auto sptr = src.ptr<dt_float32>();
auto dptr = dst.ptr<dt_float32>();
MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) {
mean_pooling_w2x2_s2x2_sse3(sptr + n * C * IH * IW + c * IH * IW,
IH, IW,
dptr + n * C * OH * OW + c * OH * OW,
OH, OW, PH, PW, is_include);
});
return;
}
if (is_supported(SIMDType::SSE) && src.layout.dtype == dtype::Float32() &&
mode == Mode::MAX && param().format == Param::Format::NCHW && FH == 2 &&
FW == 2 && SH == 2 && SW == 2) {
auto sptr = src.ptr<dt_float32>();
auto dptr = dst.ptr<dt_float32>();
MEGDNN_DISPATCH_CPU_KERN_OPR(rep(n, N) rep(c, C) {
max_pooling_w2x2_s2x2_sse(sptr + n * C * IH * IW + c * IH * IW, IH,
IW, dptr + n * C * OH * OW + c * OH * OW,
OH, OW, PH, PW);
});
return;
}
if (is_supported(SIMDType::SSE) && src.layout.dtype == dtype::Float32() &&
mode == Mode::MAX && param().format == Param::Format::NCHW && FH == 3 &&
FW == 3 && SH == 2 && SW == 2) {
auto sptr = src.ptr<dt_float32>();
auto dptr = dst.ptr<dt_float32>();
MEGDNN_DISPATCH_CPU_KERN_OPR(
WorkspaceBundle ws =
get_bundle(src.layout, dst.layout, param());
ws.set(workspace.raw_ptr); rep(n, N) rep(c, C) {
do_max_pooling_3x3_s2x2_float_SSE(
sptr + n * C * IH * IW + c * IH * IW,
dptr + n * C * OH * OW + c * OH * OW, IH, IW, OH,
OW, PH, PW, ws);
});
return;
}
#if MEGDNN_X86_WITH_MKL_DNN
// Mkldnn provide optimized code for nhwc int8 pooling now.
// Mkldnn can not change the layout automatic.
// Reorder nchw input to nhwc, do pooling, reorder nhwc result to nchw
if ((src.layout.dtype.enumv() == DTypeEnum::QuantizedS8 ||
src.layout.dtype.enumv() == DTypeEnum::Int8) &&
mode == Mode::MAX && param().format == Param::Format::NCHW) {
auto x86_handle = static_cast<HandleImpl*>(inplace_cpu_handle().get());
auto mkldnn_eng = x86_handle->mkldnn_engine();
auto mkldnn_stream = x86_handle->mkldnn_stream();
auto mkldnn_pooling_mode = dnnl::algorithm::pooling_max;
dnnl::memory::dims pool_strides = {SH, SW};
dnnl::memory::dims pool_padding = {PH, PW};
dnnl::memory::dims pool_kernel = {FH, FW};
dnnl::memory&& megdnn_src_memory_ori =
tensor_to_mkl_memory<dnnl::memory::format_tag::nchw, false>(
src, mkldnn_eng, dnnl::memory::data_type::s8);
dnnl::memory&& megdnn_dst_memory_ori =
tensor_to_mkl_memory<dnnl::memory::format_tag::nchw, false>(
dst, mkldnn_eng, dnnl::memory::data_type::s8);
dnnl::memory&& megdnn_src_memory =
tensor_to_mkl_memory<dnnl::memory::format_tag::nhwc, true>(
src, mkldnn_eng, dnnl::memory::data_type::s8);
dnnl::memory&& megdnn_dst_memory =
tensor_to_mkl_memory<dnnl::memory::format_tag::nhwc, true>(
dst, mkldnn_eng, dnnl::memory::data_type::s8);
auto reorder_src =
dnnl::reorder(megdnn_src_memory_ori, megdnn_src_memory);
auto reorder_dst =
dnnl::reorder(megdnn_dst_memory, megdnn_dst_memory_ori);
auto pool1_desc = dnnl::pooling_forward::desc(
dnnl::prop_kind::forward_inference, mkldnn_pooling_mode,
megdnn_src_memory.get_desc(), megdnn_dst_memory.get_desc(),
pool_strides, pool_kernel, pool_padding, pool_padding);
auto pool_pd =
dnnl::pooling_forward::primitive_desc(pool1_desc, mkldnn_eng);
auto pool = dnnl::pooling_forward(pool_pd);
auto run = [mkldnn_stream, mkldnn_eng, reorder_src, pool, reorder_dst,
megdnn_src_memory_ori, megdnn_src_memory, megdnn_dst_memory,
megdnn_dst_memory_ori](void) {
MEGDNN_MARK_USED_VAR(mkldnn_eng);
auto mkl_stream = mkldnn_stream;
reorder_src.execute(mkl_stream,
{{DNNL_ARG_FROM, megdnn_src_memory_ori},
{DNNL_ARG_TO, megdnn_src_memory}});
pool.execute(mkl_stream, {{DNNL_ARG_SRC, megdnn_src_memory},
{DNNL_ARG_DST, megdnn_dst_memory}});
reorder_dst.execute(mkl_stream,
{{DNNL_ARG_FROM, megdnn_dst_memory},
{DNNL_ARG_TO, megdnn_dst_memory_ori}});
mkl_stream.wait();
};
MEGDNN_DISPATCH_CPU_KERN_OPR(run());
return;
}
if (src.layout.dtype == dtype::Float32() && mode == Mode::MAX &&
param().format == Param::Format::NCHW88) {
auto x86_handle = static_cast<HandleImpl*>(inplace_cpu_handle().get());
auto mkldnn_eng = x86_handle->mkldnn_engine();
auto mkldnn_stream = x86_handle->mkldnn_stream();
auto mkldnn_pooling_mode = dnnl::algorithm::pooling_max;
switch (mode) {
case Mode::MAX:
mkldnn_pooling_mode = dnnl::algorithm::pooling_max;
break;
case Mode::AVERAGE:
mkldnn_pooling_mode =
dnnl::algorithm::pooling_avg_include_padding;
break;
case Mode::AVERAGE_COUNT_EXCLUDE_PADDING:
mkldnn_pooling_mode =
dnnl::algorithm::pooling_avg_exclude_padding;
break;
default:
megdnn_assert(0, "not supported pooling mode\n");
};
dnnl::memory::dims pool_strides = {SH, SW};
dnnl::memory::dims pool_padding = {PH, PW};
dnnl::memory::dims pool_kernel = {FH, FW};
dnnl::memory&& megdnn_src_memory_ori =
tensor_to_mkl_memory<dnnl::memory::format_tag::nChw8c, false>(
src, mkldnn_eng, dnnl::memory::data_type::f32);
dnnl::memory&& megdnn_dst_memory_ori =
tensor_to_mkl_memory<dnnl::memory::format_tag::nChw8c, false>(
dst, mkldnn_eng, dnnl::memory::data_type::f32);
auto pool_desc = dnnl::pooling_forward::desc(
dnnl::prop_kind::forward_inference, mkldnn_pooling_mode,
megdnn_src_memory_ori.get_desc(),
megdnn_dst_memory_ori.get_desc(), pool_strides, pool_kernel,
pool_padding, pool_padding);
auto pool_pd =
dnnl::pooling_forward::primitive_desc(pool_desc, mkldnn_eng);
auto pool = dnnl::pooling_forward(pool_pd);
auto run = [mkldnn_stream, pool, mkldnn_eng, megdnn_src_memory_ori,
megdnn_dst_memory_ori](void) {
MEGDNN_MARK_USED_VAR(mkldnn_eng);
auto mkl_stream = mkldnn_stream;
pool.execute(mkl_stream, {{DNNL_ARG_SRC, megdnn_src_memory_ori},
{DNNL_ARG_DST, megdnn_dst_memory_ori}});
mkl_stream.wait();
};
MEGDNN_DISPATCH_CPU_KERN_OPR(run());
return;
AlgoBase::ExecArgs args(this, src, dst, workspace);
auto algo = get_algorithm(this, src.layout, dst.layout);
if (!is_fallback_algo(algo)) {
algo->exec(args);
} else {
fallback::PoolingImpl::exec(src, dst, Workspace());
}
#endif
fallback::PoolingImpl::exec(src, dst, Workspace());
}
// vim: syntax=cpp.doxygen
......@@ -6,7 +6,8 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "src/fallback/pooling/opr_impl.h"
......@@ -14,17 +15,62 @@
namespace megdnn {
namespace x86 {
class PoolingImpl: public fallback::PoolingImpl {
public:
using fallback::PoolingImpl::PoolingImpl;
void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
_megdnn_workspace) override;
size_t get_workspace_in_bytes(const TensorLayout &,
const TensorLayout &) override;
};
class PoolingImpl : public fallback::PoolingImpl {
private:
class AlgoMeanW2S2AVX;
class AlgoMeanW2S2SSE3;
class AlgoMaxW2S2SSE;
class AlgoMaxW3S3SSE;
#if MEGDNN_X86_WITH_MKL_DNN
class AlgoMKLDNNNCHW;
class AlgoMKLDNNNCHW88;
#endif
class AlgoFallback;
class AlgoPack;
static AlgoPack sm_algo_pack;
} // namespace x86
} // namespace megdnn
// vim: syntax=cpp.doxygen
public:
using fallback::PoolingImpl::PoolingImpl;
class AlgoBase;
void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
_megdnn_workspace) override;
size_t get_workspace_in_bytes(const TensorLayout&,
const TensorLayout&) override;
static size_t constexpr MAX_SPATIAL_DIM = 2;
const char* get_algorithm_set_name() const override {
return "X86_POOLING_FORWARD";
}
Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override;
AlgorithmInfo get_algorithm_info_heuristic(
const TensorLayout& src, const TensorLayout& dst,
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) {
return get_algorithm_heuristic(src, dst, workspace_limit_in_bytes,
positive_attr, negative_attr)
->info();
}
static const AlgoPack& algo_pack() { return sm_algo_pack; }
bool is_fallback_algo(Algorithm* algo) {
return strcmp(algo->name(), "FALLBACK_POOLING") == 0;
}
protected:
std::vector<Algorithm*> get_all_algorithms(
const TensorLayout& src, const TensorLayout& dst) override;
Algorithm* get_algorithm_heuristic(
const TensorLayout& src, const TensorLayout& dst,
size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr,
const AlgoAttribute& negative_attr) override;
};
WorkspaceBundle get_bundle(const TensorLayout& src, const TensorLayout& dst,
const param::Pooling& param);
} // namespace x86
} // namespace megdnn
// vim: syntax=cpp.doxygen
......@@ -159,6 +159,42 @@ TEST(TestOprDNN, PoolingExePolicy) {
"cudnnReproducible") != std::string::npos);
}
TEST(TestOprDNN, PoolingForwardFastrun) {
using Param = opr::Pooling::Param;
Param param;
using Policy = opr::Pooling::ExecutionPolicy;
using S = Policy::Strategy;
auto cn = CompNode::load("xpux");
cn.activate();
auto orig_impl = PersistentCache::set_impl(
std::make_shared<InMemoryPersistentCache>());
HostTensorND host_y;
S strategy = S::PROFILE | S::REPRODUCIBLE;
auto graph = ComputingGraph::make();
HostTensorGenerator<> gen;
TensorShape shape = {1, 20, 24, 24};
auto input = opr::Host2DeviceCopy::make(*graph, gen(shape, cn));
param.mode = Param::Mode::MAX;
param.window_h = param.window_w = 2;
param.stride_h = param.stride_w = 2;
param.pad_h = param.pad_w = 0;
param.format = Param::Format::NCHW;
Policy policy;
policy.strategy = strategy;
auto pooling = opr::PoolingForward::make(input, param, {}, policy);
auto func = graph->compile({make_callback_copy(pooling, host_y)});
func->execute().wait();
}
} // anonymous namespace
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册