From c2435d1561079aa3a0ea4e4295d63d09d0f549a1 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 22 Mar 2022 19:10:17 +0800 Subject: [PATCH] perf(imperative): specialize adaptive pooling GitOrigin-RevId: 01e14184580fc00e6725d2a7bf90ca374b47eccc --- dnn/src/common/basic_types.cpp | 2 +- imperative/python/megengine/functional/nn.py | 28 ++-- imperative/python/src/tensor.cpp | 8 +- imperative/python/src/tensor_utils.cpp | 54 ++++++++ imperative/python/src/tensor_utils.h | 2 + imperative/src/impl/ops/adaptive_pooling.cpp | 129 +++++++++++++++++++ imperative/src/impl/ops/specializations.cpp | 14 -- src/core/include/megbrain/ir/ops.td | 6 +- 8 files changed, 207 insertions(+), 36 deletions(-) create mode 100644 imperative/src/impl/ops/adaptive_pooling.cpp diff --git a/dnn/src/common/basic_types.cpp b/dnn/src/common/basic_types.cpp index 370090bc4..35375ed03 100644 --- a/dnn/src/common/basic_types.cpp +++ b/dnn/src/common/basic_types.cpp @@ -191,7 +191,7 @@ bool TensorShape::is_empty() const { return true; } } - return false; + return ndim == 0; } /* ===================== TensorLayout ===================== */ diff --git a/imperative/python/megengine/functional/nn.py b/imperative/python/megengine/functional/nn.py index 4a0961dea..0e6ea5345 100644 --- a/imperative/python/megengine/functional/nn.py +++ b/imperative/python/megengine/functional/nn.py @@ -11,7 +11,12 @@ from functools import lru_cache from typing import NamedTuple, Optional, Sequence, Tuple, Union from ..core import _config -from ..core._imperative_rt.core2 import Const, apply, dtype_promotion +from ..core._imperative_rt.core2 import ( + Const, + adaptive_pool2d_cpp, + apply, + dtype_promotion, +) from ..core._imperative_rt.ops import SubgraphBuilder as _SubgraphBuilder from ..core._imperative_rt.ops import get_global_rng_seed as _get_global_rng_seed from ..core.ops import builtin @@ -691,19 +696,12 @@ def adaptive_max_pool2d( Args: inp: input tensor. - oshp: OH, OW)` size of the output shape. + oshp: `(OH, OW)` size of the output shape. Returns: output tensor. """ - if isinstance(oshp, int): - oshp = (oshp, oshp) - conv_format = _config._get_actual_op_param("NCHW", _config.__conv_format) - - op = builtin.AdaptivePooling(mode="max", format=conv_format,) - oshp = astensor1d(oshp, inp, dtype="int32", device=inp.device) - (output,) = apply(op, inp, oshp) - return output + return adaptive_pool2d_cpp(inp, oshp, "MAX") def adaptive_avg_pool2d( @@ -715,18 +713,12 @@ def adaptive_avg_pool2d( Args: inp: input tensor. - oshp: OH, OW)` size of the output shape. + oshp: `(OH, OW)` size of the output shape. Returns: output tensor. """ - if isinstance(oshp, int): - oshp = (oshp, oshp) - - op = builtin.AdaptivePooling(mode="average", format="NCHW",) - oshp = astensor1d(oshp, inp, dtype="int32", device=inp.device) - (output,) = apply(op, inp, oshp) - return output + return adaptive_pool2d_cpp(inp, oshp, "AVERAGE") def deformable_psroi_pooling( diff --git a/imperative/python/src/tensor.cpp b/imperative/python/src/tensor.cpp index 73512a0eb..e61772c4d 100644 --- a/imperative/python/src/tensor.cpp +++ b/imperative/python/src/tensor.cpp @@ -430,6 +430,7 @@ WRAP_FUNC_PY35(squeeze_cpp); WRAP_FUNC_PY35(transpose_cpp); WRAP_FUNC_PY35(broadcast_cpp); WRAP_FUNC_PY35(reshape_cpp); +WRAP_FUNC_PY35(adaptive_pool2d_cpp); WRAP_FUNC_PY35(Const); WRAP_FUNC_PY35(astype_cpp); WRAP_FUNC_PY35(convert_single_value_cpp); @@ -584,6 +585,7 @@ void init_tensor(py::module m) { MGE_PY_INTERFACE(transpose_cpp, transpose_cpp), MGE_PY_INTERFACE(broadcast_cpp, broadcast_cpp), MGE_PY_INTERFACE(reshape_cpp, reshape_cpp), + MGE_PY_INTERFACE(adaptive_pool2d_cpp, adaptive_pool2d_cpp), MGE_PY_INTERFACE(Const, Const), MGE_PY_INTERFACE(astype_cpp, astype_cpp), MGE_PY_INTERFACE(convert_single_value_cpp, convert_single_value_cpp), @@ -991,8 +993,10 @@ void init_tensor(py::module m) { m.def("is_tracing_module", [=] { return get_module_trace()->enabled(); }); - m.def("set_module_trace_hook", - [](py::function function) { module_trace_hook = function; }); + m.def("set_module_trace_hook", [](py::function function) { + module_trace_hook = function; + module_trace_hook.inc_ref(); + }); m.def("begin_record_values", [] { Value::begin_record_values(); }); diff --git a/imperative/python/src/tensor_utils.cpp b/imperative/python/src/tensor_utils.cpp index 0b90cc042..05c0d4a9d 100644 --- a/imperative/python/src/tensor_utils.cpp +++ b/imperative/python/src/tensor_utils.cpp @@ -948,6 +948,7 @@ std::tuple, bool> tuple2vector(py::object shape) { py::tuple tup = py::reinterpret_borrow(shape); for (size_t i = 0; i < tup.size(); ++i) { if (!PyLong_Check(tup[i].ptr())) { + shp.clear(); return {shp, false}; } else { shp.push_back(tup[i].cast()); @@ -1108,6 +1109,52 @@ py::object _reshape_cpp(py::handle inp_hdl, py::handle args) { return ret[0]; } +py::object _adaptive_pool2d_cpp( + py::handle inp_hdl, py::handle shape_val_hdl, py::handle pool_mode_hdl) { + py::object shape_hdl = py::reinterpret_borrow(shape_val_hdl); + py::list shps(0); + if (!PyTuple_Check(shape_val_hdl.ptr())) { + shps.append(PyLong_AsLong(shape_val_hdl.ptr())); + shps.append(PyLong_AsLong(shape_val_hdl.ptr())); + + shape_hdl = py::reinterpret_borrow(shps); + } + py::object shape_tuple; + try { + shape_tuple = _make_shape_tuple(shape_hdl); + } catch (py::error_already_set& err) { + shape_tuple = py::reinterpret_borrow(shape_hdl); + } + auto mode_string = pool_mode_hdl.cast(); + ::megdnn::param::AdaptivePooling::Mode pool_mode = + ::megdnn::param::AdaptivePooling::Mode::MAX; + if (mode_string.compare(std::string("AVERAGE")) == 0) { + pool_mode = ::megdnn::param::AdaptivePooling::Mode::AVERAGE; + } + auto [shape, fastpath] = tuple2vector(shape_tuple); + fastpath &= enable_fastpath(inp_hdl); + std::shared_ptr op; + std::vector p; + py::object shape_tensor; + op = AdaptivePooling::make( + pool_mode, ::megdnn::param::AdaptivePooling::Format::NCHW, shape); + if (fastpath) { + p.resize(2); + } else { + p.resize(3); + shape_tensor = _astensor1d_cpp( + shape_hdl, py::cast((mgb::DType)dtype::Int32()), + getattr(inp_hdl, "device"), inp_hdl); + p[2] = shape_tensor.ptr(); + } + py::object Op = py::cast(op); + p[0] = Op.ptr(); + p[1] = inp_hdl.ptr(); + py::tuple ret = + py::reinterpret_steal(py_apply(NULL, p.data(), p.size())); + return ret[0]; +} + py::object _getitem_cpp(py::handle inp_hdl, py::handle idx_hdl) { py::tuple try_res = _try_cond_take(inp_hdl, idx_hdl); if (try_res.size() == 2) { @@ -1506,6 +1553,13 @@ PyObject* reshape_cpp(PyObject* self, PyObject* const* args, size_t nargs) { PYEXT17_TRANSLATE_EXC_RET(nullptr) } +PyObject* adaptive_pool2d_cpp(PyObject* self, PyObject* const* args, size_t nargs) { + try { + return _adaptive_pool2d_cpp(args[0], args[1], args[2]).release().ptr(); + } + PYEXT17_TRANSLATE_EXC_RET(nullptr) +} + PyObject* Const(PyObject* self, PyObject* const* args, size_t nargs) { try { return _Const(args[0], args[1], args[2], args[3]).release().ptr(); diff --git a/imperative/python/src/tensor_utils.h b/imperative/python/src/tensor_utils.h index 451541ea0..ab832f669 100644 --- a/imperative/python/src/tensor_utils.h +++ b/imperative/python/src/tensor_utils.h @@ -24,6 +24,8 @@ PyObject* broadcast_cpp(PyObject* self, PyObject* const* args, size_t nargs); PyObject* reshape_cpp(PyObject* self, PyObject* const* args, size_t nargs); +PyObject* adaptive_pool2d_cpp(PyObject* self, PyObject* const* args, size_t nargs); + PyObject* Const(PyObject* self, PyObject* const* args, size_t nargs); PyObject* astype_cpp(PyObject* self, PyObject* const* args, size_t nargs); diff --git a/imperative/src/impl/ops/adaptive_pooling.cpp b/imperative/src/impl/ops/adaptive_pooling.cpp new file mode 100644 index 000000000..c8e4fcae1 --- /dev/null +++ b/imperative/src/impl/ops/adaptive_pooling.cpp @@ -0,0 +1,129 @@ +#include "megbrain/opr/dnn/adaptive_pooling.h" +#include "../algo_chooser.h" +#include "../blob_manager_impl.h" +#include "../dnn_op_helper.h" +#include "../op_trait.h" +#include "megbrain/imperative/ops/autogen.h" +#include "megbrain/opr/io.h" + +namespace mgb::imperative { + +namespace { +namespace adaptive_pooling { +auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { + auto&& pool = static_cast(def); + OperatorNodeConfig config{pool.make_name()}; + size_t nr_inp = inputs.size(); + if (nr_inp > 1) { + return opr::AdaptivePooling::make(inputs[0], inputs[1], pool.param(), config); + } + + HostTensorND hv = HostTensorND(inputs[0]->comp_node(), {2}, dtype::Int32()); + auto* ptr = hv.ptr(); + ptr[0] = pool.shape[0]; + ptr[1] = pool.shape[1]; + auto graph = inputs[0]->owner_graph(); + auto target_shape = opr::ImmutableTensor::make(*graph, hv, config); + return opr::AdaptivePooling::make(inputs[0], target_shape, pool.param(), config); +} + +std::tuple, bool> infer_output_attrs_fallible( + const OpDef& def, const SmallVector& inputs) { + auto&& pool = static_cast(def); + size_t nr_inp = inputs.size(); + auto&& src = inputs[0]; + TensorLayout dst_layout(src.layout.dtype); + if (src.layout.is_empty()) { + return {{{TensorLayout(src.layout.dtype), src.comp_node}}, false}; + } + + dst_layout.ndim = 4u; + if (nr_inp == 1) { + dst_layout[0] = src.layout[0]; + dst_layout[1] = src.layout[1]; + dst_layout[2] = pool.shape[0]; + dst_layout[3] = pool.shape[1]; + } else { + auto&& tshp = inputs[1]; + if (tshp.value.empty()) { + return {{{TensorLayout(src.layout.dtype), src.comp_node}}, false}; + } + mgb_assert( + tshp.layout.ndim == 1, + "target shape of AdaptivePooling expects ndim=1; got ndim=%lu actually", + tshp.layout.ndim); + dst_layout[0] = src.layout[0]; + dst_layout[1] = src.layout[1]; + auto* ptr = tshp.value.ptr(); + dst_layout[2] = ptr[0]; + dst_layout[3] = ptr[1]; + } + dst_layout.init_contiguous_stride(); + return {{{dst_layout, src.comp_node}}, true}; +} + +SmallVector apply_on_physical_tensor( + const OpDef& def, const SmallVector& inputs, + SmallVector& output_descs, const bool& validated) { + auto&& pool = static_cast(def); + auto&& cn = inputs[0]->comp_node(); + + using TensorND = megdnn::TensorND; + auto&& src_layout = inputs[0]->layout(); + TensorLayout dst_layout = output_descs[0].layout; + if (!validated) { + TensorShape tshp; + dst_layout[0] = src_layout[0]; + dst_layout[1] = src_layout[1]; + if (inputs.size() == 2) { + auto&& tshp_nd = inputs[1]; + cg::copy_tensor_value_to_shape( + tshp, tshp_nd->get_value().proxy_to_default_cpu()); + dst_layout[2] = tshp[0]; + dst_layout[3] = tshp[1]; + } else { + dst_layout[2] = pool.shape[0]; + dst_layout[3] = pool.shape[1]; + } + dst_layout.init_contiguous_stride(); + } + + size_t IH = src_layout[2], IW = src_layout[3], OH = dst_layout[2], + OW = dst_layout[3]; + DnnOprCaller dnn_opr(cn); + auto&& param = dnn_opr.op->param(); + param.mode = pool.mode; + param.format = pool.format; + param.pad_h = param.pad_w = 0; + param.stride_h = floor(IH / OH); + param.stride_w = floor(IW / OW); + param.window_h = IH - (OH - 1) * param.stride_h; + param.window_w = IW - (OW - 1) * param.stride_w; + + TensorND src = inputs[0]->dnn_tensor(); + DeviceTensorND dst = + BlobManager::inst()->alloc_workspace_with_defrag(cn, dst_layout); + + size_t sz = setup_algo( + {src_layout, dst_layout}, dnn_opr.op.get(), 0, false, false, cn, + ::megdnn::param::ExecutionPolicy{}, false); + + megdnn::Workspace dnn_wk; + if (sz) { + TensorLayout w_layout({sz}, dtype::Byte()); + dnn_wk = dnn_opr.create_workspace(w_layout); + } + dnn_opr.op->exec(src, dst.as_megdnn(), dnn_wk); + + return {Tensor::make(dst)}; +} + +OP_TRAIT_REG(AdaptivePooling, AdaptivePooling) + .apply_on_var_node(apply_on_var_node) + .infer_output_attrs_fallible(infer_output_attrs_fallible) + .apply_on_physical_tensor(apply_on_physical_tensor) + .fallback(); +} // namespace adaptive_pooling +} // namespace + +} // namespace mgb::imperative diff --git a/imperative/src/impl/ops/specializations.cpp b/imperative/src/impl/ops/specializations.cpp index cb36e43a9..0afcaef8d 100644 --- a/imperative/src/impl/ops/specializations.cpp +++ b/imperative/src/impl/ops/specializations.cpp @@ -293,20 +293,6 @@ OP_TRAIT_REG(TopK, TopK).apply_on_var_node(apply_on_var_node).fallback(); } // namespace top_k } // namespace -namespace { -namespace adaptive_pooling { -auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { - auto&& pool = static_cast(def); - OperatorNodeConfig config{pool.make_name()}; - return opr::AdaptivePooling::make(inputs[0], inputs[1], pool.param(), config); -} - -OP_TRAIT_REG(AdaptivePooling, AdaptivePooling) - .apply_on_var_node(apply_on_var_node) - .fallback(); -} // namespace adaptive_pooling -} // namespace - namespace { namespace batch_conv_bias { auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { diff --git a/src/core/include/megbrain/ir/ops.td b/src/core/include/megbrain/ir/ops.td index 23311367d..9f1dd3a38 100644 --- a/src/core/include/megbrain/ir/ops.td +++ b/src/core/include/megbrain/ir/ops.td @@ -69,7 +69,11 @@ def GroupLocal: MgbHashableOp<"GroupLocal", [ConvolutionParam]>; def Pooling: MgbHashableOp<"Pooling", [PoolingParam, ExecutionPolicyParamBase<"policy">]>; -def AdaptivePooling : MgbHashableOp<"AdaptivePooling", [AdaptivePoolingParam]>; +def AdaptivePooling : MgbHashableOp<"AdaptivePooling", [AdaptivePoolingParam]> { + let extraArguments = (ins + MgbArrayAttr:$shape + ); +} def ROIPooling: MgbHashableOp<"ROIPooling", [ROIPoolingParam]>; -- GitLab