perf(imperative): specialize adaptive pooling

GitOrigin-RevId: 01e14184580fc00e6725d2a7bf90ca374b47eccc

perf(imperative): specialize adaptive pooling
GitOrigin-RevId: 01e14184580fc00e6725d2a7bf90ca374b47eccc
c2435d15 · Megvii Engine Team · 8fcbe825 · c2435d15 · c2435d15 · c2435d15
8 changed file
--- a/dnn/src/common/basic_types.cpp
+++ b/dnn/src/common/basic_types.cpp
@@ -191,7 +191,7 @@ bool TensorShape::is_empty() const {
            return true;
        }
    }
-    return false;
+    return ndim == 0;
 }

 /* ===================== TensorLayout =====================  */

--- a/imperative/python/megengine/functional/nn.py
+++ b/imperative/python/megengine/functional/nn.py
@@ -11,7 +11,12 @@ from functools import lru_cache
 from typing import NamedTuple, Optional, Sequence, Tuple, Union

 from ..core import _config
-from ..core._imperative_rt.core2 import Const, apply, dtype_promotion
+from ..core._imperative_rt.core2 import (
+    Const,
+    adaptive_pool2d_cpp,
+    apply,
+    dtype_promotion,
+)
 from ..core._imperative_rt.ops import SubgraphBuilder as _SubgraphBuilder
 from ..core._imperative_rt.ops import get_global_rng_seed as _get_global_rng_seed
 from ..core.ops import builtin
@@ -691,19 +696,12 @@ def adaptive_max_pool2d(

    Args:
        inp: input tensor.
-        oshp: OH, OW)` size of the output shape.
+        oshp: `(OH, OW)` size of the output shape.

    Returns:
        output tensor.
    """
-    if isinstance(oshp, int):
-        oshp = (oshp, oshp)
-    conv_format = _config._get_actual_op_param("NCHW", _config.__conv_format)
-
-    op = builtin.AdaptivePooling(mode="max", format=conv_format,)
-    oshp = astensor1d(oshp, inp, dtype="int32", device=inp.device)
-    (output,) = apply(op, inp, oshp)
-    return output
+    return adaptive_pool2d_cpp(inp, oshp, "MAX")


 def adaptive_avg_pool2d(
@@ -715,18 +713,12 @@ def adaptive_avg_pool2d(

    Args:
        inp: input tensor.
-        oshp: OH, OW)` size of the output shape.
+        oshp: `(OH, OW)` size of the output shape.

    Returns:
        output tensor.
    """
-    if isinstance(oshp, int):
-        oshp = (oshp, oshp)
-
-    op = builtin.AdaptivePooling(mode="average", format="NCHW",)
-    oshp = astensor1d(oshp, inp, dtype="int32", device=inp.device)
-    (output,) = apply(op, inp, oshp)
-    return output
+    return adaptive_pool2d_cpp(inp, oshp, "AVERAGE")


 def deformable_psroi_pooling(

--- a/imperative/python/src/tensor.cpp
+++ b/imperative/python/src/tensor.cpp
@@ -430,6 +430,7 @@ WRAP_FUNC_PY35(squeeze_cpp);
 WRAP_FUNC_PY35(transpose_cpp);
 WRAP_FUNC_PY35(broadcast_cpp);
 WRAP_FUNC_PY35(reshape_cpp);
+WRAP_FUNC_PY35(adaptive_pool2d_cpp);
 WRAP_FUNC_PY35(Const);
 WRAP_FUNC_PY35(astype_cpp);
 WRAP_FUNC_PY35(convert_single_value_cpp);
@@ -584,6 +585,7 @@ void init_tensor(py::module m) {
            MGE_PY_INTERFACE(transpose_cpp, transpose_cpp),
            MGE_PY_INTERFACE(broadcast_cpp, broadcast_cpp),
            MGE_PY_INTERFACE(reshape_cpp, reshape_cpp),
+            MGE_PY_INTERFACE(adaptive_pool2d_cpp, adaptive_pool2d_cpp),
            MGE_PY_INTERFACE(Const, Const),
            MGE_PY_INTERFACE(astype_cpp, astype_cpp),
            MGE_PY_INTERFACE(convert_single_value_cpp, convert_single_value_cpp),
@@ -991,8 +993,10 @@ void init_tensor(py::module m) {

    m.def("is_tracing_module", [=] { return get_module_trace()->enabled(); });

-    m.def("set_module_trace_hook",
-          [](py::function function) { module_trace_hook = function; });
+    m.def("set_module_trace_hook", [](py::function function) {
+        module_trace_hook = function;
+        module_trace_hook.inc_ref();
+    });

    m.def("begin_record_values", [] { Value::begin_record_values(); });


--- a/imperative/python/src/tensor_utils.cpp
+++ b/imperative/python/src/tensor_utils.cpp
@@ -948,6 +948,7 @@ std::tuple<std::vector<int32_t>, bool> tuple2vector(py::object shape) {
    py::tuple tup = py::reinterpret_borrow<py::tuple>(shape);
    for (size_t i = 0; i < tup.size(); ++i) {
        if (!PyLong_Check(tup[i].ptr())) {
+            shp.clear();
            return {shp, false};
        } else {
            shp.push_back(tup[i].cast<int32_t>());
@@ -1108,6 +1109,52 @@ py::object _reshape_cpp(py::handle inp_hdl, py::handle args) {
    return ret[0];
 }

+py::object _adaptive_pool2d_cpp(
+        py::handle inp_hdl, py::handle shape_val_hdl, py::handle pool_mode_hdl) {
+    py::object shape_hdl = py::reinterpret_borrow<py::object>(shape_val_hdl);
+    py::list shps(0);
+    if (!PyTuple_Check(shape_val_hdl.ptr())) {
+        shps.append(PyLong_AsLong(shape_val_hdl.ptr()));
+        shps.append(PyLong_AsLong(shape_val_hdl.ptr()));
+
+        shape_hdl = py::reinterpret_borrow<py::object>(shps);
+    }
+    py::object shape_tuple;
+    try {
+        shape_tuple = _make_shape_tuple(shape_hdl);
+    } catch (py::error_already_set& err) {
+        shape_tuple = py::reinterpret_borrow<py::object>(shape_hdl);
+    }
+    auto mode_string = pool_mode_hdl.cast<std::string>();
+    ::megdnn::param::AdaptivePooling::Mode pool_mode =
+            ::megdnn::param::AdaptivePooling::Mode::MAX;
+    if (mode_string.compare(std::string("AVERAGE")) == 0) {
+        pool_mode = ::megdnn::param::AdaptivePooling::Mode::AVERAGE;
+    }
+    auto [shape, fastpath] = tuple2vector(shape_tuple);
+    fastpath &= enable_fastpath(inp_hdl);
+    std::shared_ptr<OpDef> op;
+    std::vector<PyObject*> p;
+    py::object shape_tensor;
+    op = AdaptivePooling::make(
+            pool_mode, ::megdnn::param::AdaptivePooling::Format::NCHW, shape);
+    if (fastpath) {
+        p.resize(2);
+    } else {
+        p.resize(3);
+        shape_tensor = _astensor1d_cpp(
+                shape_hdl, py::cast((mgb::DType)dtype::Int32()),
+                getattr(inp_hdl, "device"), inp_hdl);
+        p[2] = shape_tensor.ptr();
+    }
+    py::object Op = py::cast(op);
+    p[0] = Op.ptr();
+    p[1] = inp_hdl.ptr();
+    py::tuple ret =
+            py::reinterpret_steal<py::object>(py_apply(NULL, p.data(), p.size()));
+    return ret[0];
+}
+
 py::object _getitem_cpp(py::handle inp_hdl, py::handle idx_hdl) {
    py::tuple try_res = _try_cond_take(inp_hdl, idx_hdl);
    if (try_res.size() == 2) {
@@ -1506,6 +1553,13 @@ PyObject* reshape_cpp(PyObject* self, PyObject* const* args, size_t nargs) {
    PYEXT17_TRANSLATE_EXC_RET(nullptr)
 }

+PyObject* adaptive_pool2d_cpp(PyObject* self, PyObject* const* args, size_t nargs) {
+    try {
+        return _adaptive_pool2d_cpp(args[0], args[1], args[2]).release().ptr();
+    }
+    PYEXT17_TRANSLATE_EXC_RET(nullptr)
+}
+
 PyObject* Const(PyObject* self, PyObject* const* args, size_t nargs) {
    try {
        return _Const(args[0], args[1], args[2], args[3]).release().ptr();

--- a/imperative/python/src/tensor_utils.h
+++ b/imperative/python/src/tensor_utils.h
@@ -24,6 +24,8 @@ PyObject* broadcast_cpp(PyObject* self, PyObject* const* args, size_t nargs);

 PyObject* reshape_cpp(PyObject* self, PyObject* const* args, size_t nargs);

+PyObject* adaptive_pool2d_cpp(PyObject* self, PyObject* const* args, size_t nargs);
+
 PyObject* Const(PyObject* self, PyObject* const* args, size_t nargs);

 PyObject* astype_cpp(PyObject* self, PyObject* const* args, size_t nargs);

--- a/imperative/src/impl/ops/adaptive_pooling.cpp
+++ b/imperative/src/impl/ops/adaptive_pooling.cpp
+#include "megbrain/opr/dnn/adaptive_pooling.h"
+#include "../algo_chooser.h"
+#include "../blob_manager_impl.h"
+#include "../dnn_op_helper.h"
+#include "../op_trait.h"
+#include "megbrain/imperative/ops/autogen.h"
+#include "megbrain/opr/io.h"
+
+namespace mgb::imperative {
+
+namespace {
+namespace adaptive_pooling {
+auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
+    auto&& pool = static_cast<const AdaptivePooling&>(def);
+    OperatorNodeConfig config{pool.make_name()};
+    size_t nr_inp = inputs.size();
+    if (nr_inp > 1) {
+        return opr::AdaptivePooling::make(inputs[0], inputs[1], pool.param(), config);
+    }
+
+    HostTensorND hv = HostTensorND(inputs[0]->comp_node(), {2}, dtype::Int32());
+    auto* ptr = hv.ptr<dt_int32>();
+    ptr[0] = pool.shape[0];
+    ptr[1] = pool.shape[1];
+    auto graph = inputs[0]->owner_graph();
+    auto target_shape = opr::ImmutableTensor::make(*graph, hv, config);
+    return opr::AdaptivePooling::make(inputs[0], target_shape, pool.param(), config);
+}
+
+std::tuple<SmallVector<LogicalTensorDesc>, bool> infer_output_attrs_fallible(
+        const OpDef& def, const SmallVector<LogicalTensorDesc>& inputs) {
+    auto&& pool = static_cast<const AdaptivePooling&>(def);
+    size_t nr_inp = inputs.size();
+    auto&& src = inputs[0];
+    TensorLayout dst_layout(src.layout.dtype);
+    if (src.layout.is_empty()) {
+        return {{{TensorLayout(src.layout.dtype), src.comp_node}}, false};
+    }
+
+    dst_layout.ndim = 4u;
+    if (nr_inp == 1) {
+        dst_layout[0] = src.layout[0];
+        dst_layout[1] = src.layout[1];
+        dst_layout[2] = pool.shape[0];
+        dst_layout[3] = pool.shape[1];
+    } else {
+        auto&& tshp = inputs[1];
+        if (tshp.value.empty()) {
+            return {{{TensorLayout(src.layout.dtype), src.comp_node}}, false};
+        }
+        mgb_assert(
+                tshp.layout.ndim == 1,
+                "target shape of AdaptivePooling expects ndim=1; got ndim=%lu actually",
+                tshp.layout.ndim);
+        dst_layout[0] = src.layout[0];
+        dst_layout[1] = src.layout[1];
+        auto* ptr = tshp.value.ptr<dt_int32>();
+        dst_layout[2] = ptr[0];
+        dst_layout[3] = ptr[1];
+    }
+    dst_layout.init_contiguous_stride();
+    return {{{dst_layout, src.comp_node}}, true};
+}
+
+SmallVector<TensorPtr> apply_on_physical_tensor(
+        const OpDef& def, const SmallVector<TensorPtr>& inputs,
+        SmallVector<LogicalTensorDesc>& output_descs, const bool& validated) {
+    auto&& pool = static_cast<const AdaptivePooling&>(def);
+    auto&& cn = inputs[0]->comp_node();
+
+    using TensorND = megdnn::TensorND;
+    auto&& src_layout = inputs[0]->layout();
+    TensorLayout dst_layout = output_descs[0].layout;
+    if (!validated) {
+        TensorShape tshp;
+        dst_layout[0] = src_layout[0];
+        dst_layout[1] = src_layout[1];
+        if (inputs.size() == 2) {
+            auto&& tshp_nd = inputs[1];
+            cg::copy_tensor_value_to_shape(
+                    tshp, tshp_nd->get_value().proxy_to_default_cpu());
+            dst_layout[2] = tshp[0];
+            dst_layout[3] = tshp[1];
+        } else {
+            dst_layout[2] = pool.shape[0];
+            dst_layout[3] = pool.shape[1];
+        }
+        dst_layout.init_contiguous_stride();
+    }
+
+    size_t IH = src_layout[2], IW = src_layout[3], OH = dst_layout[2],
+           OW = dst_layout[3];
+    DnnOprCaller<megdnn::Pooling> dnn_opr(cn);
+    auto&& param = dnn_opr.op->param();
+    param.mode = pool.mode;
+    param.format = pool.format;
+    param.pad_h = param.pad_w = 0;
+    param.stride_h = floor(IH / OH);
+    param.stride_w = floor(IW / OW);
+    param.window_h = IH - (OH - 1) * param.stride_h;
+    param.window_w = IW - (OW - 1) * param.stride_w;
+
+    TensorND src = inputs[0]->dnn_tensor();
+    DeviceTensorND dst =
+            BlobManager::inst()->alloc_workspace_with_defrag(cn, dst_layout);
+
+    size_t sz = setup_algo<megdnn::Pooling>(
+            {src_layout, dst_layout}, dnn_opr.op.get(), 0, false, false, cn,
+            ::megdnn::param::ExecutionPolicy{}, false);
+
+    megdnn::Workspace dnn_wk;
+    if (sz) {
+        TensorLayout w_layout({sz}, dtype::Byte());
+        dnn_wk = dnn_opr.create_workspace(w_layout);
+    }
+    dnn_opr.op->exec(src, dst.as_megdnn(), dnn_wk);
+
+    return {Tensor::make(dst)};
+}
+
+OP_TRAIT_REG(AdaptivePooling, AdaptivePooling)
+        .apply_on_var_node(apply_on_var_node)
+        .infer_output_attrs_fallible(infer_output_attrs_fallible)
+        .apply_on_physical_tensor(apply_on_physical_tensor)
+        .fallback();
+}  // namespace adaptive_pooling
+}  // namespace
+
+}  // namespace mgb::imperative
--- a/imperative/src/impl/ops/specializations.cpp
+++ b/imperative/src/impl/ops/specializations.cpp
@@ -293,20 +293,6 @@ OP_TRAIT_REG(TopK, TopK).apply_on_var_node(apply_on_var_node).fallback();
 }  // namespace top_k
 }  // namespace

-namespace {
-namespace adaptive_pooling {
-auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
-    auto&& pool = static_cast<const AdaptivePooling&>(def);
-    OperatorNodeConfig config{pool.make_name()};
-    return opr::AdaptivePooling::make(inputs[0], inputs[1], pool.param(), config);
-}
-
-OP_TRAIT_REG(AdaptivePooling, AdaptivePooling)
-        .apply_on_var_node(apply_on_var_node)
-        .fallback();
-}  // namespace adaptive_pooling
-}  // namespace
-
 namespace {
 namespace batch_conv_bias {
 auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {

--- a/src/core/include/megbrain/ir/ops.td
+++ b/src/core/include/megbrain/ir/ops.td
@@ -69,7 +69,11 @@ def GroupLocal: MgbHashableOp<"GroupLocal", [ConvolutionParam]>;

 def Pooling: MgbHashableOp<"Pooling", [PoolingParam, ExecutionPolicyParamBase<"policy">]>;

-def AdaptivePooling : MgbHashableOp<"AdaptivePooling", [AdaptivePoolingParam]>;
+def AdaptivePooling : MgbHashableOp<"AdaptivePooling", [AdaptivePoolingParam]> {
+  let extraArguments = (ins
+    MgbArrayAttr<MgbI32Attr>:$shape
+  );
+}

 def ROIPooling: MgbHashableOp<"ROIPooling", [ROIPoolingParam]>;