perf(ops): enable memory forward for reduce in special cases

GitOrigin-RevId: dd6e1664c50aabadc40bab6d01fb7fb31720bd8b

perf(ops): enable memory forward for reduce in special cases
GitOrigin-RevId: dd6e1664c50aabadc40bab6d01fb7fb31720bd8b
e1c7b22f · Megvii Engine Team · cd60d268 · e1c7b22f
隐藏空白更改
内联并排

Showing with 40 addition and 0 deletion

imperative/src/impl/ops/reduce.cpp imperative/src/impl/ops/reduce.cpp +40 -0

未找到文件。
--- a/imperative/src/impl/ops/reduce.cpp
+++ b/imperative/src/impl/ops/reduce.cpp
@@ -11,6 +11,7 @@
 #include "megbrain/imperative/ops/autogen.h"
 #include "megbrain/opr/basic_arith.h"
+#include "megbrain/imperative/proxy_graph_detail.h"
 #include "../op_trait.h"
 #include "../dnn_op_helper.h"
@@ -35,9 +36,48 @@ std::shared_ptr<OpDef> make_from_op_node(cg::OperatorNodeBase* node_) {
    return Reduce::make(node->param());
 }
+bool memory_forward_success(
+        const OpDef& def,
+        SmallVector<TensorPtr> inputs) {
+    auto&& reduce = static_cast<const Reduce&>(def);
+    if (reduce.mode != Reduce::Mode::SUM_SQR && inputs.size() == 2) {
+        auto shape_tensor = inputs[1]->get_value();
+        TensorShape shape;
+        cg::copy_tensor_value_to_shape(shape, shape_tensor.proxy_to_default_cpu());
+        if (shape.eq_shape(inputs[0]->shape())) {
+            return true;
+        }
+    }
+    return false;
+}
+std::tuple<SmallVector<MemoryDesc>, SmallVector<MemoryDesc>> infer_output_mem_desc(
+        const OpDef& def,
+        const SmallVector<TensorPtr>& inputs_tensors,
+        const SmallVector<MemoryDesc>& inputs_mems) {
+    if (memory_forward_success(def, inputs_tensors)) {
+        auto& src_desc = inputs_mems[0];
+        return {{{src_desc.layout, 0, src_desc.cn, StorageIdentifier::make(&src_desc)}}, {}};
+    }
+    return proxy_graph_detail::infer_output_mem_desc(def, inputs_tensors, inputs_mems);
+}
+void execute(const OpDef& def,
+        SmallVector<TensorPtr> inputs,
+        SmallVector<TensorPtr> outputs,
+        SmallVector<TensorPtr> workspace) {
+    if (memory_forward_success(def, inputs)) {
+        return;
+    }
+    return proxy_graph_detail::execute(def, inputs, outputs, workspace);
+}
 OP_TRAIT_REG(Reduce, Reduce, opr::Reduce)
        .make_from_op_node(make_from_op_node)
        .apply_on_var_node(apply_on_var_node)
+        .infer_output_mem_desc(infer_output_mem_desc)
+        .execute(execute)
        .fallback();
 }  // namespace reduce
 }  // namespace