From 69a92b7b7f270408ba6163848a8d4c041cf1bc00 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Thu, 28 Apr 2022 10:14:59 +0800
Subject: [PATCH] [cherry-pick] Optimize performance of dygraph (#42231,
 #42253)  (#42309)

* Optimize the performanece of sum api (#42231)

* optimize the performanece of sum api

* optimize IsDenseTensorInput

* remove debug log

* Add move construct for KernelSignature (#42253)

* add move construct for KernelSignature

* add noexcept

* fix cherry-pick problem
---
 paddle/fluid/framework/infershape_utils.cc    | 12 ++++++----
 .../new_executor/new_executor_defs.cc         |  5 ++++
 .../new_executor/new_executor_defs.h          |  2 ++
 paddle/fluid/framework/op_desc.cc             |  4 ++++
 paddle/fluid/framework/operator.cc            |  4 ++++
 paddle/fluid/framework/operator.h             | 11 +++++----
 paddle/fluid/framework/shape_inference.h      |  2 ++
 paddle/fluid/imperative/infer_shape_context.h |  9 +++++++
 .../operators/reduce_ops/reduce_sum_op.cc     |  6 +++++
 paddle/fluid/pybind/eager_utils.cc            |  6 +----
 .../dialect/phi/pass/proto_arg_map_context.cc |  6 +++++
 .../dialect/phi/pass/proto_arg_map_context.h  |  1 +
 paddle/phi/core/compat/arg_map_context.h      | 21 ++++++++++++++++
 paddle/phi/infermeta/unary.cc                 |  3 +--
 paddle/phi/kernels/cpu/reduce_sum_kernel.cc   |  3 +++
 paddle/phi/kernels/gpu/reduce_sum_kernel.cu   |  3 +++
 paddle/phi/ops/compat/sum_sig.cc              |  2 +-
 paddle/phi/tests/ops/test_op_signature.h      |  4 ++++
 python/paddle/tensor/math.py                  | 24 +++++++------------
 19 files changed, 95 insertions(+), 33 deletions(-)
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 91dea654ee6..f5a3265af4f 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -70,6 +70,11 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
   }
 
   bool IsDenseTensorInput(const std::string& name) const override {
+    auto var_type = ctx_.GetInputVarType(name);
+    return var_type == proto::VarType::LOD_TENSOR;
+  }
+
+  bool IsDenseTensorInputs(const std::string& name) const override {
     auto var_types = ctx_.GetInputsVarType(name);
     return std::all_of(var_types.begin(), var_types.end(),
                        [](const proto::VarType::Type& type) {
@@ -78,11 +83,8 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
   }
 
   bool IsSelectedRowsInput(const std::string& name) const override {
-    auto var_types = ctx_.GetInputsVarType(name);
-    return std::all_of(var_types.begin(), var_types.end(),
-                       [](const proto::VarType::Type& type) {
-                         return type == proto::VarType::SELECTED_ROWS;
-                       });
+    auto var_type = ctx_.GetInputVarType(name);
+    return var_type == proto::VarType::SELECTED_ROWS;
   }
 
   bool IsDenseTensorVectorInput(const std::string& name) const override {
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index 0164c453076..535b7e5baa1 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -365,6 +365,11 @@ std::vector<DDim> InterpretercoreInferShapeContext::GetInputsDim(
   return GetDims(vars);
 }
 
+proto::VarType::Type InterpretercoreInferShapeContext::GetInputVarType(
+    const std::string& name) const {
+  return GetVarType(InputVars(name).at(0));
+}
+
 std::vector<proto::VarType::Type>
 InterpretercoreInferShapeContext::GetInputsVarType(
     const std::string& name) const {
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index 83eaf9514a1..b7b7d5eef41 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -100,6 +100,8 @@ class InterpretercoreInferShapeContext : public InferShapeContext {
 
   std::vector<DDim> GetInputsDim(const std::string& name) const override;
 
+  proto::VarType::Type GetInputVarType(const std::string& name) const override;
+
   std::vector<proto::VarType::Type> GetInputsVarType(
       const std::string& name) const override;
 
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 4ef1d3a83a2..acd45462489 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -245,6 +245,10 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   bool IsRunMKLDNNKernel() const override;
 
+  proto::VarType::Type GetInputVarType(const std::string &name) const override {
+    return GetVarType(Inputs(name).at(0));
+  }
+
   std::vector<proto::VarType::Type> GetInputsVarType(
       const std::string &name) const override {
     return GetVarTypes(Inputs(name));
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 7468aaedece..2960b024ce1 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -981,6 +981,10 @@ class RuntimeInferShapeContext : public InferShapeContext {
     return GetDims(vars);
   }
 
+  proto::VarType::Type GetInputVarType(const std::string& name) const override {
+    return GetVarType(InputVars(name).at(0));
+  }
+
   std::vector<proto::VarType::Type> GetInputsVarType(
       const std::string& name) const override {
     return GetVarTypes(InputVars(name));
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 70e9f5c1b14..d8a4ac87292 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -479,6 +479,11 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
   }
 
   bool IsDenseTensorInput(const std::string& name) const override {
+    const auto* var = ctx_.InputVar(name);
+    return var->IsType<phi::DenseTensor>();
+  }
+
+  bool IsDenseTensorInputs(const std::string& name) const override {
     auto vars = ctx_.MultiInputVar(name);
     return std::all_of(vars.begin(), vars.end(), [](const Variable* var) {
       return var->IsType<phi::DenseTensor>();
@@ -486,10 +491,8 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
   }
 
   bool IsSelectedRowsInput(const std::string& name) const override {
-    auto vars = ctx_.MultiInputVar(name);
-    return std::all_of(vars.begin(), vars.end(), [](const Variable* var) {
-      return var->IsType<phi::SelectedRows>();
-    });
+    const auto* var = ctx_.InputVar(name);
+    return var->IsType<phi::SelectedRows>();
   }
 
   bool IsDenseTensorVectorInput(const std::string& name) const override {
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index 4600213596e..850a1093317 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -65,6 +65,8 @@ class InferShapeContext {
   virtual bool HasOutput(const std::string &name) const = 0;
   virtual bool HasAttr(const std::string &name) const = 0;
 
+  virtual proto::VarType::Type GetInputVarType(
+      const std::string &name) const = 0;
   virtual std::vector<proto::VarType::Type> GetInputsVarType(
       const std::string &name) const = 0;
   virtual std::vector<proto::VarType::Type> GetOutputsVarType(
diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h
index 8a5d942e059..a1486638c13 100644
--- a/paddle/fluid/imperative/infer_shape_context.h
+++ b/paddle/fluid/imperative/infer_shape_context.h
@@ -300,6 +300,15 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
     return vec_res;
   }
 
+  framework::proto::VarType::Type GetInputVarType(
+      const std::string& name) const override {
+    auto it = var_map_in_->find(name);
+    PADDLE_ENFORCE_NE(
+        it, var_map_in_->end(),
+        platform::errors::NotFound("can not find [%s] in input", name));
+    return framework::ToVarType(it->second[0]->Var().Type());
+  }
+
   std::vector<framework::proto::VarType::Type> GetInputsVarType(
       const std::string& name) const override {
     std::vector<framework::proto::VarType::Type> vec_res;
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 2a78774f370..6b8e6b8f805 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -89,6 +89,12 @@ class ReduceSumVarTypeInference : public paddle::framework::VarTypeInference {
         BOOST_GET_CONST(int, ctx->GetAttr("out_dtype")));
     if (data_type >= 0) {
       ctx->SetOutputDataType("Out", data_type);
+    } else {
+      auto x_type = ctx->GetInputDataType("X");
+      if (x_type == framework::proto::VarType::BOOL ||
+          x_type == framework::proto::VarType::INT32) {
+        ctx->SetOutputDataType("Out", framework::proto::VarType::INT64);
+      }
     }
   }
 };
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 452aa0ce2d5..124e5883324 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -1204,11 +1204,7 @@ paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
                                                   const std::string& op_type,
                                                   ssize_t arg_pos) {
   if (obj == Py_None) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "data_type, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+    return paddle::experimental::DataType::UNDEFINED;
   }
 
   framework::proto::VarType::Type type = CastPyArg2ProtoType(obj, arg_pos);
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
index 070867853ad..49fe069217e 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
@@ -63,6 +63,12 @@ bool ProtoArgumentMappingContext::IsDenseTensorInput(
     const std::string& name) const {
   return true;
 }
+
+bool ProtoArgumentMappingContext::IsDenseTensorInputs(
+    const std::string& name) const {
+  return true;
+}
+
 bool ProtoArgumentMappingContext::IsSelectedRowsInput(
     const std::string& name) const {
   return false;
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
index 5cf2ef97907..7cb2651ccf6 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
@@ -41,6 +41,7 @@ class ProtoArgumentMappingContext : public ::phi::ArgumentMappingContext {
   size_t OutputSize(const std::string& name) const override;
 
   bool IsDenseTensorInput(const std::string& name) const override;
+  bool IsDenseTensorInputs(const std::string& name) const override;
   bool IsSelectedRowsInput(const std::string& name) const override;
   bool IsDenseTensorVectorInput(const std::string& name) const override;
 
diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h
index 102dca48b99..cd7eb419f13 100644
--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -58,6 +58,18 @@ struct KernelSignature {
 
   // TODO(chenweihang): add assign constructor to solve windows compile
   // problem, remove it later
+  KernelSignature(const KernelSignature& other)
+      : name(other.name),
+        input_names(other.input_names),
+        attr_names(other.attr_names),
+        output_names(other.output_names) {}
+
+  KernelSignature(KernelSignature&& other) noexcept
+      : name(other.name),
+        input_names(std::move(other.input_names)),
+        attr_names(std::move(other.attr_names)),
+        output_names(std::move(other.output_names)) {}
+
   KernelSignature& operator=(const KernelSignature& other) {
     name = other.name;
     input_names = other.input_names;
@@ -65,6 +77,14 @@ struct KernelSignature {
     output_names = other.output_names;
     return *this;
   }
+
+  KernelSignature& operator=(KernelSignature&& other) noexcept {
+    name = other.name;
+    input_names.swap(other.input_names);
+    attr_names.swap(other.attr_names);
+    output_names.swap(other.output_names);
+    return *this;
+  }
 };
 
 std::ostream& operator<<(std::ostream& os, KernelSignature signature);
@@ -86,6 +106,7 @@ class ArgumentMappingContext {
   virtual size_t OutputSize(const std::string& name) const = 0;
 
   virtual bool IsDenseTensorInput(const std::string& name) const = 0;
+  virtual bool IsDenseTensorInputs(const std::string& name) const = 0;
   virtual bool IsSelectedRowsInput(const std::string& name) const = 0;
   // For compatibility with LoDTensorArray
   virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0;
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 7514f19ef48..144da3cc82f 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -2254,8 +2254,7 @@ void SumRawInferMeta(const MetaTensor& x,
   if (dtype != DataType::UNDEFINED) {
     out_dtype = dtype;
   } else {
-    if (x.dtype() == DataType::BOOL || x.dtype() == DataType::INT32 ||
-        x.dtype() == DataType::INT64) {
+    if (x.dtype() == DataType::BOOL || x.dtype() == DataType::INT32) {
       out_dtype = DataType::INT64;
     } else {
       out_dtype = x.dtype();
diff --git a/paddle/phi/kernels/cpu/reduce_sum_kernel.cc b/paddle/phi/kernels/cpu/reduce_sum_kernel.cc
index 32b12ea6845..0b4c4b9f470 100644
--- a/paddle/phi/kernels/cpu/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_sum_kernel.cc
@@ -29,6 +29,9 @@ void SumRawKernel(const Context& dev_ctx,
                   bool reduce_all,
                   DataType out_dtype,
                   DenseTensor* out) {
+  if (out_dtype == DataType::UNDEFINED && out->dtype() != x.dtype()) {
+    out_dtype = out->dtype();
+  }
   phi::Reduce<CPUContext, T, phi::funcs::SumFunctor>(
       dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
 }
diff --git a/paddle/phi/kernels/gpu/reduce_sum_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_kernel.cu
index 28bdbd009bd..918d9b0b65e 100644
--- a/paddle/phi/kernels/gpu/reduce_sum_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_sum_kernel.cu
@@ -27,6 +27,9 @@ void SumRawKernel(const Context& dev_ctx,
                   bool reduce_all,
                   DataType out_dtype,
                   DenseTensor* out) {
+  if (out_dtype == DataType::UNDEFINED && out->dtype() != x.dtype()) {
+    out_dtype = out->dtype();
+  }
   phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
       dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
 }
diff --git a/paddle/phi/ops/compat/sum_sig.cc b/paddle/phi/ops/compat/sum_sig.cc
index 4364047b0e6..d71111408f8 100644
--- a/paddle/phi/ops/compat/sum_sig.cc
+++ b/paddle/phi/ops/compat/sum_sig.cc
@@ -18,7 +18,7 @@
 namespace phi {
 
 KernelSignature SumOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.IsDenseTensorInput("X")) {
+  if (ctx.IsDenseTensorInputs("X")) {
     return KernelSignature("add_n", {"X"}, {}, {"Out"});
   }
   return KernelSignature("unregistered", {}, {}, {});
diff --git a/paddle/phi/tests/ops/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h
index 4a84793527e..1535f40b700 100644
--- a/paddle/phi/tests/ops/test_op_signature.h
+++ b/paddle/phi/tests/ops/test_op_signature.h
@@ -68,6 +68,10 @@ class TestArgumentMappingContext : public phi::ArgumentMappingContext {
     return dense_tensor_inputs.count(name) > 0;
   }
 
+  bool IsDenseTensorInputs(const std::string& name) const override {
+    return dense_tensor_inputs.count(name) > 0;
+  }
+
   bool IsSelectedRowsInput(const std::string& name) const override {
     return selected_rows_inputs.count(name) > 0;
   }
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 217b8258a7e..ede3bcad2f3 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -899,15 +899,10 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
         else:
             reduce_all_flag = False
 
-    def get_dtype(x, dtype):
-        if dtype is not None:
-            return (True, dtype)
-        src_type = convert_dtype(x.dtype)
-        if src_type in ['bool','int32', 'int64']:
-            return (True, 'int64')
-        return (False, src_type)
-
-    dtype_flag, dtype = get_dtype(x, dtype)
+    dtype_flag = False
+    if dtype is not None:
+        dtype_flag = True
+        dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
         if reduce_all_flag:
@@ -915,17 +910,14 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
         else:
             axis = axis if axis != None and axis != [] else [0]
 
-        out_dtype = convert_np_dtype_to_dtype_(dtype)
-        out = _C_ops.final_state_sum(x, axis, out_dtype, keepdim)
-        return out
+        return _C_ops.final_state_sum(x, axis, dtype, keepdim)
 
     if _in_legacy_dygraph():
         axis = axis if axis != None and axis != [] else [0]
         if dtype_flag:
             return _C_ops.reduce_sum(x, 'dim', axis, 'keep_dim', keepdim,
                                        'reduce_all', reduce_all_flag, 'in_dtype',
-                                       x.dtype, 'out_dtype',
-                                       convert_np_dtype_to_dtype_(dtype))
+                                       x.dtype, 'out_dtype', dtype)
         else:
             return _C_ops.reduce_sum(x, 'dim', axis, 'keep_dim', keepdim,
                                        'reduce_all', reduce_all_flag)
@@ -939,7 +931,7 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
     if dtype_flag:
         attrs.update({
             'in_dtype': x.dtype,
-            'out_dtype': convert_np_dtype_to_dtype_(dtype)
+            'out_dtype': dtype
         })
 
     check_variable_and_dtype(
@@ -953,7 +945,7 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
     helper = LayerHelper('sum', **locals())
     if dtype_flag:
         out = helper.create_variable_for_type_inference(
-            dtype=convert_np_dtype_to_dtype_(dtype))
+            dtype=dtype)
     else:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
-- 
GitLab