[NewIR]support new ir load combine (#56101)

* support new ir load combine * update * polish code * remove print * polish code * fix bug * polish code * fix compile bug

[NewIR]support new ir load combine (#56101)
* support new ir load combine * update * polish code * remove print * polish code * fix bug * polish code * fix compile bug
b850acb2 · hong · GitHub · a2fe1e24 · b850acb2 · b850acb2
12 changed file
--- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
@@ -21,7 +21,6 @@

 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 #include "paddle/fluid/platform/event.h"
-#include "paddle/ir/core/value.h"

 namespace ir {
 class Value;

--- a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
+
 #include <map>
 #include <string>
 #include <unordered_map>
 #include <vector>

-#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
-
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/event.h"
@@ -42,7 +42,7 @@ std::vector<int> GetValueIds(
    const std::unordered_map<const paddle::framework::Variable*, std::string>&
        variable_2_var_name) {
  std::vector<int> ids;
-  std::string var_name = value_2_var_name.at(value);
+  auto& var_name = value_2_var_name.at(value);
  ids.push_back(var_name_2_id.at(var_name));
  // NOTE(zhangbo): Value maybe a VariableRefArray
  auto var = inner_scope->FindVar(var_name);
@@ -61,7 +61,7 @@ platform::DeviceContext* ParseDeviceContext(
    const platform::Place& place,
    const std::string& execution_stream,
    const int stream_priority) {
-  auto op_attributes = op->attributes();
+  auto& op_attributes = op->attributes();
  auto op_name =
      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
  interpreter::ContextManager& ctx_manager =
@@ -149,7 +149,7 @@ OpFuncType AnalyseOpFuncType(::ir::Operation* op,
  // computing. They execute serially in device thread and block CUDA kernel
  // launching in other GPU OPs. To improve performance, set them as kGpuSync
  // and so that they would be dispatched to host thread.
-  auto op_attributes = op->attributes();
+  auto& op_attributes = op->attributes();
  auto op_name =
      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
  if (op_name == kCoalesceTensor &&

--- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
@@ -43,7 +43,7 @@ LegacyKernelInstruction::LegacyKernelInstruction(
    const std::unordered_map<const paddle::framework::Variable*, std::string>&
        variable_2_var_name)
    : InstructionBase(id, place) {
-  auto op_attributes = op->attributes();
+  auto& op_attributes = op->attributes();
  auto op_name =
      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
  ir::OpInfo op_info = ir::IrContext::Instance()->GetRegisteredOpInfo(op_name);
@@ -97,18 +97,20 @@ LegacyKernelInstruction::LegacyKernelInstruction(
      yaml_interface->get_op_info_());
  VLOG(6) << "finish process yaml_info_parser";

-  ::ir::BuildPhiContext<
-      phi::InferMetaContext,
-      phi::MetaTensor,
-      phi::MetaTensor,
-      paddle::small_vector<phi::MetaTensor, phi::kInputSmallVectorSize>,
-      paddle::small_vector<phi::MetaTensor, phi::kInputSmallVectorSize>,
-      false>(op,
-             value_2_var_name,
-             scope,
-             local_scope,
-             yaml_info_parser,
-             &infer_meta_context_);
+  if (infer_meta_interface_) {
+    ::ir::BuildPhiContext<
+        phi::InferMetaContext,
+        phi::MetaTensor,
+        phi::MetaTensor,
+        paddle::small_vector<phi::MetaTensor, phi::kInputSmallVectorSize>,
+        paddle::small_vector<phi::MetaTensor, phi::kInputSmallVectorSize>,
+        false>(op,
+               value_2_var_name,
+               scope,
+               local_scope,
+               yaml_info_parser,
+               &infer_meta_context_);
+  }
  VLOG(6) << "finish process infer meta context";

  auto kernel_name =
@@ -123,8 +125,10 @@ LegacyKernelInstruction::LegacyKernelInstruction(
      phi_kernel_->IsValid(), true, "not found kernel for [%s]", kernel_name);
  VLOG(6) << "finish process select kernel";

-  operator_base_ =
-      ir::BuildOperatorBase(op, value_2_var_name, yaml_info_parser);
+  Scope* inner_scope = local_scope == nullptr ? scope : local_scope;
+
+  operator_base_ = ir::BuildOperatorBase(
+      op, value_2_var_name, yaml_info_parser, variable_2_var_name, inner_scope);
  paddle::framework::VariableValueMap in_map;
  paddle::framework::VariableValueMap out_map;
  auto dev_ctx = phi::DeviceContextPool::Instance().Get(
@@ -151,7 +155,6 @@ LegacyKernelInstruction::LegacyKernelInstruction(
                         GetStreamPriority()));
  VLOG(6) << "finish process device context";

-  Scope* inner_scope = local_scope == nullptr ? scope : local_scope;
  InitInputsOutputsIds(
      op, inner_scope, value_2_var_name, var_name_2_id, variable_2_var_name);
  VLOG(6) << "finish process inputs outputs index";
@@ -169,10 +172,16 @@ LegacyKernelInstruction::~LegacyKernelInstruction() {
  if (kernel_context_ != nullptr) {
    delete kernel_context_;
  }
+
+  if (phi_kernel_ != nullptr) {
+    delete phi_kernel_;
+  }
 }

 void LegacyKernelInstruction::Run() {
-  infer_meta_interface_->infer_meta_(&(infer_meta_context_));
+  if (infer_meta_interface_) {
+    infer_meta_interface_->infer_meta_(&(infer_meta_context_));
+  }
  VLOG(6) << "Run op " << legacy_op_name_ << " infer meta.";
  (*(phi_kernel_))((kernel_context_));
  VLOG(6) << "Run op " << legacy_op_name_ << " kernel.";

--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -1076,36 +1076,17 @@ void BuildOpFuncList(
                      "not found kernel for [%s]",
                      kernel_name);

-    if (kernel_name == "fused_softmax_mask_upper_triangle" ||
-        kernel_name == "fused_softmax_mask_upper_triangle_grad") {
-      // builder operator
-      op_func_node.operator_base_ =
-          ir::BuildOperatorBase(op, value_2_name_map, op_yaml_info_parser);
-      paddle::framework::VariableValueMap in_map;
-      paddle::framework::VariableValueMap out_map;
-      op_func_node.runtime_ctx_ =
-          std::make_shared<paddle::framework::RuntimeContext>(
-              paddle::framework::RuntimeContext(in_map, out_map));
-      ir::BuildRuntimeContext(op,
-                              value_2_name_map,
-                              scope,
-                              local_scope,
-                              op_yaml_info_parser,
-                              op_func_node.runtime_ctx_.get());
-      op_func_node.fluid_op = true;
-    } else {
-      ::ir::BuildPhiContext<phi::KernelContext,
-                            const phi::TensorBase*,
-                            phi::TensorBase*,
-                            paddle::small_vector<const phi::TensorBase*>,
-                            paddle::small_vector<phi::TensorBase*>,
-                            true>(op,
-                                  value_2_name_map,
-                                  scope,
-                                  local_scope,
-                                  op_yaml_info_parser,
-                                  &(op_func_node.kernel_context_));
-    }
+    ::ir::BuildPhiContext<phi::KernelContext,
+                          const phi::TensorBase*,
+                          phi::TensorBase*,
+                          paddle::small_vector<const phi::TensorBase*>,
+                          paddle::small_vector<phi::TensorBase*>,
+                          true>(op,
+                                value_2_name_map,
+                                scope,
+                                local_scope,
+                                op_yaml_info_parser,
+                                &(op_func_node.kernel_context_));

    VLOG(6) << "finish process kernel context";
    op_func_node.kernel_context_.SetDeviceContext(

--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -38,6 +38,7 @@

 #include "paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h"
+#include "paddle/fluid/ir/dialect/utils.h"
 #include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
 #include "paddle/ir/core/builtin_attribute.h"

@@ -428,8 +429,7 @@ void NewIRInterpreter::BuildInstruction() {
      }
      VLOG(6) << "process " << op_name;

-      if (op_name == "pd.fused_softmax_mask_upper_triangle" ||
-          op_name == "pd.fused_softmax_mask_upper_triangle_grad") {
+      if (dialect::IsLegacyOp(op_name)) {
        vec_instruction_base_.emplace_back(
            std::make_unique<LegacyKernelInstruction>(op_idx++,
                                                      place_,

--- a/paddle/fluid/ir/dialect/pd_op.yaml
+++ b/paddle/fluid/ir/dialect/pd_op.yaml
@@ -103,7 +103,7 @@
  - {typename: bool, name: load_as_fp16}
  - {typename: bool, name: model_from_memory}
  outputs:
-    - {typename: 'Tensor[]', name: out, optional: true, intermediate: false}
+    - {typename: 'Tensor[]', name: Out, optional: true, intermediate: false}
  no_need_buffer: null
  data_transform: null
  kernel:

--- a/paddle/fluid/ir/dialect/utils.cc
+++ b/paddle/fluid/ir/dialect/utils.cc
@@ -17,6 +17,11 @@
 namespace paddle {
 namespace dialect {

+const std::unordered_set<std::string> LegacyOpList = {
+    "pd.fused_softmax_mask_upper_triangle",
+    "pd.fused_softmax_mask_upper_triangle_grad",
+    "pd.load_combine"};
+
 enum class AttrType {
  UNDEFINED = 0,
  BOOL,
@@ -167,5 +172,7 @@ VariantType GetAttributeData(const ir::Attribute& attr) {
  return kAttrCastMap[attr_type](attr);
 }

+bool IsLegacyOp(const std::string& name) { return LegacyOpList.count(name); }
+
 }  // namespace dialect
 }  // namespace paddle
--- a/paddle/fluid/ir/dialect/utils.h
+++ b/paddle/fluid/ir/dialect/utils.h
@@ -147,5 +147,7 @@ static inline ir::Attribute TransToIrAttribute(phi::Scalar scalar,

 VariantType GetAttributeData(const ir::Attribute& attr);

+bool IsLegacyOp(const std::string& name);
+
 }  // namespace dialect
 }  // namespace paddle
--- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
+++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
@@ -598,17 +598,39 @@ void BuildRuntimeContext(
    PADDLE_ENFORCE_NOT_NULL(inner_scope->FindVar(in_var_name),
                            phi::errors::PreconditionNotMet(
                                "can not find var[%s] in scope", in_var_name));
+
    auto var = inner_scope->FindVar(in_var_name);
-    std::vector<paddle::framework::Variable*> vec_tmp = {var};
-    auto legacy_attr_name = op_normalizer.GetLegacyArgName(fluid_op_name, name);
-    runtime_ctx->outputs[legacy_attr_name] = vec_tmp;
+
+    auto type = ptr.type();
+    auto legacy_arg_name = op_normalizer.GetLegacyArgName(fluid_op_name, name);
+    if (type.isa<paddle::dialect::AllocatedDenseTensorType>() ||
+        type.isa<paddle::dialect::AllocatedSelectedRowsType>()) {
+      std::vector<paddle::framework::Variable*> vec_tmp = {var};
+
+      runtime_ctx->outputs[legacy_arg_name] = vec_tmp;
+    } else if (type.isa<ir::VectorType>()) {
+      auto var_ref = var->Get<paddle::framework::VariableRefArray>();
+      std::vector<paddle::framework::Variable*> vec_tmp;
+      vec_tmp.reserve(var_ref.size());
+      for (size_t k = 0; k < var_ref.size(); ++k) {
+        vec_tmp.push_back(const_cast<paddle::framework::Variable*>(var_ref[k]));
+      }
+      runtime_ctx->outputs[legacy_arg_name] = vec_tmp;
+    } else {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "only support AllocatedDenseTensor, AllocatedSelectedRowsType  and "
+          "ir::vector type"));
+    }
  }
 }

 std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
    ir::Operation* op,
    const std::unordered_map<ir::Value, std::string>& name_map,
-    const paddle::dialect::OpYamlInfoParser& op_yaml_info) {
+    const paddle::dialect::OpYamlInfoParser& op_yaml_info,
+    const std::unordered_map<const paddle::framework::Variable*, std::string>&
+        variable_2_var_name,
+    const paddle::framework::Scope* scope) {
  paddle::framework::VariableNameMap in_name_map;
  paddle::framework::VariableNameMap out_name_map;
  paddle::framework::AttributeMap attr_map;
@@ -637,6 +659,30 @@ std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
  }

  // build attribute
+  auto& op_attr_map = op->attributes();
+  auto attr_name_list = op_yaml_info.AttrParams(true);
+  for (auto& name : attr_name_list) {
+    auto& val = op_attr_map.at(name);
+
+    if (val.isa<ir::StrAttribute>()) {
+      attr_map[name] = val.dyn_cast<ir::StrAttribute>().AsString();
+    } else if (val.isa<ir::Int32Attribute>()) {
+      attr_map[name] = val.dyn_cast<ir::Int32Attribute>().data();
+    } else if (val.isa<ir::BoolAttribute>()) {
+      attr_map[name] = val.dyn_cast<ir::BoolAttribute>().data();
+    } else if (val.isa<ir::FloatAttribute>()) {
+      attr_map[name] = val.dyn_cast<ir::FloatAttribute>().data();
+    } else if (val.isa<ir::DoubleAttribute>()) {
+      attr_map[name] = val.dyn_cast<ir::DoubleAttribute>().data();
+    } else if (val.isa<ir::Int64Attribute>()) {
+      attr_map[name] = val.dyn_cast<ir::Int64Attribute>().data();
+    } else {
+      std::stringstream ss;
+      val.Print(ss);
+      VLOG(1) << "type not support " << ss.str() << std::endl;
+      PADDLE_THROW("Type[%s] in attribute map not support yet", ss.str());
+    }
+  }

  auto& output_name_list = op_yaml_info.OutputNames();
  for (size_t i = 0; i < output_name_list.size(); ++i) {
@@ -644,8 +690,26 @@ std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
    ir::Value ptr = op->result(i);

    auto out_var_name = name_map.at(ptr);
-    auto legacy_attr_name = op_normalizer.GetLegacyArgName(fluid_op_name, name);
-    out_name_map[legacy_attr_name].push_back(out_var_name);
+
+    auto type = ptr.type();
+    auto legacy_arg_name = op_normalizer.GetLegacyArgName(fluid_op_name, name);
+    if (type.isa<paddle::dialect::AllocatedDenseTensorType>() ||
+        type.isa<paddle::dialect::AllocatedSelectedRowsType>()) {
+      out_name_map[legacy_arg_name].push_back(out_var_name);
+    } else if (type.isa<ir::VectorType>()) {
+      auto var = scope->FindVar(out_var_name);
+      auto var_ref = var->Get<paddle::framework::VariableRefArray>();
+      for (size_t k = 0; k < var_ref.size(); ++k) {
+        PADDLE_ENFORCE(variable_2_var_name.count(var_ref[k]),
+                       "Variable MUST in variable_2_var_name map");
+        out_name_map[legacy_arg_name].push_back(
+            variable_2_var_name.at(var_ref[k]));
+      }
+    } else {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "only support AllocatedDenseTensor, AllocatedSelectedRowsType  and "
+          "ir::vector type"));
+    }
  }

  auto& op_info = paddle::framework::OpInfoMap::Instance().Get(fluid_op_name);

--- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h
+++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h
@@ -62,7 +62,10 @@ void BuildRuntimeContext(
 std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
    ir::Operation* op,
    const std::unordered_map<ir::Value, std::string>& name_map,
-    const paddle::dialect::OpYamlInfoParser& op_yaml_info);
+    const paddle::dialect::OpYamlInfoParser& op_yaml_info,
+    const std::unordered_map<const paddle::framework::Variable*, std::string>&
+        variable_2_var_name,
+    const paddle::framework::Scope* scope);

 template <typename Context,
          typename InType,

--- a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
@@ -59,10 +59,6 @@ const std::unordered_set<std::string> UnchangeOutputOps = {
    "builtin.get_parameter",
    "pd.shadow_output"};

-const std::unordered_set<std::string> LegacyOpList = {
-    "pd.fused_softmax_mask_upper_triangle",
-    "pd.fused_softmax_mask_upper_triangle_grad"};
-
 bool NeedFallBackCpu(const ir::Operation* op,
                     const std::string& kernel_fn_name,
                     const phi::KernelKey& kernel_key) {
@@ -553,6 +549,9 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
        GetKernelKey(op_item, place, map_value_pair, op_info_parser.get());
    VLOG(6) << "kernel type " << kernel_key;

+    if (op_item->name() == "pd.load_combine") {
+      kernel_key.set_dtype(phi::DataType::FLOAT32);
+    }
    if (NeedFallBackCpu((op_item), kernel_fn_str, kernel_key)) {
      kernel_key.set_backend(phi::Backend::CPU);
    }
@@ -571,7 +570,7 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
      auto args_def = phi_kernel.args_def();
      auto output_defs = args_def.output_defs();
      if (!UnchangeOutputOps.count(op_item->name()) &&
-          !LegacyOpList.count(op_item->name())) {
+          !IsLegacyOp(op_item->name())) {
        PADDLE_ENFORCE_EQ(
            op_item->num_results(),
            output_defs.size(),
@@ -583,7 +582,7 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
      for (size_t i = 0; i < op_item->num_results(); ++i) {
        phi::Place out_place;
        if ((!UnchangeOutputOps.count(op_item->name())) &&
-            (!LegacyOpList.count(op_item->name())) && phi_kernel.IsValid()) {
+            (!IsLegacyOp(op_item->name())) && phi_kernel.IsValid()) {
          out_place = phi::TransToPhiPlace(output_defs[i].backend);
        } else {
          out_place = phi::TransToPhiPlace(kernel_key.backend());

--- a/test/ir/new_ir/test_standalone_new_ir.py
+++ b/test/ir/new_ir/test_standalone_new_ir.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-
+import os
+import tempfile
 import unittest

 import numpy as np
@@ -287,11 +288,18 @@ class TestNewIrPrint(unittest.TestCase):


 class TestJitSaveOp(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_path = os.path.join(self.temp_dir.name, "new_ir_save_load")
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
    def test_with_new_ir(self):
        paddle.disable_static()

        linear = paddle.nn.Linear(10, 10)
-        path = "example_model/linear"
+        path = os.path.join(self.model_path, "linear")

        paddle.jit.save(
            linear,
@@ -299,6 +307,26 @@ class TestJitSaveOp(unittest.TestCase):
            input_spec=[paddle.static.InputSpec([10, 10], 'float32', 'x')],
        )

+        paddle.enable_static()
+        place = (
+            paddle.CUDAPlace(0)
+            if paddle.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
+
+        exe = paddle.static.Executor(place)
+
+        [
+            inference_program,
+            feed_target_names,
+            fetch_targets,
+        ] = paddle.static.io.load_inference_model(
+            self.model_path,
+            executor=exe,
+            model_filename="linear.pdmodel",
+            params_filename="linear.pdiparams",
+        )
+

 if __name__ == "__main__":
    paddle.enable_static()