[NewIR]Change feed list to variable list && support GPU (#55401)

* add feed with place op * remove useless unitest * udpate mkldnn * update * new ir support builtin slice op * fix phi kernel adaptor bug * add enable_static * remove useless test case * change feed list to single variable * support gpu * fix bug * remove template * add more data type * fix cimpile bug

[NewIR]Change feed list to variable list && support GPU (#55401)
* add feed with place op * remove useless unitest * udpate mkldnn * update * new ir support builtin slice op * fix phi kernel adaptor bug * add enable_static * remove useless test case * change feed list to single variable * support gpu * fix bug * remove template * add more data type * fix cimpile bug
75517841 · hong · GitHub · 7eeff7b1 · 75517841 · 75517841
14 changed file
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "glog/logging.h"
+PHI_DECLARE_bool(enable_new_ir_in_executor);
 namespace phi {
 class DenseTensor;
 }  // namespace phi
@@ -34,6 +36,19 @@ void SetFeedVariable(Scope* scope,
  // If var_name Variable is not found in GlobalScope, a new variable will
  // be created.
  VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
+  if (FLAGS_enable_new_ir_in_executor) {
+    // shared data with input tensor
+    auto inner_var_name = var_name + "_" + std::to_string(index);
+    auto feed_ele = scope->Var(inner_var_name);
+    if (!feed_ele->IsType<phi::DenseTensor>()) {
+      VLOG(3) << "Reset " << inner_var_name << " to phi::DenseTensor";
+      feed_ele->Clear();
+    }
+    auto val = feed_ele->GetMutable<phi::DenseTensor>();
+    val->ShareDataWith(input);
+    // set lod
+    val->set_lod(input.lod());
+  } else {
    Variable* g_feed_value = scope->Var(var_name);
    auto& feed_inputs = *(g_feed_value->GetMutable<FeedList>());
    if (index >= feed_inputs.size()) {
@@ -44,6 +59,7 @@ void SetFeedVariable(Scope* scope,
    val.ShareDataWith(input);
    // set lod
    val.set_lod(input.lod());
+  }
 }
 void SetFeedVariable(Scope* scope,

--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -62,11 +62,11 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
    execution_config.skip_gc_vars = job->SkipGcVars();
    // TODO(phlrain) we only support cpu for now
-    if (FLAGS_enable_new_ir_in_executor && platform::is_cpu_place(place)) {
+    if (FLAGS_enable_new_ir_in_executor) {
      VLOG(6) << "begin to translate" << std::endl;
      auto base_program = paddle::TranslateLegacyProgramToProgram(*program);
      auto kernel_program =
-          paddle::dialect::PdOpLowerToKernelPass(base_program.get());
+          paddle::dialect::PdOpLowerToKernelPass(base_program.get(), place);
      interpretercores_.emplace_back(std::make_shared<InterpreterCore>(
          place_, std::move(kernel_program), scope_, execution_config));
    } else {

--- a/paddle/fluid/ir/dialect/pd_op.yaml
+++ b/paddle/fluid/ir/dialect/pd_op.yaml
@@ -227,3 +227,30 @@
  inplace: null
  view: null
  backward: null
+- name: shaddow_feed
+  inputs:
+  - typename: Tensor
+    name: x
+    optional: false
+    no_need_buffer: false
+    data_transform: {}
+  attrs: []
+  outputs:
+    - {typename: Tensor, name: out, optional: false, intermediate: false}
+  no_need_buffer: null
+  data_transform: null
+  infer_meta:
+    func: UnchangedInferMeta
+    param: [x]
+  kernel:
+    func: [shaddow_feed]
+    param: [x]
+    backend: null
+    layout: null
+    data_type: null
+    dispatch: {fetch: null}
+    force_backend: null
+  inplace: null
+  backward: null
--- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
+++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
@@ -193,26 +193,13 @@ void HandleForSpecialOp(
  if (op_name == "pd.feed") {
    auto value = op->result(0);
-    auto var = CreateVar(value,
+    VLOG(6) << "link feed output to feed in variable" << inner_scope;
-                         inner_scope,
-                         var_name_prefix,
-                         false,
-                         value_2_var_name,
-                         variable_2_var_name,
-                         var_name_2_id,
-                         variable_list);
-    // TODO(phlrain): need to update here, support StringTensor
-    auto out_tensor = var->GetMutable<phi::DenseTensor>();
-    auto feed_var =
-        const_cast<paddle::framework::Scope*>(inner_scope->root())->Var("feed");
-    VLOG(6) << "Create var: feed in scope " << inner_scope->root();
    int index =
        op->attributes().at("col").dyn_cast<ir::Int32Attribute>().data();
-    auto feed_list = feed_var->Get<paddle::framework::FeedList>();
-    auto& in_tensor = (PADDLE_GET(phi::DenseTensor, feed_list.at(index)));
+    auto feed_var_name = "feed_" + std::to_string(index);
-    out_tensor->ShareDataWith(in_tensor);
+    value_2_var_name->emplace(value, feed_var_name);
-    out_tensor->set_lod(in_tensor.lod());
  }
  if (op_name == "builtin.combine") {

--- a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
@@ -53,7 +53,7 @@ phi::KernelKey GetKernelKey(
    ir::Operation* op,
    const phi::Place& place,
    const std::unordered_map<ir::Value, ir::OpResult>& map_value_pair,
-    const dialect::OpYamlInfoParser* op_info_parser = nullptr) {
+    std::unique_ptr<dialect::OpYamlInfoParser> op_info_parser = nullptr) {
  if (op->name() == "pd.feed") {
    // NOTE, for now feed op don't need a kernel, so the data type from Op
    // Result the next op use base program datatype
@@ -223,11 +223,11 @@ phi::KernelKey GetKernelKey(
  return res;
 }
-std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog) {
+std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
+                                                   phi::Place place) {
  auto program = std::make_unique<ir::Program>(ir::IrContext::Instance());
  auto block = prog->block();
-  phi::Place cpu_place(phi::AllocationType::CPU);
  ir::IrContext* ctx = ir::IrContext::Instance();
  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
@@ -244,14 +244,19 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog) {
    VLOG(6) << "op name " << (*it)->name();
    paddle::dialect::OpYamlInfoInterface op_info_interface =
        (*it)->dyn_cast<paddle::dialect::OpYamlInfoInterface>();
-    OpYamlInfoParser* op_info_parser = nullptr;
+    std::unique_ptr<OpYamlInfoParser> op_info_parser;
    if (op_info_interface) {
-      op_info_parser = new OpYamlInfoParser(op_info_interface.GetOpInfo());
+      op_info_parser.reset(new OpYamlInfoParser(op_info_interface.GetOpInfo()));
    }
+    std::string kernel_fn_str;
+    if (op_info_parser != nullptr) {
+      kernel_fn_str = op_info_parser->OpRuntimeInfo().kernel_func[0];
+    }
    auto kernel_key =
-        GetKernelKey(*it, cpu_place, map_value_pair, op_info_parser);
+        GetKernelKey(*it, place, map_value_pair, std::move(op_info_parser));
    VLOG(6) << "kernel type " << kernel_key;
-    // create new Op
    // only for single output
    // need update new kernel key layout and data tyep
@@ -305,11 +310,6 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog) {
    // constuct input
    std::vector<ir::OpResult> vec_inputs;
-    std::string kernel_fn_str;
-    if (op_info_parser != nullptr) {
-      kernel_fn_str = op_info_parser->OpRuntimeInfo().kernel_func[0];
-    }
    if ((*it)->num_operands() > 0) {
      for (size_t i = 0; i < (*it)->num_operands(); ++i) {
        auto cur_in = (*it)->operand(i);
@@ -404,6 +404,35 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog) {
    }
    program->block()->push_back(op);
+    if ((*it)->name() == "pd.feed" && platform::is_gpu_place(place)) {
+      // add shaddow feed op
+      phi::KernelKey shaddow_key{
+          phi::Backend::GPU,
+          phi::DataLayout::ANY,
+          TransToPhiDataType(
+              (*it)->result(0).type().dyn_cast<DenseTensorType>().dtype())};
+      std::unordered_map<std::string, ir::Attribute> attr_map{
+          {"op_name", ir::StrAttribute::get(ctx, "pd.shaddow_feed")},
+          {"kernel_name", ir::StrAttribute::get(ctx, "shaddow_feed")},
+          {"kernel_key", dialect::KernelAttribute::get(ctx, shaddow_key)}};
+      auto out_type = paddle::dialect::AllocatedDenseTensorType::get(
+          ctx,
+          phi::TransToPhiPlace(shaddow_key.backend()),
+          (*it)->result(0).type().dyn_cast<dialect::DenseTensorType>());
+      ir::Operation* shaddow_op =
+          ir::Operation::Create({op->result(0)}, attr_map, {out_type}, op_info);
+      map_op_pair[*it] = shaddow_op;
+      program->block()->push_back(shaddow_op);
+      if ((*it)->num_results() > 0) {
+        for (size_t i = 0; i < shaddow_op->num_results(); ++i) {
+          map_value_pair[(*it)->result(i)] = shaddow_op->result(i);
+        }
+      }
+    }
  }
  return program;

--- a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h
+++ b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h
@@ -14,11 +14,13 @@
 #pragma once
 #include "paddle/ir/core/program.h"
+#include "paddle/phi/common/place.h"
 namespace paddle {
 namespace dialect {
-std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog);
+std::unique_ptr<ir::Program> PdOpLowerToKernelPass(
+    ir::Program* prog, phi::Place place = phi::CPUPlace());
 }  // namespace dialect
 }  // namespace paddle
--- a/paddle/phi/kernels/cpu/feed_with_place_kernel.cc
+++ b/paddle/phi/kernels/cpu/feed_with_place_kernel.cc
@@ -16,6 +16,7 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/feed_with_place_impl.h"
 namespace phi {
@@ -26,11 +27,20 @@ void FeedWithPlaceKernel(const Context& ctx,
                         DenseTensor* out) {}
 }  // namespace phi
-PD_REGISTER_KERNEL(feed_with_place,
+PD_REGISTER_KERNEL(
+    feed_with_place, CPU, ALL_LAYOUT, phi::FeedWithPlaceKernel, float) {}
+PD_REGISTER_KERNEL(shaddow_feed,
                   CPU,
                   ALL_LAYOUT,
-                   phi::FeedWithPlaceKernel,
+                   phi::ShaddowFeedKernel,
+                   bool,
                   float,
                   int32_t,
                   int64_t,
-                   double) {}
+                   double,
+                   phi::float16,
+                   phi::bfloat16,
+                   phi::complex64,
+                   phi::complex128) {}
--- a/paddle/phi/kernels/cpu/fetch_kernel.cc
+++ b/paddle/phi/kernels/cpu/fetch_kernel.cc
@@ -16,17 +16,8 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/fetch_impl.h"
-namespace phi {
-template <typename T, typename Context>
-void FetchKernel(const Context& dev_ctx,
-                 const DenseTensor& x,
-                 DenseTensor* out) {
-  phi::Copy(dev_ctx, x, phi::CPUPlace(), true, out);
-  out->set_lod(x.lod());
-}
-}  // namespace phi
 PD_REGISTER_KERNEL(fetch,
                   CPU,
                   ALL_LAYOUT,

--- a/paddle/phi/kernels/feed_with_place_kernel.h
+++ b/paddle/phi/kernels/feed_with_place_kernel.h
@@ -24,4 +24,9 @@ void FeedWithPlaceKernel(const Context& ctx,
                         phi::DataType data_type,
                         DenseTensor* out);
+template <typename T, typename Context>
+void ShaddowFeedKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       DenseTensor* out);
 }  // namespace phi
--- a/paddle/phi/kernels/gpu/feed_with_place_kernel.cu
+++ b/paddle/phi/kernels/gpu/feed_with_place_kernel.cu
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/feed_with_place_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/feed_with_place_impl.h"
+PD_REGISTER_KERNEL(shaddow_feed,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ShaddowFeedKernel,
+                   bool,
+                   float,
+                   int32_t,
+                   int64_t,
+                   double,
+                   phi::float16,
+                   phi::bfloat16,
+                   phi::complex64,
+                   phi::complex128) {}
--- a/paddle/phi/kernels/gpu/fetch_kernel.cu
+++ b/paddle/phi/kernels/gpu/fetch_kernel.cu
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/fetch_kernel.h"
+#include "paddle/phi/kernels/impl/fetch_impl.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+PD_REGISTER_KERNEL(fetch,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FetchKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   phi::float16,
+                   phi::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>,
+                   bool) {}
--- a/paddle/phi/kernels/impl/feed_with_place_impl.h
+++ b/paddle/phi/kernels/impl/feed_with_place_impl.h
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/tensor_utils.h"
+namespace phi {
+template <typename T, typename Context>
+void ShaddowFeedKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  if (x.place() == out->place()) {
+    out->ShareDataWith(x);
+    out->set_lod(x.lod());
+  } else {
+    phi::Copy<Context>(ctx, x, ctx.GetPlace(), true, out);
+  }
+}
+}  // namespace phi
--- a/paddle/phi/kernels/impl/fetch_impl.h
+++ b/paddle/phi/kernels/impl/fetch_impl.h
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/tensor_utils.h"
+namespace phi {
+template <typename T, typename Context>
+void FetchKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
+  phi::Copy(ctx, x, phi::CPUPlace(), true, out);
+}
+}  // namespace phi
--- a/test/ir/new_ir/test_standalone_new_ir.py
+++ b/test/ir/new_ir/test_standalone_new_ir.py
@@ -24,7 +24,11 @@ paddle.enable_static()
 class TestNewIr(unittest.TestCase):
    def test_with_new_ir(self):
-        place = paddle.CPUPlace()
+        place = (
+            paddle.CUDAPlace(0)
+            if paddle.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
        exe = paddle.static.Executor(place)
        main_program = paddle.static.Program()
@@ -44,7 +48,11 @@ class TestNewIr(unittest.TestCase):
 class TestCombineOp(unittest.TestCase):
    def test_with_new_ir(self):
-        place = paddle.CPUPlace()
+        place = (
+            paddle.CUDAPlace(0)
+            if paddle.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
        exe = paddle.static.Executor(place)
        main_program = paddle.static.Program()
@@ -64,7 +72,11 @@ class TestCombineOp(unittest.TestCase):
 class TestFeedOp(unittest.TestCase):
    def test_with_new_ir(self):
-        place = paddle.CPUPlace()
+        place = (
+            paddle.CUDAPlace(0)
+            if paddle.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
        exe = paddle.static.Executor(place)
        main_program = paddle.static.Program()
@@ -91,6 +103,8 @@ class TestFeedOp(unittest.TestCase):
 class TestSelectedRows(unittest.TestCase):
    def test_with_new_ir(self):
+        # TODO(phlrain): support selected rows in GPU
+        # place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace()
        place = paddle.CPUPlace()
        exe = paddle.static.Executor(place)
@@ -113,7 +127,11 @@ class TestSelectedRows(unittest.TestCase):
 class TestAddGradOp(unittest.TestCase):
    def test_with_new_ir(self):
-        place = paddle.CPUPlace()
+        place = (
+            paddle.CUDAPlace(0)
+            if paddle.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
        exe = paddle.static.Executor(place)
        main_program = paddle.static.Program()
@@ -143,7 +161,11 @@ class TestAddGradOp(unittest.TestCase):
 class TestSplitOp(unittest.TestCase):
    def test_with_new_ir(self):
-        place = paddle.CPUPlace()
+        place = (
+            paddle.CUDAPlace(0)
+            if paddle.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
        exe = paddle.static.Executor(place)
        main_program = paddle.static.Program()