Remove DryRun in standalone executor (#54222)

5c0b60ae · Ruibiao Chen · GitHub · ea8f1998 · 5c0b60ae · 5c0b60ae
7 changed file
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -158,38 +158,6 @@ InterpreterCore::~InterpreterCore() {
 #endif
 }
-interpreter::CostInfo InterpreterCore::DryRun(
-    const std::vector<std::string>& feed_names,
-    const std::vector<phi::DenseTensor>& feed_tensors) {
-  SetDeviceId(place_);
-  CheckCUDAGraphBeforeRun(feed_names);
-  Prepare(feed_names, feed_tensors, true);
-  interpreter::CostInfo cost_info;
-  {
-    interpreter::ProfilerGuard(place_, &cost_info);
-    // For the program that only run once, it is no need to
-    // create work_queue, so the async_work_queue_ is created
-    // until the second step run.
-    async_work_queue_ = GetWorkQueue();
-    // lazy initialization of gc, do not create gc is the program only run once
-    if (!gc_) {
-      gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
-    }
-    ExecuteInstructionList(vec_instruction_);
-    platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  }
-  if (HasLocalScope()) {
-    ClearLoDTensorArrayInLocalScope();
-  }
-  return cost_info;
-}
 void InterpreterCore::RunImpl() {
  // lazy initialization of gc, do not create gc is the program only run once
  if (!gc_) {
@@ -1540,24 +1508,5 @@ void InterpreterCore::AnalyseExecuteOrderForTrace() {
  trace_execute_order_ = trace_order;
 }
-std::shared_ptr<InterpreterCore> CreateInterpreterCore(
-    const platform::Place& place,
-    const ProgramDesc& prog,
-    Scope* scope,
-    const std::vector<std::string>& fetch_names,
-    const interpreter::ExecutionConfig& execution_config) {
-  std::shared_ptr<InterpreterCore> core = nullptr;
-  // NOTE(Aurelius84): `AddFetch` will modify BlockDesc, so we should copy
-  // a new program.
-  auto new_prog = std::make_shared<framework::ProgramDesc>(prog);
-  auto* block = new_prog->MutableBlock(0);
-  interpreter::AddFetch(fetch_names, block);
-  core =
-      std::make_shared<InterpreterCore>(place, *block, scope, execution_config);
-  core->SetCopyProgram(new_prog);
-  return core;
-}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -54,10 +54,6 @@ class InterpreterCore {
  ~InterpreterCore();
-  interpreter::CostInfo DryRun(
-      const std::vector<std::string>& feed_names,
-      const std::vector<phi::DenseTensor>& feed_tensors);
  paddle::framework::FetchList Run(
      const std::vector<std::string>& feed_names,
      const std::vector<phi::DenseTensor>& feed_tensors);
@@ -190,13 +186,5 @@ class InterpreterCore {
  InstructionSchedulingPriorityLess instruction_scheduling_priority_less;
 };
-std::shared_ptr<InterpreterCore> CreateInterpreterCore(
-    const platform::Place& place,
-    const ProgramDesc& prog,
-    Scope* scope,
-    const std::vector<std::string>& fetch_names = {},
-    const interpreter::ExecutionConfig& execution_config =
-        interpreter::ExecutionConfig());
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -28,27 +28,17 @@ paddle::framework::FetchList StandaloneExecutor::Run(
    const std::vector<std::string>& fetch_names) {
  platform::RecordEvent record_event(
      "StandaloneExecutor::run", platform::TracerEventType::UserDefined, 1);
-  auto core = GetInterpreterCore(scope, prog_, feed_names, fetch_names, false);
+  auto core = GetInterpreterCore(scope, prog_, feed_names, fetch_names);
  VLOG(4) << "StandaloneExecutor: " << this << ", InterpreterCore: " << core;
  return core->Run(feed_names);
 }
-framework::interpreter::CostInfo StandaloneExecutor::DryRun(
-    Scope* scope,
-    const std::vector<std::string>& feed_names,
-    const std::vector<phi::DenseTensor>& feed_tensors) {
-  auto core = GetInterpreterCore(scope, prog_, feed_names, {}, true);
-  return core->DryRun(feed_names, feed_tensors);
-}
 std::shared_ptr<InterpreterCore> StandaloneExecutor::GetInterpreterCore(
    Scope* scope,
    const ProgramDesc& prog,
    const std::vector<std::string>& feed_names,
-    const std::vector<std::string>& fetch_names,
+    const std::vector<std::string>& fetch_names) {
-    bool add_fetch_op) {
  std::ostringstream oss;
  oss << "feed:";
  for (auto& feedname : feed_names) {
@@ -65,14 +55,8 @@ std::shared_ptr<InterpreterCore> StandaloneExecutor::GetInterpreterCore(
  if (iter == interpretercores_.end()) {
    VLOG(3) << "create interpreter_core for " << oss.str() << " on place "
            << place_;
-    VLOG(3) << "add fetch op: " << add_fetch_op;
+    std::shared_ptr<InterpreterCore> core =
-    std::shared_ptr<InterpreterCore> core = nullptr;
+        std::make_shared<InterpreterCore>(place_, prog.Block(0), scope);
-    if (add_fetch_op) {
-      core = CreateInterpreterCore(place_, prog, scope, fetch_names);
-    } else {
-      core = std::make_shared<InterpreterCore>(place_, prog.Block(0), scope);
-    }
    interpretercores_.emplace(oss.str(), core);
    return core;
  } else {

--- a/paddle/fluid/framework/new_executor/standalone_executor.h
+++ b/paddle/fluid/framework/new_executor/standalone_executor.h
@@ -42,18 +42,12 @@ class StandaloneExecutor {
                                   const std::vector<std::string>& feed_names,
                                   const std::vector<std::string>& fetch_names);
-  framework::interpreter::CostInfo DryRun(
-      Scope* scope,
-      const std::vector<std::string>& feed_names,
-      const std::vector<phi::DenseTensor>& feed_tensors);
 private:
  std::shared_ptr<InterpreterCore> GetInterpreterCore(
      Scope* scope,
      const ProgramDesc& prog,
      const std::vector<std::string>& feed_names,
-      const std::vector<std::string>& fetch_names,
+      const std::vector<std::string>& fetch_names);
-      bool add_fetch_op);
  platform::Place place_;
  const ProgramDesc& prog_;

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1852,28 +1852,6 @@ All parameter, weight, gradient are variables in Paddle.
               ret = self.Run(scope, feed_names, fetch_names);
             }
             return py::cast(std::move(ret));
-           })
-      .def("dry_run",
-           [](StandaloneExecutor &self,
-              Scope *scope,
-              const std::unordered_map<std::string, py::array> &input_dict) {
-             std::vector<phi::DenseTensor> feed_tensors;
-             std::vector<std::string> feed_names;
-             for (auto &item : input_dict) {
-               phi::DenseTensor t;
-               SetTensorFromPyArray<platform::CPUPlace>(
-                   &t, item.second, platform::CPUPlace(), false);
-               feed_names.push_back(item.first);
-               feed_tensors.push_back(t);
-             }
-             framework::interpreter::CostInfo cost_info;
-             {
-               pybind11::gil_scoped_release release;
-               cost_info = self.DryRun(scope, feed_names, feed_tensors);
-             }
-             return cost_info;
           });
  m.def("init_gflags", framework::InitGflags);

--- a/test/cpp/new_executor/standalone_executor_test.cc
+++ b/test/cpp/new_executor/standalone_executor_test.cc
@@ -174,7 +174,9 @@ TEST(InterpreterCore, skip_gc_vars) {
  Scope scope;
  std::shared_ptr<InterpreterCore> startup_core =
-      CreateInterpreterCore(place, startup_prog, &scope);
+      std::make_shared<InterpreterCore>(
+          place, startup_prog.Block(0), &scope, interpreter::ExecutionConfig());
  startup_core->Run({}, {});
  std::set<std::string> skip_gc_vars = {"uniform_0.tmp_0",
@@ -191,8 +193,9 @@ TEST(InterpreterCore, skip_gc_vars) {
  interpreter::ExecutionConfig execution_config;
  execution_config.skip_gc_vars = skip_gc_vars;
-  std::shared_ptr<InterpreterCore> main_core = CreateInterpreterCore(
+  std::shared_ptr<InterpreterCore> main_core =
-      place, main_prog, &scope, /*fetch_names=*/{}, execution_config);
+      std::make_shared<InterpreterCore>(
+          place, main_prog.Block(0), &scope, execution_config);
  auto check_gc_result =
      [](Scope& scope, std::set<std::string>& vars, bool is_skip_gc) {
@@ -225,10 +228,10 @@ void TestShareWorkQueue(const ProgramDesc& prog,
  const platform::CPUPlace place = platform::CPUPlace();
  Scope scope;
-  std::shared_ptr<InterpreterCore> core1 =
+  std::shared_ptr<InterpreterCore> core1 = std::make_shared<InterpreterCore>(
-      CreateInterpreterCore(place, prog, &scope, fetch_names);
+      place, prog.Block(0), &scope, interpreter::ExecutionConfig());
-  std::shared_ptr<InterpreterCore> core2 =
+  std::shared_ptr<InterpreterCore> core2 = std::make_shared<InterpreterCore>(
-      CreateInterpreterCore(place, prog, &scope, fetch_names);
+      place, prog.Block(0), &scope, interpreter::ExecutionConfig());
  core2->ShareWorkQueueFrom(core1);
  auto run_and_check = [&feed_names, &feed_tensors, &fetch_results](

--- a/test/standalone_executor/test_standalone_executor.py
+++ b/test/standalone_executor/test_standalone_executor.py
@@ -17,64 +17,17 @@ import os
 os.environ['FLAGS_use_stream_safe_cuda_allocator'] = "true"
 import json
 import shutil
-import sys
 import unittest
 import numpy as np
 import paddle
 from paddle.fluid import core
-from paddle.fluid.core import StandaloneExecutor
 from paddle.profiler import profiler
 paddle.enable_static()
-class TestDryRun(unittest.TestCase):
-    def setUp(self):
-        place = (
-            paddle.CUDAPlace(0)
-            if core.is_compiled_with_cuda()
-            else paddle.CPUPlace()
-        )
-        self.place = core.Place()
-        self.place.set_place(place)
-    def build_program(self):
-        startup_program = paddle.static.Program()
-        main_program = paddle.static.Program()
-        with paddle.static.program_guard(main_program, startup_program):
-            a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
-            b = paddle.ones([2, 2]) * 2
-            t = paddle.static.nn.fc(a, 2)
-            c = t + b
-        return startup_program, main_program, c
-    def test_dry_run(self):
-        scope = core.Scope()
-        startup_program, main_program, c = self.build_program()
-        exe = paddle.static.Executor(self.place)
-        exe.run(startup_program, scope=scope)
-        standaloneexecutor = StandaloneExecutor(self.place, main_program.desc)
-        # test for cost_info
-        cost_info = standaloneexecutor.dry_run(
-            scope, {"a": np.ones([2, 2], dtype="float32")}
-        )
-        self.check_cost_info(cost_info)
-    def check_cost_info(self, cost_info):
-        IS_WINDOWS = sys.platform.startswith('win')
-        if core.is_compiled_with_cuda():
-            # # w,bias,b, out, memory block is at least 256 bytes on Linux
-            gt = 16 * 4 if IS_WINDOWS else 256 * 4
-            self.assertGreater(cost_info.device_memory_bytes(), gt)
-        else:
-            self.assertEqual(cost_info.device_memory_bytes(), 0)
 def build_program():
    main_program = paddle.static.Program()
    startup_program = paddle.static.Program()