From 5c0b60ae11f95483c1ac569eca7fcc3b2d1db6cf Mon Sep 17 00:00:00 2001
From: Ruibiao Chen <chenruibiao@baidu.com>
Date: Wed, 31 May 2023 22:06:23 +0800
Subject: [PATCH] Remove DryRun in standalone executor (#54222)

---
 .../framework/new_executor/interpretercore.cc | 51 -------------------
 .../framework/new_executor/interpretercore.h  | 12 -----
 .../new_executor/standalone_executor.cc       | 24 ++-------
 .../new_executor/standalone_executor.h        |  8 +--
 paddle/fluid/pybind/pybind.cc                 | 22 --------
 .../new_executor/standalone_executor_test.cc  | 17 ++++---
 .../test_standalone_executor.py               | 47 -----------------
 7 files changed, 15 insertions(+), 166 deletions(-)
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 01bf1d1e2c9..2e47699499f 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -158,38 +158,6 @@ InterpreterCore::~InterpreterCore() {
 #endif
 }
 
-interpreter::CostInfo InterpreterCore::DryRun(
-    const std::vector<std::string>& feed_names,
-    const std::vector<phi::DenseTensor>& feed_tensors) {
-  SetDeviceId(place_);
-  CheckCUDAGraphBeforeRun(feed_names);
-
-  Prepare(feed_names, feed_tensors, true);
-  interpreter::CostInfo cost_info;
-  {
-    interpreter::ProfilerGuard(place_, &cost_info);
-
-    // For the program that only run once, it is no need to
-    // create work_queue, so the async_work_queue_ is created
-    // until the second step run.
-    async_work_queue_ = GetWorkQueue();
-
-    // lazy initialization of gc, do not create gc is the program only run once
-    if (!gc_) {
-      gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
-    }
-
-    ExecuteInstructionList(vec_instruction_);
-    platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  }
-
-  if (HasLocalScope()) {
-    ClearLoDTensorArrayInLocalScope();
-  }
-
-  return cost_info;
-}
-
 void InterpreterCore::RunImpl() {
   // lazy initialization of gc, do not create gc is the program only run once
   if (!gc_) {
@@ -1540,24 +1508,5 @@ void InterpreterCore::AnalyseExecuteOrderForTrace() {
   trace_execute_order_ = trace_order;
 }
 
-std::shared_ptr<InterpreterCore> CreateInterpreterCore(
-    const platform::Place& place,
-    const ProgramDesc& prog,
-    Scope* scope,
-    const std::vector<std::string>& fetch_names,
-    const interpreter::ExecutionConfig& execution_config) {
-  std::shared_ptr<InterpreterCore> core = nullptr;
-  // NOTE(Aurelius84): `AddFetch` will modify BlockDesc, so we should copy
-  // a new program.
-  auto new_prog = std::make_shared<framework::ProgramDesc>(prog);
-  auto* block = new_prog->MutableBlock(0);
-  interpreter::AddFetch(fetch_names, block);
-
-  core =
-      std::make_shared<InterpreterCore>(place, *block, scope, execution_config);
-  core->SetCopyProgram(new_prog);
-  return core;
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index 63e9227b345..cf335f2b0bd 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -54,10 +54,6 @@ class InterpreterCore {
 
   ~InterpreterCore();
 
-  interpreter::CostInfo DryRun(
-      const std::vector<std::string>& feed_names,
-      const std::vector<phi::DenseTensor>& feed_tensors);
-
   paddle::framework::FetchList Run(
       const std::vector<std::string>& feed_names,
       const std::vector<phi::DenseTensor>& feed_tensors);
@@ -190,13 +186,5 @@ class InterpreterCore {
   InstructionSchedulingPriorityLess instruction_scheduling_priority_less;
 };
 
-std::shared_ptr<InterpreterCore> CreateInterpreterCore(
-    const platform::Place& place,
-    const ProgramDesc& prog,
-    Scope* scope,
-    const std::vector<std::string>& fetch_names = {},
-    const interpreter::ExecutionConfig& execution_config =
-        interpreter::ExecutionConfig());
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index e34ef2bfd07..7cd57447f28 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -28,27 +28,17 @@ paddle::framework::FetchList StandaloneExecutor::Run(
     const std::vector<std::string>& fetch_names) {
   platform::RecordEvent record_event(
       "StandaloneExecutor::run", platform::TracerEventType::UserDefined, 1);
-  auto core = GetInterpreterCore(scope, prog_, feed_names, fetch_names, false);
+  auto core = GetInterpreterCore(scope, prog_, feed_names, fetch_names);
 
   VLOG(4) << "StandaloneExecutor: " << this << ", InterpreterCore: " << core;
   return core->Run(feed_names);
 }
 
-framework::interpreter::CostInfo StandaloneExecutor::DryRun(
-    Scope* scope,
-    const std::vector<std::string>& feed_names,
-    const std::vector<phi::DenseTensor>& feed_tensors) {
-  auto core = GetInterpreterCore(scope, prog_, feed_names, {}, true);
-
-  return core->DryRun(feed_names, feed_tensors);
-}
-
 std::shared_ptr<InterpreterCore> StandaloneExecutor::GetInterpreterCore(
     Scope* scope,
     const ProgramDesc& prog,
     const std::vector<std::string>& feed_names,
-    const std::vector<std::string>& fetch_names,
-    bool add_fetch_op) {
+    const std::vector<std::string>& fetch_names) {
   std::ostringstream oss;
   oss << "feed:";
   for (auto& feedname : feed_names) {
@@ -65,14 +55,8 @@ std::shared_ptr<InterpreterCore> StandaloneExecutor::GetInterpreterCore(
   if (iter == interpretercores_.end()) {
     VLOG(3) << "create interpreter_core for " << oss.str() << " on place "
             << place_;
-    VLOG(3) << "add fetch op: " << add_fetch_op;
-    std::shared_ptr<InterpreterCore> core = nullptr;
-
-    if (add_fetch_op) {
-      core = CreateInterpreterCore(place_, prog, scope, fetch_names);
-    } else {
-      core = std::make_shared<InterpreterCore>(place_, prog.Block(0), scope);
-    }
+    std::shared_ptr<InterpreterCore> core =
+        std::make_shared<InterpreterCore>(place_, prog.Block(0), scope);
     interpretercores_.emplace(oss.str(), core);
     return core;
   } else {
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h
index 297e7a3ab72..dc89cb07f55 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.h
+++ b/paddle/fluid/framework/new_executor/standalone_executor.h
@@ -42,18 +42,12 @@ class StandaloneExecutor {
                                    const std::vector<std::string>& feed_names,
                                    const std::vector<std::string>& fetch_names);
 
-  framework::interpreter::CostInfo DryRun(
-      Scope* scope,
-      const std::vector<std::string>& feed_names,
-      const std::vector<phi::DenseTensor>& feed_tensors);
-
  private:
   std::shared_ptr<InterpreterCore> GetInterpreterCore(
       Scope* scope,
       const ProgramDesc& prog,
       const std::vector<std::string>& feed_names,
-      const std::vector<std::string>& fetch_names,
-      bool add_fetch_op);
+      const std::vector<std::string>& fetch_names);
 
   platform::Place place_;
   const ProgramDesc& prog_;
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 7db02616fa8..036f7da3942 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1852,28 +1852,6 @@ All parameter, weight, gradient are variables in Paddle.
                ret = self.Run(scope, feed_names, fetch_names);
              }
              return py::cast(std::move(ret));
-           })
-      .def("dry_run",
-           [](StandaloneExecutor &self,
-              Scope *scope,
-              const std::unordered_map<std::string, py::array> &input_dict) {
-             std::vector<phi::DenseTensor> feed_tensors;
-             std::vector<std::string> feed_names;
-
-             for (auto &item : input_dict) {
-               phi::DenseTensor t;
-               SetTensorFromPyArray<platform::CPUPlace>(
-                   &t, item.second, platform::CPUPlace(), false);
-               feed_names.push_back(item.first);
-               feed_tensors.push_back(t);
-             }
-
-             framework::interpreter::CostInfo cost_info;
-             {
-               pybind11::gil_scoped_release release;
-               cost_info = self.DryRun(scope, feed_names, feed_tensors);
-             }
-             return cost_info;
            });
 
   m.def("init_gflags", framework::InitGflags);
diff --git a/test/cpp/new_executor/standalone_executor_test.cc b/test/cpp/new_executor/standalone_executor_test.cc
index d56d09cabe2..771f1d0ad59 100644
--- a/test/cpp/new_executor/standalone_executor_test.cc
+++ b/test/cpp/new_executor/standalone_executor_test.cc
@@ -174,7 +174,9 @@ TEST(InterpreterCore, skip_gc_vars) {
   Scope scope;
 
   std::shared_ptr<InterpreterCore> startup_core =
-      CreateInterpreterCore(place, startup_prog, &scope);
+      std::make_shared<InterpreterCore>(
+          place, startup_prog.Block(0), &scope, interpreter::ExecutionConfig());
+
   startup_core->Run({}, {});
 
   std::set<std::string> skip_gc_vars = {"uniform_0.tmp_0",
@@ -191,8 +193,9 @@ TEST(InterpreterCore, skip_gc_vars) {
   interpreter::ExecutionConfig execution_config;
   execution_config.skip_gc_vars = skip_gc_vars;
 
-  std::shared_ptr<InterpreterCore> main_core = CreateInterpreterCore(
-      place, main_prog, &scope, /*fetch_names=*/{}, execution_config);
+  std::shared_ptr<InterpreterCore> main_core =
+      std::make_shared<InterpreterCore>(
+          place, main_prog.Block(0), &scope, execution_config);
 
   auto check_gc_result =
       [](Scope& scope, std::set<std::string>& vars, bool is_skip_gc) {
@@ -225,10 +228,10 @@ void TestShareWorkQueue(const ProgramDesc& prog,
   const platform::CPUPlace place = platform::CPUPlace();
 
   Scope scope;
-  std::shared_ptr<InterpreterCore> core1 =
-      CreateInterpreterCore(place, prog, &scope, fetch_names);
-  std::shared_ptr<InterpreterCore> core2 =
-      CreateInterpreterCore(place, prog, &scope, fetch_names);
+  std::shared_ptr<InterpreterCore> core1 = std::make_shared<InterpreterCore>(
+      place, prog.Block(0), &scope, interpreter::ExecutionConfig());
+  std::shared_ptr<InterpreterCore> core2 = std::make_shared<InterpreterCore>(
+      place, prog.Block(0), &scope, interpreter::ExecutionConfig());
   core2->ShareWorkQueueFrom(core1);
 
   auto run_and_check = [&feed_names, &feed_tensors, &fetch_results](
diff --git a/test/standalone_executor/test_standalone_executor.py b/test/standalone_executor/test_standalone_executor.py
index 08b150c84c3..5724ff42667 100644
--- a/test/standalone_executor/test_standalone_executor.py
+++ b/test/standalone_executor/test_standalone_executor.py
@@ -17,64 +17,17 @@ import os
 os.environ['FLAGS_use_stream_safe_cuda_allocator'] = "true"
 import json
 import shutil
-import sys
 import unittest
 
 import numpy as np
 
 import paddle
 from paddle.fluid import core
-from paddle.fluid.core import StandaloneExecutor
 from paddle.profiler import profiler
 
 paddle.enable_static()
 
 
-class TestDryRun(unittest.TestCase):
-    def setUp(self):
-        place = (
-            paddle.CUDAPlace(0)
-            if core.is_compiled_with_cuda()
-            else paddle.CPUPlace()
-        )
-        self.place = core.Place()
-        self.place.set_place(place)
-
-    def build_program(self):
-        startup_program = paddle.static.Program()
-        main_program = paddle.static.Program()
-        with paddle.static.program_guard(main_program, startup_program):
-            a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
-            b = paddle.ones([2, 2]) * 2
-            t = paddle.static.nn.fc(a, 2)
-            c = t + b
-
-        return startup_program, main_program, c
-
-    def test_dry_run(self):
-        scope = core.Scope()
-        startup_program, main_program, c = self.build_program()
-        exe = paddle.static.Executor(self.place)
-        exe.run(startup_program, scope=scope)
-
-        standaloneexecutor = StandaloneExecutor(self.place, main_program.desc)
-        # test for cost_info
-        cost_info = standaloneexecutor.dry_run(
-            scope, {"a": np.ones([2, 2], dtype="float32")}
-        )
-        self.check_cost_info(cost_info)
-
-    def check_cost_info(self, cost_info):
-        IS_WINDOWS = sys.platform.startswith('win')
-
-        if core.is_compiled_with_cuda():
-            # # w,bias,b, out, memory block is at least 256 bytes on Linux
-            gt = 16 * 4 if IS_WINDOWS else 256 * 4
-            self.assertGreater(cost_info.device_memory_bytes(), gt)
-        else:
-            self.assertEqual(cost_info.device_memory_bytes(), 0)
-
-
 def build_program():
     main_program = paddle.static.Program()
     startup_program = paddle.static.Program()
-- 
GitLab