From 5c0b60ae11f95483c1ac569eca7fcc3b2d1db6cf Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Wed, 31 May 2023 22:06:23 +0800 Subject: [PATCH] Remove DryRun in standalone executor (#54222) --- .../framework/new_executor/interpretercore.cc | 51 ------------------- .../framework/new_executor/interpretercore.h | 12 ----- .../new_executor/standalone_executor.cc | 24 ++------- .../new_executor/standalone_executor.h | 8 +-- paddle/fluid/pybind/pybind.cc | 22 -------- .../new_executor/standalone_executor_test.cc | 17 ++++--- .../test_standalone_executor.py | 47 ----------------- 7 files changed, 15 insertions(+), 166 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 01bf1d1e2c9..2e47699499f 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -158,38 +158,6 @@ InterpreterCore::~InterpreterCore() { #endif } -interpreter::CostInfo InterpreterCore::DryRun( - const std::vector& feed_names, - const std::vector& feed_tensors) { - SetDeviceId(place_); - CheckCUDAGraphBeforeRun(feed_names); - - Prepare(feed_names, feed_tensors, true); - interpreter::CostInfo cost_info; - { - interpreter::ProfilerGuard(place_, &cost_info); - - // For the program that only run once, it is no need to - // create work_queue, so the async_work_queue_ is created - // until the second step run. - async_work_queue_ = GetWorkQueue(); - - // lazy initialization of gc, do not create gc is the program only run once - if (!gc_) { - gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_); - } - - ExecuteInstructionList(vec_instruction_); - platform::DeviceContextPool::Instance().Get(place_)->Wait(); - } - - if (HasLocalScope()) { - ClearLoDTensorArrayInLocalScope(); - } - - return cost_info; -} - void InterpreterCore::RunImpl() { // lazy initialization of gc, do not create gc is the program only run once if (!gc_) { @@ -1540,24 +1508,5 @@ void InterpreterCore::AnalyseExecuteOrderForTrace() { trace_execute_order_ = trace_order; } -std::shared_ptr CreateInterpreterCore( - const platform::Place& place, - const ProgramDesc& prog, - Scope* scope, - const std::vector& fetch_names, - const interpreter::ExecutionConfig& execution_config) { - std::shared_ptr core = nullptr; - // NOTE(Aurelius84): `AddFetch` will modify BlockDesc, so we should copy - // a new program. - auto new_prog = std::make_shared(prog); - auto* block = new_prog->MutableBlock(0); - interpreter::AddFetch(fetch_names, block); - - core = - std::make_shared(place, *block, scope, execution_config); - core->SetCopyProgram(new_prog); - return core; -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index 63e9227b345..cf335f2b0bd 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -54,10 +54,6 @@ class InterpreterCore { ~InterpreterCore(); - interpreter::CostInfo DryRun( - const std::vector& feed_names, - const std::vector& feed_tensors); - paddle::framework::FetchList Run( const std::vector& feed_names, const std::vector& feed_tensors); @@ -190,13 +186,5 @@ class InterpreterCore { InstructionSchedulingPriorityLess instruction_scheduling_priority_less; }; -std::shared_ptr CreateInterpreterCore( - const platform::Place& place, - const ProgramDesc& prog, - Scope* scope, - const std::vector& fetch_names = {}, - const interpreter::ExecutionConfig& execution_config = - interpreter::ExecutionConfig()); - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index e34ef2bfd07..7cd57447f28 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -28,27 +28,17 @@ paddle::framework::FetchList StandaloneExecutor::Run( const std::vector& fetch_names) { platform::RecordEvent record_event( "StandaloneExecutor::run", platform::TracerEventType::UserDefined, 1); - auto core = GetInterpreterCore(scope, prog_, feed_names, fetch_names, false); + auto core = GetInterpreterCore(scope, prog_, feed_names, fetch_names); VLOG(4) << "StandaloneExecutor: " << this << ", InterpreterCore: " << core; return core->Run(feed_names); } -framework::interpreter::CostInfo StandaloneExecutor::DryRun( - Scope* scope, - const std::vector& feed_names, - const std::vector& feed_tensors) { - auto core = GetInterpreterCore(scope, prog_, feed_names, {}, true); - - return core->DryRun(feed_names, feed_tensors); -} - std::shared_ptr StandaloneExecutor::GetInterpreterCore( Scope* scope, const ProgramDesc& prog, const std::vector& feed_names, - const std::vector& fetch_names, - bool add_fetch_op) { + const std::vector& fetch_names) { std::ostringstream oss; oss << "feed:"; for (auto& feedname : feed_names) { @@ -65,14 +55,8 @@ std::shared_ptr StandaloneExecutor::GetInterpreterCore( if (iter == interpretercores_.end()) { VLOG(3) << "create interpreter_core for " << oss.str() << " on place " << place_; - VLOG(3) << "add fetch op: " << add_fetch_op; - std::shared_ptr core = nullptr; - - if (add_fetch_op) { - core = CreateInterpreterCore(place_, prog, scope, fetch_names); - } else { - core = std::make_shared(place_, prog.Block(0), scope); - } + std::shared_ptr core = + std::make_shared(place_, prog.Block(0), scope); interpretercores_.emplace(oss.str(), core); return core; } else { diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h index 297e7a3ab72..dc89cb07f55 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.h +++ b/paddle/fluid/framework/new_executor/standalone_executor.h @@ -42,18 +42,12 @@ class StandaloneExecutor { const std::vector& feed_names, const std::vector& fetch_names); - framework::interpreter::CostInfo DryRun( - Scope* scope, - const std::vector& feed_names, - const std::vector& feed_tensors); - private: std::shared_ptr GetInterpreterCore( Scope* scope, const ProgramDesc& prog, const std::vector& feed_names, - const std::vector& fetch_names, - bool add_fetch_op); + const std::vector& fetch_names); platform::Place place_; const ProgramDesc& prog_; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 7db02616fa8..036f7da3942 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1852,28 +1852,6 @@ All parameter, weight, gradient are variables in Paddle. ret = self.Run(scope, feed_names, fetch_names); } return py::cast(std::move(ret)); - }) - .def("dry_run", - [](StandaloneExecutor &self, - Scope *scope, - const std::unordered_map &input_dict) { - std::vector feed_tensors; - std::vector feed_names; - - for (auto &item : input_dict) { - phi::DenseTensor t; - SetTensorFromPyArray( - &t, item.second, platform::CPUPlace(), false); - feed_names.push_back(item.first); - feed_tensors.push_back(t); - } - - framework::interpreter::CostInfo cost_info; - { - pybind11::gil_scoped_release release; - cost_info = self.DryRun(scope, feed_names, feed_tensors); - } - return cost_info; }); m.def("init_gflags", framework::InitGflags); diff --git a/test/cpp/new_executor/standalone_executor_test.cc b/test/cpp/new_executor/standalone_executor_test.cc index d56d09cabe2..771f1d0ad59 100644 --- a/test/cpp/new_executor/standalone_executor_test.cc +++ b/test/cpp/new_executor/standalone_executor_test.cc @@ -174,7 +174,9 @@ TEST(InterpreterCore, skip_gc_vars) { Scope scope; std::shared_ptr startup_core = - CreateInterpreterCore(place, startup_prog, &scope); + std::make_shared( + place, startup_prog.Block(0), &scope, interpreter::ExecutionConfig()); + startup_core->Run({}, {}); std::set skip_gc_vars = {"uniform_0.tmp_0", @@ -191,8 +193,9 @@ TEST(InterpreterCore, skip_gc_vars) { interpreter::ExecutionConfig execution_config; execution_config.skip_gc_vars = skip_gc_vars; - std::shared_ptr main_core = CreateInterpreterCore( - place, main_prog, &scope, /*fetch_names=*/{}, execution_config); + std::shared_ptr main_core = + std::make_shared( + place, main_prog.Block(0), &scope, execution_config); auto check_gc_result = [](Scope& scope, std::set& vars, bool is_skip_gc) { @@ -225,10 +228,10 @@ void TestShareWorkQueue(const ProgramDesc& prog, const platform::CPUPlace place = platform::CPUPlace(); Scope scope; - std::shared_ptr core1 = - CreateInterpreterCore(place, prog, &scope, fetch_names); - std::shared_ptr core2 = - CreateInterpreterCore(place, prog, &scope, fetch_names); + std::shared_ptr core1 = std::make_shared( + place, prog.Block(0), &scope, interpreter::ExecutionConfig()); + std::shared_ptr core2 = std::make_shared( + place, prog.Block(0), &scope, interpreter::ExecutionConfig()); core2->ShareWorkQueueFrom(core1); auto run_and_check = [&feed_names, &feed_tensors, &fetch_results]( diff --git a/test/standalone_executor/test_standalone_executor.py b/test/standalone_executor/test_standalone_executor.py index 08b150c84c3..5724ff42667 100644 --- a/test/standalone_executor/test_standalone_executor.py +++ b/test/standalone_executor/test_standalone_executor.py @@ -17,64 +17,17 @@ import os os.environ['FLAGS_use_stream_safe_cuda_allocator'] = "true" import json import shutil -import sys import unittest import numpy as np import paddle from paddle.fluid import core -from paddle.fluid.core import StandaloneExecutor from paddle.profiler import profiler paddle.enable_static() -class TestDryRun(unittest.TestCase): - def setUp(self): - place = ( - paddle.CUDAPlace(0) - if core.is_compiled_with_cuda() - else paddle.CPUPlace() - ) - self.place = core.Place() - self.place.set_place(place) - - def build_program(self): - startup_program = paddle.static.Program() - main_program = paddle.static.Program() - with paddle.static.program_guard(main_program, startup_program): - a = paddle.static.data(name="a", shape=[2, 2], dtype='float32') - b = paddle.ones([2, 2]) * 2 - t = paddle.static.nn.fc(a, 2) - c = t + b - - return startup_program, main_program, c - - def test_dry_run(self): - scope = core.Scope() - startup_program, main_program, c = self.build_program() - exe = paddle.static.Executor(self.place) - exe.run(startup_program, scope=scope) - - standaloneexecutor = StandaloneExecutor(self.place, main_program.desc) - # test for cost_info - cost_info = standaloneexecutor.dry_run( - scope, {"a": np.ones([2, 2], dtype="float32")} - ) - self.check_cost_info(cost_info) - - def check_cost_info(self, cost_info): - IS_WINDOWS = sys.platform.startswith('win') - - if core.is_compiled_with_cuda(): - # # w,bias,b, out, memory block is at least 256 bytes on Linux - gt = 16 * 4 if IS_WINDOWS else 256 * 4 - self.assertGreater(cost_info.device_memory_bytes(), gt) - else: - self.assertEqual(cost_info.device_memory_bytes(), 0) - - def build_program(): main_program = paddle.static.Program() startup_program = paddle.static.Program() -- GitLab