From 572bad8a66ea9650d753911af8383785ed671af9 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Tue, 31 Aug 2021 19:06:28 +0800 Subject: [PATCH] Support CostInfo and MemProfiler in InterpreterCore (#34981) * polish code * fix unittest on windows * refine pybind interface * support statistic MemSize of AllocatorPool * Replace mutex into atomic --- .../framework/new_executor/interpretercore.cc | 52 +++++++- .../framework/new_executor/interpretercore.h | 9 +- .../fluid/framework/new_executor/profiler.h | 121 ++++++++++++++++++ .../new_executor/standalone_executor.cc | 9 ++ .../new_executor/standalone_executor.h | 3 + paddle/fluid/platform/gpu_info.cc | 20 +-- paddle/fluid/pybind/pybind.cc | 32 ++++- .../interpreter/test_standalone_executor.py | 18 +++ 8 files changed, 247 insertions(+), 17 deletions(-) create mode 100644 paddle/fluid/framework/new_executor/profiler.h diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index c86a16cef0..d31627efed 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -349,6 +349,9 @@ void InterpreterCore::BuildInstructionCtx(Instruction* instr_node, } void InterpreterCore::RunInstruction(const Instruction& instr_node) { + VLOG(3) << "RunInstruction: " + << instr_node.kernel_func_.operator_base_->Type(); + static_cast( instr_node.kernel_func_.operator_base_) ->InferShape(instr_node.infershape_ctx_.get()); @@ -364,7 +367,7 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { void InterpreterCore::ExecuteInstructionList( const std::vector& vec_instr, const VariableScope& var_scope, - const platform::Place& place) { + const platform::Place& place, bool is_dry_run) { std::queue working_queue; auto working_dependecy_count = dependecy_count_; for (size_t i = 0; i < dependecy_count_.size(); ++i) { @@ -385,6 +388,11 @@ void InterpreterCore::ExecuteInstructionList( // step2: run instruction RunInstruction(instr_node); ++run_op_number; + + if (is_dry_run) { + profiler_.ParseMemoryInfo(var_scope.var_list); + } + // step3: insert event for out_vars if needed RecordEventInstruction(instr_node, vec_func_list_[instr_id]); @@ -827,6 +835,48 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place, VLOG(3) << "run " << op_base->Type() << " done."; } } +void InterpreterCore::Prepare( + const std::vector& feed_tensors) { + auto FeedInput = [&] { + for (size_t i = 0; i < feed_names_.size(); ++i) { + auto it = global_scope_->name2id.find(feed_names_[i]); + assert(it != global_scope_->name2id.end()); + + auto feed_tensor = global_scope_->var_list[it->second] + ->GetMutable(); + feed_tensor->ShareDataWith(feed_tensors[i]); + } + }; + + if (is_build_ == false) { + BuildVariableScope(main_program_, global_scope_); + FeedInput(); + BuildOpFuncList(place_, main_program_, &op_list_, &vec_func_list_, + global_scope_); + is_build_ = true; + // convert vec func_list to graph + Convert(); + } + // NOTE: Because feed_tensor will be GC after BuildOpFuncList, so we should + // call + // FeedInput again. + FeedInput(); +} + +const CostInfo& InterpreterCore::DryRun( + const std::vector& feed_tensors) { + Prepare(feed_tensors); + // DryRun may be called many times. + profiler_.Reset(); + profiler_.Start(); + ExecuteInstructionList(vec_instruction_, *global_scope_, place_, + /*is_dry_run=*/true); + platform::DeviceContextPool::Instance().Get(place_)->Wait(); + + profiler_.Pause(); + profiler_.TotalCUDAAllocatedMemorySize(place_); + return profiler_.GetCostInfo(); +} platform::DeviceContext* InterpreterCore::ParseDeviceContextForInstruction( const OpFuncNode& op_func_node, const OperatorBase& op_base) { diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index 652e4fff7c..ebb81dc9a0 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -21,6 +21,7 @@ #include "paddle/fluid/framework/new_executor/interpretercore_util.h" #include "paddle/fluid/framework/new_executor/new_executor_defs.h" +#include "paddle/fluid/framework/new_executor/profiler.h" #include "paddle/fluid/framework/new_executor/workqueue.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/tensor.h" @@ -41,6 +42,8 @@ class InterpreterCore { paddle::framework::FetchList Run( const std::vector& feed_tensors); + const CostInfo& DryRun(const std::vector& feed_tensors); + static void BuildOpFuncList(const platform::Place& place, const framework::ProgramDesc& pdesc, std::vector* op_list, @@ -58,7 +61,8 @@ class InterpreterCore { void ExecuteInstructionList(const std::vector& vec_instr, const VariableScope& var_scope, - const platform::Place& place); + const platform::Place& place, + bool is_dry_run = false); std::vector MergeVector(const std::vector& first, const std::vector& second); @@ -66,6 +70,8 @@ class InterpreterCore { void BuildVariableScope(const framework::ProgramDesc& pdesc, VariableScope* var_scope); + void Prepare(const std::vector& feed_tensors); + void CheckGC(size_t instr_id, const std::vector& gc_check_list, const VariableScope& var_scope, const platform::Place& place, std::vector& working_var_ref); // NOLINT @@ -100,6 +106,7 @@ class InterpreterCore { bool is_build_; std::vector feed_names_; + InterpreterProfiler profiler_; std::map> var_id2event_; std::vector gc_event_; diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h new file mode 100644 index 0000000000..5d8ec05b7f --- /dev/null +++ b/paddle/fluid/framework/new_executor/profiler.h @@ -0,0 +1,121 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/timer.h" + +namespace paddle { +namespace framework { + +static void GetTensors(Variable* var, std::unordered_set* tensor_set) { + if (var->IsType() && var->Get().IsInitialized()) { + tensor_set->insert(var->GetMutable()); + } else if (var->IsType() && + var->Get().value().IsInitialized()) { + tensor_set->insert(var->GetMutable()->mutable_value()); + } else if (var->IsType()) { + auto* tensor_arr = var->GetMutable(); + for (auto& t : *tensor_arr) { + if (t.IsInitialized()) { + tensor_set->insert(&t); + } + } + } +} + +static std::pair GetTensorMemorySize( + const std::vector& var_list) { + std::unordered_set tensor_set; + for (auto* var : var_list) { + GetTensors(var, &tensor_set); + } + size_t host_memory_bytes = 0; + size_t device_memory_bytes = 0; + std::unordered_set allocation_set; + for (auto* tensor : tensor_set) { + auto allocation = tensor->Holder().get(); + if (!allocation_set.count(allocation)) { + allocation_set.insert(allocation); + if (platform::is_cuda_pinned_place(tensor->place()) || + platform::is_cpu_place(tensor->place())) { + VLOG(3) << "found host memory : " << allocation->size(); + host_memory_bytes += allocation->size(); + } else { + VLOG(3) << "found device memory : " << allocation->size(); + device_memory_bytes += allocation->size(); + } + } + } + return {host_memory_bytes, device_memory_bytes}; +} + +struct CostInfo { + double total_time{0.}; // ms + size_t host_memory_bytes{0}; // bytes + size_t device_memory_bytes{0}; // bytes + size_t device_total_memory_bytes{0}; // total allocated memory size +}; + +class InterpreterProfiler { + public: + void Start() { timer_.Start(); } + + void Pause() { + timer_.Pause(); + cost_info_.total_time += timer_.ElapsedMS(); + } + + void Reset() { + timer_.Reset(); + cost_info_.total_time = 0.; + cost_info_.host_memory_bytes = 0; + cost_info_.device_memory_bytes = 0; + cost_info_.device_total_memory_bytes = 0; + } + + void ParseMemoryInfo(const std::vector& vars) { + timer_.Start(); + auto memory_info = GetTensorMemorySize(vars); + VLOG(3) << "host memory size: " << memory_info.first; + cost_info_.host_memory_bytes = + std::max(cost_info_.host_memory_bytes, memory_info.first); + + VLOG(3) << "device memory size: " << memory_info.second; + cost_info_.device_memory_bytes = + std::max(cost_info_.device_memory_bytes, memory_info.second); + timer_.Pause(); + cost_info_.total_time -= timer_.ElapsedMS(); + } + + void TotalCUDAAllocatedMemorySize(const platform::Place& place) { + if (platform::is_gpu_place(place)) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place); + cost_info_.device_total_memory_bytes = + platform::RecordedCudaMallocSize(cuda_place.device); +#endif + } + } + + const CostInfo& GetCostInfo() const { return cost_info_; } + + private: + platform::Timer timer_; + CostInfo cost_info_; +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 665a99d7d7..2e76a6720a 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -61,6 +61,15 @@ paddle::framework::FetchList StandaloneExecutor::Run( return core->Run(feed_tensors); } +const CostInfo& StandaloneExecutor::DryRun( + const std::vector& feed_names, + const std::vector& feed_tensors) { + auto core = GetInterpreterCore(feed_names, {}); + + auto& cost_info = core->DryRun(feed_tensors); + return cost_info; +} + void StandaloneExecutor::BuildVariableOuterScope( const framework::ProgramDesc& pdesc, VariableScope* var_scope, Scope* outer_scope) { diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h index c8943e2937..600c90e3a1 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.h +++ b/paddle/fluid/framework/new_executor/standalone_executor.h @@ -45,6 +45,9 @@ class StandaloneExecutor : public ExecutorBase { const std::vector& feed_tensors, const std::vector& fetch_names); + const CostInfo& DryRun(const std::vector& feed_names, + const std::vector& feed_tensors); + private: void BuildVariableOuterScope(const framework::ProgramDesc& pdesc, VariableScope* var_scope, Scope* outer_scope); diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 2e66e3e36d..fda233b3a0 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -499,7 +499,7 @@ class RecordedCudaMallocHelper { */ gpuError_t Malloc(void **ptr, size_t size) { LockGuardPtr lock(mtx_); - if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) { + if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) { #ifdef PADDLE_WITH_HIP return hipErrorOutOfMemory; #else @@ -514,9 +514,7 @@ class RecordedCudaMallocHelper { auto result = cudaMalloc(ptr, size); #endif if (result == gpuSuccess) { - if (NeedRecord()) { - cur_size_ += size; - } + cur_size_.fetch_add(size); STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); return gpuSuccess; } else { @@ -551,10 +549,7 @@ class RecordedCudaMallocHelper { if (err != cudaErrorCudartUnloading) { #endif PADDLE_ENFORCE_CUDA_SUCCESS(err); - if (NeedRecord()) { - std::lock_guard guard(*mtx_); - cur_size_ -= size; - } + cur_size_.fetch_sub(size); STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); } else { #ifdef PADDLE_WITH_HIP @@ -582,7 +577,7 @@ class RecordedCudaMallocHelper { if (NeedRecord()) { std::lock_guard guard(*mtx_); - *avail = std::min(*actual_avail, limit_size_ - cur_size_); + *avail = std::min(*actual_avail, limit_size_ - cur_size_.load()); *total = std::min(*actual_total, limit_size_); return *total < *actual_total; } else { @@ -594,17 +589,14 @@ class RecordedCudaMallocHelper { inline bool NeedRecord() const { return limit_size_ != 0; } - uint64_t RecordedSize() const { - LockGuardPtr lock(mtx_); - return NeedRecord() ? cur_size_ : 0; - } + uint64_t RecordedSize() const { return cur_size_.load(); } uint64_t LimitSize() const { return limit_size_; } private: const int dev_id_; const uint64_t limit_size_; - uint64_t cur_size_{0}; + std::atomic cur_size_{0}; mutable std::unique_ptr mtx_; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 677da35b41..ee4edbbaa0 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1951,6 +1951,16 @@ All parameter, weight, gradient are variables in Paddle. fetch_vars); }); + py::class_(m, "CostInfo") + .def(py::init<>()) + .def("total_time", [](CostInfo &self) { return self.total_time; }) + .def("host_memory_bytes", + [](CostInfo &self) { return self.host_memory_bytes; }) + .def("device_memory_bytes", + [](CostInfo &self) { return self.device_memory_bytes; }) + .def("device_total_memory_bytes", + [](CostInfo &self) { return self.device_total_memory_bytes; }); + py::class_(m, "StandaloneExecutor") .def(py::init()) @@ -1974,8 +1984,28 @@ All parameter, weight, gradient are variables in Paddle. pybind11::gil_scoped_release release; ret = self.Run(feed_names, feed_tensors, fetch_names); } - return py::cast(std::move(ret)); + }) + .def("dry_run", + [](StandaloneExecutor &self, + const std::unordered_map &input_dict) { + std::vector feed_tensors; + std::vector feed_names; + + for (auto &item : input_dict) { + framework::LoDTensor t; + SetTensorFromPyArray( + &t, item.second, platform::CPUPlace(), false); + feed_names.push_back(item.first); + feed_tensors.push_back(t); + } + + CostInfo cost_info; + { + pybind11::gil_scoped_release release; + cost_info = self.DryRun(feed_names, feed_tensors); + } + return cost_info; }); m.def("init_gflags", framework::InitGflags); diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py index 2286765680..b60c3f77e0 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py +++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys import unittest import paddle from paddle.fluid import core @@ -56,6 +57,23 @@ class LinearTestCase(unittest.TestCase): [2, 2], dtype="float32") * i }, [a.name, c.name]) + # test for cost_info + cost_info = standaloneexecutor.dry_run({ + "a": np.ones( + [2, 2], dtype="float32") * i + }) + self.check_cost_info(cost_info) + + def check_cost_info(self, cost_info): + if core.is_compiled_with_cuda(): + self.assertEqual(cost_info.host_memory_bytes(), 16) + self.assertGreater(cost_info.device_memory_bytes(), 0) + self.assertGreaterEqual(cost_info.device_total_memory_bytes(), + cost_info.device_memory_bytes()) + else: + self.assertGreater(cost_info.host_memory_bytes(), 0) + self.assertEqual(cost_info.device_memory_bytes(), 0) + class MultiStreamModelTestCase(unittest.TestCase): def setUp(self): -- GitLab