未验证 提交 572bad8a 编写于 作者: A Aurelius84 提交者: GitHub

Support CostInfo and MemProfiler in InterpreterCore (#34981)

* polish code

* fix unittest on windows

* refine pybind interface

* support statistic MemSize of AllocatorPool

* Replace mutex into atomic
上级 e2991555
......@@ -349,6 +349,9 @@ void InterpreterCore::BuildInstructionCtx(Instruction* instr_node,
}
void InterpreterCore::RunInstruction(const Instruction& instr_node) {
VLOG(3) << "RunInstruction: "
<< instr_node.kernel_func_.operator_base_->Type();
static_cast<const framework::OperatorWithKernel*>(
instr_node.kernel_func_.operator_base_)
->InferShape(instr_node.infershape_ctx_.get());
......@@ -364,7 +367,7 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
void InterpreterCore::ExecuteInstructionList(
const std::vector<Instruction>& vec_instr, const VariableScope& var_scope,
const platform::Place& place) {
const platform::Place& place, bool is_dry_run) {
std::queue<size_t> working_queue;
auto working_dependecy_count = dependecy_count_;
for (size_t i = 0; i < dependecy_count_.size(); ++i) {
......@@ -385,6 +388,11 @@ void InterpreterCore::ExecuteInstructionList(
// step2: run instruction
RunInstruction(instr_node);
++run_op_number;
if (is_dry_run) {
profiler_.ParseMemoryInfo(var_scope.var_list);
}
// step3: insert event for out_vars if needed
RecordEventInstruction(instr_node, vec_func_list_[instr_id]);
......@@ -827,6 +835,48 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
VLOG(3) << "run " << op_base->Type() << " done.";
}
}
void InterpreterCore::Prepare(
const std::vector<framework::Tensor>& feed_tensors) {
auto FeedInput = [&] {
for (size_t i = 0; i < feed_names_.size(); ++i) {
auto it = global_scope_->name2id.find(feed_names_[i]);
assert(it != global_scope_->name2id.end());
auto feed_tensor = global_scope_->var_list[it->second]
->GetMutable<framework::LoDTensor>();
feed_tensor->ShareDataWith(feed_tensors[i]);
}
};
if (is_build_ == false) {
BuildVariableScope(main_program_, global_scope_);
FeedInput();
BuildOpFuncList(place_, main_program_, &op_list_, &vec_func_list_,
global_scope_);
is_build_ = true;
// convert vec func_list to graph
Convert();
}
// NOTE: Because feed_tensor will be GC after BuildOpFuncList, so we should
// call
// FeedInput again.
FeedInput();
}
const CostInfo& InterpreterCore::DryRun(
const std::vector<framework::Tensor>& feed_tensors) {
Prepare(feed_tensors);
// DryRun may be called many times.
profiler_.Reset();
profiler_.Start();
ExecuteInstructionList(vec_instruction_, *global_scope_, place_,
/*is_dry_run=*/true);
platform::DeviceContextPool::Instance().Get(place_)->Wait();
profiler_.Pause();
profiler_.TotalCUDAAllocatedMemorySize(place_);
return profiler_.GetCostInfo();
}
platform::DeviceContext* InterpreterCore::ParseDeviceContextForInstruction(
const OpFuncNode& op_func_node, const OperatorBase& op_base) {
......
......@@ -21,6 +21,7 @@
#include "paddle/fluid/framework/new_executor/interpretercore_util.h"
#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
#include "paddle/fluid/framework/new_executor/profiler.h"
#include "paddle/fluid/framework/new_executor/workqueue.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/tensor.h"
......@@ -41,6 +42,8 @@ class InterpreterCore {
paddle::framework::FetchList Run(
const std::vector<framework::Tensor>& feed_tensors);
const CostInfo& DryRun(const std::vector<framework::Tensor>& feed_tensors);
static void BuildOpFuncList(const platform::Place& place,
const framework::ProgramDesc& pdesc,
std::vector<OperatorBase*>* op_list,
......@@ -58,7 +61,8 @@ class InterpreterCore {
void ExecuteInstructionList(const std::vector<Instruction>& vec_instr,
const VariableScope& var_scope,
const platform::Place& place);
const platform::Place& place,
bool is_dry_run = false);
std::vector<size_t> MergeVector(const std::vector<size_t>& first,
const std::vector<size_t>& second);
......@@ -66,6 +70,8 @@ class InterpreterCore {
void BuildVariableScope(const framework::ProgramDesc& pdesc,
VariableScope* var_scope);
void Prepare(const std::vector<framework::Tensor>& feed_tensors);
void CheckGC(size_t instr_id, const std::vector<size_t>& gc_check_list,
const VariableScope& var_scope, const platform::Place& place,
std::vector<VariableMetaInfo>& working_var_ref); // NOLINT
......@@ -100,6 +106,7 @@ class InterpreterCore {
bool is_build_;
std::vector<std::string> feed_names_;
InterpreterProfiler profiler_;
std::map<size_t, std::shared_ptr<platform::DeviceEvent>> var_id2event_;
std::vector<paddle::platform::DeviceEvent> gc_event_;
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/timer.h"
namespace paddle {
namespace framework {
static void GetTensors(Variable* var, std::unordered_set<Tensor*>* tensor_set) {
if (var->IsType<LoDTensor>() && var->Get<LoDTensor>().IsInitialized()) {
tensor_set->insert(var->GetMutable<LoDTensor>());
} else if (var->IsType<SelectedRows>() &&
var->Get<SelectedRows>().value().IsInitialized()) {
tensor_set->insert(var->GetMutable<SelectedRows>()->mutable_value());
} else if (var->IsType<LoDTensorArray>()) {
auto* tensor_arr = var->GetMutable<LoDTensorArray>();
for (auto& t : *tensor_arr) {
if (t.IsInitialized()) {
tensor_set->insert(&t);
}
}
}
}
static std::pair<size_t, size_t> GetTensorMemorySize(
const std::vector<Variable*>& var_list) {
std::unordered_set<Tensor*> tensor_set;
for (auto* var : var_list) {
GetTensors(var, &tensor_set);
}
size_t host_memory_bytes = 0;
size_t device_memory_bytes = 0;
std::unordered_set<memory::Allocation*> allocation_set;
for (auto* tensor : tensor_set) {
auto allocation = tensor->Holder().get();
if (!allocation_set.count(allocation)) {
allocation_set.insert(allocation);
if (platform::is_cuda_pinned_place(tensor->place()) ||
platform::is_cpu_place(tensor->place())) {
VLOG(3) << "found host memory : " << allocation->size();
host_memory_bytes += allocation->size();
} else {
VLOG(3) << "found device memory : " << allocation->size();
device_memory_bytes += allocation->size();
}
}
}
return {host_memory_bytes, device_memory_bytes};
}
struct CostInfo {
double total_time{0.}; // ms
size_t host_memory_bytes{0}; // bytes
size_t device_memory_bytes{0}; // bytes
size_t device_total_memory_bytes{0}; // total allocated memory size
};
class InterpreterProfiler {
public:
void Start() { timer_.Start(); }
void Pause() {
timer_.Pause();
cost_info_.total_time += timer_.ElapsedMS();
}
void Reset() {
timer_.Reset();
cost_info_.total_time = 0.;
cost_info_.host_memory_bytes = 0;
cost_info_.device_memory_bytes = 0;
cost_info_.device_total_memory_bytes = 0;
}
void ParseMemoryInfo(const std::vector<Variable*>& vars) {
timer_.Start();
auto memory_info = GetTensorMemorySize(vars);
VLOG(3) << "host memory size: " << memory_info.first;
cost_info_.host_memory_bytes =
std::max(cost_info_.host_memory_bytes, memory_info.first);
VLOG(3) << "device memory size: " << memory_info.second;
cost_info_.device_memory_bytes =
std::max(cost_info_.device_memory_bytes, memory_info.second);
timer_.Pause();
cost_info_.total_time -= timer_.ElapsedMS();
}
void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
if (platform::is_gpu_place(place)) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place);
cost_info_.device_total_memory_bytes =
platform::RecordedCudaMallocSize(cuda_place.device);
#endif
}
}
const CostInfo& GetCostInfo() const { return cost_info_; }
private:
platform::Timer timer_;
CostInfo cost_info_;
};
} // namespace framework
} // namespace paddle
......@@ -61,6 +61,15 @@ paddle::framework::FetchList StandaloneExecutor::Run(
return core->Run(feed_tensors);
}
const CostInfo& StandaloneExecutor::DryRun(
const std::vector<std::string>& feed_names,
const std::vector<framework::Tensor>& feed_tensors) {
auto core = GetInterpreterCore(feed_names, {});
auto& cost_info = core->DryRun(feed_tensors);
return cost_info;
}
void StandaloneExecutor::BuildVariableOuterScope(
const framework::ProgramDesc& pdesc, VariableScope* var_scope,
Scope* outer_scope) {
......
......@@ -45,6 +45,9 @@ class StandaloneExecutor : public ExecutorBase {
const std::vector<framework::Tensor>& feed_tensors,
const std::vector<std::string>& fetch_names);
const CostInfo& DryRun(const std::vector<std::string>& feed_names,
const std::vector<framework::Tensor>& feed_tensors);
private:
void BuildVariableOuterScope(const framework::ProgramDesc& pdesc,
VariableScope* var_scope, Scope* outer_scope);
......
......@@ -499,7 +499,7 @@ class RecordedCudaMallocHelper {
*/
gpuError_t Malloc(void **ptr, size_t size) {
LockGuardPtr<std::mutex> lock(mtx_);
if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) {
if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
#ifdef PADDLE_WITH_HIP
return hipErrorOutOfMemory;
#else
......@@ -514,9 +514,7 @@ class RecordedCudaMallocHelper {
auto result = cudaMalloc(ptr, size);
#endif
if (result == gpuSuccess) {
if (NeedRecord()) {
cur_size_ += size;
}
cur_size_.fetch_add(size);
STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
return gpuSuccess;
} else {
......@@ -551,10 +549,7 @@ class RecordedCudaMallocHelper {
if (err != cudaErrorCudartUnloading) {
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(err);
if (NeedRecord()) {
std::lock_guard<std::mutex> guard(*mtx_);
cur_size_ -= size;
}
cur_size_.fetch_sub(size);
STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
} else {
#ifdef PADDLE_WITH_HIP
......@@ -582,7 +577,7 @@ class RecordedCudaMallocHelper {
if (NeedRecord()) {
std::lock_guard<std::mutex> guard(*mtx_);
*avail = std::min(*actual_avail, limit_size_ - cur_size_);
*avail = std::min(*actual_avail, limit_size_ - cur_size_.load());
*total = std::min(*actual_total, limit_size_);
return *total < *actual_total;
} else {
......@@ -594,17 +589,14 @@ class RecordedCudaMallocHelper {
inline bool NeedRecord() const { return limit_size_ != 0; }
uint64_t RecordedSize() const {
LockGuardPtr<std::mutex> lock(mtx_);
return NeedRecord() ? cur_size_ : 0;
}
uint64_t RecordedSize() const { return cur_size_.load(); }
uint64_t LimitSize() const { return limit_size_; }
private:
const int dev_id_;
const uint64_t limit_size_;
uint64_t cur_size_{0};
std::atomic<uint64_t> cur_size_{0};
mutable std::unique_ptr<std::mutex> mtx_;
......
......@@ -1951,6 +1951,16 @@ All parameter, weight, gradient are variables in Paddle.
fetch_vars);
});
py::class_<framework::CostInfo>(m, "CostInfo")
.def(py::init<>())
.def("total_time", [](CostInfo &self) { return self.total_time; })
.def("host_memory_bytes",
[](CostInfo &self) { return self.host_memory_bytes; })
.def("device_memory_bytes",
[](CostInfo &self) { return self.device_memory_bytes; })
.def("device_total_memory_bytes",
[](CostInfo &self) { return self.device_total_memory_bytes; });
py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor")
.def(py::init<const platform::Place &, const ProgramDesc &,
const ProgramDesc &, Scope *>())
......@@ -1974,8 +1984,28 @@ All parameter, weight, gradient are variables in Paddle.
pybind11::gil_scoped_release release;
ret = self.Run(feed_names, feed_tensors, fetch_names);
}
return py::cast(std::move(ret));
})
.def("dry_run",
[](StandaloneExecutor &self,
const std::unordered_map<std::string, py::array> &input_dict) {
std::vector<framework::Tensor> feed_tensors;
std::vector<std::string> feed_names;
for (auto &item : input_dict) {
framework::LoDTensor t;
SetTensorFromPyArray<platform::CPUPlace>(
&t, item.second, platform::CPUPlace(), false);
feed_names.push_back(item.first);
feed_tensors.push_back(t);
}
CostInfo cost_info;
{
pybind11::gil_scoped_release release;
cost_info = self.DryRun(feed_names, feed_tensors);
}
return cost_info;
});
m.def("init_gflags", framework::InitGflags);
......
......@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import unittest
import paddle
from paddle.fluid import core
......@@ -56,6 +57,23 @@ class LinearTestCase(unittest.TestCase):
[2, 2], dtype="float32") * i
}, [a.name, c.name])
# test for cost_info
cost_info = standaloneexecutor.dry_run({
"a": np.ones(
[2, 2], dtype="float32") * i
})
self.check_cost_info(cost_info)
def check_cost_info(self, cost_info):
if core.is_compiled_with_cuda():
self.assertEqual(cost_info.host_memory_bytes(), 16)
self.assertGreater(cost_info.device_memory_bytes(), 0)
self.assertGreaterEqual(cost_info.device_total_memory_bytes(),
cost_info.device_memory_bytes())
else:
self.assertGreater(cost_info.host_memory_bytes(), 0)
self.assertEqual(cost_info.device_memory_bytes(), 0)
class MultiStreamModelTestCase(unittest.TestCase):
def setUp(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册