未验证 提交 989f3dde 编写于 作者: 周周周 提交者: GitHub

[Paddle-TRT] use hook to collect shape in CollectShapeRangeInfo API. (#54841)

* commit

* commit

* commit

* commit

* final commit

* use hook to collect shape and shape value
上级 b6321350
......@@ -69,8 +69,12 @@ void NaiveExecutor::Run() {
platform::NvtxRangeColor::Green);
#endif
for (auto &func : input_hookfuncs_) {
func(op.get(), scope_);
}
if (op->Type() == "while") {
op->SetOutputHooks(hookfuncs_);
op->SetOutputHooks(output_hookfuncs_);
}
op->Run(*scope_, place_);
......@@ -104,7 +108,7 @@ void NaiveExecutor::Run() {
#ifdef PADDLE_WITH_INFERENCE_NVTX
platform::CudaNvtxRangePop();
#endif
for (auto &func : hookfuncs_) {
for (auto &func : output_hookfuncs_) {
func(op.get(), scope_);
}
}
......@@ -185,7 +189,11 @@ phi::DenseTensor *NaiveExecutor::FindTensor(const std::string &name) {
}
void NaiveExecutor::RegisterOutputHook(const HookFunc &hookfunc) {
hookfuncs_.push_back(hookfunc);
output_hookfuncs_.push_back(hookfunc);
}
void NaiveExecutor::RegisterInputHook(const HookFunc &hookfunc) {
input_hookfuncs_.push_back(hookfunc);
}
void NaiveExecutor::MakeReusePlan(
......
......@@ -76,6 +76,7 @@ class NaiveExecutor {
void CloneLiteEnigne(int num, void* stream);
void RegisterOutputHook(const HookFunc& hookfunc);
void RegisterInputHook(const HookFunc& hookfunc);
private:
void CreateOps(const ProgramDesc& desc,
......@@ -88,7 +89,8 @@ class NaiveExecutor {
std::vector<std::unique_ptr<OperatorBase>> ops_;
Scope* scope_{nullptr};
std::vector<HookFunc> hookfuncs_;
std::vector<HookFunc> output_hookfuncs_;
std::vector<HookFunc> input_hookfuncs_;
// Record information that tensor_a should ShareBufferWith tensor_b.
std::unordered_map<OperatorBase*, std::unordered_map<phi::DenseTensor*, int>>
......
......@@ -2092,6 +2092,9 @@ bool AnalysisPredictor::ZeroCopyRun() {
}
#endif
if (config_.shape_range_info_collected()) {
HookCollectShapeRangeInfo();
}
#ifdef PADDLE_WITH_XPU
InferXPUContext *infer_xpu_ctx = nullptr;
if (config_.use_xpu_ && !config_.use_lite_) {
......@@ -2126,10 +2129,6 @@ bool AnalysisPredictor::ZeroCopyRun() {
}
#endif
if (config_.shape_range_info_collected()) {
CollectShapeRangeInfo();
}
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
tensor_array_batch_cleaner_.ResetTensorArray();
......@@ -2193,65 +2192,46 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
}
#endif
bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
#ifdef PADDLE_WITH_XPU
auto xpu_runtime_config =
reinterpret_cast<paddle_infer::experimental::XpuRuntimeConfig *>(config);
config_.xpu_config_.context = xpu_runtime_config->context;
auto *stream = xpu_runtime_config->stream;
if (stream != nullptr && stream != predictor_stream_) {
paddle::platform::XPUStreamSync(
static_cast<paddle::xpuStream>(predictor_stream_));
predictor_stream_ = stream;
}
auto l3_size = xpu_runtime_config->l3_size;
auto l3_autotune_size = xpu_runtime_config->l3_autotune_size;
PADDLE_ENFORCE_LE(
l3_autotune_size,
l3_size,
phi::errors::InvalidArgument(
"l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).",
l3_autotune_size,
l3_size));
config_.xpu_config_.l3_size = l3_size;
config_.xpu_config_.l3_ptr = xpu_runtime_config->l3_ptr;
config_.xpu_config_.l3_autotune_size = l3_autotune_size;
return ZeroCopyRun();
#endif
return false;
}
void AnalysisPredictor::CollectShapeRangeInfo() {
// if use gpu, sync first.
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
if (config_.use_gpu()) {
void AnalysisPredictor::HookCollectShapeRangeInfo() {
auto hook = [&](const std::string &op_type,
const std::string &input_name,
const paddle::Tensor &var) -> void {
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
if (config_.use_gpu()) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto *dev_ctx = pool.Get(place_);
auto stream = static_cast<phi::GPUContext *>(dev_ctx)->stream();
auto *dev_ctx = pool.Get(place_);
auto stream = static_cast<phi::GPUContext *>(dev_ctx)->stream();
#ifdef PADDLE_WITH_HIP
hipStreamSynchronize(stream);
hipStreamSynchronize(stream);
#else
cudaStreamSynchronize(stream);
cudaStreamSynchronize(stream);
#endif
#endif
}
}
std::vector<std::string> var_names = sub_scope_->LocalVarNames();
for (const auto &name : var_names) {
auto *var = sub_scope_->GetVar(name);
if (!var->IsType<phi::DenseTensor>()) {
continue;
auto *new_var = sub_scope_->GetVar(input_name);
if (!new_var) return;
if (!new_var->IsType<phi::DenseTensor>()) {
return;
}
auto tensor = var->Get<phi::DenseTensor>();
if (!tensor.initialized()) continue;
auto tensor = new_var->Get<phi::DenseTensor>();
if (!tensor.initialized()) return;
framework::DDim dim = tensor.dims();
std::vector<int32_t> shape(dim.size());
for (size_t i = 0; i < shape.size(); ++i) shape[i] = dim[i];
shape_info_[name].emplace_back(shape);
if (shape.size() >= 1) {
shape_info_[input_name].emplace_back(shape);
} else if (tensor.numel() > 0) {
// This must be a zero dimension tensor.
PADDLE_ENFORCE_EQ(tensor.numel(),
1UL,
platform::errors::PreconditionNotMet(
"This tensor must have one element, but got %ld.",
tensor.numel()));
std::vector<int32_t> zero_shape(1, 1);
shape_info_[input_name].emplace_back(zero_shape);
}
// We need collect value range for shape tensor for Paddle-TRT's use.
// To be noticed, this method to identify all shape tensors is based on
......@@ -2296,9 +2276,41 @@ void AnalysisPredictor::CollectShapeRangeInfo() {
nullptr);
#endif
}
shape_tensor_value_[name].emplace_back(int32_host);
shape_tensor_value_[input_name].emplace_back(int32_host);
}
};
RegisterInputHook(hook);
}
bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
#ifdef PADDLE_WITH_XPU
auto xpu_runtime_config =
reinterpret_cast<paddle_infer::experimental::XpuRuntimeConfig *>(config);
config_.xpu_config_.context = xpu_runtime_config->context;
auto *stream = xpu_runtime_config->stream;
if (stream != nullptr && stream != predictor_stream_) {
paddle::platform::XPUStreamSync(
static_cast<paddle::xpuStream>(predictor_stream_));
predictor_stream_ = stream;
}
auto l3_size = xpu_runtime_config->l3_size;
auto l3_autotune_size = xpu_runtime_config->l3_autotune_size;
PADDLE_ENFORCE_LE(
l3_autotune_size,
l3_size,
phi::errors::InvalidArgument(
"l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).",
l3_autotune_size,
l3_size));
config_.xpu_config_.l3_size = l3_size;
config_.xpu_config_.l3_ptr = xpu_runtime_config->l3_ptr;
config_.xpu_config_.l3_autotune_size = l3_autotune_size;
return ZeroCopyRun();
#endif
return false;
}
void AnalysisPredictor::StatisticShapeRangeInfo() {
......@@ -2677,10 +2689,33 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
exe.Run(save_program, scope(), 0, true, true);
}
void AnalysisPredictor::RegisterInputHook(const InputTensorHookFunc &hookfunc) {
static std::once_flag register_input_hook_flag;
std::call_once(register_input_hook_flag, [this] {
executor_->RegisterInputHook(
[this](framework::OperatorBase *op, framework::Scope *scope) {
for (auto &input : op->Inputs()) {
for (auto &var_name : input.second) {
auto *var = scope->FindVar(var_name);
if (!var || !var->IsType<phi::DenseTensor>()) continue;
auto dense_tensor = var->Get<phi::DenseTensor>();
if (!dense_tensor.initialized()) continue;
auto tensor = paddle::Tensor(
std::make_shared<phi::DenseTensor>(dense_tensor), var_name);
for (auto &hookfunc : this->input_hookfuncs_) {
hookfunc(op->Type(), var_name, tensor);
}
}
}
});
});
input_hookfuncs_.push_back(hookfunc);
}
void AnalysisPredictor::RegisterOutputHook(
const OutputTensorHookFunc &hookfunc) {
static std::once_flag register_hook_flag;
std::call_once(register_hook_flag, [this] {
static std::once_flag register_output_hook_flag;
std::call_once(register_output_hook_flag, [this] {
executor_->RegisterOutputHook(
[this](framework::OperatorBase *op, framework::Scope *scope) {
for (auto &output : op->Outputs()) {
......@@ -2691,14 +2726,14 @@ void AnalysisPredictor::RegisterOutputHook(
if (!dense_tensor.initialized()) continue;
auto tensor = paddle::Tensor(
std::make_shared<phi::DenseTensor>(dense_tensor), var_name);
for (auto &hookfunc : this->hookfuncs_) {
for (auto &hookfunc : this->output_hookfuncs_) {
hookfunc(op->Type(), var_name, tensor);
}
}
}
});
});
hookfuncs_.push_back(hookfunc);
output_hookfuncs_.push_back(hookfunc);
}
template <>
......@@ -2987,6 +3022,9 @@ uint64_t Predictor::TryShrinkMemory() { return predictor_->TryShrinkMemory(); }
void Predictor::RegisterOutputHook(const OutputTensorHookFunc &hookfunc) {
predictor_->RegisterOutputHook(hookfunc);
}
void Predictor::RegisterInputHook(const OutputTensorHookFunc &hookfunc) {
predictor_->RegisterInputHook(hookfunc);
}
void *Predictor::GetExecStream() const { return predictor_->GetExecStream(); }
......
......@@ -328,6 +328,9 @@ class AnalysisPredictor : public PaddlePredictor {
///
void RegisterOutputHook(const OutputTensorHookFunc &hookfunc) override;
/// \brief Same as RegisterOutputHook
void RegisterInputHook(const InputTensorHookFunc &hookfunc) override;
///
/// \brief Initialize mkldnn quantizer and execute mkldnn quantization pass
///
......@@ -498,8 +501,7 @@ class AnalysisPredictor : public PaddlePredictor {
private:
void StatisticShapeRangeInfo();
void CollectShapeRangeInfo();
void HookCollectShapeRangeInfo();
void InitPlace();
void InitDeviceContexts();
void InitResourceManager(void *stream);
......@@ -598,7 +600,8 @@ class AnalysisPredictor : public PaddlePredictor {
private:
std::vector<OutputTensorHookFunc> hookfuncs_;
std::vector<OutputTensorHookFunc> output_hookfuncs_;
std::vector<InputTensorHookFunc> input_hookfuncs_;
// Some status here that help to determine the status inside the predictor.
bool status_is_cloned_{false};
......
......@@ -38,6 +38,7 @@ namespace paddle {
using PaddleDType = paddle_infer::DataType;
using PaddlePlace = paddle_infer::PlaceType;
using PaddleDataLayout = paddle_infer::DataLayout;
using paddle_infer::InputTensorHookFunc;
using paddle_infer::OutputTensorHookFunc;
/// \brief Memory manager for PaddleTensor.
......@@ -323,6 +324,9 @@ class PD_INFER_DECL PaddlePredictor {
///
virtual void RegisterOutputHook(const OutputTensorHookFunc& hookfunc) {}
/// \brief Same as RegisterOutputHook
virtual void RegisterInputHook(const InputTensorHookFunc& hookfunc) {}
/// \brief Clone an existing predictor
/// When using clone, the same network will be created,
/// and the parameters between them are shared.
......
......@@ -200,6 +200,9 @@ class PD_INFER_DECL Predictor {
///
void RegisterOutputHook(const OutputTensorHookFunc& hookfunc);
/// The same as RegisterOutputHook.
void RegisterInputHook(const InputTensorHookFunc& hookfunc);
///
/// \brief Get the execution stream on devices with a concept of stream,
/// otherwise returns nullptr.
......
......@@ -38,6 +38,7 @@ using Strings = std::vector<std::string>;
using OutputTensorHookFunc = std::function<void(
const std::string&, const std::string&, const paddle::Tensor&)>;
using InputTensorHookFunc = OutputTensorHookFunc;
typedef void (*CallbackFunc)(void*);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册