未验证 提交 989f3dde 编写于 作者: 周周周 提交者: GitHub

[Paddle-TRT] use hook to collect shape in CollectShapeRangeInfo API. (#54841)

* commit

* commit

* commit

* commit

* final commit

* use hook to collect shape and shape value
上级 b6321350
...@@ -69,8 +69,12 @@ void NaiveExecutor::Run() { ...@@ -69,8 +69,12 @@ void NaiveExecutor::Run() {
platform::NvtxRangeColor::Green); platform::NvtxRangeColor::Green);
#endif #endif
for (auto &func : input_hookfuncs_) {
func(op.get(), scope_);
}
if (op->Type() == "while") { if (op->Type() == "while") {
op->SetOutputHooks(hookfuncs_); op->SetOutputHooks(output_hookfuncs_);
} }
op->Run(*scope_, place_); op->Run(*scope_, place_);
...@@ -104,7 +108,7 @@ void NaiveExecutor::Run() { ...@@ -104,7 +108,7 @@ void NaiveExecutor::Run() {
#ifdef PADDLE_WITH_INFERENCE_NVTX #ifdef PADDLE_WITH_INFERENCE_NVTX
platform::CudaNvtxRangePop(); platform::CudaNvtxRangePop();
#endif #endif
for (auto &func : hookfuncs_) { for (auto &func : output_hookfuncs_) {
func(op.get(), scope_); func(op.get(), scope_);
} }
} }
...@@ -185,7 +189,11 @@ phi::DenseTensor *NaiveExecutor::FindTensor(const std::string &name) { ...@@ -185,7 +189,11 @@ phi::DenseTensor *NaiveExecutor::FindTensor(const std::string &name) {
} }
void NaiveExecutor::RegisterOutputHook(const HookFunc &hookfunc) { void NaiveExecutor::RegisterOutputHook(const HookFunc &hookfunc) {
hookfuncs_.push_back(hookfunc); output_hookfuncs_.push_back(hookfunc);
}
void NaiveExecutor::RegisterInputHook(const HookFunc &hookfunc) {
input_hookfuncs_.push_back(hookfunc);
} }
void NaiveExecutor::MakeReusePlan( void NaiveExecutor::MakeReusePlan(
......
...@@ -76,6 +76,7 @@ class NaiveExecutor { ...@@ -76,6 +76,7 @@ class NaiveExecutor {
void CloneLiteEnigne(int num, void* stream); void CloneLiteEnigne(int num, void* stream);
void RegisterOutputHook(const HookFunc& hookfunc); void RegisterOutputHook(const HookFunc& hookfunc);
void RegisterInputHook(const HookFunc& hookfunc);
private: private:
void CreateOps(const ProgramDesc& desc, void CreateOps(const ProgramDesc& desc,
...@@ -88,7 +89,8 @@ class NaiveExecutor { ...@@ -88,7 +89,8 @@ class NaiveExecutor {
std::vector<std::unique_ptr<OperatorBase>> ops_; std::vector<std::unique_ptr<OperatorBase>> ops_;
Scope* scope_{nullptr}; Scope* scope_{nullptr};
std::vector<HookFunc> hookfuncs_; std::vector<HookFunc> output_hookfuncs_;
std::vector<HookFunc> input_hookfuncs_;
// Record information that tensor_a should ShareBufferWith tensor_b. // Record information that tensor_a should ShareBufferWith tensor_b.
std::unordered_map<OperatorBase*, std::unordered_map<phi::DenseTensor*, int>> std::unordered_map<OperatorBase*, std::unordered_map<phi::DenseTensor*, int>>
......
...@@ -2092,6 +2092,9 @@ bool AnalysisPredictor::ZeroCopyRun() { ...@@ -2092,6 +2092,9 @@ bool AnalysisPredictor::ZeroCopyRun() {
} }
#endif #endif
if (config_.shape_range_info_collected()) {
HookCollectShapeRangeInfo();
}
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
InferXPUContext *infer_xpu_ctx = nullptr; InferXPUContext *infer_xpu_ctx = nullptr;
if (config_.use_xpu_ && !config_.use_lite_) { if (config_.use_xpu_ && !config_.use_lite_) {
...@@ -2126,10 +2129,6 @@ bool AnalysisPredictor::ZeroCopyRun() { ...@@ -2126,10 +2129,6 @@ bool AnalysisPredictor::ZeroCopyRun() {
} }
#endif #endif
if (config_.shape_range_info_collected()) {
CollectShapeRangeInfo();
}
// Fix TensorArray reuse not cleaned bug. // Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_); tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
tensor_array_batch_cleaner_.ResetTensorArray(); tensor_array_batch_cleaner_.ResetTensorArray();
...@@ -2193,65 +2192,46 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) { ...@@ -2193,65 +2192,46 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
} }
#endif #endif
bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) { void AnalysisPredictor::HookCollectShapeRangeInfo() {
#ifdef PADDLE_WITH_XPU auto hook = [&](const std::string &op_type,
auto xpu_runtime_config = const std::string &input_name,
reinterpret_cast<paddle_infer::experimental::XpuRuntimeConfig *>(config); const paddle::Tensor &var) -> void {
paddle::platform::DeviceContextPool &pool =
config_.xpu_config_.context = xpu_runtime_config->context; paddle::platform::DeviceContextPool::Instance();
auto *stream = xpu_runtime_config->stream; if (config_.use_gpu()) {
if (stream != nullptr && stream != predictor_stream_) {
paddle::platform::XPUStreamSync(
static_cast<paddle::xpuStream>(predictor_stream_));
predictor_stream_ = stream;
}
auto l3_size = xpu_runtime_config->l3_size;
auto l3_autotune_size = xpu_runtime_config->l3_autotune_size;
PADDLE_ENFORCE_LE(
l3_autotune_size,
l3_size,
phi::errors::InvalidArgument(
"l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).",
l3_autotune_size,
l3_size));
config_.xpu_config_.l3_size = l3_size;
config_.xpu_config_.l3_ptr = xpu_runtime_config->l3_ptr;
config_.xpu_config_.l3_autotune_size = l3_autotune_size;
return ZeroCopyRun();
#endif
return false;
}
void AnalysisPredictor::CollectShapeRangeInfo() {
// if use gpu, sync first.
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
if (config_.use_gpu()) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto *dev_ctx = pool.Get(place_); auto *dev_ctx = pool.Get(place_);
auto stream = static_cast<phi::GPUContext *>(dev_ctx)->stream(); auto stream = static_cast<phi::GPUContext *>(dev_ctx)->stream();
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
hipStreamSynchronize(stream); hipStreamSynchronize(stream);
#else #else
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
#endif #endif
#endif #endif
} }
std::vector<std::string> var_names = sub_scope_->LocalVarNames(); auto *new_var = sub_scope_->GetVar(input_name);
for (const auto &name : var_names) { if (!new_var) return;
auto *var = sub_scope_->GetVar(name); if (!new_var->IsType<phi::DenseTensor>()) {
if (!var->IsType<phi::DenseTensor>()) { return;
continue;
} }
auto tensor = var->Get<phi::DenseTensor>(); auto tensor = new_var->Get<phi::DenseTensor>();
if (!tensor.initialized()) continue; if (!tensor.initialized()) return;
framework::DDim dim = tensor.dims(); framework::DDim dim = tensor.dims();
std::vector<int32_t> shape(dim.size()); std::vector<int32_t> shape(dim.size());
for (size_t i = 0; i < shape.size(); ++i) shape[i] = dim[i]; for (size_t i = 0; i < shape.size(); ++i) shape[i] = dim[i];
shape_info_[name].emplace_back(shape); if (shape.size() >= 1) {
shape_info_[input_name].emplace_back(shape);
} else if (tensor.numel() > 0) {
// This must be a zero dimension tensor.
PADDLE_ENFORCE_EQ(tensor.numel(),
1UL,
platform::errors::PreconditionNotMet(
"This tensor must have one element, but got %ld.",
tensor.numel()));
std::vector<int32_t> zero_shape(1, 1);
shape_info_[input_name].emplace_back(zero_shape);
}
// We need collect value range for shape tensor for Paddle-TRT's use. // We need collect value range for shape tensor for Paddle-TRT's use.
// To be noticed, this method to identify all shape tensors is based on // To be noticed, this method to identify all shape tensors is based on
...@@ -2296,9 +2276,41 @@ void AnalysisPredictor::CollectShapeRangeInfo() { ...@@ -2296,9 +2276,41 @@ void AnalysisPredictor::CollectShapeRangeInfo() {
nullptr); nullptr);
#endif #endif
} }
shape_tensor_value_[name].emplace_back(int32_host); shape_tensor_value_[input_name].emplace_back(int32_host);
} }
};
RegisterInputHook(hook);
}
bool AnalysisPredictor::ExpRunWithRuntimeConfig(void *config) {
#ifdef PADDLE_WITH_XPU
auto xpu_runtime_config =
reinterpret_cast<paddle_infer::experimental::XpuRuntimeConfig *>(config);
config_.xpu_config_.context = xpu_runtime_config->context;
auto *stream = xpu_runtime_config->stream;
if (stream != nullptr && stream != predictor_stream_) {
paddle::platform::XPUStreamSync(
static_cast<paddle::xpuStream>(predictor_stream_));
predictor_stream_ = stream;
} }
auto l3_size = xpu_runtime_config->l3_size;
auto l3_autotune_size = xpu_runtime_config->l3_autotune_size;
PADDLE_ENFORCE_LE(
l3_autotune_size,
l3_size,
phi::errors::InvalidArgument(
"l3_autotune_size(%zu) should be less than or equal to l3_size(%zu).",
l3_autotune_size,
l3_size));
config_.xpu_config_.l3_size = l3_size;
config_.xpu_config_.l3_ptr = xpu_runtime_config->l3_ptr;
config_.xpu_config_.l3_autotune_size = l3_autotune_size;
return ZeroCopyRun();
#endif
return false;
} }
void AnalysisPredictor::StatisticShapeRangeInfo() { void AnalysisPredictor::StatisticShapeRangeInfo() {
...@@ -2677,10 +2689,33 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) { ...@@ -2677,10 +2689,33 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
exe.Run(save_program, scope(), 0, true, true); exe.Run(save_program, scope(), 0, true, true);
} }
void AnalysisPredictor::RegisterInputHook(const InputTensorHookFunc &hookfunc) {
static std::once_flag register_input_hook_flag;
std::call_once(register_input_hook_flag, [this] {
executor_->RegisterInputHook(
[this](framework::OperatorBase *op, framework::Scope *scope) {
for (auto &input : op->Inputs()) {
for (auto &var_name : input.second) {
auto *var = scope->FindVar(var_name);
if (!var || !var->IsType<phi::DenseTensor>()) continue;
auto dense_tensor = var->Get<phi::DenseTensor>();
if (!dense_tensor.initialized()) continue;
auto tensor = paddle::Tensor(
std::make_shared<phi::DenseTensor>(dense_tensor), var_name);
for (auto &hookfunc : this->input_hookfuncs_) {
hookfunc(op->Type(), var_name, tensor);
}
}
}
});
});
input_hookfuncs_.push_back(hookfunc);
}
void AnalysisPredictor::RegisterOutputHook( void AnalysisPredictor::RegisterOutputHook(
const OutputTensorHookFunc &hookfunc) { const OutputTensorHookFunc &hookfunc) {
static std::once_flag register_hook_flag; static std::once_flag register_output_hook_flag;
std::call_once(register_hook_flag, [this] { std::call_once(register_output_hook_flag, [this] {
executor_->RegisterOutputHook( executor_->RegisterOutputHook(
[this](framework::OperatorBase *op, framework::Scope *scope) { [this](framework::OperatorBase *op, framework::Scope *scope) {
for (auto &output : op->Outputs()) { for (auto &output : op->Outputs()) {
...@@ -2691,14 +2726,14 @@ void AnalysisPredictor::RegisterOutputHook( ...@@ -2691,14 +2726,14 @@ void AnalysisPredictor::RegisterOutputHook(
if (!dense_tensor.initialized()) continue; if (!dense_tensor.initialized()) continue;
auto tensor = paddle::Tensor( auto tensor = paddle::Tensor(
std::make_shared<phi::DenseTensor>(dense_tensor), var_name); std::make_shared<phi::DenseTensor>(dense_tensor), var_name);
for (auto &hookfunc : this->hookfuncs_) { for (auto &hookfunc : this->output_hookfuncs_) {
hookfunc(op->Type(), var_name, tensor); hookfunc(op->Type(), var_name, tensor);
} }
} }
} }
}); });
}); });
hookfuncs_.push_back(hookfunc); output_hookfuncs_.push_back(hookfunc);
} }
template <> template <>
...@@ -2987,6 +3022,9 @@ uint64_t Predictor::TryShrinkMemory() { return predictor_->TryShrinkMemory(); } ...@@ -2987,6 +3022,9 @@ uint64_t Predictor::TryShrinkMemory() { return predictor_->TryShrinkMemory(); }
void Predictor::RegisterOutputHook(const OutputTensorHookFunc &hookfunc) { void Predictor::RegisterOutputHook(const OutputTensorHookFunc &hookfunc) {
predictor_->RegisterOutputHook(hookfunc); predictor_->RegisterOutputHook(hookfunc);
} }
void Predictor::RegisterInputHook(const OutputTensorHookFunc &hookfunc) {
predictor_->RegisterInputHook(hookfunc);
}
void *Predictor::GetExecStream() const { return predictor_->GetExecStream(); } void *Predictor::GetExecStream() const { return predictor_->GetExecStream(); }
......
...@@ -328,6 +328,9 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -328,6 +328,9 @@ class AnalysisPredictor : public PaddlePredictor {
/// ///
void RegisterOutputHook(const OutputTensorHookFunc &hookfunc) override; void RegisterOutputHook(const OutputTensorHookFunc &hookfunc) override;
/// \brief Same as RegisterOutputHook
void RegisterInputHook(const InputTensorHookFunc &hookfunc) override;
/// ///
/// \brief Initialize mkldnn quantizer and execute mkldnn quantization pass /// \brief Initialize mkldnn quantizer and execute mkldnn quantization pass
/// ///
...@@ -498,8 +501,7 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -498,8 +501,7 @@ class AnalysisPredictor : public PaddlePredictor {
private: private:
void StatisticShapeRangeInfo(); void StatisticShapeRangeInfo();
void CollectShapeRangeInfo(); void HookCollectShapeRangeInfo();
void InitPlace(); void InitPlace();
void InitDeviceContexts(); void InitDeviceContexts();
void InitResourceManager(void *stream); void InitResourceManager(void *stream);
...@@ -598,7 +600,8 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -598,7 +600,8 @@ class AnalysisPredictor : public PaddlePredictor {
private: private:
std::vector<OutputTensorHookFunc> hookfuncs_; std::vector<OutputTensorHookFunc> hookfuncs_;
std::vector<OutputTensorHookFunc> output_hookfuncs_;
std::vector<InputTensorHookFunc> input_hookfuncs_;
// Some status here that help to determine the status inside the predictor. // Some status here that help to determine the status inside the predictor.
bool status_is_cloned_{false}; bool status_is_cloned_{false};
......
...@@ -38,6 +38,7 @@ namespace paddle { ...@@ -38,6 +38,7 @@ namespace paddle {
using PaddleDType = paddle_infer::DataType; using PaddleDType = paddle_infer::DataType;
using PaddlePlace = paddle_infer::PlaceType; using PaddlePlace = paddle_infer::PlaceType;
using PaddleDataLayout = paddle_infer::DataLayout; using PaddleDataLayout = paddle_infer::DataLayout;
using paddle_infer::InputTensorHookFunc;
using paddle_infer::OutputTensorHookFunc; using paddle_infer::OutputTensorHookFunc;
/// \brief Memory manager for PaddleTensor. /// \brief Memory manager for PaddleTensor.
...@@ -323,6 +324,9 @@ class PD_INFER_DECL PaddlePredictor { ...@@ -323,6 +324,9 @@ class PD_INFER_DECL PaddlePredictor {
/// ///
virtual void RegisterOutputHook(const OutputTensorHookFunc& hookfunc) {} virtual void RegisterOutputHook(const OutputTensorHookFunc& hookfunc) {}
/// \brief Same as RegisterOutputHook
virtual void RegisterInputHook(const InputTensorHookFunc& hookfunc) {}
/// \brief Clone an existing predictor /// \brief Clone an existing predictor
/// When using clone, the same network will be created, /// When using clone, the same network will be created,
/// and the parameters between them are shared. /// and the parameters between them are shared.
......
...@@ -200,6 +200,9 @@ class PD_INFER_DECL Predictor { ...@@ -200,6 +200,9 @@ class PD_INFER_DECL Predictor {
/// ///
void RegisterOutputHook(const OutputTensorHookFunc& hookfunc); void RegisterOutputHook(const OutputTensorHookFunc& hookfunc);
/// The same as RegisterOutputHook.
void RegisterInputHook(const InputTensorHookFunc& hookfunc);
/// ///
/// \brief Get the execution stream on devices with a concept of stream, /// \brief Get the execution stream on devices with a concept of stream,
/// otherwise returns nullptr. /// otherwise returns nullptr.
......
...@@ -38,6 +38,7 @@ using Strings = std::vector<std::string>; ...@@ -38,6 +38,7 @@ using Strings = std::vector<std::string>;
using OutputTensorHookFunc = std::function<void( using OutputTensorHookFunc = std::function<void(
const std::string&, const std::string&, const paddle::Tensor&)>; const std::string&, const std::string&, const paddle::Tensor&)>;
using InputTensorHookFunc = OutputTensorHookFunc;
typedef void (*CallbackFunc)(void*); typedef void (*CallbackFunc)(void*);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册