未验证 提交 7c96efed 编写于 作者: W Wilber 提交者: GitHub

[Inference] Add tuned trt_dynamic_shape mode. (#34806)

上级 f5e430c5
......@@ -212,6 +212,12 @@ struct Argument {
bool);
DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool);
DECL_ARGUMENT_FIELD(tensorrt_shape_range_info_path,
TensorRtShapeRangeInfoPath, std::string);
DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape, TensorRtTunedDynamicShape,
bool);
DECL_ARGUMENT_FIELD(tensorrt_allow_build_at_runtime,
TensorRtAllowBuildAtRuntime, bool);
DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);
......
......@@ -146,6 +146,14 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
pass->Set("use_static_engine", new bool(use_static_engine));
pass->Set("model_from_memory", new bool(argument->model_from_memory()));
// tuned trt dynamic_shape
pass->Set("trt_shape_range_info_path",
new std::string(argument->tensorrt_shape_range_info_path()));
pass->Set("trt_tuned_dynamic_shape",
new bool(argument->tensorrt_tuned_dynamic_shape()));
pass->Set("trt_allow_build_at_runtime",
new bool(argument->tensorrt_allow_build_at_runtime()));
pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
argument->max_input_shape()));
pass->Set("min_input_shape", new std::map<std::string, std::vector<int>>(
......@@ -153,17 +161,17 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("optim_input_shape",
new std::map<std::string, std::vector<int>>(
argument->optim_input_shape()));
bool with_dynamic_shape = argument->max_input_shape().size() > 0 &&
argument->min_input_shape().size() > 0 &&
argument->optim_input_shape().size() > 0;
bool with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
argument->min_input_shape().size() > 0 &&
argument->optim_input_shape().size() > 0) ||
argument->tensorrt_tuned_dynamic_shape();
pass->Set("with_dynamic_shape", new bool(with_dynamic_shape));
pass->Set("trt_disabled_ops", new std::vector<std::string>(
argument->tensorrt_disabled_ops()));
pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla()));
pass->Set("trt_dla_core", new int(argument->tensorrt_dla_core()));
// Setting the disable_trt_plugin_fp16 to true means that TRT plugin will
// not
// run fp16.
// not run fp16.
pass->Set("disable_trt_plugin_fp16",
new bool(argument->disable_trt_plugin_fp16()));
} else if (pass_name == "dlnne_subgraph_pass") {
......
cc_library(subgraph_util SRCS subgraph_util.cc DEPS subgraph_detector)
if (WITH_GPU AND TENSORRT_FOUND)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_util tensorrt_op_teller)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_util tensorrt_op_teller infer_io_utils)
set(analysis_deps ${analysis_deps}
subgraph_util tensorrt_subgraph_pass
......
......@@ -22,6 +22,7 @@
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
#include "paddle/fluid/inference/utils/io_utils.h"
namespace paddle {
namespace inference {
......@@ -197,6 +198,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
auto opt_input_shape =
Get<std::map<std::string, std::vector<int>>>("optim_input_shape");
auto allow_build_at_runtime = Get<bool>("trt_allow_build_at_runtime");
auto shape_range_info_path = Get<std::string>("trt_shape_range_info_path");
auto trt_tuned_dynamic_shape = Get<bool>("trt_tuned_dynamic_shape");
int max_batch_size = Get<int>("max_batch_size");
if (trt_tuned_dynamic_shape) {
VLOG(1) << "trt dynamic_shape deserialize from " << shape_range_info_path;
inference::DeserializeShapeRangeInfo(shape_range_info_path,
&min_input_shape, &max_input_shape,
&opt_input_shape);
}
// The following procedure is used to rename all the intermediate
// variables and the output variables of the subgraph.
// Why we do this?
......@@ -242,12 +254,14 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
op_desc->SetBlockAttr("sub_block", new_block);
op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString());
op_desc->SetAttr("max_batch_size", Get<int>("max_batch_size"));
op_desc->SetAttr("max_batch_size", max_batch_size);
op_desc->SetAttr("workspace_size", Get<int>("workspace_size"));
op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id"));
op_desc->SetAttr("output_name_mapping", output_mapping);
op_desc->SetAttr("origin_output_dims", renamed_output_dims);
op_desc->SetAttr("parameters", params);
op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime);
op_desc->SetAttr("shape_range_info_path", shape_range_info_path);
// we record all inputs' shapes in attr to check if they are consistent
// with the real inputs' shapes retrieved from scope when trt runs.
......@@ -259,6 +273,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
}
auto use_static_engine = Get<bool>("use_static_engine");
op_desc->SetAttr("use_static_engine", use_static_engine);
if (use_static_engine)
op_desc->SetAttr("model_opt_cache_dir",
Get<std::string>("model_opt_cache_dir"));
// TODO(NHZlX)
// There are models with the same structure but the different parameters,
// when running in the 'use_serialize' mode, there is a bug.
......@@ -266,12 +285,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
// So we use seperate engine keys in serialization and calibration.
auto engine_key = GenerateEngineKey(
input_names_with_id, output_names_with_id, std::to_string(0),
std::to_string(Get<int>("max_batch_size")),
std::to_string(max_batch_size),
std::to_string(static_cast<int>(precision_mode)), false);
auto calibration_engine_key = GenerateEngineKey(
input_names_with_id, output_names_with_id, std::to_string(0),
std::to_string(Get<int>("max_batch_size")),
std::to_string(static_cast<int>(precision_mode)), true);
auto calibration_engine_key =
GenerateEngineKey(input_names_with_id, output_names_with_id,
std::to_string(0), std::to_string(max_batch_size),
std::to_string(static_cast<int>(precision_mode)), true);
auto predictor_id = Get<int>("predictor_id");
// Get "" when there is no cached calibration table data.
......@@ -345,11 +364,10 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
bool disable_trt_plugin_fp16 = Get<bool>("disable_trt_plugin_fp16");
tensorrt::TensorRTEngine *trt_engine =
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.Create(engine_key + std::to_string(predictor_id),
Get<int>("max_batch_size"), Get<int>("workspace_size"),
precision_mode, calibrator.get(), Get<int>("gpu_device_id"),
min_input_shape, max_input_shape, opt_input_shape,
disable_trt_plugin_fp16);
.Create(engine_key + std::to_string(predictor_id), max_batch_size,
Get<int>("workspace_size"), precision_mode, calibrator.get(),
Get<int>("gpu_device_id"), min_input_shape, max_input_shape,
opt_input_shape, disable_trt_plugin_fp16);
trt_engine->SetUseOSS(Get<bool>("use_oss"));
trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
trt_engine->SetDLACore(Get<int>("trt_dla_core"));
......
......@@ -55,10 +55,17 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// We get all the vars from local_scope instead of the ProgramDesc.
// Because there exists the case that new parameter variables are not added to
// the program in the analysis pass.
bool reserve_cpu_weights = false;
if (argument->tensorrt_allow_build_at_runtime_valid() &&
argument->tensorrt_allow_build_at_runtime()) {
reserve_cpu_weights = true;
}
for (auto &var_name : all_vars) {
if (std::count(repetitive_params.begin(), repetitive_params.end(),
var_name)) {
scope->EraseVars({var_name});
if (!reserve_cpu_weights) {
scope->EraseVars({var_name});
}
continue;
}
auto *var = scope->FindLocalVar(var_name);
......
......@@ -49,7 +49,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
endif()
cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps}
zero_copy_tensor ir_pass_manager op_compatible_info)
zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)
cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)
......
......@@ -158,6 +158,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(trt_use_static_engine_);
CP_MEMBER(trt_use_calib_mode_);
CP_MEMBER(trt_use_oss_);
CP_MEMBER(trt_tuned_dynamic_shape_);
CP_MEMBER(trt_allow_build_at_runtime_);
CP_MEMBER(collect_shape_range_info_);
CP_MEMBER(shape_range_info_path_);
// Dlnne related
CP_MEMBER(use_dlnne_);
CP_MEMBER(dlnne_min_subgraph_size_);
......@@ -653,8 +657,8 @@ float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
#endif
}
void AnalysisConfig::EnableMemoryOptim() {
enable_memory_optim_ = true;
void AnalysisConfig::EnableMemoryOptim(bool x) {
enable_memory_optim_ = x;
Update();
}
......@@ -783,6 +787,9 @@ std::string AnalysisConfig::Summary() {
// dynamic_shape
os.InsertRow({"tensorrt_enable_dynamic_shape",
min_input_shape_.empty() ? "false" : "true"});
os.InsertRow({"tensorrt_tuned_dynamic_shape", trt_tuned_dynamic_shape_
? shape_range_info_path_
: "false"});
os.InsertRow({"tensorrt_use_oss", trt_use_oss_ ? "true" : "false"});
os.InsertRow({"tensorrt_use_dla", trt_use_dla_ ? "true" : "false"});
......@@ -812,8 +819,46 @@ std::string AnalysisConfig::Summary() {
os.InsertRow({"memory_optim", enable_memory_optim_ ? "true" : "false"});
os.InsertRow({"enable_profile", with_profile_ ? "true" : "false"});
os.InsertRow({"enable_log", with_glog_info_ ? "true" : "false"});
os.InsertRow({"collect_shape_range_info",
collect_shape_range_info_ ? shape_range_info_path_ : "false"});
return os.PrintTable();
}
void AnalysisConfig::CollectShapeRangeInfo(
const std::string &shape_range_info_path) {
LOG(INFO) << "In CollectShapeInfo mode, we will disable optimizations and "
"collect the shape information of "
<< "all intermediate tensors in the compute graph and calculate "
"the min_shape, max_shape and opt_shape.";
collect_shape_range_info_ = true;
PADDLE_ENFORCE_EQ(shape_range_info_path.empty(), false,
platform::errors::InvalidArgument(
"The shape_range_info_path should not be empty, please "
"re-check the argument."));
shape_range_info_path_ = shape_range_info_path;
}
const std::string &AnalysisConfig::shape_range_info_path() {
return shape_range_info_path_;
}
bool AnalysisConfig::shape_range_info_collected() {
return collect_shape_range_info_;
}
void AnalysisConfig::EnableTunedTensorRtDynamicShape(
const std::string &shape_range_info_path, bool allow_build_at_runtime) {
shape_range_info_path_ = shape_range_info_path;
trt_allow_build_at_runtime_ = allow_build_at_runtime;
trt_tuned_dynamic_shape_ = true;
}
bool AnalysisConfig::tuned_tensorrt_dynamic_shape() {
return trt_tuned_dynamic_shape_;
}
bool AnalysisConfig::trt_allow_build_at_runtime() {
return trt_allow_build_at_runtime_;
}
} // namespace paddle
......@@ -13,7 +13,9 @@
// limitations under the License.
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include <glog/logging.h>
#include <algorithm>
#include <fstream>
#include <memory>
......@@ -21,6 +23,7 @@
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/extension/include/ext_op_meta_info.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
......@@ -34,6 +37,7 @@
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
......@@ -570,6 +574,11 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetMaxInputShape(config_.max_input_shape_);
argument_.SetOptimInputShape(config_.optim_input_shape_);
argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
argument_.SetTensorRtShapeRangeInfoPath(config_.shape_range_info_path());
argument_.SetTensorRtTunedDynamicShape(
config_.tuned_tensorrt_dynamic_shape());
argument_.SetTensorRtAllowBuildAtRuntime(
config_.trt_allow_build_at_runtime());
}
if (config_.dlnne_enabled()) {
......@@ -915,6 +924,11 @@ bool AnalysisPredictor::ZeroCopyRun() {
#endif
executor_->Run();
if (config_.shape_range_info_collected()) {
CollectShapeRangeInfo();
}
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
tensor_array_batch_cleaner_.ResetTensorArray();
......@@ -934,6 +948,78 @@ bool AnalysisPredictor::ZeroCopyRun() {
return true;
}
void AnalysisPredictor::CollectShapeRangeInfo() {
// if use gpu, sync first.
if (config_.use_gpu()) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, place_);
auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
pool.Get(gpu_place));
#ifdef PADDLE_WITH_HIP
hipStreamSynchronize(dev_ctx->stream());
#else
cudaStreamSynchronize(dev_ctx->stream());
#endif
#endif
}
std::vector<std::string> var_names = sub_scope_->LocalVarNames();
for (const auto &name : var_names) {
auto *var = sub_scope_->GetVar(name);
if (!var->IsType<framework::LoDTensor>()) {
continue;
}
framework::DDim dim = var->Get<framework::LoDTensor>().dims();
std::vector<int32_t> shape(dim.size());
for (size_t i = 0; i < shape.size(); ++i) shape[i] = dim[i];
shape_info_[name].emplace_back(shape);
}
}
void AnalysisPredictor::StatisticShapeRangeInfo() {
std::map<std::string, std::vector<int32_t>> min_shapes;
std::map<std::string, std::vector<int32_t>> max_shapes;
std::map<std::string, std::vector<int32_t>> opt_shapes;
for (auto it : shape_info_) {
auto name = it.first;
auto shapes = it.second;
std::vector<int32_t> min_shape(shapes[0].begin(), shapes[0].end());
std::vector<int32_t> max_shape(shapes[0].begin(), shapes[0].end());
std::vector<int32_t> opt_shape(shapes[0].begin(), shapes[0].end());
auto ShapeMaxFreq = [](const std::map<int32_t, int32_t> &m) -> int32_t {
std::vector<std::pair<int32_t, int32_t>> counter;
for (auto &it : m) counter.push_back(it);
std::sort(
counter.begin(), counter.end(),
[](std::pair<int32_t, int32_t> &a, std::pair<int32_t, int32_t> &b) {
return a.second > b.second;
});
return counter[0].first;
};
for (size_t d = 0; d < shapes[0].size(); ++d) {
std::map<int32_t, int32_t> counter;
for (size_t i = 0; i < shapes.size(); ++i) {
counter[shapes[i][d]] += 1;
if (shapes[i][d] < min_shape[d]) min_shape[d] = shapes[i][d];
if (shapes[i][d] > max_shape[d]) max_shape[d] = shapes[i][d];
}
opt_shape[d] = ShapeMaxFreq(counter);
}
min_shapes[name] = min_shape;
max_shapes[name] = max_shape;
opt_shapes[name] = opt_shape;
}
inference::SerializeShapeRangeInfo(config_.shape_range_info_path(),
min_shapes, max_shapes, opt_shapes);
}
bool AnalysisPredictor::LoadProgramDesc() {
// Initialize the inference program
std::string filename;
......@@ -1140,6 +1226,10 @@ AnalysisPredictor::~AnalysisPredictor() {
}
#endif
if (config_.shape_range_info_collected()) {
StatisticShapeRangeInfo();
}
memory::Release(place_);
}
......
......@@ -91,6 +91,10 @@ class AnalysisPredictor : public PaddlePredictor {
/// \param[in] AnalysisConfig config
///
explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {
if (config_.shape_range_info_collected()) {
config_.SwitchIrOptim(false);
config_.EnableMemoryOptim(false);
}
predictor_id_ = inference::GetUniqueId();
}
///
......@@ -377,6 +381,10 @@ class AnalysisPredictor : public PaddlePredictor {
FRIEND_TEST(AnalysisPredictor, with_gpu);
#endif
private:
void StatisticShapeRangeInfo();
void CollectShapeRangeInfo();
private:
AnalysisConfig config_;
Argument argument_;
......@@ -419,6 +427,8 @@ class AnalysisPredictor : public PaddlePredictor {
private:
// Some status here that help to determine the status inside the predictor.
bool status_is_cloned_{false};
std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
};
} // namespace paddle
......@@ -19,8 +19,10 @@
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/fluid/platform/cpu_info.h"
DEFINE_string(dirname, "", "dirname to tests.");
......@@ -32,6 +34,8 @@ TEST(AnalysisPredictor, analysis_off) {
config.SetModel(FLAGS_dirname);
config.SwitchIrOptim(false);
LOG(INFO) << config.Summary();
LOG(INFO) << "Shape Info collected: " << config.shape_range_info_collected()
<< ", path: " << config.shape_range_info_path();
auto _predictor = CreatePaddlePredictor<AnalysisConfig>(config);
auto* predictor = static_cast<AnalysisPredictor*>(_predictor.get());
......@@ -86,10 +90,6 @@ TEST(AnalysisPredictor, analysis_on) {
std::vector<PaddleTensor> outputs;
ASSERT_TRUE(predictor->Run(inputs, &outputs));
for (auto& output : outputs) {
LOG(INFO) << inference::DescribeTensor(output);
}
// compare with NativePredictor
auto naive_predictor =
CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
......@@ -139,6 +139,54 @@ TEST(AnalysisPredictor, ZeroCopy) {
predictor->TryShrinkMemory();
}
TEST(AnalysisPredictor, CollectShapeRangeInfo) {
AnalysisConfig config;
config.SetModel(FLAGS_dirname);
config.SwitchUseFeedFetchOps(false);
config.EnableUseGpu(100, 0);
config.CollectShapeRangeInfo(FLAGS_dirname + "/shape_range.pbtxt");
LOG(INFO) << config.Summary();
AnalysisConfig config2(config);
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config2);
auto w0 = predictor->GetInputTensor("firstw");
auto w1 = predictor->GetInputTensor("secondw");
auto w2 = predictor->GetInputTensor("thirdw");
auto w3 = predictor->GetInputTensor("forthw");
w0->Reshape({4, 1});
w1->Reshape({4, 1});
w2->Reshape({4, 1});
w3->Reshape({4, 1});
auto* w0_data = w0->mutable_data<int64_t>(PaddlePlace::kCPU);
auto* w1_data = w1->mutable_data<int64_t>(PaddlePlace::kCPU);
auto* w2_data = w2->mutable_data<int64_t>(PaddlePlace::kCPU);
auto* w3_data = w3->mutable_data<int64_t>(PaddlePlace::kCPU);
for (int i = 0; i < 4; i++) {
w0_data[i] = i;
w1_data[i] = i;
w2_data[i] = i;
w3_data[i] = i;
}
predictor->ZeroCopyRun();
auto out = predictor->GetOutputTensor("fc_1.tmp_2");
PaddlePlace place;
int size = 0;
out->data<float>(&place, &size);
LOG(INFO) << "output size: " << size / sizeof(float);
// TODO(wilber): check for windows
// std::map<std::string, std::vector<int32_t>> min_shape;
// std::map<std::string, std::vector<int32_t>> max_shape;
// std::map<std::string, std::vector<int32_t>> opt_shape;
// inference::DeserializeShapeRangeInfo(FLAGS_dirname + "/shape_range.pbtxt",
// &min_shape, &max_shape, &opt_shape);
// ASSERT_EQ(min_shape.size(), 14u);
}
TEST(AnalysisPredictor, Clone) {
AnalysisConfig config;
config.SetModel(FLAGS_dirname);
......
......@@ -380,6 +380,50 @@ struct PD_INFER_DECL AnalysisConfig {
bool tensorrt_dynamic_shape_enabled() const {
return !min_input_shape_.empty();
}
///
/// \brief Enable tuned tensorrt dynamic shape.
///
/// \param shape_range_info_path the path to shape_info file got in
/// CollectShapeInfo
/// mode.
/// \param allow_build_at_runtime allow build trt engine at runtime.
///
void EnableTunedTensorRtDynamicShape(const std::string& shape_range_info_path,
bool allow_build_at_runtime = true);
///
/// \brief A boolean state telling whether to use tuned tensorrt dynamic
/// shape.
///
bool tuned_tensorrt_dynamic_shape();
///
/// \brief A boolean state telling whether to allow building trt engine at
/// runtime.
///
bool trt_allow_build_at_runtime();
///
/// \brief Collect shape info of all tensors in compute graph.
///
/// \param shape_range_info_path the path to save shape info.
///
void CollectShapeRangeInfo(const std::string& shape_range_info_path);
///
/// \brief the shape info path in CollectShapeInfo mode.
///
/// \return the shape info path.
///
const std::string& shape_range_info_path();
///
/// \brief A boolean state telling whether to collect shape info.
///
/// \return bool Whether to collect shape info.
///
bool shape_range_info_collected();
///
/// \brief Prevent ops running in Paddle-TRT
/// NOTE: just experimental, not an official stable API, easy to be broken.
......@@ -573,7 +617,9 @@ struct PD_INFER_DECL AnalysisConfig {
/// \brief Turn on memory optimize
/// NOTE still in development.
///
void EnableMemoryOptim();
/// \param x Whether to enable memory optimize.
///
void EnableMemoryOptim(bool x = true);
///
/// \brief A boolean state telling whether the memory optimization is
/// activated.
......@@ -693,6 +739,15 @@ struct PD_INFER_DECL AnalysisConfig {
std::map<std::string, std::vector<int>> optim_input_shape_{};
std::vector<std::string> trt_disabled_ops_{};
bool disable_trt_plugin_fp16_{false};
bool trt_allow_build_at_runtime_{false};
// tune to get dynamic_shape info.
bool trt_tuned_dynamic_shape_{false};
// In CollectShapeInfo mode, we will collect the shape information of
// all intermediate tensors in the compute graph and calculate the
// min_shape, max_shape and opt_shape and save in shape_range_info_path_;
bool collect_shape_range_info_{false};
std::string shape_range_info_path_;
// dlnne related.
bool use_dlnne_{false};
......
......@@ -196,6 +196,10 @@ void TensorRTEngine::FreezeNetwork() {
#if IS_TRT_VERSION_GE(6000)
LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode.";
for (auto &input : min_input_shape_) {
VLOG(4) << "TRT dynamic_shape set " << input.first
<< " min: " << Vec2Str(input.second)
<< ", max: " << Vec2Str(max_input_shape_[input.first])
<< ", opt: " << Vec2Str(optim_input_shape_[input.first]);
optim_profile_->setDimensions(
input.first.c_str(), nvinfer1::OptProfileSelector::kMIN,
Vec2TRT_Dims(input.second, input.first, true));
......
......@@ -32,6 +32,7 @@ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/utils/any.h"
namespace paddle {
......@@ -226,6 +227,7 @@ class TensorRTEngine {
const std::string& name);
// Set the itensor_map_[name] as the network's output, and set its name.
void DeclareOutput(const std::string& name);
void ClearTensorMap() { itensor_map_.clear(); }
void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
// Get an ITensor called name.
......@@ -244,6 +246,16 @@ class TensorRTEngine {
}
return infer_context_[tid].get();
}
void ResetContext() {
std::unique_lock<std::mutex> lock(mutex_);
const std::thread::id tid = std::this_thread::get_id();
PADDLE_ENFORCE_NOT_NULL(
infer_engine_,
platform::errors::InvalidArgument(
"You should build engine first and then set the context."));
infer_context_[tid].reset(nullptr);
infer_context_.erase(tid);
}
nvinfer1::IHostMemory* Serialize() {
PADDLE_ENFORCE_NOT_NULL(
......@@ -364,6 +376,55 @@ class TensorRTEngine {
ShapeMapType min_input_shape() { return min_input_shape_; }
ShapeMapType max_input_shape() { return max_input_shape_; }
ShapeMapType optim_input_shape() { return optim_input_shape_; }
bool AdjustDynamicShapeRange(const ShapeMapType& runtime_input_shape,
std::vector<std::string>* changed) {
bool ret = false;
changed->clear();
for (const auto& it : runtime_input_shape) {
auto name = it.first;
auto input_shape = it.second;
PADDLE_ENFORCE_EQ(
min_input_shape_.count(name), true,
platform::errors::InvalidArgument(
"TRT dynamic_shape min_input_shape %s not found.", name));
PADDLE_ENFORCE_EQ(min_input_shape_[name].size(), input_shape.size(),
platform::errors::InvalidArgument(
"TRT dynamic_shape min_input_shape %s size not "
"equal, the min_input_shape[%s].size()=%d"
", but the runtime_input_shape[%s].size()=%d.",
name, name, min_input_shape_[name].size(), name,
input_shape.size()));
auto bak_min_shape = min_input_shape_[name];
auto bak_max_shape = max_input_shape_[name];
bool min_change = false;
bool max_change = false;
for (size_t d = 0; d < input_shape.size(); ++d) {
if (input_shape[d] < min_input_shape_[name][d]) {
ret = true;
min_change = true;
min_input_shape_[name][d] = input_shape[d];
}
if (input_shape[d] > max_input_shape_[name][d]) {
ret = true;
max_change = true;
max_input_shape_[name][d] = input_shape[d];
}
}
if (min_change)
LOG(INFO) << "refactor shape range: " << name << ", min_shape from "
<< Vec2Str(bak_min_shape) << " to "
<< Vec2Str(min_input_shape_[name]);
if (max_change)
LOG(INFO) << "refactor shape range: " << name << ", max_shape from "
<< Vec2Str(bak_max_shape) << " to "
<< Vec2Str(max_input_shape_[name]);
if (min_change || max_change) changed->push_back(name);
}
return ret;
}
bool use_oss() { return use_oss_; }
bool with_ernie() { return with_ernie_; }
bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; }
......
......@@ -154,6 +154,16 @@ inline void PrintITensorShape(nvinfer1::ITensor* X) {
std::cout << "]\n";
}
template <typename T>
inline std::string Vec2Str(const std::vector<T>& vec) {
std::ostringstream os;
os << "(";
for (size_t i = 0; i < vec.size() - 1; ++i) {
os << vec[i] << ",";
}
os << vec[vec.size() - 1] << ")";
return os.str();
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
......@@ -143,6 +143,70 @@ void TestDynamic2() {
}
}
void TestTunedDynamic() {
std::string model_dir =
FLAGS_infer_model + "/complex_model_dynamic/complex_model_dynamic2";
AnalysisConfig config_tuned;
const std::string shape_range = "shape_range.pbtxt";
config_tuned.EnableUseGpu(100, 0);
config_tuned.SetModel(model_dir + "/model", model_dir + "/params");
config_tuned.SwitchUseFeedFetchOps(false);
config_tuned.CollectShapeRangeInfo(shape_range);
int batch_size = 1;
auto predictor_tuned = CreatePaddlePredictor(config_tuned);
auto check_func = [batch_size](PaddlePredictor *predictor) {
int channels = 3;
int height = 5;
int width = 5;
int input_num = channels * height * width * 1;
float *input = new float[input_num];
memset(input, 0, input_num * sizeof(float));
auto input_names = predictor->GetInputNames();
auto input_t = predictor->GetInputTensor(input_names[0]);
input_t->Reshape({batch_size, channels, height, width});
input_t->copy_from_cpu(input);
auto input_t1 = predictor->GetInputTensor(input_names[1]);
input_t1->Reshape({batch_size, 2, 1, 1});
std::vector<float> first;
for (int i = 0; i < batch_size * 2; i++) first.push_back(1.0);
input_t1->copy_from_cpu(first.data());
auto input_t2 = predictor->GetInputTensor(input_names[2]);
input_t2->Reshape({batch_size, 2, 1, 1});
input_t2->copy_from_cpu(first.data());
ASSERT_TRUE(predictor->ZeroCopyRun());
std::vector<float> out_data;
auto output_names = predictor->GetOutputNames();
auto output_t = predictor->GetOutputTensor(output_names[0]);
std::vector<int> output_shape = output_t->shape();
int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
std::multiplies<int>());
out_data.resize(out_num);
output_t->copy_to_cpu(out_data.data());
};
check_func(predictor_tuned.get());
// check tuned_dynamic_shape
AnalysisConfig config;
config.EnableUseGpu(100, 0);
std::string cache_dir = "tuned_cache";
config.SetOptimCacheDir(cache_dir);
delete_cache_files(cache_dir);
config.SetModel(model_dir + "/model", model_dir + "/params");
config.SwitchUseFeedFetchOps(false);
config.EnableTunedTensorRtDynamicShape(shape_range, true);
config.EnableTensorRtEngine(1 << 30, batch_size, 0,
AnalysisConfig::Precision::kFloat32, true, false);
auto test_predictor = CreatePaddlePredictor(config);
check_func(test_predictor.get());
}
TEST(AnalysisPredictor, trt_dynamic) { TestDynamic(true); }
TEST(AnalysisPredictor, trt_static) { TestDynamic(false); }
TEST(AnalysisPredictor, trt_memory_serialize) {
......@@ -153,5 +217,7 @@ TEST(AnalysisPredictor, trt_memory_serialize) {
}
TEST(AnalysisPredictor, trt_dynamic2) { TestDynamic2(); }
TEST(AnalysisPredictor, trt_tuned_dynamic) { TestTunedDynamic(); }
} // namespace inference
} // namespace paddle
......@@ -47,6 +47,24 @@ TEST(AnalysisPredictor, use_gpu) {
}
}
TEST(AnalysisPredictor, collect_shape_range) {
std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.SetModel(model_dir);
config.CollectShapeRangeInfo("shape_range.pbtxt");
std::vector<std::vector<PaddleTensor>> inputs_all;
auto predictor = CreatePaddlePredictor(config);
SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
std::vector<PaddleTensor> outputs;
for (auto &input : inputs_all) {
ASSERT_TRUE(predictor->Run(input, &outputs));
predictor->ClearIntermediateTensor();
}
}
} // namespace inference
} // namespace paddle
......
cc_library(benchmark SRCS benchmark.cc DEPS enforce)
cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
cc_library(infer_io_utils SRCS io_utils.cc DEPS paddle_inference_api lod_tensor)
cc_library(infer_io_utils SRCS io_utils.cc DEPS paddle_inference_api lod_tensor shape_range_info_proto)
cc_test(infer_io_utils_tester SRCS io_utils_tester.cc DEPS infer_io_utils)
cc_library(table_printer SRCS table_printer.cc)
cc_test(test_table_printer SRCS table_printer_tester.cc DEPS table_printer)
proto_library(shape_range_info_proto SRCS shape_range_info.proto)
......@@ -13,7 +13,15 @@
// limitations under the License.
#include "paddle/fluid/inference/utils/io_utils.h"
#include <fcntl.h>
#include <utility>
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/text_format.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/utils/shape_range_info.pb.h"
namespace paddle {
namespace inference {
......@@ -157,5 +165,102 @@ void DeserializePDTensorsToFile(const std::string &path,
fin.close();
}
void SerializeShapeRangeInfo(
const std::string &path,
const paddle::inference::proto::ShapeRangeInfos &info) {
int out_fd = open(path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644);
google::protobuf::io::FileOutputStream *os =
new google::protobuf::io::FileOutputStream(out_fd);
google::protobuf::TextFormat::Print(info, os);
delete os;
close(out_fd);
}
void SerializeShapeRangeInfo(
const std::string &path,
const std::map<std::string, std::vector<int32_t>> &min_shape,
const std::map<std::string, std::vector<int32_t>> &max_shape,
const std::map<std::string, std::vector<int32_t>> &opt_shape) {
paddle::inference::proto::ShapeRangeInfos shape_range_infos;
for (auto it : min_shape) {
auto *s = shape_range_infos.add_shape_range_info();
s->set_name(it.first);
for (size_t i = 0; i < it.second.size(); ++i) {
s->add_min_shape(it.second[i]);
s->add_max_shape(max_shape.at(it.first)[i]);
s->add_opt_shape(opt_shape.at(it.first)[i]);
}
}
inference::SerializeShapeRangeInfo(path, shape_range_infos);
}
void DeserializeShapeRangeInfo(
const std::string &path, paddle::inference::proto::ShapeRangeInfos *info) {
int fd = open(path.c_str(), O_RDONLY);
google::protobuf::io::FileInputStream *is =
new google::protobuf::io::FileInputStream(fd);
google::protobuf::TextFormat::Parse(is, info);
delete is;
close(fd);
}
void DeserializeShapeRangeInfo(
const std::string &path,
std::map<std::string, std::vector<int32_t>> *min_shape,
std::map<std::string, std::vector<int32_t>> *max_shape,
std::map<std::string, std::vector<int32_t>> *opt_shape) {
paddle::inference::proto::ShapeRangeInfos shape_range_infos;
DeserializeShapeRangeInfo(path, &shape_range_infos);
for (int i = 0; i < shape_range_infos.shape_range_info_size(); ++i) {
auto info = shape_range_infos.shape_range_info(i);
auto name = info.name();
if (min_shape->count(name) || max_shape->count(name) ||
opt_shape->count(name)) {
continue;
} else {
std::vector<int32_t> tmp(info.min_shape_size());
for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.min_shape(k);
min_shape->insert(std::make_pair(name, tmp));
tmp.resize(info.max_shape_size());
for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.max_shape(k);
max_shape->insert(std::make_pair(name, tmp));
tmp.resize(info.opt_shape_size());
for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.opt_shape(k);
opt_shape->insert(std::make_pair(name, tmp));
}
}
}
void UpdateShapeRangeInfo(
const std::string &path,
const std::map<std::string, std::vector<int32_t>> &min_shape,
const std::map<std::string, std::vector<int32_t>> &max_shape,
const std::map<std::string, std::vector<int32_t>> &opt_shape,
const std::vector<std::string> &names) {
paddle::inference::proto::ShapeRangeInfos shape_range_infos;
DeserializeShapeRangeInfo(path, &shape_range_infos);
for (int i = 0; i < shape_range_infos.shape_range_info_size(); ++i) {
auto *info = shape_range_infos.mutable_shape_range_info(i);
for (const auto &name : names) {
if (info->name() == name) {
info->clear_min_shape();
info->clear_max_shape();
info->clear_opt_shape();
for (size_t j = 0; j < min_shape.at(name).size(); ++j)
info->add_min_shape(min_shape.at(name)[j]);
for (size_t j = 0; j < max_shape.at(name).size(); ++j)
info->add_max_shape(max_shape.at(name)[j]);
for (size_t j = 0; j < opt_shape.at(name).size(); ++j)
info->add_opt_shape(opt_shape.at(name)[j]);
break;
}
}
}
inference::SerializeShapeRangeInfo(path, shape_range_infos);
}
} // namespace inference
} // namespace paddle
......@@ -19,6 +19,7 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/api/paddle_api.h"
#include "paddle/fluid/inference/utils/shape_range_info.pb.h"
namespace paddle {
struct PaddleTensor;
......@@ -41,5 +42,28 @@ void SerializePDTensorsToFile(const std::string& path,
const std::vector<PaddleTensor>& tensors);
void DeserializePDTensorsToFile(const std::string& path,
std::vector<PaddleTensor>* tensors);
void SerializeShapeRangeInfo(
const std::string& path,
const paddle::inference::proto::ShapeRangeInfos& info);
void SerializeShapeRangeInfo(
const std::string& path,
const std::map<std::string, std::vector<int32_t>>& min_shape,
const std::map<std::string, std::vector<int32_t>>& max_shape,
const std::map<std::string, std::vector<int32_t>>& opt_shape);
void DeserializeShapeRangeInfo(const std::string& path,
paddle::inference::proto::ShapeRangeInfos* info);
void DeserializeShapeRangeInfo(
const std::string& path,
std::map<std::string, std::vector<int32_t>>* min_shape,
std::map<std::string, std::vector<int32_t>>* max_shape,
std::map<std::string, std::vector<int32_t>>* opt_shape);
void UpdateShapeRangeInfo(
const std::string& path,
const std::map<std::string, std::vector<int32_t>>& min_shape,
const std::map<std::string, std::vector<int32_t>>& max_shape,
const std::map<std::string, std::vector<int32_t>>& opt_shape,
const std::vector<std::string>& names);
} // namespace inference
} // namespace paddle
......@@ -15,6 +15,7 @@
#include "paddle/fluid/inference/utils/io_utils.h"
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <utility>
#include "paddle/fluid/inference/api/helper.h"
namespace paddle {
......@@ -93,3 +94,28 @@ TEST(infer_io_utils, tensors) {
paddle::inference::pd_tensor_equal(tensors_in[i], tensors_out[i]));
}
}
TEST(shape_info_io, read_and_write) {
const std::string path = "test_shape_info_io";
std::map<std::string, std::vector<int32_t>> min_shape, max_shape, opt_shape;
min_shape.insert(
std::make_pair("test1", std::vector<int32_t>{1, 3, 112, 112}));
max_shape.insert(
std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
opt_shape.insert(
std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
paddle::inference::SerializeShapeRangeInfo(path, min_shape, max_shape,
opt_shape);
min_shape.clear();
max_shape.clear();
opt_shape.clear();
opt_shape.insert(
std::make_pair("test2", std::vector<int32_t>{1, 3, 224, 224}));
paddle::inference::DeserializeShapeRangeInfo(path, &min_shape, &max_shape,
&opt_shape);
min_shape.insert(std::make_pair("test1", std::vector<int32_t>{1, 3, 56, 56}));
std::vector<std::string> names{"test1"};
paddle::inference::UpdateShapeRangeInfo(path, min_shape, max_shape, opt_shape,
names);
}
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax = "proto2";
package paddle.inference.proto;
// To support trt dynamic shape, record the runtime shape
// information of all tmp tensors in the Compution graph.
message ShapeRangeInfos {
message ShapeRangeInfo {
required string name = 1;
repeated int32 min_shape = 2;
repeated int32 max_shape = 3;
repeated int32 opt_shape = 4;
}
repeated ShapeRangeInfo shape_range_info = 1;
}
op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter infer_io_utils analysis_helper)
file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(tensorrt_engine);\n")
nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
DEPS tensorrt_engine_op
......
......@@ -14,6 +14,7 @@
#pragma once
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_CUDA
#include <memory>
......@@ -30,6 +31,7 @@
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/utils/io_utils.h"
namespace paddle {
namespace inference {
......@@ -77,18 +79,19 @@ static void RuntimeStaticShapeCheck(std::vector<int64_t> runtime_input_shape,
}
static void RuntimeDynamicShapeCheck(
const std::string &x, const std::vector<int64_t> &runtime_input_shape,
const std::vector<int> &min_input_shape,
const std::vector<int> &max_input_shape) {
PADDLE_ENFORCE_EQ(runtime_input_shape.size(), min_input_shape.size(),
platform::errors::InvalidArgument(
"TRT engine runtime input dims size(%d) inconsistent "
"with the dynamic shape size(%d)",
runtime_input_shape.size(), min_input_shape.size()));
const std::string &x, const std::vector<int32_t> &runtime_input_shape,
const std::vector<int32_t> &min_input_shape,
const std::vector<int32_t> &max_input_shape) {
// PADDLE_ENFORCE_EQ(
// runtime_input_shape.size(), min_input_shape.size(),
// platform::errors::InvalidArgument(
// "TRT engine runtime input %s dims size(%d) inconsistent "
// "with the dynamic shape size(%d)",
// x, runtime_input_shape.size(), min_input_shape.size()));
auto is_input_shape_valid = [&](
const std::vector<int64_t> &runtime_input_shape,
const std::vector<int> &min_input_shape,
const std::vector<int> &max_input_shape) -> bool {
const std::vector<int32_t> &runtime_input_shape,
const std::vector<int32_t> &min_input_shape,
const std::vector<int32_t> &max_input_shape) -> bool {
for (size_t i = 0; i < runtime_input_shape.size(); i++) {
if (runtime_input_shape[i] <= max_input_shape[i] &&
runtime_input_shape[i] >= min_input_shape[i]) {
......@@ -128,6 +131,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
private:
std::vector<std::string> input_names_;
std::unordered_set<std::string> param_names_;
std::vector<std::string> runtime_input_names_;
mutable TensorRTEngine *trt_engine_{nullptr};
int max_batch_size_;
int workspace_size_;
......@@ -141,7 +145,14 @@ class TensorRTEngineOp : public framework::OperatorBase {
bool calibration_mode_;
int predictor_id_;
int device_id_;
bool allow_build_at_runtime_{false};
std::string shape_range_info_path_;
std::string model_opt_cache_dir_;
bool use_static_engine_;
AnalysisConfig::Precision precision_mode_;
std::map<std::string, std::vector<int>> min_input_shape_{};
std::map<std::string, std::vector<int>> max_input_shape_{};
std::map<std::string, std::vector<int>> opt_input_shape_{};
public:
TensorRTEngineOp(const std::string &type,
......@@ -160,11 +171,57 @@ class TensorRTEngineOp : public framework::OperatorBase {
engine_key_ = Attr<std::string>("engine_key");
calibration_engine_key_ = Attr<std::string>("calibration_engine_key");
predictor_id_ = Attr<int>("predictor_id");
shape_range_info_path_ = Attr<std::string>("shape_range_info_path");
allow_build_at_runtime_ = Attr<bool>("allow_build_at_runtime");
use_static_engine_ = Attr<bool>("use_static_engine");
if (use_static_engine_) {
model_opt_cache_dir_ = Attr<std::string>("model_opt_cache_dir");
}
if (HasAttr("dynamic_shape_names") && HasAttr("min_input_shape") &&
HasAttr("max_input_shape") && HasAttr("opt_input_shape")) {
std::vector<std::string> dynamic_shape_names;
std::vector<std::vector<int>> min_input_shapes;
std::vector<std::vector<int>> max_input_shapes;
std::vector<std::vector<int>> opt_input_shapes;
std::vector<int> dynamic_shape_lens;
dynamic_shape_names =
Attr<std::vector<std::string>>("dynamic_shape_names");
std::vector<int> min_shapes = Attr<std::vector<int>>("min_input_shape");
std::vector<int> max_shapes = Attr<std::vector<int>>("max_input_shape");
std::vector<int> opt_shapes = Attr<std::vector<int>>("opt_input_shape");
dynamic_shape_lens = Attr<std::vector<int>>("dynamic_shape_lens");
int idx = 0;
for (size_t i = 0; i < dynamic_shape_lens.size(); ++i) {
std::vector<int> tmp1, tmp2, tmp3;
for (int j = 0; j < dynamic_shape_lens[i]; ++j) {
tmp1.push_back(min_shapes[idx]);
tmp2.push_back(max_shapes[idx]);
tmp3.push_back(opt_shapes[idx++]);
}
min_input_shapes.emplace_back(tmp1);
max_input_shapes.emplace_back(tmp2);
opt_input_shapes.emplace_back(tmp3);
}
for (size_t i = 0; i < dynamic_shape_names.size(); ++i) {
min_input_shape_.insert(
std::make_pair(dynamic_shape_names[i], min_input_shapes[i]));
max_input_shape_.insert(
std::make_pair(dynamic_shape_names[i], max_input_shapes[i]));
opt_input_shape_.insert(
std::make_pair(dynamic_shape_names[i], opt_input_shapes[i]));
}
}
auto params = Attr<std::vector<std::string>>("parameters");
for (const auto &param : params) {
param_names_.insert(param);
}
for (auto &x : input_names_) {
if (param_names_.count(x)) continue;
runtime_input_names_.emplace_back(x);
}
// calibration_mode is ture represents we need to
// generate the calibration table data.
calibration_mode_ =
......@@ -210,6 +267,78 @@ class TensorRTEngineOp : public framework::OperatorBase {
return;
}
auto *trt_engine = GetEngine(scope, dev_place);
if (trt_engine->with_dynamic_shape()) {
// get runtime input shapes.
std::map<std::string, std::vector<int32_t>> runtime_input_shape;
for (auto name : runtime_input_names_) {
auto &t = inference::analysis::GetFromScope<framework::LoDTensor>(scope,
name);
VLOG(4) << "trt engine runtime input name(" << name << "), dims("
<< t.dims() << ")";
auto t_shape = framework::vectorize<int32_t>(t.dims());
runtime_input_shape.insert(std::make_pair(name, t_shape));
}
if (!allow_build_at_runtime_) {
std::map<std::string, std::vector<int>> min_input_shape =
trt_engine->min_input_shape();
std::map<std::string, std::vector<int>> max_input_shape =
trt_engine->max_input_shape();
for (auto &x : runtime_input_names_) {
PADDLE_ENFORCE_EQ(
min_input_shape.count(x), true,
platform::errors::InvalidArgument(
"Input %s not found in TRT engine min_input_shape.", x));
PADDLE_ENFORCE_EQ(
max_input_shape.count(x), true,
platform::errors::InvalidArgument(
"Input %s not found in TRT engine max_input_shape.", x));
RuntimeDynamicShapeCheck(x, runtime_input_shape[x],
min_input_shape[x], max_input_shape[x]);
}
} else {
// compare runtime_input_shape and trt_engine dynamic shapes.
std::vector<std::string> shape_changed_name;
bool is_adjusted = trt_engine->AdjustDynamicShapeRange(
runtime_input_shape, &shape_changed_name);
if (is_adjusted) {
LOG(INFO) << "Adjust dynamic shape range, rebuild trt engine!";
trt_engine->ResetContext();
trt_engine->ClearTensorMap();
auto *anc = scope.parent();
while (anc && anc->parent()) {
anc = anc->parent();
}
if (anc == nullptr) {
anc = &scope;
}
PrepareTRTEngine(*anc, trt_engine);
// update shape_range_info_pbtxt
if (!shape_range_info_path_.empty()) {
inference::UpdateShapeRangeInfo(
shape_range_info_path_, trt_engine->min_input_shape(),
trt_engine->max_input_shape(), trt_engine->optim_input_shape(),
shape_changed_name);
}
if (use_static_engine_) {
nvinfer1::IHostMemory *serialized_engine_data =
trt_engine->Serialize();
std::string trt_engine_serialized_data =
std::string((const char *)serialized_engine_data->data(),
serialized_engine_data->size());
inference::analysis::SaveTrtEngineSerializedDataToFile(
inference::analysis::GetTrtEngineSerializedPath(
model_opt_cache_dir_, engine_key_),
trt_engine_serialized_data);
LOG(INFO) << "Save TRT Optimized Info to "
<< inference::analysis::GetTrtEngineSerializedPath(
model_opt_cache_dir_, engine_key_);
}
}
}
}
RunTrt(scope, dev_place, trt_engine);
}
......@@ -273,7 +402,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
PADDLE_ENFORCE_EQ(
input_names_.empty(), false,
runtime_input_names_.empty(), false,
platform::errors::PreconditionNotMet(
"TensorRT engine needs at least one input, but no input is found. "
"Please check if you set the input correctly."));
......@@ -283,16 +412,12 @@ class TensorRTEngineOp : public framework::OperatorBase {
int num_inputs = 0;
for (const auto &x : Inputs("Xs")) {
if (param_names_.count(x)) continue;
num_inputs += 1;
}
num_inputs += runtime_input_names_.size();
const int num_bindings = num_inputs + Outputs("Ys").size();
std::vector<void *> buffers(num_bindings);
// Bind input tensor to TRT.
for (const auto &x : Inputs("Xs")) {
if (param_names_.count(x)) continue;
for (const auto &x : runtime_input_names_) {
// convert input and copy to TRT engine's buffer
auto &t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
......@@ -341,22 +466,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
}
} else {
#if IS_TRT_VERSION_GE(6000)
std::map<std::string, std::vector<int>> min_input_shape =
engine->min_input_shape();
std::map<std::string, std::vector<int>> max_input_shape =
engine->max_input_shape();
PADDLE_ENFORCE_EQ(
min_input_shape.count(x), true,
platform::errors::InvalidArgument(
"Input %s not found in TRT engine min_input_shape.", x));
PADDLE_ENFORCE_EQ(
max_input_shape.count(x), true,
platform::errors::InvalidArgument(
"Input %s not found in TRT engine max_input_shape.", x));
auto x_min_input_shape = min_input_shape[x];
auto x_max_input_shape = max_input_shape[x];
RuntimeDynamicShapeCheck(x, t_shape, x_min_input_shape,
x_max_input_shape);
auto *trt_context = engine->context();
trt_context->setBindingDimensions(
bind_index, inference::tensorrt::Vec2TRT_Dims(t_shape, x, true));
......@@ -460,7 +569,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.Create(engine_key_ + std::to_string(predictor_id_),
max_batch_size_, workspace_size_, precision_mode_,
calibrator_.get(), device_id_);
calibrator_.get(), device_id_, min_input_shape_,
max_input_shape_, opt_input_shape_);
PrepareTRTEngine(scope, trt_engine_);
}
return trt_engine_;
......
......@@ -56,7 +56,7 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
using inference::analysis::SetAttr;
TEST(TensorRTEngineOp, manual) {
void DynamicShapeTest(bool allow_build_at_runtime) {
framework::ProgramDesc program;
auto* block_ = program.Proto()->add_blocks();
block_->set_idx(0);
......@@ -116,6 +116,15 @@ TEST(TensorRTEngineOp, manual) {
engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
int device_id = 0;
engine_op_desc.SetAttr("gpu_id", device_id);
engine_op_desc.SetAttr("shape_range_info_path", std::string(""));
engine_op_desc.SetAttr("model_opt_cache_dir", std::string(""));
engine_op_desc.SetAttr("allow_build_at_runtime", allow_build_at_runtime);
engine_op_desc.SetAttr("use_static_engine", true);
engine_op_desc.SetAttr("dynamic_shape_names", std::vector<std::string>{"x"});
engine_op_desc.SetAttr("dynamic_shape_lens", std::vector<int>{4});
engine_op_desc.SetAttr("min_input_shape", std::vector<int>{1, 4, 1, 1});
engine_op_desc.SetAttr("max_input_shape", std::vector<int>{2, 4, 1, 1});
engine_op_desc.SetAttr("opt_input_shape", std::vector<int>{2, 4, 1, 1});
LOG(INFO) << "create engine op";
auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
......@@ -125,7 +134,10 @@ TEST(TensorRTEngineOp, manual) {
platform::CUDAPlace place;
platform::CUDADeviceContext ctx(place);
// Prepare variables.
CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4}));
if (allow_build_at_runtime)
CreateCUDATensor(&scope, "x", std::vector<int64_t>({3, 4, 1, 1}));
else
CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4, 1, 1}));
CreateCUDATensor(&scope, "y", std::vector<int64_t>({4, 6}));
CreateCUDATensor(&scope, "z", std::vector<int64_t>({2, 6}));
......@@ -137,6 +149,11 @@ TEST(TensorRTEngineOp, manual) {
engine_op->Run(scope, place);
}
TEST(TensorRTEngineOp, manual) {
DynamicShapeTest(false);
DynamicShapeTest(true);
}
void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
framework::ProgramDesc program;
framework::Scope scope;
......@@ -220,6 +237,10 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
int device_id = 0;
engine_op_desc.SetAttr("gpu_id", device_id);
engine_op_desc.SetAttr("shape_range_info_path", std::string(""));
engine_op_desc.SetAttr("model_opt_cache_dir", std::string(""));
engine_op_desc.SetAttr("allow_build_at_runtime", false);
engine_op_desc.SetAttr("use_static_engine", false);
auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
......@@ -228,7 +249,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
}
// Test with a larger FC layer.
TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
// TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
} // namespace operators
} // namespace paddle
......
......@@ -538,7 +538,8 @@ void BindAnalysisConfig(py::module *m) {
.def("switch_ir_optim", &AnalysisConfig::SwitchIrOptim,
py::arg("x") = true)
.def("ir_optim", &AnalysisConfig::ir_optim)
.def("enable_memory_optim", &AnalysisConfig::EnableMemoryOptim)
.def("enable_memory_optim", &AnalysisConfig::EnableMemoryOptim,
py::arg("x") = true)
.def("enable_profile", &AnalysisConfig::EnableProfile)
.def("disable_glog_info", &AnalysisConfig::DisableGlogInfo)
.def("glog_info_disabled", &AnalysisConfig::glog_info_disabled)
......@@ -569,6 +570,16 @@ void BindAnalysisConfig(py::module *m) {
&AnalysisConfig::tensorrt_dynamic_shape_enabled)
.def("enable_tensorrt_oss", &AnalysisConfig::EnableTensorRtOSS)
.def("tensorrt_oss_enabled", &AnalysisConfig::tensorrt_oss_enabled)
.def("collect_shape_range_info", &AnalysisConfig::CollectShapeRangeInfo)
.def("shape_range_info_path", &AnalysisConfig::shape_range_info_path)
.def("shape_range_info_collected",
&AnalysisConfig::shape_range_info_collected)
.def("enable_tuned_tensorrt_dynamic_shape",
&AnalysisConfig::EnableTunedTensorRtDynamicShape)
.def("tuned_tensorrt_dynamic_shape",
&AnalysisConfig::tuned_tensorrt_dynamic_shape)
.def("trt_allow_build_at_runtime",
&AnalysisConfig::trt_allow_build_at_runtime)
.def("exp_disable_tensorrt_ops", &AnalysisConfig::Exp_DisableTensorRtOPs)
.def("enable_tensorrt_dla", &AnalysisConfig::EnableTensorRtDLA,
py::arg("dla_core") = 0)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
paddle.enable_static()
import paddle.fluid as fluid
from paddle.inference import Config, Predictor, create_predictor
class TRTTunedDynamicShapeTest(unittest.TestCase):
def get_model(self):
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
main_program = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
data = fluid.data(
name="data", shape=[-1, 6, 64, 64], dtype="float32")
conv_out = fluid.layers.conv2d(
input=data,
num_filters=3,
filter_size=3,
groups=1,
padding=0,
bias_attr=False,
act=None)
exe.run(startup_program)
serialized_program = paddle.static.serialize_program(
data, conv_out, program=main_program)
serialized_params = paddle.static.serialize_persistables(
data, conv_out, executor=exe, program=main_program)
return serialized_program, serialized_params
def get_config(self, model, params, tuned=False):
config = Config()
config.set_model_buffer(model, len(model), params, len(params))
config.enable_use_gpu(100, 0)
config.set_optim_cache_dir('tuned_test')
if tuned:
config.collect_shape_range_info('shape_range.pbtxt')
else:
config.enable_tensorrt_engine(
workspace_size=1024,
max_batch_size=1,
min_subgraph_size=0,
precision_mode=paddle.inference.PrecisionType.Float32,
use_static=True,
use_calib_mode=False)
config.enable_tuned_tensorrt_dynamic_shape('shape_range.pbtxt',
True)
return config
def predictor_run(self, config, in_data):
predictor = create_predictor(config)
in_names = predictor.get_input_names()
in_handle = predictor.get_input_handle(in_names[0])
in_handle.copy_from_cpu(in_data)
predictor.run()
def test_tuned_dynamic_shape_run(self):
program, params = self.get_model()
config = self.get_config(program, params, tuned=True)
self.predictor_run(config, np.ones((1, 6, 64, 64)).astype(np.float32))
config2 = self.get_config(program, params, tuned=False)
self.predictor_run(config2, np.ones((1, 6, 32, 32)).astype(np.float32))
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册