未验证 提交 ae576f3c 编写于 作者: Z Zhaolong Xing 提交者: GitHub

fix: when use the load model from memory mode, the RAM occupy is high (#17788)

test=develop
上级 5efe8c72
......@@ -63,6 +63,16 @@ struct Argument {
using anakin_max_shape_t = std::map<std::string, std::vector<int>>;
bool Has(const std::string& key) const { return valid_fields_.count(key); }
void PartiallyRelease() {
if (Has("model_program_path")) {
if (Has("model_from_memory") && model_from_memory()) {
model_program_path().clear();
model_program_path().shrink_to_fit();
model_params_path().clear();
model_params_path().shrink_to_fit();
}
}
}
#define DECL_ARGUMENT_FIELD(field__, Field, type__) \
public: \
......
......@@ -87,6 +87,7 @@ void IRPassManager::CreatePasses(Argument *argument,
bool enable_int8 = argument->tensorrt_precision_mode() ==
AnalysisConfig::Precision::kInt8;
pass->Set("predictor_id", new int(argument->predictor_id()));
bool use_calib_mode = argument->tensorrt_use_calib_mode();
pass->Set("enable_int8", new bool(enable_int8));
pass->Set("use_calib_mode", new bool(use_calib_mode));
......
......@@ -199,8 +199,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
SetAttr(op_desc->Proto(), "parameters", params);
auto use_static_engine = Get<bool>("use_static_engine");
// TODO(NHZlX)
// There are models with the same structure but the different parameters,
// when runing in the 'use_serialize' mode, there is a bug.
auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
std::to_string(0));
auto predictor_id = Get<int>("predictor_id");
// Get "" when there is no cached calibration table data.
bool load_from_memory = Get<bool>("model_from_memory");
......@@ -214,6 +218,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
SetAttr(op_desc->Proto(), "use_calib_mode", use_calib_mode);
SetAttr(op_desc->Proto(), "engine_key", engine_key);
SetAttr(op_desc->Proto(), "predictor_id", predictor_id);
std::string trt_engine_serialized_data = "";
SetAttr(op_desc->Proto(), "engine_serialized_data",
trt_engine_serialized_data);
......@@ -233,15 +238,20 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
std::copy(params.begin(), params.end(),
std::back_inserter(*repetitive_params));
bool need_serialize = (use_static_engine && !load_from_memory);
tensorrt::TensorRTEngine *trt_engine =
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.Create(engine_key + std::to_string(predictor_id),
Get<int>("max_batch_size"), Get<int>("workspace_size"),
enable_int8, calibrator.get(), Get<int>("gpu_device_id"));
bool need_serialize = (use_static_engine && !load_from_memory);
if (need_serialize) {
trt_engine_serialized_data = GetTrtEngineSerializedData(
Get<std::string>("model_opt_cache_dir"), engine_key);
// we can load the engine info serialized before from the disk.
if (!trt_engine_serialized_data.empty()) {
SetAttr(op_desc->Proto(), "engine_serialized_data",
trt_engine_serialized_data);
trt_engine->Deserialize(trt_engine_serialized_data);
LOG(INFO) << "Load TRT Optimized Info from "
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
......@@ -254,10 +264,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
// 2. already load serialized trt engine info.
LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time.";
std::unique_ptr<tensorrt::TensorRTEngine> trt_engine(
new tensorrt::TensorRTEngine(
Get<int>("max_batch_size"), Get<int>("workspace_size"), enable_int8,
calibrator.get(), Get<int>("gpu_device_id")));
auto *scope = param_scope();
framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
std::unordered_set<std::string> param_set(params.begin(), params.end());
......@@ -265,20 +272,18 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
.ConvertBlockToTRTEngine(
&block_desc_temp, *scope,
std::vector<std::string>(input_names.begin(), input_names.end()),
param_set, output_mapping, trt_engine.get());
param_set, output_mapping, trt_engine);
if (need_serialize) {
nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
trt_engine_serialized_data =
std::string((const char *)serialized_engine_data->data(),
serialized_engine_data->size());
if (need_serialize) {
SaveTrtEngineSerializedDataToFile(
GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
engine_key),
trt_engine_serialized_data);
}
SetAttr(op_desc->Proto(), "engine_serialized_data",
trt_engine_serialized_data);
}
} // namespace analysis
......
......@@ -69,7 +69,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// Copy the parameter data to a tmp tensor.
TensorCopySync(*t, cpu_place, &temp_tensor);
// Reallocation the space on GPU
t->mutable_data<float>(place);
t->clear();
// Copy parameter data to newly allocated GPU space.
TensorCopySync(temp_tensor, place, t);
......
......@@ -87,10 +87,12 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
// Model related.
CP_MEMBER(model_dir_);
CP_MEMBER(prog_file_);
CP_MEMBER(params_file_);
CP_MEMBER(model_from_memory_); // the memory model reuses prog_file_ and
// params_file_ fields.
prog_file_ = std::move(other.prog_file_);
params_file_ = std::move(other.params_file_);
// Gpu related.
CP_MEMBER(use_gpu_);
CP_MEMBER(device_id_);
......@@ -439,4 +441,12 @@ void AnalysisConfig::EnableAnakinEngine(
anakin_auto_config_layout_ = auto_config_layout;
Update();
}
void AnalysisConfig::PartiallyRelease() {
prog_file_.clear();
prog_file_.shrink_to_fit();
params_file_.clear();
params_file_.shrink_to_fit();
}
} // namespace paddle
......@@ -444,6 +444,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
ARGUMENT_CHECK_FIELD((&argument_), ir_analyzed_program);
inference_program_.reset(
new framework::ProgramDesc(argument_.ir_analyzed_program()));
// The config and argument take a lot of storage,
// when the predictor settings are complete, we release these stores.
argument_.PartiallyRelease();
config_.PartiallyRelease();
LOG(INFO) << "== optimize end ==";
}
......@@ -451,6 +455,8 @@ template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
VLOG(3) << "create AnalysisConfig";
PADDLE_ENFORCE(config.is_valid(),
"Note: Each config can only be used for one predictor.");
if (config.use_gpu()) {
// 1. GPU memory
PADDLE_ENFORCE_GE(config.memory_pool_init_size_mb(), 0.f);
......@@ -480,6 +486,8 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
}
std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
// Each config can only be used for one predictor.
config.SetInValid();
auto predictor_p = dynamic_cast<AnalysisPredictor *>(predictor.get());
if (!predictor_p->Init(nullptr)) {
......
......@@ -232,6 +232,8 @@ struct AnalysisConfig {
bool force_update_static_cache = false);
/** Tell whether the memory optimization is activated. */
bool enable_memory_optim() const;
void SetInValid() const { is_valid_ = false; }
bool is_valid() const { return is_valid_; }
friend class ::paddle::AnalysisPredictor;
......@@ -239,6 +241,7 @@ struct AnalysisConfig {
* Get a pass builder for customize the passes in IR analysis phase.
*/
PassStrategy* pass_builder() const;
void PartiallyRelease();
protected:
// Update the config.
......@@ -249,8 +252,8 @@ struct AnalysisConfig {
protected:
// Model pathes.
std::string model_dir_;
std::string prog_file_;
std::string params_file_;
mutable std::string prog_file_;
mutable std::string params_file_;
// GPU related.
bool use_gpu_{false};
......@@ -312,6 +315,11 @@ struct AnalysisConfig {
bool use_mkldnn_quantizer_{false};
std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
// If the config is already used on a predictor, it becomes invalid.
mutable bool is_valid_{true};
// Any config can only be used with one predictor.
// Variables held by config can take up a lot of memory in some cases.
// So we release the memory when the predictor is set up.
};
} // namespace paddle
......@@ -109,6 +109,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
// guaranteed at least v7
"conv_elementwise_add_act_fuse_pass", //
......
......@@ -170,6 +170,7 @@ class OpConverter {
engine->DeclareOutput(output);
}
engine->FreezeNetwork();
engine->ClearWeights();
}
void RreplenishLayerAndOutput(
......
......@@ -149,6 +149,12 @@ class TensorRTEngine {
std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
weight_map;
void ClearWeights() {
for (auto& weight_pair : weight_map) {
weight_pair.second.reset(nullptr);
}
}
private:
// Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// ensure that the thread is associated with the correct device by calling
......@@ -213,6 +219,39 @@ class TensorRTEngine {
#define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
engine__->network()->add##layer__(ARGS);
class TRTEngineManager {
public:
bool Empty() const { return engines_.size() == 0; }
bool Has(const std::string& name) const {
if (engines_.count(name) == 0) return false;
return engines_.at(name).get() != nullptr;
}
TensorRTEngine* Get(const std::string& name) const {
return engines_.at(name).get();
}
TensorRTEngine* Create(std::string name, int max_batch, int max_workspace,
bool enable_int8 = false,
TRTInt8Calibrator* calibrator = nullptr,
int device_id = 0,
nvinfer1::ILogger& logger = NaiveLogger::Global()) {
auto* p = new TensorRTEngine(max_batch, max_workspace, enable_int8,
calibrator, device_id, logger);
engines_[name].reset(p);
return p;
}
void DeleteAll() {
for (auto& item : engines_) {
item.second.reset(nullptr);
}
}
private:
std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
......@@ -31,7 +31,7 @@ struct SimpleOpTypeSetTeller : public Teller {
std::unordered_set<std::string> teller_set{
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
"elementwise_add", "elementwise_mul", "dropout", "prelu",
"conv2d_transpose", "leaky_relu", "fc"}};
};
......
......@@ -177,11 +177,15 @@ TEST(Analyzer_Pyramid_DNN, compare_zero_copy) {
AnalysisConfig cfg;
SetConfig(&cfg);
AnalysisConfig cfg1;
SetConfig(&cfg1);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
std::vector<std::string> outputs_name;
outputs_name.emplace_back("cos_sim_2.tmp_0");
CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
input_slots_all, outputs_name);
}
......
......@@ -293,11 +293,15 @@ TEST(Analyzer_rnn1, compare_zero_copy) {
AnalysisConfig cfg;
SetConfig(&cfg);
AnalysisConfig cfg1;
SetConfig(&cfg1);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
std::vector<std::string> outputs_name;
outputs_name.emplace_back("final_output.tmp_1");
CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
input_slots_all, outputs_name);
}
......
......@@ -39,10 +39,17 @@ TEST(Analyzer, save_model) {
mkdir(optimModelPath.c_str(), 0777);
SaveOptimModel(&cfg, optimModelPath);
cfg.pass_builder()->ClearPasses();
int origin_num_ops = GetNumOps(cfg);
cfg.SetModel(optimModelPath + "/model", optimModelPath + "/params");
int fused_num_ops = GetNumOps(cfg);
// Each config can only be applied to one predictor.
AnalysisConfig cfg2;
SetConfig(&cfg2);
cfg2.pass_builder()->ClearPasses();
cfg2.SetModel(optimModelPath + "/model", optimModelPath + "/params");
int origin_num_ops = GetNumOps(cfg2);
AnalysisConfig cfg3;
SetConfig(&cfg3);
cfg3.SetModel(optimModelPath + "/model", optimModelPath + "/params");
int fused_num_ops = GetNumOps(cfg3);
CHECK_LE(fused_num_ops, origin_num_ops);
}
......
......@@ -215,11 +215,15 @@ TEST(Analyzer_seq_pool1, compare_zero_copy) {
AnalysisConfig cfg;
SetConfig(&cfg);
AnalysisConfig cfg1;
SetConfig(&cfg1);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
std::vector<std::string> outputs_name;
outputs_name.emplace_back(out_var_name);
CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
input_slots_all, outputs_name);
}
......
......@@ -534,7 +534,7 @@ void CompareNativeAndAnalysis(
}
void CompareAnalysisAndZeroCopy(
PaddlePredictor::Config *config,
PaddlePredictor::Config *config, PaddlePredictor::Config *config1,
const std::vector<std::vector<PaddleTensor>> &inputs,
const std::vector<std::string> &outputs_name) {
int batch_size = FLAGS_batch_size;
......@@ -544,8 +544,8 @@ void CompareAnalysisAndZeroCopy(
predictor->Run(inputs[0], &analysis_outputs, batch_size);
// analysis + zero_copy
std::vector<ZeroCopyTensor> zerocopy_outputs;
reinterpret_cast<AnalysisConfig *>(config)->SwitchUseFeedFetchOps(false);
predictor = CreateTestPredictor(config, true);
reinterpret_cast<AnalysisConfig *>(config1)->SwitchUseFeedFetchOps(false);
predictor = CreateTestPredictor(config1, true);
ConvertPaddleTensorToZeroCopyTensor(predictor.get(), inputs[0]);
predictor->ZeroCopyRun();
for (size_t i = 0; i < outputs_name.size(); i++) {
......
......@@ -43,7 +43,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
private:
std::vector<std::string> input_names_;
std::unordered_set<std::string> param_names_;
mutable std::unique_ptr<TensorRTEngine> trt_engine_;
mutable TensorRTEngine *trt_engine_{nullptr};
int max_batch_size_;
int workspace_size_;
std::unique_ptr<TRTInt8Calibrator> calibrator_;
......@@ -51,8 +51,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
bool use_calib_mode_;
std::string calibration_data_;
std::string engine_key_;
std::string engine_serialized_data_;
bool calibration_mode_;
int predictor_id_;
int device_id_;
public:
......@@ -69,7 +69,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
use_calib_mode_ = Attr<bool>("use_calib_mode");
calibration_data_ = Attr<std::string>("calibration_data");
engine_key_ = Attr<std::string>("engine_key");
engine_serialized_data_ = Attr<std::string>("engine_serialized_data");
predictor_id_ = Attr<int>("predictor_id");
auto params = Attr<std::vector<std::string>>("parameters");
for (const auto &param : params) {
......@@ -84,16 +84,14 @@ class TensorRTEngineOp : public framework::OperatorBase {
if (enable_int8_ && calibration_data_.size()) {
calibrator_.reset(new TRTInt8Calibrator(calibration_data_));
}
if (!calibration_mode_ && !engine_serialized_data_.empty()) {
trt_engine_.reset(new inference::tensorrt::TensorRTEngine(
max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(),
device_id_));
PADDLE_ENFORCE(engine_serialized_data_.size(),
"TRT serialized data should not be empty here,"
"there must be error when generate serialized data in TRT "
"subgraph detect pass.");
trt_engine_->Deserialize(engine_serialized_data_);
bool has_engine =
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.Has(engine_key_ + std::to_string(predictor_id_));
if (!calibration_mode_ && has_engine) {
trt_engine_ =
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.Get(engine_key_ + std::to_string(predictor_id_));
}
}
......@@ -239,12 +237,14 @@ class TensorRTEngineOp : public framework::OperatorBase {
TensorRTEngine *GetEngine(const framework::Scope &scope,
const platform::Place &dev_place) const {
if (!trt_engine_) {
trt_engine_.reset(new inference::tensorrt::TensorRTEngine(
max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(),
device_id_));
PrepareTRTEngine(scope, trt_engine_.get());
trt_engine_ =
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.Create(engine_key_ + std::to_string(predictor_id_),
max_batch_size_, workspace_size_, enable_int8_,
calibrator_.get(), device_id_);
PrepareTRTEngine(scope, trt_engine_);
}
return trt_engine_.get();
return trt_engine_;
}
void PrepareTRTEngine(const framework::Scope &scope,
......
......@@ -102,6 +102,7 @@ TEST(TensorRTEngineOp, manual) {
engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
engine_op_desc.SetAttr("parameters", std::vector<std::string>({}));
engine_op_desc.SetAttr("engine_key", std::string("a_engine"));
engine_op_desc.SetAttr("predictor_id", 1);
engine_op_desc.SetAttr("calibration_data", std::string(""));
engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
......@@ -201,6 +202,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
engine_op_desc.SetAttr("parameters",
std::vector<std::string>({"y0", "y1", "y2", "y3"}));
engine_op_desc.SetAttr("engine_key", std::string("b_engine"));
engine_op_desc.SetAttr("predictor_id", 1);
engine_op_desc.SetAttr("calibration_data", std::string(""));
engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册