Merge pull request #14734 from luotao1/memory_load

support loading from memory

Merge pull request #14734 from luotao1/memory_load
support loading from memory
cf661338 · Tao Luo · GitHub · aebc175c · 743cb840 · cf661338
12 changed file
--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -97,7 +97,7 @@ void ExecutorThreadWorker::SetDevice() {
  static unsigned concurrency_cap = std::thread::hardware_concurrency();
  int thread_id = this->thread_id_;
-  if (thread_id < concurrency_cap) {
+  if (static_cast<unsigned>(thread_id) < concurrency_cap) {
    unsigned proc = thread_id;
    cpu_set_t mask;

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -103,6 +103,7 @@ struct Argument {
  // Model specified with program and parameters files.
  DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string);
  DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string);
+  DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool);
  // The overall graph to work on.
  DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph);

--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -46,7 +46,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
             argument->model_params_path_valid()) {
    auto program =
        LoadModel(argument->model_program_path(), argument->model_params_path(),
-                  argument->scope_ptr(), place);
+                  argument->scope_ptr(), place, argument->model_from_memory());
    argument->SetMainProgram(program.release());
  } else {
    PADDLE_THROW(
@@ -68,9 +68,14 @@ std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
 std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
    const std::string &program_path, const std::string &params_path,
-    framework::Scope *scope, const platform::Place &place) {
+    framework::Scope *scope, const platform::Place &place,
+    bool model_from_memory) {
  framework::Executor exe(place);
-  return Load(&exe, scope, program_path, params_path);
+  if (!model_from_memory) {
+    return Load(&exe, scope, program_path, params_path);
+  } else {
+    return LoadFromMemory(&exe, scope, program_path, params_path);
+  }
 }
 std::string IrGraphBuildPass::repr() const { return "ir-graph-build-pass"; }

--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
@@ -24,7 +24,7 @@ namespace inference {
 namespace analysis {
 /*
- * Load program and parameter to memory from the disk.
+ * Load program and parameter to memory from the disk or directly from memory.
 */
 class IrGraphBuildPass : public AnalysisPass {
 public:
@@ -38,7 +38,8 @@ class IrGraphBuildPass : public AnalysisPass {
      const platform::Place &place);
  std::unique_ptr<framework::ProgramDesc> LoadModel(
      const std::string &program_path, const std::string &params_path,
-      framework::Scope *scope, const platform::Place &place);
+      framework::Scope *scope, const platform::Place &place,
+      bool model_from_memory);
  std::string model_binary_str_;
 };

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -53,6 +53,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
  use_tensorrt_ = other.use_tensorrt_;
  tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
  tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
+  model_from_memory_ = other.model_from_memory_;
  if (use_gpu) {
    pass_builder_.reset(new GpuPassStrategy(
@@ -80,6 +81,8 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
  use_tensorrt_ = other.use_tensorrt_;
  tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
  tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
+  model_from_memory_ = other.model_from_memory_;
  pass_builder_ = std::move(other.pass_builder_);
 }
@@ -102,4 +105,13 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
  pass_builder()->InsertPass(1, "tensorrt_subgraph_pass");
 }
+void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
+                                             size_t prog_buffer_size,
+                                             const char *param_buffer,
+                                             size_t param_buffer_size) {
+  prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size);
+  param_file = std::string(param_buffer, param_buffer + param_buffer_size);
+  model_from_memory_ = true;
+}
 }  // namespace paddle
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -308,6 +308,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  argument_.SetUseGPU(config_.use_gpu);
  argument_.SetGPUDeviceId(config_.device);
+  argument_.SetModelFromMemory(config_.model_from_memory_);
  // Analyze inference_program
  if (!config_.model_dir.empty()) {
    argument_.SetModelDir(config_.model_dir);
@@ -448,20 +449,24 @@ bool AnalysisPredictor::LoadProgramDesc() {
    return false;
  }
-  std::string pb_content;
-  // Read binary
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
-  fin.seekg(0, std::ios::end);
-  pb_content.resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(pb_content.at(0)), pb_content.size());
-  fin.close();
  // Create ProgramDesc
  framework::proto::ProgramDesc proto;
-  proto.ParseFromString(pb_content);
+  if (!config_.model_from_memory()) {
+    std::string pb_content;
+    // Read binary
+    std::ifstream fin(filename, std::ios::in | std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fin.is_open()), "Cannot open file %s",
+                   filename);
+    fin.seekg(0, std::ios::end);
+    pb_content.resize(fin.tellg());
+    fin.seekg(0, std::ios::beg);
+    fin.read(&(pb_content.at(0)), pb_content.size());
+    fin.close();
+    proto.ParseFromString(pb_content);
+  } else {
+    proto.ParseFromString(config_.prog_file);
+  }
  inference_program_.reset(new framework::ProgramDesc(proto));
  return true;
 }
@@ -469,6 +474,7 @@ bool AnalysisPredictor::LoadProgramDesc() {
 bool AnalysisPredictor::LoadParameters() {
  PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
                          "The inference program should be loaded first.");
  const auto &global_block = inference_program_->MutableBlock(0);
  // create a temporary program to load parameters.

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -52,10 +52,13 @@ struct AnalysisConfig : public NativeConfig {
  bool use_tensorrt() const { return use_tensorrt_; }
  void EnableMKLDNN();
-  // NOTE this is just for internal development, please not use it.
-  // NOT stable yet.
  bool use_mkldnn() const { return use_mkldnn_; }
+  // Specify the memory buffer of program and parameter
+  void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size,
+                      const char* program_buffer, size_t program_buffer_size);
+  bool model_from_memory() const { return model_from_memory_; }
  friend class ::paddle::AnalysisPredictor;
 protected:
@@ -64,6 +67,7 @@ struct AnalysisConfig : public NativeConfig {
  int tensorrt_workspace_size_;
  int tensorrt_max_batchsize_;
  std::unique_ptr<PassStrategy> pass_builder_;
+  bool model_from_memory_{false};
 };
 // Configurations for Anakin engine.

--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -69,7 +69,8 @@ bool IsPersistable(const framework::VarDesc* var) {
 void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
                      const framework::ProgramDesc& main_program,
                      const std::string& dirname,
-                      const std::string& param_filename) {
+                      const std::string& param_filename,
+                      bool model_from_memory = false) {
  const framework::BlockDesc& global_block = main_program.Block(0);
  framework::ProgramDesc* load_program = new framework::ProgramDesc();
@@ -108,6 +109,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
    op->SetType("load_combine");
    op->SetOutput("Out", paramlist);
    op->SetAttr("file_path", {param_filename});
+    op->SetAttr("model_from_memory", {model_from_memory});
    op->CheckAttrs();
  }
@@ -130,16 +132,17 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                 "model version %ld is not supported.",
                 main_program->Version());
-  LoadPersistables(executor, scope, *main_program, dirname, "");
+  // model_from_memory is false in seperate parameters.
+  LoadPersistables(executor, scope, *main_program, dirname, "",
+                   false /* model_from_memory */);
  return main_program;
 }
 std::unique_ptr<framework::ProgramDesc> Load(
    framework::Executor* executor, framework::Scope* scope,
    const std::string& prog_filename, const std::string& param_filename) {
-  std::string model_filename = prog_filename;
  std::string program_desc_str;
-  ReadBinaryFile(model_filename, &program_desc_str);
+  ReadBinaryFile(prog_filename, &program_desc_str);
  std::unique_ptr<framework::ProgramDesc> main_program(
      new framework::ProgramDesc(program_desc_str));
@@ -147,7 +150,22 @@ std::unique_ptr<framework::ProgramDesc> Load(
                 "model version %ld is not supported.",
                 main_program->Version());
-  LoadPersistables(executor, scope, *main_program, "", param_filename);
+  LoadPersistables(executor, scope, *main_program, "", param_filename,
+                   false /* model_from_memory */);
+  return main_program;
+}
+std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
+    framework::Executor* executor, framework::Scope* scope,
+    const std::string& prog_buffer, const std::string& param_buffer) {
+  std::unique_ptr<framework::ProgramDesc> main_program(
+      new framework::ProgramDesc(prog_buffer));
+  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
+                 "model version %ld is not supported.",
+                 main_program->Version());
+  LoadPersistables(executor, scope, *main_program, "", param_buffer,
+                   true /* model_filename */);
  return main_program;
 }

--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -30,7 +30,8 @@ void Init(const std::vector<std::string> argv);
 void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
                      const framework::ProgramDesc& main_program,
                      const std::string& dirname,
-                      const std::string& param_filename);
+                      const std::string& param_filename,
+                      bool model_from_memory);
 std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                                             framework::Scope* scope,
@@ -41,6 +42,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                                             const std::string& prog_filename,
                                             const std::string& param_filename);
+std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
+    framework::Executor* executor, framework::Scope* scope,
+    const std::string& prog_buffer, const std::string& param_buffer);
 // Save the variables from a scope to disk.
 void SaveVars(const framework::Scope& scope,
              const std::vector<std::string>& vars, const std::string& dirname,

--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -93,9 +93,17 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }
-void SetConfig(contrib::AnalysisConfig *cfg) {
+void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) {
-  cfg->prog_file = FLAGS_infer_model + "/__model__";
+  if (memory_load) {
-  cfg->param_file = FLAGS_infer_model + "/param";
+    std::string buffer_prog, buffer_param;
+    ReadBinaryFile(FLAGS_infer_model + "/__model__", &buffer_prog);
+    ReadBinaryFile(FLAGS_infer_model + "/param", &buffer_param);
+    cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0],
+                        buffer_param.size());
+  } else {
+    cfg->prog_file = FLAGS_infer_model + "/__model__";
+    cfg->param_file = FLAGS_infer_model + "/param";
+  }
  cfg->use_gpu = false;
  cfg->device = 0;
  cfg->specify_input_name = true;
@@ -114,9 +122,9 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 }
 // Easy for profiling independently.
-TEST(Analyzer_Chinese_ner, profile) {
+void profile(bool memory_load = false) {
  contrib::AnalysisConfig cfg;
-  SetConfig(&cfg);
+  SetConfig(&cfg, memory_load);
  std::vector<PaddleTensor> outputs;
  std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -138,6 +146,12 @@ TEST(Analyzer_Chinese_ner, profile) {
  }
 }
+TEST(Analyzer_Chinese_ner, profile) { profile(); }
+TEST(Analyzer_Chinese_ner, profile_memory_load) {
+  profile(true /* memory_load */);
+}
 // Check the fuse status
 TEST(Analyzer_Chinese_ner, fuse_statis) {
  contrib::AnalysisConfig cfg;

--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -49,8 +49,6 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
  os << GenSpaces(num_spaces) << "device: " << config.device << "\n";
  os << GenSpaces(num_spaces)
     << "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n";
-  os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
-  os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
  os << GenSpaces(num_spaces)
     << "specify_input_name: " << config.specify_input_name << "\n";
  os << GenSpaces(num_spaces)
@@ -65,6 +63,13 @@ std::ostream &operator<<(std::ostream &os,
  os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
  num_spaces++;
  os << *reinterpret_cast<const NativeConfig *>(&config);
+  if (!config.model_from_memory()) {
+    os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
+    os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
+  } else {
+    os << GenSpaces(num_spaces)
+       << "prog_file and param_file: load from memory \n";
+  }
  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim
     << "\n";
  os << GenSpaces(num_spaces)

--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -32,16 +32,26 @@ class LoadCombineOp : public framework::OperatorBase {
               const platform::Place &place) const override {
    auto filename = Attr<std::string>("file_path");
    auto load_as_fp16 = Attr<bool>("load_as_fp16");
+    auto model_from_memory = Attr<bool>("model_from_memory");
-    std::ifstream fin(filename);
-    PADDLE_ENFORCE(static_cast<bool>(fin),
-                   "Cannot open file %s for load_combine op", filename);
    auto out_var_names = Outputs("Out");
    PADDLE_ENFORCE_GT(
        static_cast<int>(out_var_names.size()), 0,
        "The number of output variables should be greater than 0.");
+    if (!model_from_memory) {
+      std::ifstream fin(filename);
+      PADDLE_ENFORCE(static_cast<bool>(fin),
+                     "Cannot open file %s for load_combine op", filename);
+      LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
+    } else {
+      PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
+      std::stringstream fin(filename);
+      LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
+    }
+  }
+  void LoadParamsFromBuffer(
+      const framework::Scope &scope, const platform::Place &place,
+      std::istream *buffer, bool load_as_fp16,
+      const std::vector<std::string> &out_var_names) const {
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(place);
@@ -54,11 +64,10 @@ class LoadCombineOp : public framework::OperatorBase {
      auto *tensor = out_var->GetMutable<framework::LoDTensor>();
      // Error checking
-      PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
+      PADDLE_ENFORCE(static_cast<bool>(buffer), "Cannot read more");
-                     filename);
      // Get data from fin to tensor
-      DeserializeFromStream(fin, tensor, dev_ctx);
+      DeserializeFromStream(*buffer, tensor, dev_ctx);
      auto in_dtype = framework::ToDataType(tensor->type());
      auto out_dtype =
@@ -103,11 +112,17 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                         "LoDTensors will be loaded from \"file_path\".")
        .AddCustomChecker(
            [](const std::string &path) { return !path.empty(); });
+    AddAttr<bool>("model_from_memory",
+                  "(boolean, default false)"
+                  "If true, file_path is in memory, and LoDTensors will be "
+                  "loaded directly from memory")
+        .SetDefault(false);
    AddComment(R"DOC(
 LoadCombine Operator.
-LoadCombine operator loads LoDTensor variables from a file. The file should 
+LoadCombine operator loads LoDTensor variables from a file, which could be 
-contain one or more LoDTensors serialized using the SaveCombine operator. The 
+loaded in memory already. The file should contain one or more LoDTensors 
+serialized using the SaveCombine operator. The
 LoadCombine operator applies a deserialization strategy to appropriately load 
 the LodTensors, and this strategy complements the serialization strategy used 
 in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled