[JitLayer]Polish PEFuntion to speed up JitLayer and fix memory leak (#44738)

* Polish PEFuntion to speed up JitLayer * Polish PEFunction code * Fix comments

[JitLayer]Polish PEFuntion to speed up JitLayer and fix memory leak (#44738)
* Polish PEFuntion to speed up JitLayer * Polish PEFunction code * Fix comments
75122319 · WangZhen · GitHub · 212f015f · 75122319 · 75122319
显示空白变更内容
内联并排

Showing with 88 addition and 49 deletion

paddle/fluid/jit/executor_function.h paddle/fluid/jit/executor_function.h +10 -4

paddle/fluid/jit/pe_function.h paddle/fluid/jit/pe_function.h +78 -45

未找到文件。
--- a/paddle/fluid/jit/executor_function.h
+++ b/paddle/fluid/jit/executor_function.h
@@ -22,6 +22,7 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/fluid/jit/base_function.h"
 #include "paddle/fluid/jit/function_schema.h"
@@ -36,9 +37,14 @@ class ExecutorFunction : public BaseFunction {
                   const Name2VariableMap &params_dict,
                   const phi::Place &place)
      : info_(info), place_(place), inner_exe_(place_) {
+    info_->RemoveDescFeedFetch();
+    PADDLE_ENFORCE_GT(
+        static_cast<int64_t>(info_->ProgramDesc().Block(0).OpSize()),
+        0,
+        platform::errors::PreconditionNotMet(
+            "There is no operator in ProgramDesc."));
    utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, &scope_);
    VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_);
-    info_->RemoveDescFeedFetch();
  }
  ~ExecutorFunction() noexcept {}
@@ -56,9 +62,9 @@ class ExecutorFunction : public BaseFunction {
                   false,
                   true,
                   info_->OutputArgNames());
-    std::vector<DenseTensor> res;
+    std::vector<DenseTensor> outputs;
-    utils::FetchOuts(info_->OutputArgNames(), scope_, &res);
+    utils::FetchOuts(info_->OutputArgNames(), scope_, &outputs);
-    return res;
+    return outputs;
  }
  const std::shared_ptr<FunctionInfo> &Info() const { return info_; }

--- a/paddle/fluid/jit/pe_function.h
+++ b/paddle/fluid/jit/pe_function.h
@@ -19,10 +19,14 @@
 #include <vector>
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/executor_cache.h"
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/fluid/jit/base_function.h"
 #include "paddle/fluid/jit/function_schema.h"
@@ -31,72 +35,99 @@
 namespace paddle {
 namespace jit {
+using ExecutionStrategy = framework::details::ExecutionStrategy;
+using ParallelExecutor = framework::ParallelExecutor;
+using Graph = framework::ir::Graph;
 class PEFunction : public BaseFunction {
 public:
  PEFunction(const std::shared_ptr<FunctionInfo> &info,
             const Name2VariableMap &params_dict,
             const phi::Place &place)
      : info_(info), place_(place) {
+    info_->RemoveDescFeedFetch();
+    PADDLE_ENFORCE_GT(
+        static_cast<int64_t>(info_->ProgramDesc().Block(0).OpSize()),
+        0,
+        platform::errors::PreconditionNotMet(
+            "There is no operator in ProgramDesc."));
    utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, &scope_);
    VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_);
-    info_->RemoveDescFeedFetch();
+    CreateGraphAndPE();
  }
  ~PEFunction() noexcept {}
-  std::vector<Tensor> operator()(const std::vector<Tensor> &inputs) {
+  static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) {
-    auto dense_tensors = utils::ToDenseTensors(inputs);
+    ExecutionStrategy execution_strategy;
-    return utils::ToTensors(this->operator()(dense_tensors));
+    auto device_type = platform::Place2DeviceType(place);
+    switch (device_type) {
+      case platform::DeviceType::CPU: {
+        execution_strategy.num_threads_ = 2;
+        break;
      }
+      case platform::DeviceType::CUDA: {
+        // NOTE: According experiments, one thread is faster in
+        // most model training.
+        execution_strategy.num_threads_ = 1;
+        break;
+      }
+      case platform::DeviceType::XPU: {
+        execution_strategy.num_threads_ = 1;
+        break;
+      }
+      case platform::DeviceType::IPU: {
+        execution_strategy.num_threads_ = 1;
+        break;
+      }
+      default:
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Unsupported Device type %d.", device_type));
+    }
+    execution_strategy.use_device_ = device_type;
-  std::vector<DenseTensor> operator()(const std::vector<DenseTensor> &inputs) {
+    return execution_strategy;
-    std::string prog_string;
+  }
-    std::hash<std::string> string_hash;
-    auto &program_desc = info_->ProgramDesc();
+  void CreateGraphAndPE() {
-    // TODO(dev): Serialize is very slow.
+    framework::details::BuildStrategy build_strategy;
-    const_cast<framework::ProgramDesc *>(&program_desc)
+    auto execution_strategy = GetExecutionStrategy(place_);
-        ->Proto()
-        ->SerializePartialToString(&prog_string);
-    int64_t program_id = static_cast<int64_t>(string_hash(prog_string));
+    auto &program_desc = info_->ProgramDesc();
    const framework::BlockDesc &global_block = program_desc.Block(0);
    int64_t start_op_index = 0;
    int64_t end_op_index = static_cast<int64_t>(global_block.OpSize());
-    utils::ShareIntoScope(info_->InputArgNames(), inputs, &scope_);
+    graph_ =
-    std::vector<std::string> input_var_names = info_->InputArgNames();
+        std::make_shared<Graph>(program_desc, start_op_index, end_op_index);
-    std::vector<std::string> output_var_names = info_->OutputArgNames();
+    inner_pe_ = std::make_shared<ParallelExecutor>(
+        place_, &scope_, execution_strategy, build_strategy, graph_.get());
-    if (end_op_index > start_op_index) {
+    inner_pe_->PrepareVariables(&scope_);
-      auto cache_info = framework::GetExecutorInfoFromCache(program_desc,
+    inner_pe_->SkipMemoryReuse(/*scope_idx=*/0, info_->InputArgNames());
-                                                            place_,
-                                                            start_op_index,
-                                                            end_op_index,
-                                                            /*is_grad=*/false,
-                                                            program_id,
-                                                            &scope_);
-      auto &parallel_executor = cache_info.first;
-      auto &skip_eager_delete_vars =
-          framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
-              program_id, false);
-      if (cache_info.second /*is_new_created*/) {
-        parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_var_names);
-        skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
-                                      output_var_names.begin(),
-                                      output_var_names.end());
-        framework::details::ParseSafeEagerDeletionSkipVars(
-            program_desc,
-            end_op_index,
-            output_var_names,
-            &skip_eager_delete_vars);
  }
-      parallel_executor->RunWithoutFetch(skip_eager_delete_vars);
+  std::vector<Tensor> operator()(const std::vector<Tensor> &inputs) {
+    auto dense_tensors = utils::ToDenseTensors(inputs);
+    return utils::ToTensors(this->operator()(dense_tensors));
  }
-    std::vector<DenseTensor> res;
-    utils::FetchOuts(info_->OutputArgNames(), scope_, &res);
+  std::vector<DenseTensor> operator()(const std::vector<DenseTensor> &inputs) {
-    return res;
+    utils::ShareIntoScope(info_->InputArgNames(), inputs, &scope_);
+    // update op_handle scope_map in pe->executor_->Graph
+    std::unordered_map<framework::Scope *, framework::Scope *> scope_map = {
+        {inner_pe_->GetLocalScopes().front(), &scope_}};
+    inner_pe_->ResetOpHandleScopeMapOfGraphs(scope_map);
+    // need to recreate tmp variables in new scope
+    inner_pe_->PrepareVariables(&scope_);
+    inner_pe_->RunWithoutFetch(info_->OutputArgNames());
+    std::vector<DenseTensor> outputs;
+    utils::FetchOuts(info_->OutputArgNames(), scope_, &outputs);
+    scope_.DropKids();
+    return outputs;
  }
  const std::shared_ptr<FunctionInfo> &Info() const { return info_; }
@@ -105,6 +136,8 @@ class PEFunction : public BaseFunction {
  std::shared_ptr<FunctionInfo> info_;
  framework::Scope scope_;
  phi::Place place_;
+  std::shared_ptr<ParallelExecutor> inner_pe_;
+  std::shared_ptr<Graph> graph_;
 };
 }  // namespace jit