parallel_executor.h 5.7 KB
Newer Older
Y
Yang Yang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

17
#include <map>
Q
Qiao Longfei 已提交
18
#include <memory>
X
Xin Pan 已提交
19
#include <string>
S
sneaxiy 已提交
20
#include <unordered_map>
Y
Yang Yang 已提交
21
#include <unordered_set>
Y
Yan Xu 已提交
22
#include <utility>
X
Xin Pan 已提交
23
#include <vector>
24 25

#include "paddle/fluid/framework/details/build_strategy.h"
Y
yuyang18 已提交
26
#include "paddle/fluid/framework/details/execution_strategy.h"
27
#include "paddle/fluid/framework/details/op_handle_base.h"
28
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
Y
Yang Yang 已提交
29
#include "paddle/fluid/framework/executor.h"
30
#include "paddle/fluid/framework/feed_fetch_type.h"
Y
Yang Yang 已提交
31 32 33 34 35
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
36

37
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
Y
Yancey1989 已提交
38 39 40
#include "paddle/fluid/platform/nccl_helper.h"
#endif

Y
Yang Yang 已提交
41 42 43
namespace paddle {
namespace framework {

Y
Yu Yang 已提交
44
class ParallelExecutorPrivate;
Y
Use mtx  
Yu Yang 已提交
45

46
using details::VariableInfo;
Y
yuyang18 已提交
47
using details::BuildStrategy;
Y
yuyang18 已提交
48
using details::ExecutionStrategy;
49 50
namespace p = paddle::platform;
using DeviceType = paddle::platform::DeviceType;
Y
yuyang18 已提交
51

Y
Yang Yang 已提交
52
class ParallelExecutor {
Y
Yu Yang 已提交
53 54
  DISABLE_COPY_AND_ASSIGN(ParallelExecutor);

Y
Yu Yang 已提交
55
 public:
Y
yuyang18 已提交
56
  explicit ParallelExecutor(const std::vector<platform::Place> &places,
Y
Yan Xu 已提交
57
                            const std::vector<std::string> &bcast_vars,
Y
yuyang18 已提交
58 59
                            const std::string &loss_var_name, Scope *scope,
                            const std::vector<Scope *> &local_scopes,
Y
yuyang18 已提交
60
                            const ExecutionStrategy &exec_strategy,
X
Xin Pan 已提交
61
                            const BuildStrategy &build_strategy,
Q
Qiao Longfei 已提交
62
                            ir::Graph *graph);
Y
Yu Yang 已提交
63

64 65 66 67 68 69
  // NOTE(Aurelius84): Construct a PE running on single device for @to_static
  explicit ParallelExecutor(const platform::Place &place, Scope *scope,
                            const ExecutionStrategy &exec_strategy,
                            const BuildStrategy &build_strategy,
                            ir::Graph *graph);

70 71
  ~ParallelExecutor();

72 73
  size_t DeviceCount() const;

Y
yuyang18 已提交
74
  std::vector<Scope *> &GetLocalScopes();
75

76 77 78 79 80
  void DropLocalExeScopes();

  // This API is used to check whether DropLocalExeScopes work.
  bool NeedCreateLocalExeScope();

Y
Yu Yang 已提交
81 82 83 84 85
  /**
   * Feed tensors to local scopes. The size of tensors should be equal to the
   * size of local scopes.
   */
  void FeedTensorsIntoLocalScopes(
Y
yuyang18 已提交
86
      const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors);
Y
Yu Yang 已提交
87 88

  void FeedAndSplitTensorIntoLocalScopes(
Y
yuyang18 已提交
89
      const std::unordered_map<std::string, LoDTensor> &tensors);
Y
Yu Yang 已提交
90

Z
Zhen Wang 已提交
91 92
  FetchResultType Run(const std::vector<std::string> &fetch_tensors,
                      bool return_merged = true);
Y
Yang Yang 已提交
93

94 95
  void RunWithoutFetch(const std::vector<std::string> &skip_eager_vars);

96 97 98 99
  FetchResultType RunFromCinn(
      const std::unordered_map<std::string, LoDTensor> &feed_tensors,
      const std::vector<std::string> &fetch_names);

100 101 102
  void ResetOpHandleScopeMapOfGraphs(
      const std::unordered_map<Scope *, Scope *> &scope_map);

103
  const ir::Graph &Graph() const;
104 105 106 107
  void PrepareVariables(Scope *scope);

  void SkipMemoryReuse(size_t scope_idx,
                       const std::vector<std::string> &skip_vars);
108

X
Xin Pan 已提交
109
 private:
Y
Yan Xu 已提交
110 111 112 113
  // broadcast the parameters from the 0th device.
  // trainer_id the trainer index in nccl distributed training.
  void BCastParamsToDevices(const std::vector<std::string> &vars,
                            int trainer_id = 0) const;
X
Xin Pan 已提交
114 115 116
  bool EnableParallelGraphExecution(const ir::Graph &graph,
                                    const ExecutionStrategy &exec_strategy,
                                    const BuildStrategy &build_strategy) const;
T
typhoonzero 已提交
117

118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
  void InitExecutorPrivateMemberInfo(const ExecutionStrategy &exec_strategy,
                                     const BuildStrategy &build_strategy,
                                     size_t device_count,
                                     const ir::Graph &graph);

  void CreateLocalScopes(Scope *global_scope,
                         const std::vector<Scope *> &local_scopes,
                         bool create_new);

  std::unordered_map<Scope *, Scope *> CreateLocalExecScopes(
      const std::vector<Scope *> &local_scopes, bool create_new);

  std::vector<ir::Graph *> CloneGraphToMultiDevices(ir::Graph *graph);

  void PrepareNCCLCommunicator(Scope *global_scope);

  std::vector<ir::Graph *> CompileGraphWithBuildStrategy(
      ir::Graph *graph, std::vector<ir::Graph *> *graphs,
      const std::string &loss_var_name);

  void CreateVariableInfos(std::vector<VariableInfo> *var_infos,
                           ir::Graph *graph);

  std::vector<ir::Graph *> CreateSSAGraphExecutor(
      const ExecutionStrategy &exec_strategy,
      std::vector<ir::Graph *> *async_graphs, ir::Graph *graph);

  void ResetOpHandleScopeMapOfGraphs(
      const std::vector<ir::Graph *> &final_graphs,
      const std::unordered_map<Scope *, Scope *> &scope_map);

  void SetReaderOpDeviceInfoOfGraphs(
      const std::vector<ir::Graph *> &final_graphs);

152 153
  void PrepareForCUDAGraphCapture(ir::Graph *graph);

S
sneaxiy 已提交
154
  ParallelExecutorPrivate *member_;
Q
Qiao Longfei 已提交
155
  std::vector<std::unique_ptr<ir::Graph>> async_graphs_;
156
  std::vector<VariableInfo> var_infos_;
157
};
Y
Yang Yang 已提交
158 159
}  // namespace framework
}  // namespace paddle