parallel_executor.h 4.8 KB
Newer Older
Y
Yang Yang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

Q
Qiao Longfei 已提交
17
#include <memory>
X
Xin Pan 已提交
18
#include <string>
S
sneaxiy 已提交
19
#include <unordered_map>
Y
Yang Yang 已提交
20
#include <unordered_set>
Y
Yan Xu 已提交
21
#include <utility>
X
Xin Pan 已提交
22
#include <vector>
23 24

#include "paddle/fluid/framework/details/build_strategy.h"
Y
yuyang18 已提交
25
#include "paddle/fluid/framework/details/execution_strategy.h"
26
#include "paddle/fluid/framework/details/op_handle_base.h"
27
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
Y
Yang Yang 已提交
28
#include "paddle/fluid/framework/executor.h"
29
#include "paddle/fluid/framework/feed_fetch_type.h"
Y
Yang Yang 已提交
30 31 32 33 34
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
35

36
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
Y
Yancey1989 已提交
37 38 39
#include "paddle/fluid/platform/nccl_helper.h"
#endif

Y
Yang Yang 已提交
40 41 42
namespace paddle {
namespace framework {

Y
Yu Yang 已提交
43
class ParallelExecutorPrivate;
Y
Use mtx  
Yu Yang 已提交
44

45
using details::VariableInfo;
Y
yuyang18 已提交
46
using details::BuildStrategy;
Y
yuyang18 已提交
47
using details::ExecutionStrategy;
48 49
namespace p = paddle::platform;
using DeviceType = paddle::platform::DeviceType;
Y
yuyang18 已提交
50

Y
Yang Yang 已提交
51
class ParallelExecutor {
Y
Yu Yang 已提交
52 53
  DISABLE_COPY_AND_ASSIGN(ParallelExecutor);

Y
Yu Yang 已提交
54
 public:
Y
yuyang18 已提交
55
  explicit ParallelExecutor(const std::vector<platform::Place> &places,
Y
Yan Xu 已提交
56
                            const std::vector<std::string> &bcast_vars,
Y
yuyang18 已提交
57 58
                            const std::string &loss_var_name, Scope *scope,
                            const std::vector<Scope *> &local_scopes,
Y
yuyang18 已提交
59
                            const ExecutionStrategy &exec_strategy,
X
Xin Pan 已提交
60
                            const BuildStrategy &build_strategy,
Q
Qiao Longfei 已提交
61
                            ir::Graph *graph);
Y
Yu Yang 已提交
62

63 64
  ~ParallelExecutor();

65 66
  size_t DeviceCount() const;

Y
yuyang18 已提交
67
  std::vector<Scope *> &GetLocalScopes();
68

69 70 71 72 73
  void DropLocalExeScopes();

  // This API is used to check whether DropLocalExeScopes work.
  bool NeedCreateLocalExeScope();

Y
Yu Yang 已提交
74 75 76 77 78
  /**
   * Feed tensors to local scopes. The size of tensors should be equal to the
   * size of local scopes.
   */
  void FeedTensorsIntoLocalScopes(
Y
yuyang18 已提交
79
      const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors);
Y
Yu Yang 已提交
80 81

  void FeedAndSplitTensorIntoLocalScopes(
Y
yuyang18 已提交
82
      const std::unordered_map<std::string, LoDTensor> &tensors);
Y
Yu Yang 已提交
83

Z
Zhen Wang 已提交
84 85
  FetchResultType Run(const std::vector<std::string> &fetch_tensors,
                      bool return_merged = true);
Y
Yang Yang 已提交
86

87 88
  const ir::Graph &Graph() const;

X
Xin Pan 已提交
89
 private:
Y
Yan Xu 已提交
90 91 92 93
  // broadcast the parameters from the 0th device.
  // trainer_id the trainer index in nccl distributed training.
  void BCastParamsToDevices(const std::vector<std::string> &vars,
                            int trainer_id = 0) const;
X
Xin Pan 已提交
94 95 96
  bool EnableParallelGraphExecution(const ir::Graph &graph,
                                    const ExecutionStrategy &exec_strategy,
                                    const BuildStrategy &build_strategy) const;
T
typhoonzero 已提交
97

98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
  void InitExecutorPrivateMemberInfo(const ExecutionStrategy &exec_strategy,
                                     const BuildStrategy &build_strategy,
                                     size_t device_count,
                                     const ir::Graph &graph);

  void CreateLocalScopes(Scope *global_scope,
                         const std::vector<Scope *> &local_scopes,
                         bool create_new);

  std::unordered_map<Scope *, Scope *> CreateLocalExecScopes(
      const std::vector<Scope *> &local_scopes, bool create_new);

  std::vector<ir::Graph *> CloneGraphToMultiDevices(ir::Graph *graph);

  void PrepareNCCLCommunicator(Scope *global_scope);

  std::vector<ir::Graph *> CompileGraphWithBuildStrategy(
      ir::Graph *graph, std::vector<ir::Graph *> *graphs,
      const std::string &loss_var_name);

  void CreateVariableInfos(std::vector<VariableInfo> *var_infos,
                           ir::Graph *graph);

  std::vector<ir::Graph *> CreateSSAGraphExecutor(
      const ExecutionStrategy &exec_strategy,
      std::vector<ir::Graph *> *async_graphs, ir::Graph *graph);

  void ResetOpHandleScopeMapOfGraphs(
      const std::vector<ir::Graph *> &final_graphs,
      const std::unordered_map<Scope *, Scope *> &scope_map);

  void SetReaderOpDeviceInfoOfGraphs(
      const std::vector<ir::Graph *> &final_graphs);

S
sneaxiy 已提交
132
  ParallelExecutorPrivate *member_;
Q
Qiao Longfei 已提交
133
  std::vector<std::unique_ptr<ir::Graph>> async_graphs_;
134
};
Y
Yang Yang 已提交
135 136
}  // namespace framework
}  // namespace paddle