parallel_executor.cc 13.2 KB
Newer Older
Y
Yang Yang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/framework/parallel_executor.h"
Q
qiaolongfei 已提交
16

C
chengduoZH 已提交
17
#include <string>
18
#include <tuple>
Q
qiaolongfei 已提交
19
#include <vector>
Y
Yu Yang 已提交
20

X
clean  
Xin Pan 已提交
21
#include "paddle/fluid/framework/ir/graph.h"
X
Xin Pan 已提交
22
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
X
Xin Pan 已提交
23

Y
Yu Yang 已提交
24
#ifdef PADDLE_WITH_CUDA
Y
Yu Yang 已提交
25
#include "paddle/fluid/platform/nccl_helper.h"
Y
Yu Yang 已提交
26
#endif
Y
Yang Yang 已提交
27

X
Xin Pan 已提交
28 29
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
Y
yuyang18 已提交
30
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
Y
Yu Yang 已提交
31
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
32
#include "paddle/fluid/platform/profiler.h"
Y
Yu Yang 已提交
33

Y
Yang Yang 已提交
34
namespace paddle {
Y
Yu Yang 已提交
35 36
namespace framework {

X
Xin Pan 已提交
37 38 39 40 41 42 43 44 45 46
std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
    const std::string &loss_var_name,
    const std::unordered_set<std::string> &param_names,
    const std::vector<Scope *> &local_scopes, const bool use_cuda,
#ifdef PADDLE_WITH_CUDA
    const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
#else
    const BuildStrategy &strategy) {
#endif
X
Xin Pan 已提交
47
  // Convert the program to graph.
X
Xin Pan 已提交
48
  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
X
Xin Pan 已提交
49 50

  // Apply a graph viz pass to record a graph.
X
Xin Pan 已提交
51 52 53 54 55 56 57 58
  if (!strategy.debug_graphviz_path_.empty()) {
    auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
    const std::string graph_path = string::Sprintf(
        "%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph");
    viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
    graph = viz_pass->Apply(std::move(graph));
  }

X
Xin Pan 已提交
59
  // Convert graph to run on multi-devices.
X
Xin Pan 已提交
60 61 62 63 64 65 66
  auto multi_devices_pass =
      ir::PassRegistry::Instance().Get("multi_devices_pass");
  multi_devices_pass->SetNotOwned<const std::vector<platform::Place>>("places",
                                                                      &places);
  multi_devices_pass->SetNotOwned<const std::string>("loss_var_name",
                                                     &loss_var_name);
  multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
X
Xin Pan 已提交
67
      "params", &param_names);
X
Xin Pan 已提交
68 69 70
  multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
                                                              &local_scopes);
  multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
X
Xin Pan 已提交
71 72 73

#ifdef PADDLE_WITH_CUDA
  platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
X
Xin Pan 已提交
74
  multi_devices_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
X
Xin Pan 已提交
75
#endif
X
Xin Pan 已提交
76
  graph = multi_devices_pass->Apply(std::move(graph));
X
Xin Pan 已提交
77

X
Xin Pan 已提交
78
  // Apply a graph print pass to record a graph with device info.
X
Xin Pan 已提交
79
  if (!strategy.debug_graphviz_path_.empty()) {
X
Xin Pan 已提交
80 81 82
    auto multi_devices_print_pass =
        ir::PassRegistry::Instance().Get("multi_devices_print_pass");
    multi_devices_print_pass->SetNotOwned<const std::string>(
X
Xin Pan 已提交
83
        "debug_graphviz_path", &strategy.debug_graphviz_path_);
X
Xin Pan 已提交
84
    multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
X
Xin Pan 已提交
85
        "graph_printer", new details::GraphvizSSAGraphPrinter);
X
Xin Pan 已提交
86
    graph = multi_devices_print_pass->Apply(std::move(graph));
X
Xin Pan 已提交
87 88
  }

X
Xin Pan 已提交
89
  // Verify that the graph is correct for multi-device executor.
X
Xin Pan 已提交
90 91 92
  auto multi_devices_check_pass =
      ir::PassRegistry::Instance().Get("multi_devices_check_pass");
  graph = multi_devices_check_pass->Apply(std::move(graph));
X
Xin Pan 已提交
93 94 95
  return graph;
}

Y
Yu Yang 已提交
96 97 98
class ParallelExecutorPrivate {
 public:
  explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
Y
Yu Yang 已提交
99
      : places_(places) {}
Y
Yu Yang 已提交
100 101 102 103

  std::vector<platform::Place> places_;
  std::vector<Scope *> local_scopes_;
  Scope *global_scope_;
Y
Yu Yang 已提交
104
  std::unique_ptr<details::SSAGraphExecutor> executor_;
Y
Yu Yang 已提交
105

Y
Yu Yang 已提交
106
#ifdef PADDLE_WITH_CUDA
Y
Yu Yang 已提交
107
  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
Y
Yu Yang 已提交
108
#endif
C
chengduoZH 已提交
109 110
  bool own_local_scope_;
  bool use_cuda_;
111
  bool use_all_reduce_;
Y
Yu Yang 已提交
112 113
};

114 115 116 117
std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
  return member_->local_scopes_;
}

Y
Yu Yang 已提交
118
ParallelExecutor::ParallelExecutor(
119
    const std::vector<platform::Place> &places,
Y
Yu Yang 已提交
120
    const std::unordered_set<std::string> &params,
121 122
    const std::unordered_set<std::string> &bcast_vars,
    const ProgramDesc &main_program, const std::string &loss_var_name,
Y
yuyang18 已提交
123
    Scope *scope, const std::vector<Scope *> &local_scopes,
124
    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
125
    size_t num_trainers, size_t trainer_id)
Y
Yu Yang 已提交
126
    : member_(new ParallelExecutorPrivate(places)) {
Y
Yu Yang 已提交
127
  member_->global_scope_ = scope;
128
  member_->use_cuda_ = exec_strategy.use_cuda_;
129 130 131 132 133 134 135 136
  member_->use_all_reduce_ =
      build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;

  if (!member_->use_all_reduce_) {
    PADDLE_ENFORCE(places.size() > 1,
                   "If you set build_strategy.reduce with 'Reduce',"
                   "the number of places must be greater than 1.");
  }
Y
Yu Yang 已提交
137

138
  // Step 1. Bcast the params to devs.
Y
Yu Yang 已提交
139
  // Create local scopes
140
  if (local_scopes.empty()) {
C
chengduoZH 已提交
141
    member_->own_local_scope_ = true;
Y
Yu Yang 已提交
142 143
    member_->local_scopes_.emplace_back(member_->global_scope_);
    for (size_t i = 1; i < member_->places_.size(); ++i) {
Y
Debug  
Yu Yang 已提交
144
      member_->local_scopes_.emplace_back(&scope->NewScope());
145 146
    }
  } else {
C
chengduoZH 已提交
147
    member_->own_local_scope_ = false;
148 149
    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
    for (size_t i = 0; i < member_->places_.size(); ++i) {
150
      member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
151
    }
Y
Yu Yang 已提交
152 153
  }

C
chengduoZH 已提交
154
  if (member_->use_cuda_) {
Y
Yu Yang 已提交
155 156
// Bcast Parameters to all GPUs
#ifdef PADDLE_WITH_CUDA
C
chengduoZH 已提交
157 158 159 160 161 162 163 164 165
    auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
    ncclUniqueId *nccl_id = nullptr;
    if (nccl_id_var != nullptr) {
      nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
    }
    member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
        member_->places_, nccl_id, num_trainers, trainer_id));
#else
    PADDLE_THROW("Not compiled with CUDA");
Y
Yu Yang 已提交
166
#endif
C
chengduoZH 已提交
167 168 169
  }

  if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
Y
Yancey1989 已提交
170
    BCastParamsToDevices(bcast_vars);
Y
Yu Yang 已提交
171
  }
Y
yuyang18 已提交
172 173 174 175 176 177 178 179 180 181
  // Startup Program has been run. All local scopes has correct parameters.

  // Step 2. Create vars in each scope;
  std::vector<details::VariableInfo> var_infos;
  for (auto *var : main_program.Block(0).AllVars()) {
    var_infos.emplace_back();
    var_infos.back().name_ = var->Name();
    var_infos.back().type_ = var->GetType();
    var_infos.back().persistable_ = var->Persistable();
  }
Y
Yu Yang 已提交
182

X
Xin Pan 已提交
183 184
// Step 3. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
Y
yuyang18 已提交
185
#ifdef PADDLE_WITH_CUDA
X
Xin Pan 已提交
186 187 188 189
  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
      main_program, member_->places_, loss_var_name, params,
      member_->local_scopes_, member_->use_cuda_, build_strategy,
      member_->nccl_ctxs_.get());
C
chengduoZH 已提交
190
#else
X
Xin Pan 已提交
191 192 193
  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
      main_program, member_->places_, loss_var_name, params,
      member_->local_scopes_, member_->use_cuda_, build_strategy);
Y
Yu Yang 已提交
194
#endif
X
Xin Pan 已提交
195

Y
Yu Yang 已提交
196
  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
X
Xin Pan 已提交
197
      exec_strategy, member_->local_scopes_, places, std::move(graph)));
Y
yuyang18 已提交
198 199 200
  member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
      exec_strategy, member_->local_scopes_, std::move(var_infos),
      member_->places_, std::move(member_->executor_)));
Y
Yu Yang 已提交
201 202
}

Y
Yancey1989 已提交
203
void ParallelExecutor::BCastParamsToDevices(
204
    const std::unordered_set<std::string> &vars) const {
205
  // the initializing bcast, all vars would be bcast from device(0),
Y
yi.wu 已提交
206
  // otherwise
207
  // bcast from the specified device.
X
Xin Pan 已提交
208
  bool initializing = member_->executor_ ? false : true;
209
  for (auto &var : vars) {
X
Xin Pan 已提交
210 211 212 213
    int var_dev_id = -1;
    if (member_->executor_) {
      auto &sharded_var_device =
          member_->executor_->Graph().Get<details::ShardedVarDevice>(
X
Xin Pan 已提交
214
              details::kShardedVarDevice);
X
Xin Pan 已提交
215 216 217 218 219
      if (sharded_var_device.find(var) != sharded_var_device.end()) {
        var_dev_id = sharded_var_device.at(var);
      }
    }

Y
yi.wu 已提交
220
    if (!initializing && var_dev_id == -1) continue;
221 222

    framework::Variable *main_var = nullptr;
Y
yi.wu 已提交
223
    if (initializing) {
224 225 226 227 228
      main_var = member_->local_scopes_[0]->FindVar(var);
    } else {
      main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
    }

J
JiayiFeng 已提交
229
    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
230 231 232 233 234 235
      continue;
    }

    auto &main_tensor = main_var->Get<LoDTensor>();
    auto &dims = main_tensor.dims();
    if (paddle::platform::is_gpu_place(main_tensor.place())) {
C
chengduoZH 已提交
236
#ifdef PADDLE_WITH_CUDA
237
      std::vector<void *> buffers;
238 239 240 241 242
      size_t numel = main_tensor.numel();
      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
      for (size_t i = 0; i < member_->places_.size(); ++i) {
        auto place = member_->places_[i];
        void *buffer;
243

Y
yi.wu 已提交
244
        if ((initializing && i == 0) ||
Y
update  
yi.wu 已提交
245
            (!initializing && static_cast<int>(i) == var_dev_id)) {
246 247
          buffer = const_cast<void *>(main_tensor.data<void>());
        } else {
Y
Yu Yang 已提交
248
          auto local_scope = member_->local_scopes_[i];
249
          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
Y
Update  
Yu Yang 已提交
250
          t->Resize(dims);
251
          buffer = t->mutable_data(place, main_tensor.type());
Y
Update  
Yu Yang 已提交
252
        }
253
        buffers.push_back(buffer);
254
      }
255

256 257 258 259 260 261
      PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
                        "variables' buffer size to bcast NOT equal to places");
      {
        platform::NCCLGroupGuard guard;
        for (size_t i = 0; i < member_->places_.size(); ++i) {
          auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
Y
yi.wu 已提交
262 263 264 265
          if (initializing) {
            platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
                                         nccl_ctx.comm_, nccl_ctx.stream());
          } else {
Y
update  
yi.wu 已提交
266
            if (var_dev_id >= 0) {
Y
yi.wu 已提交
267 268 269 270 271
              platform::dynload::ncclBcast(buffers[i], numel, data_type,
                                           var_dev_id, nccl_ctx.comm_,
                                           nccl_ctx.stream());
            }
          }
272
        }
273
        member_->nccl_ctxs_->WaitAll();
274
      }
275

C
chengduoZH 已提交
276 277 278
#else
      PADDLE_THROW("Not compiled with CUDA");
#endif
279 280
    } else {
      platform::CPUPlace cpu;
Y
Yancey1989 已提交
281 282 283 284 285
      for (size_t i = 0; i < member_->places_.size(); ++i) {
        if ((initializing && i == 0) ||
            (!initializing && static_cast<int>(i) == var_dev_id))
          continue;

286 287
        auto local_scope = member_->local_scopes_[i];
        auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
C
chengduo 已提交
288 289 290 291

        // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix.
        if (member_->use_all_reduce_ || member_->use_cuda_ ||
            var == "@LR_DECAY_COUNTER@") {
292 293 294 295 296 297
          t->Resize(dims);
          t->mutable_data(cpu, main_tensor.type());
          paddle::framework::TensorCopy(main_tensor, cpu, t);
        } else {
          t->ShareDataWith(main_tensor);
        }
Y
Yu Yang 已提交
298
      }
Y
Stash  
Yu Yang 已提交
299 300
    }
  }
Y
Yu Yang 已提交
301
}
Y
Yu Yang 已提交
302

Y
Yu Yang 已提交
303 304
void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                           const std::string &fetched_var_name) {
X
Xin Pan 已提交
305
  platform::RecordBlock b(0);
Y
Yu Yang 已提交
306 307 308
  auto fetch_data = member_->executor_->Run(fetch_tensors);
  *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
      fetch_data;
Y
Yu Yang 已提交
309
}
Y
Yu Yang 已提交
310

Y
Yu Yang 已提交
311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329
void ParallelExecutor::FeedTensorsIntoLocalScopes(
    const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors) {
  PADDLE_ENFORCE_EQ(member_->local_scopes_.size(), tensors.size());

  for (size_t i = 0; i < tensors.size(); ++i) {
    auto &map = tensors[i];
    auto *scope = member_->local_scopes_[i];
    for (auto &pair : map) {
      auto *trg = scope->Var(pair.first)->GetMutable<LoDTensor>();
      trg->ShareDataWith(pair.second);
      trg->set_lod(pair.second.lod());
    }
  }
}

void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
    const std::unordered_map<std::string, LoDTensor> &tensors) {
  for (auto pair : tensors) {
    auto lod_tensors = pair.second.SplitLoDTensor(member_->places_);
330 331 332 333 334
    PADDLE_ENFORCE_EQ(
        member_->places_.size(), lod_tensors.size(),
        "The number of samples of current batch is less than the count of "
        "devices, currently, it is not allowed. (%d vs %d)",
        member_->places_.size(), lod_tensors.size());
X
Xin Pan 已提交
335 336
    for (size_t j = 0; j < member_->places_.size(); ++j) {
      // TODO(panxy0718): Do I need to delete this var?
337
      auto t =
Y
Yu Yang 已提交
338
          member_->local_scopes_[j]->Var(pair.first)->GetMutable<LoDTensor>();
339 340
      t->ShareDataWith(lod_tensors[j]);
      t->set_lod(lod_tensors[j].lod());
X
Xin Pan 已提交
341 342 343 344
    }
  }
}

345
ParallelExecutor::~ParallelExecutor() {
C
chengduoZH 已提交
346
  if (member_->own_local_scope_) {
347 348 349 350 351 352
    for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
      member_->global_scope_->DeleteScope(member_->local_scopes_[i]);
    }
  }
}

Y
Yu Yang 已提交
353
}  // namespace framework
Y
Yang Yang 已提交
354
}  // namespace paddle
X
Xin Pan 已提交
355 356

USE_PASS(graph_viz_pass);
X
Xin Pan 已提交
357 358 359
USE_PASS(multi_devices_pass);
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);