build_strategy.cc 21.3 KB
Newer Older
1
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2
Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/framework/details/build_strategy.h"

D
dzhwinter 已提交
18
#include <glog/logging.h>
19

20
#include "paddle/fluid/framework/details/reduce_op_handle.h"
C
chengduo 已提交
21
#include "paddle/fluid/framework/ir/graph_printer.h"
22
#include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
23

24
DECLARE_bool(convert_all_blocks);
25
DECLARE_bool(use_mkldnn);
26 27 28
#ifdef PADDLE_WITH_CINN
DECLARE_bool(use_cinn);
#endif
29

30 31 32 33
namespace paddle {
namespace framework {
namespace details {

34
static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
Y
Yancey1989 已提交
35 36
  // Should fix the allreduce op order if scheduling
  // them in multiple threads or processes to avoid hang.
Y
Yancey1989 已提交
37
  // NOTE: ParallelGraph would execute this pass on each graph, so
Y
Yancey1989 已提交
38
  // don't need to append it here.
Y
Yancey1989 已提交
39
  return (!strategy.enable_sequential_execution_ &&
Y
Yancey1989 已提交
40 41
          strategy.num_trainers_ > 1) &&
         !strategy.enable_parallel_graph_;
42 43
}

44 45
static inline void ConvertDefaultValue(paddle::optional<bool> *default_value) {
  if (*default_value == paddle::none) {
C
chengduo 已提交
46 47 48 49
    *default_value = true;
  }
}

50 51 52 53
class ParallelExecutorPassBuilder : public ir::PassBuilder {
 public:
  explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
      : ir::PassBuilder(), strategy_(strategy) {
C
chengduo 已提交
54
    ResolveOptionConfliction();
C
chengduo 已提交
55

C
chengduo 已提交
56
    AppendPrintGraphPass("graph_viz_pass", "_original_graph");
57 58

#ifdef PADDLE_WITH_CINN
59
    if (FLAGS_use_cinn || strategy.build_cinn_pass_) {
60 61 62 63 64 65
      // Note: This pass is used to enable cinn.
      AppendPass("build_cinn_pass");
      AppendPrintGraphPass("graph_viz_pass", "_build_cinn_graph");
    }
#endif

C
chengduo 已提交
66 67 68 69 70 71
    AppendPassWithCheck(strategy_.enable_sequential_execution_,
                        "sequential_execution_pass");
    AppendPassWithCheck(strategy_.sync_batch_norm_, "sync_batch_norm_pass");

    AppendOpFusePasses();
    AppendPrintGraphPass("graph_viz_pass", "_fused_graph");
72

73
    AppendAddReaderDependencyPass();
C
chengduo 已提交
74 75 76 77 78 79 80 81 82 83 84 85 86
    AppendMultiDevPass();
    AppendMultiGraphOptPasses();

    AppendPassToSetMkldnnAttr("mkldnn_placement_pass");
    // runtime_context_cache pass should be the last pass to enable the attr of
    // all original and fused operators. But no operators can be enabled this
    // attr if putting it after MultiDevPass.
    AppendPassWithCheck(strategy_.cache_runtime_context_,
                        "runtime_context_cache_pass");
    AppendPassWithCheck(strategy_.remove_unnecessary_lock_,
                        "modify_op_lock_and_record_event_pass");
    // Note: This pass is used to check whether the multi_device_graph is right.
    AppendPass("multi_devices_check_pass");
Z
Zeng Jinle 已提交
87

C
chengduo 已提交
88 89
    SetCollectiveContext();
  }
90

C
chengduo 已提交
91 92 93
  void ResolveOptionConfliction() {
    // Specifies the restrictions between different pass.
    if (strategy_.enable_parallel_graph_) {
C
chengduo 已提交
94
      LOG_IF(WARNING, strategy_.fuse_all_optimizer_ops_ == true)
95
          << "Currently, fuse_all_optimizer_ops doesn't work under "
C
chengduo 已提交
96 97
             "parallel_graph.";
      strategy_.fuse_all_optimizer_ops_ = false;
C
chengduo 已提交
98
      LOG_IF(WARNING, strategy_.fuse_all_reduce_ops_ == true)
99 100 101
          << "fuse_all_reduce_ops doesn't work under "
             "parallel_graph.";
      strategy_.fuse_all_reduce_ops_ = false;
S
sneaxiy 已提交
102
    }
C
chengduo 已提交
103
    if (strategy_.is_distribution_) {
C
chengduo 已提交
104
      LOG_IF(WARNING, strategy_.fuse_all_optimizer_ops_ == true)
C
chengduo 已提交
105 106 107
          << "Currently, fuse_all_optimizer_ops only works under "
             "Non-distributed mode.";
      strategy_.fuse_all_optimizer_ops_ = false;
C
chengduo 已提交
108
      LOG_IF(WARNING, strategy_.fuse_all_reduce_ops_ == true)
109 110 111
          << "Currently, fuse_all_reduce_ops_ only works under "
             "Non-distributed mode.";
      strategy_.fuse_all_reduce_ops_ = false;
Q
qingqing01 已提交
112
    }
C
chengduo 已提交
113
    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
C
chengduo 已提交
114
      LOG_IF(WARNING, strategy_.fuse_all_optimizer_ops_ == true)
C
chengduo 已提交
115 116 117
          << "Currently, fuse_all_optimizer_ops only works under AllReduce "
             "mode.";
      strategy_.fuse_all_optimizer_ops_ = false;
C
chengduo 已提交
118
      LOG_IF(WARNING, strategy_.fuse_all_reduce_ops_ == true)
119 120
          << "fuse_all_optimizer_ops only works under AllReduce "
             "mode.";
C
chengduo 已提交
121
      strategy_.fuse_all_reduce_ops_ = false;
D
dzhwinter 已提交
122
    }
C
chengduo 已提交
123 124 125 126 127 128 129 130 131 132 133 134 135
    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
      LOG_IF(WARNING, strategy_.fuse_broadcast_ops_ == true)
          << "Currently, fuse_broadcast_ops only works under Reduce "
             "mode.";
      strategy_.fuse_broadcast_ops_ = false;
    }

    ConvertDefaultValue(&strategy_.fuse_all_optimizer_ops_);
    ConvertDefaultValue(&strategy_.fuse_all_reduce_ops_);
    ConvertDefaultValue(&strategy_.fuse_broadcast_ops_);

    if (strategy_.fuse_all_optimizer_ops_ == true) {
      LOG_IF(WARNING, strategy_.async_mode_)
136 137
          << "Currently, fuse_all_optimizer_ops doesn't work under "
             "async mode.";
C
chengduo 已提交
138 139 140 141
      strategy_.fuse_all_optimizer_ops_ = !strategy_.async_mode_;
    }
    if (strategy_.fuse_all_reduce_ops_ == true) {
      LOG_IF(WARNING, strategy_.async_mode_)
142 143
          << "Currently, fuse_all_reduce_ops doesn't work under "
             "async mode.";
C
chengduo 已提交
144
      strategy_.fuse_all_reduce_ops_ = !strategy_.async_mode_;
145
    }
C
chengduo 已提交
146
  }
147

C
chengduo 已提交
148 149 150 151 152 153
  void AppendMultiGraphOptPasses() {
    // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator
    // first, if the number is zero, fuse_all_reduce_ops will do nothing.
    AppendPassWithCheck(strategy_.fuse_all_reduce_ops_,
                        "fuse_all_reduce_op_pass");
    AppendPrintGraphPass("multi_devices_print_pass", "_multi_devices_graph");
S
sneaxiy 已提交
154

C
chengduo 已提交
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
    // experimental shows that the program will be faster if append
    // all_reduce_deps_pass here.
    bool append_all_reduce_deps_pass =
        !strategy_.enable_parallel_graph_ &&
        (SeqOnlyAllReduceOps(strategy_) ||
         strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce);
    AppendPassWithCheck(append_all_reduce_deps_pass, "all_reduce_deps_pass");

    bool append_backward_optimizer_op_deps_pass =
        strategy_.num_trainers_ > 1 && !strategy_.async_mode_ &&
        !strategy_.is_distribution_ &&
        strategy_.enable_backward_optimizer_op_deps_;
    AppendPassWithCheck(append_backward_optimizer_op_deps_pass,
                        "backward_optimizer_op_deps_pass");
  }
C
chengduo 已提交
170

C
chengduo 已提交
171
  void AppendOpFusePasses() {
172
    // 1. infernce pass if enabled.
H
Hui Zhang 已提交
173 174 175 176 177 178
    AppendPassWithCheck(
        strategy_.enable_inference_pass_ && strategy_.delete_dropout_,
        "delete_dropout_op_x_pass");
    AppendPassWithCheck(
        strategy_.enable_inference_pass_ && strategy_.use_mkldnn_,
        "mkldnn_placement_pass");
179 180

    // 2. trainning pass
C
chengduo 已提交
181 182
    AppendPassWithCheck(strategy_.fuse_relu_depthwise_conv_,
                        "fuse_relu_depthwise_conv_pass");
183
    AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass");
Z
Zhang Ting 已提交
184
    AppendPassWithCheck(strategy_.fuse_bn_add_act_ops_, "fuse_bn_add_act_pass");
185 186
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
    !defined(_WIN32) && !defined(__APPLE__)
187 188
    AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass");
#endif
189 190 191 192 193

#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060)
    AppendPassWithCheck(strategy_.fuse_gemm_epilogue_,
                        "fuse_gemm_epilogue_pass");
#endif
C
chengduo 已提交
194 195
    AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_,
                        "fuse_elewise_add_act_pass");
C
chengduo 已提交
196
    // for single card training, fuse_all_reduce_ops is unnecessary.
197
    // coalesce_grad_tensor_pass should be before of MultiDevPass.
C
chengduo 已提交
198 199
    AppendPassWithCheck(strategy_.fuse_all_reduce_ops_,
                        "coalesce_grad_tensor_pass");
200
    // Fuse all the optimization operators.
C
chengduo 已提交
201 202 203
    // NOTE: fuse_all_xx_ops will count the number of xx operator first,
    // if the number is zero, fuse_all_reduce_ops will do nothing.
    // Currently, only one type of optimization algorithm can be fused.
C
chengduo 已提交
204
    if (strategy_.fuse_all_optimizer_ops_ == true) {
205 206 207
      AppendPass("fuse_adam_op_pass");
      AppendPass("fuse_sgd_op_pass");
      AppendPass("fuse_momentum_op_pass");
C
chengduo 已提交
208
    }
C
chengduo 已提交
209
  }
C
chengduo 已提交
210

C
chengduo 已提交
211 212 213 214
  void SetCollectiveContext() const {
    CollectiveContext *context = CollectiveContext::GetInstance();
    context->endpoints_ = strategy_.trainers_endpoints_;
    context->trainer_id_ = strategy_.trainer_id_;
215
    PADDLE_ENFORCE_GE(
216 217
        strategy_.trainer_id_,
        0,
218 219 220 221 222
        platform::errors::InvalidArgument(
            "The trainer_id_ of strategy_ must be greater than or equal to 0, "
            "but received strategy_.trainer_id_ = %d.",
            strategy_.trainer_id_));

C
chengduo 已提交
223
    if (strategy_.trainer_id_ > 0 && strategy_.trainers_endpoints_.size() > 0) {
224 225 226 227 228 229 230 231 232 233
      PADDLE_ENFORCE_LT(
          static_cast<size_t>(strategy_.trainer_id_),
          strategy_.trainers_endpoints_.size(),
          platform::errors::InvalidArgument(
              "The trainer_id_ of strategy_ must be less than the "
              "size of vector strategy_.trainers_endpoints_, "
              "but received strategy_.trainer_id_ = %d, "
              "the size of strategy_.trainers_endpoints_ is %d.",
              static_cast<size_t>(strategy_.trainer_id_),
              strategy_.trainers_endpoints_.size()));
S
sneaxiy 已提交
234
    }
C
chengduo 已提交
235
    VLOG(1) << "CollectiveContext:" << context->String();
236 237
  }

238 239 240 241
  void AppendAddReaderDependencyPass() {
    AppendPass("add_reader_dependency_pass");
  }

242
  // Convert graph to run on multi-devices.
C
chengduo 已提交
243
  void AppendMultiDevPass() {
C
chengduo 已提交
244
    ir::Pass *multi_devices_pass = nullptr;
Q
Qiao Longfei 已提交
245 246 247
    if (strategy_.async_mode_) {
      multi_devices_pass = AppendPass("async_multi_devices_pass").get();
    } else if (strategy_.is_distribution_) {
248 249
      multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
    } else {
C
chengduo 已提交
250 251 252 253 254 255 256 257 258
      switch (strategy_.reduce_) {
        case BuildStrategy::ReduceStrategy::kAllReduce:
          multi_devices_pass =
              AppendPass("all_reduce_mode_multi_devices_pass").get();
          break;
        case BuildStrategy::ReduceStrategy::kReduce:
          multi_devices_pass =
              AppendPass("reduce_mode_multi_devices_pass").get();
          break;
259 260 261
        case BuildStrategy::ReduceStrategy::kNoReduce:
          multi_devices_pass = AppendPass("no_reduce_multi_devices_pass").get();
          break;
C
chengduo 已提交
262
        default:
263 264
          PADDLE_THROW(
              platform::errors::Unimplemented("Unknown reduce strategy."));
265 266 267 268 269 270
      }
    }
    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
                                                         &strategy_);
  }

C
chengduo 已提交
271 272 273 274 275 276 277 278 279 280 281
  void AppendPrintGraphPass(const std::string &pass_name,
                            const std::string &debug_file_suffix) {
    if (!strategy_.debug_graphviz_path_.empty()) {
      auto viz_pass = AppendPass(pass_name);
      const std::string graph_path = string::Sprintf(
          "%s%s", strategy_.debug_graphviz_path_.c_str(), debug_file_suffix);
      viz_pass->Set<std::string>(ir::kGraphvizPath,
                                 new std::string(graph_path));
    }
  }

282
  void AppendPassWithCheck(const paddle::optional<bool> &append_pass,
C
chengduo 已提交
283 284 285 286
                           const std::string &pass_name) {
    AppendPassWithCheck(append_pass == true, pass_name);
  }

C
chengduo 已提交
287 288 289 290 291 292 293 294 295 296 297
  void AppendPassWithCheck(bool append_pass, const std::string &pass_name) {
    if (append_pass) {
      AppendPass(pass_name);
    }
  }

  void AppendPassToSetMkldnnAttr(const std::string &pass_name) {
#ifdef PADDLE_WITH_MKLDNN
    if (FLAGS_use_mkldnn) {
      AppendPass(pass_name);
    } else if (!strategy_.mkldnn_enabled_op_types_.empty()) {
T
tangwei12 已提交
298 299 300 301 302
      VLOG(1) << "mkldnn_enabled_op_types specify the operator type list to "
                 "use MKLDNN acceleration. It is null in default, means "
                 "that all the operators supported by MKLDNN will be "
                 "accelerated. And it should not be set when "
                 "FLAGS_use_mkldnn=false.";
C
chengduo 已提交
303 304
    }
#else
305 306
    PADDLE_ENFORCE_NE(FLAGS_use_mkldnn,
                      true,
307 308 309 310
                      platform::errors::PreconditionNotMet(
                          "FLAGS_use_mkldnn has been set to True, but "
                          "PaddlePaddle is compiled without MKLDNN. "
                          "Please compile PaddlePaddle with MKLDNN first."));
C
chengduo 已提交
311 312 313
#endif
  }

314 315 316 317
 private:
  BuildStrategy strategy_;
};

318
std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
X
Xin Pan 已提交
319 320
    bool finalize_strategy) const {
  if (is_finalized_) {
321 322
    return pass_builder_;
  }
323
  pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
X
Xin Pan 已提交
324 325
  if (finalize_strategy) {
    is_finalized_ = true;
326
  }
X
fix  
Xin Pan 已提交
327
  return pass_builder_;
328 329
}

330
bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
331
  return framework::ir::MultiDevSSAGraphBuilder().count(pass_name) > 0;
332 333
}

334 335 336 337 338
ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
                                const std::vector<platform::Place> &places,
                                const std::string &loss_var_name,
                                const std::vector<Scope *> &local_scopes,
                                const size_t &nranks,
339
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
340
                                DeviceType use_device,
341
                                platform::NCCLCommunicator *nccl_ctxs) const {
342 343 344
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
                                DeviceType use_device,
                                platform::BKCLCommunicator *bkcl_ctxs) const {
345
#else
346
                                DeviceType use_device) const {
347
#endif
C
chengduo 已提交
348
  VLOG(1) << "apply all passes";
349 350
  if (FLAGS_convert_all_blocks) {
    PADDLE_ENFORCE_EQ(
351 352
        graph->IsMainGraph(),
        true,
353 354
        platform::errors::InvalidArgument("This graph is not main_graph"));
  }
355 356
  // Create a default one if not finalized by user.
  CreatePassesFromStrategy(false);
X
fix  
Xin Pan 已提交
357 358

  for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
C
chengduo 已提交
359
    VLOG(1) << "BuildStrategy::Apply pass:" << pass->Type();
360 361 362
    if (IsMultiDevPass(pass->Type())) {
      pass->Erase(kPlaces);
      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
363 364
      pass->Erase(ir::kLossVarName);
      pass->SetNotOwned<const std::string>(ir::kLossVarName, &loss_var_name);
365 366
      pass->Erase(kLocalScopes);
      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
X
fix  
Xin Pan 已提交
367
                                                    &local_scopes);
368 369
      pass->Erase(kNRanks);
      pass->Set<size_t>(kNRanks, new size_t(nranks));
Y
Yancey1989 已提交
370

371
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
372 373
      platform::NCCLCommunicator *nctx =
          (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
C
chengduo 已提交
374
      pass->Erase(kNCCLCtxs);
375
      pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
376 377 378 379 380 381
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
      // ToDo: more check
      platform::BKCLCommunicator *bkcl_ctx =
          (use_device == p::kXPU) ? bkcl_ctxs : nullptr;
      pass->Erase(kBKCLCtxs);
      pass->SetNotOwned<platform::BKCLCommunicator>(kBKCLCtxs, bkcl_ctx);
382
#endif
C
chengduo 已提交
383
    } else if (pass->Type() == "fuse_all_reduce_op_pass") {
384 385
      pass->Erase(kNRanks);
      pass->Set<size_t>(kNRanks, new size_t(nranks));
C
chengduo 已提交
386 387 388 389 390
      pass->Erase(kPlaces);
      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
      pass->Erase(kLocalScopes);
      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
                                                    &local_scopes);
391
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
392 393
      platform::NCCLCommunicator *nctx =
          (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
C
chengduo 已提交
394 395 396 397 398
      pass->Erase(kNCCLCtxs);
      pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
      pass->Erase(kUseHierarchicalAllReduce);
      pass->Set<bool>(kUseHierarchicalAllReduce,
                      new bool(use_hierarchical_allreduce_));
399 400 401 402 403 404
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
      platform::BKCLCommunicator *nctx =
          (use_device == p::kXPU) ? bkcl_ctxs : nullptr;
      pass->Erase(kBKCLCtxs);
      pass->SetNotOwned<platform::BKCLCommunicator>(kBKCLCtxs, nctx);
      pass->Erase(kUseHierarchicalAllReduce);
405 406
      PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_,
                        false,
407 408 409 410
                        platform::errors::Unimplemented(
                            "xpu doesn't support hierarchical_allreduce"));
      pass->Set<bool>(kUseHierarchicalAllReduce,
                      new bool(use_hierarchical_allreduce_));
411
#endif
412
    } else if (pass->Type() == "coalesce_grad_tensor_pass") {
413 414
      pass->Erase(kNRanks);
      pass->Set<size_t>(kNRanks, new size_t(nranks));
S
sneaxiy 已提交
415
    } else if (pass->Type() == "sequential_execution_pass") {
416 417
      LOG(INFO) << "set enable_sequential_execution:"
                << enable_sequential_execution_;
418
    } else if (pass->Type() == "all_reduce_deps_pass") {
419
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
420 421
      platform::NCCLCommunicator *nctx =
          (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
422
      pass->Erase(kNCCLCtxs);
423
      pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
424 425 426
      pass->Erase(kUseHierarchicalAllReduce);
      pass->Set<bool>(kUseHierarchicalAllReduce,
                      new bool(use_hierarchical_allreduce_));
427 428 429 430 431 432
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
      platform::BKCLCommunicator *nctx =
          (use_device == p::kXPU) ? bkcl_ctxs : nullptr;
      pass->Erase(kBKCLCtxs);
      pass->SetNotOwned<platform::BKCLCommunicator>(kBKCLCtxs, nctx);
      pass->Erase(kUseHierarchicalAllReduce);
433 434
      PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_,
                        false,
435 436 437 438
                        platform::errors::Unimplemented(
                            "xpu doesn't support hierarchical_allreduce"));
      pass->Set<bool>(kUseHierarchicalAllReduce,
                      new bool(use_hierarchical_allreduce_));
439
#endif
440 441
      VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
              << ", num_trainers:" << num_trainers_;
442
    } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
443
      if (use_device != p::kCUDA) {
T
tangwei12 已提交
444 445
        VLOG(1) << "fuse_relu_depthwise_conv_pass is only supported on "
                   "GPU, skipped.";
446 447
        continue;
      }
448
    } else if (pass->Type() == "fusion_group_pass") {
449 450
      pass->Set<bool>("use_gpu", new bool((use_device == p::kCUDA)));
      if (use_device != p::kCUDA) {
T
tangwei12 已提交
451
        VLOG(1) << "fusion_group_pass is only supported on GPU, skipped.";
452 453
        continue;
      }
Z
Zhen Wang 已提交
454
    } else if (pass->Type() == "fuse_bn_act_pass") {
455
      if (use_device != p::kCUDA) {
T
tangwei12 已提交
456 457
        VLOG(1) << "fuse_bn_act_pass is only supported on "
                   "GPU, skipped.";
Z
Zhen Wang 已提交
458 459
        continue;
      }
Z
Zhang Ting 已提交
460
    } else if (pass->Type() == "fuse_bn_add_act_pass") {
461
      if (use_device != p::kCUDA) {
T
tangwei12 已提交
462 463
        VLOG(1) << "fuse_bn_add_act_pass is only supported on "
                   "GPU, skipped.";
Z
Zhang Ting 已提交
464 465
        continue;
      }
466 467 468
    } else if (pass->Type() == "mkldnn_placement_pass") {
      pass->Set("mkldnn_enabled_op_types",
                new std::unordered_set<std::string>(mkldnn_enabled_op_types_));
469
    } else if (pass->Type() == "backward_optimizer_op_deps_pass") {
470
      if (use_device != p::kCUDA) {
471 472 473 474
        VLOG(1) << "backward_optimizer_op_deps_pass is only supported on "
                   "GPU, skipped.";
        continue;
      }
X
fix  
Xin Pan 已提交
475
    }
C
chengduo 已提交
476
    VLOG(1) << "Start Apply Pass " << pass->Type();
477 478 479 480 481 482 483 484
    if (FLAGS_convert_all_blocks) {
      for (size_t i = 0; i < graph->SubGraphsSize(); ++i) {
        VLOG(3) << "Apply Pass " << pass->Type() << "to SubGraph " << i;
        pass->Apply(graph->GetSubGraph(i));
      }
    } else {
      graph = pass->Apply(graph);
    }
C
chengduo 已提交
485
    VLOG(1) << "Finish Apply Pass " << pass->Type();
X
fix  
Xin Pan 已提交
486
  }
C
chengduo 已提交
487
  VLOG(1) << "All Passes Applied";
488 489
  return graph;
}
D
dzhwinter 已提交
490

491 492 493 494
}  // namespace details
}  // namespace framework
}  // namespace paddle

Q
qingqing01 已提交
495
USE_PASS(sync_batch_norm_pass);
496
USE_PASS(fuse_relu_depthwise_conv_pass);
497
USE_PASS(fuse_elewise_add_act_pass);
Z
Zhen Wang 已提交
498
USE_PASS(fuse_bn_act_pass);
Z
Zhang Ting 已提交
499
USE_PASS(fuse_bn_add_act_pass);
500
USE_PASS(graph_viz_pass);
501
USE_PASS(multi_batch_merge_pass);
502
USE_PASS(no_reduce_multi_devices_pass);
503
USE_PASS(reduce_mode_multi_devices_pass);
C
chengduo 已提交
504
USE_PASS(all_reduce_mode_multi_devices_pass);
505
USE_PASS(dist_multi_devices_pass);
506 507
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
S
sneaxiy 已提交
508
USE_PASS(sequential_execution_pass);
509
USE_PASS(all_reduce_deps_pass);
510
USE_PASS(backward_optimizer_op_deps_pass);
S
sneaxiy 已提交
511
USE_PASS(modify_op_lock_and_record_event_pass);
M
minqiyang 已提交
512
USE_PASS(lock_free_optimize_pass);
513
USE_PASS(coalesce_grad_tensor_pass);
W
WangZhen 已提交
514
USE_PASS(graph_to_program_pass);
C
chengduo 已提交
515 516
USE_PASS(fuse_adam_op_pass);
USE_PASS(fuse_sgd_op_pass);
C
chengduo 已提交
517
USE_PASS(fuse_momentum_op_pass);
C
chengduo 已提交
518
USE_PASS(fuse_all_reduce_op_pass);
519
USE_PASS(runtime_context_cache_pass);
520
USE_PASS(add_reader_dependency_pass);
521
USE_PASS(delete_dropout_op_x_pass);
522
#ifdef PADDLE_WITH_CINN
J
jiangcheng 已提交
523
USE_PASS(build_cinn_pass);
524
#endif
525 526 527
#ifdef PADDLE_WITH_MKLDNN
USE_PASS(mkldnn_placement_pass);
#endif
528 529
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
    !defined(_WIN32) && !defined(__APPLE__)
530 531
USE_PASS(fusion_group_pass);
#endif
532 533 534
#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060)
USE_PASS(fuse_gemm_epilogue_pass);
#endif