build_strategy.cc 21.5 KB
Newer Older
1
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2
Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/framework/details/build_strategy.h"

D
dzhwinter 已提交
18
#include <glog/logging.h>
19

20
#include "paddle/fluid/framework/details/reduce_op_handle.h"
C
chengduo 已提交
21
#include "paddle/fluid/framework/ir/graph_printer.h"
22
#include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
23

24
DECLARE_bool(convert_all_blocks);
25
DECLARE_bool(use_mkldnn);
26 27 28
#ifdef PADDLE_WITH_CINN
DECLARE_bool(use_cinn);
#endif
29

30 31 32 33
namespace paddle {
namespace framework {
namespace details {

34
static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
Y
Yancey1989 已提交
35 36
  // Should fix the allreduce op order if scheduling
  // them in multiple threads or processes to avoid hang.
Y
Yancey1989 已提交
37
  // NOTE: ParallelGraph would execute this pass on each graph, so
Y
Yancey1989 已提交
38
  // don't need to append it here.
Y
Yancey1989 已提交
39
  return (!strategy.enable_sequential_execution_ &&
Y
Yancey1989 已提交
40 41
          strategy.num_trainers_ > 1) &&
         !strategy.enable_parallel_graph_;
42 43
}

44 45
static inline void ConvertDefaultValue(paddle::optional<bool> *default_value) {
  if (*default_value == paddle::none) {
C
chengduo 已提交
46 47 48 49
    *default_value = true;
  }
}

50 51 52 53
class ParallelExecutorPassBuilder : public ir::PassBuilder {
 public:
  explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
      : ir::PassBuilder(), strategy_(strategy) {
C
chengduo 已提交
54
    ResolveOptionConfliction();
C
chengduo 已提交
55

C
chengduo 已提交
56
    AppendPrintGraphPass("graph_viz_pass", "_original_graph");
57 58

#ifdef PADDLE_WITH_CINN
59
    if (FLAGS_use_cinn || strategy.build_cinn_pass_) {
60 61 62 63 64 65
      // Note: This pass is used to enable cinn.
      AppendPass("build_cinn_pass");
      AppendPrintGraphPass("graph_viz_pass", "_build_cinn_graph");
    }
#endif

C
chengduo 已提交
66 67 68 69 70 71
    AppendPassWithCheck(strategy_.enable_sequential_execution_,
                        "sequential_execution_pass");
    AppendPassWithCheck(strategy_.sync_batch_norm_, "sync_batch_norm_pass");

    AppendOpFusePasses();
    AppendPrintGraphPass("graph_viz_pass", "_fused_graph");
72

73
    AppendAddReaderDependencyPass();
C
chengduo 已提交
74 75 76 77 78 79 80 81 82 83 84 85 86
    AppendMultiDevPass();
    AppendMultiGraphOptPasses();

    AppendPassToSetMkldnnAttr("mkldnn_placement_pass");
    // runtime_context_cache pass should be the last pass to enable the attr of
    // all original and fused operators. But no operators can be enabled this
    // attr if putting it after MultiDevPass.
    AppendPassWithCheck(strategy_.cache_runtime_context_,
                        "runtime_context_cache_pass");
    AppendPassWithCheck(strategy_.remove_unnecessary_lock_,
                        "modify_op_lock_and_record_event_pass");
    // Note: This pass is used to check whether the multi_device_graph is right.
    AppendPass("multi_devices_check_pass");
Z
Zeng Jinle 已提交
87

C
chengduo 已提交
88 89
    SetCollectiveContext();
  }
90

C
chengduo 已提交
91 92 93
  void ResolveOptionConfliction() {
    // Specifies the restrictions between different pass.
    if (strategy_.enable_parallel_graph_) {
C
chengduo 已提交
94
      LOG_IF(WARNING, strategy_.fuse_all_optimizer_ops_ == true)
95
          << "Currently, fuse_all_optimizer_ops doesn't work under "
C
chengduo 已提交
96 97
             "parallel_graph.";
      strategy_.fuse_all_optimizer_ops_ = false;
C
chengduo 已提交
98
      LOG_IF(WARNING, strategy_.fuse_all_reduce_ops_ == true)
99 100 101
          << "fuse_all_reduce_ops doesn't work under "
             "parallel_graph.";
      strategy_.fuse_all_reduce_ops_ = false;
S
sneaxiy 已提交
102
    }
C
chengduo 已提交
103
    if (strategy_.is_distribution_) {
C
chengduo 已提交
104
      LOG_IF(WARNING, strategy_.fuse_all_optimizer_ops_ == true)
C
chengduo 已提交
105 106 107
          << "Currently, fuse_all_optimizer_ops only works under "
             "Non-distributed mode.";
      strategy_.fuse_all_optimizer_ops_ = false;
C
chengduo 已提交
108
      LOG_IF(WARNING, strategy_.fuse_all_reduce_ops_ == true)
109 110 111
          << "Currently, fuse_all_reduce_ops_ only works under "
             "Non-distributed mode.";
      strategy_.fuse_all_reduce_ops_ = false;
Q
qingqing01 已提交
112
    }
C
chengduo 已提交
113
    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
C
chengduo 已提交
114
      LOG_IF(WARNING, strategy_.fuse_all_optimizer_ops_ == true)
C
chengduo 已提交
115 116 117
          << "Currently, fuse_all_optimizer_ops only works under AllReduce "
             "mode.";
      strategy_.fuse_all_optimizer_ops_ = false;
C
chengduo 已提交
118
      LOG_IF(WARNING, strategy_.fuse_all_reduce_ops_ == true)
119 120
          << "fuse_all_optimizer_ops only works under AllReduce "
             "mode.";
C
chengduo 已提交
121
      strategy_.fuse_all_reduce_ops_ = false;
D
dzhwinter 已提交
122
    }
C
chengduo 已提交
123 124 125 126 127 128 129 130 131 132 133 134 135
    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
      LOG_IF(WARNING, strategy_.fuse_broadcast_ops_ == true)
          << "Currently, fuse_broadcast_ops only works under Reduce "
             "mode.";
      strategy_.fuse_broadcast_ops_ = false;
    }

    ConvertDefaultValue(&strategy_.fuse_all_optimizer_ops_);
    ConvertDefaultValue(&strategy_.fuse_all_reduce_ops_);
    ConvertDefaultValue(&strategy_.fuse_broadcast_ops_);

    if (strategy_.fuse_all_optimizer_ops_ == true) {
      LOG_IF(WARNING, strategy_.async_mode_)
136 137
          << "Currently, fuse_all_optimizer_ops doesn't work under "
             "async mode.";
C
chengduo 已提交
138 139 140 141
      strategy_.fuse_all_optimizer_ops_ = !strategy_.async_mode_;
    }
    if (strategy_.fuse_all_reduce_ops_ == true) {
      LOG_IF(WARNING, strategy_.async_mode_)
142 143
          << "Currently, fuse_all_reduce_ops doesn't work under "
             "async mode.";
C
chengduo 已提交
144
      strategy_.fuse_all_reduce_ops_ = !strategy_.async_mode_;
145
    }
C
chengduo 已提交
146
  }
147

C
chengduo 已提交
148 149 150 151 152 153
  void AppendMultiGraphOptPasses() {
    // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator
    // first, if the number is zero, fuse_all_reduce_ops will do nothing.
    AppendPassWithCheck(strategy_.fuse_all_reduce_ops_,
                        "fuse_all_reduce_op_pass");
    AppendPrintGraphPass("multi_devices_print_pass", "_multi_devices_graph");
S
sneaxiy 已提交
154

C
chengduo 已提交
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
    // experimental shows that the program will be faster if append
    // all_reduce_deps_pass here.
    bool append_all_reduce_deps_pass =
        !strategy_.enable_parallel_graph_ &&
        (SeqOnlyAllReduceOps(strategy_) ||
         strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce);
    AppendPassWithCheck(append_all_reduce_deps_pass, "all_reduce_deps_pass");

    bool append_backward_optimizer_op_deps_pass =
        strategy_.num_trainers_ > 1 && !strategy_.async_mode_ &&
        !strategy_.is_distribution_ &&
        strategy_.enable_backward_optimizer_op_deps_;
    AppendPassWithCheck(append_backward_optimizer_op_deps_pass,
                        "backward_optimizer_op_deps_pass");
  }
C
chengduo 已提交
170

C
chengduo 已提交
171
  void AppendOpFusePasses() {
172
    // 1. infernce pass if enabled.
H
Hui Zhang 已提交
173 174 175 176 177 178
    AppendPassWithCheck(
        strategy_.enable_inference_pass_ && strategy_.delete_dropout_,
        "delete_dropout_op_x_pass");
    AppendPassWithCheck(
        strategy_.enable_inference_pass_ && strategy_.use_mkldnn_,
        "mkldnn_placement_pass");
179 180

    // 2. trainning pass
C
chengduo 已提交
181 182
    AppendPassWithCheck(strategy_.fuse_relu_depthwise_conv_,
                        "fuse_relu_depthwise_conv_pass");
183
    AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass");
Z
Zhang Ting 已提交
184
    AppendPassWithCheck(strategy_.fuse_bn_add_act_ops_, "fuse_bn_add_act_pass");
185 186
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
    !defined(_WIN32) && !defined(__APPLE__)
187 188
    AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass");
#endif
189

190 191 192 193
#ifdef PADDLE_WITH_CUDA
    AppendPassWithCheck(strategy_.fused_attention_, "fused_attention_pass");
#endif

194 195 196 197
#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060)
    AppendPassWithCheck(strategy_.fuse_gemm_epilogue_,
                        "fuse_gemm_epilogue_pass");
#endif
C
chengduo 已提交
198 199
    AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_,
                        "fuse_elewise_add_act_pass");
C
chengduo 已提交
200
    // for single card training, fuse_all_reduce_ops is unnecessary.
201
    // coalesce_grad_tensor_pass should be before of MultiDevPass.
C
chengduo 已提交
202 203
    AppendPassWithCheck(strategy_.fuse_all_reduce_ops_,
                        "coalesce_grad_tensor_pass");
204
    // Fuse all the optimization operators.
C
chengduo 已提交
205 206 207
    // NOTE: fuse_all_xx_ops will count the number of xx operator first,
    // if the number is zero, fuse_all_reduce_ops will do nothing.
    // Currently, only one type of optimization algorithm can be fused.
C
chengduo 已提交
208
    if (strategy_.fuse_all_optimizer_ops_ == true) {
209 210 211
      AppendPass("fuse_adam_op_pass");
      AppendPass("fuse_sgd_op_pass");
      AppendPass("fuse_momentum_op_pass");
C
chengduo 已提交
212
    }
C
chengduo 已提交
213
  }
C
chengduo 已提交
214

C
chengduo 已提交
215 216 217 218
  void SetCollectiveContext() const {
    CollectiveContext *context = CollectiveContext::GetInstance();
    context->endpoints_ = strategy_.trainers_endpoints_;
    context->trainer_id_ = strategy_.trainer_id_;
219
    PADDLE_ENFORCE_GE(
220 221
        strategy_.trainer_id_,
        0,
222 223 224 225 226
        platform::errors::InvalidArgument(
            "The trainer_id_ of strategy_ must be greater than or equal to 0, "
            "but received strategy_.trainer_id_ = %d.",
            strategy_.trainer_id_));

C
chengduo 已提交
227
    if (strategy_.trainer_id_ > 0 && strategy_.trainers_endpoints_.size() > 0) {
228 229 230 231 232 233 234 235 236 237
      PADDLE_ENFORCE_LT(
          static_cast<size_t>(strategy_.trainer_id_),
          strategy_.trainers_endpoints_.size(),
          platform::errors::InvalidArgument(
              "The trainer_id_ of strategy_ must be less than the "
              "size of vector strategy_.trainers_endpoints_, "
              "but received strategy_.trainer_id_ = %d, "
              "the size of strategy_.trainers_endpoints_ is %d.",
              static_cast<size_t>(strategy_.trainer_id_),
              strategy_.trainers_endpoints_.size()));
S
sneaxiy 已提交
238
    }
C
chengduo 已提交
239
    VLOG(1) << "CollectiveContext:" << context->String();
240 241
  }

242 243 244 245
  void AppendAddReaderDependencyPass() {
    AppendPass("add_reader_dependency_pass");
  }

246
  // Convert graph to run on multi-devices.
C
chengduo 已提交
247
  void AppendMultiDevPass() {
C
chengduo 已提交
248
    ir::Pass *multi_devices_pass = nullptr;
Q
Qiao Longfei 已提交
249 250 251
    if (strategy_.async_mode_) {
      multi_devices_pass = AppendPass("async_multi_devices_pass").get();
    } else if (strategy_.is_distribution_) {
252 253
      multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
    } else {
C
chengduo 已提交
254 255 256 257 258 259 260 261 262
      switch (strategy_.reduce_) {
        case BuildStrategy::ReduceStrategy::kAllReduce:
          multi_devices_pass =
              AppendPass("all_reduce_mode_multi_devices_pass").get();
          break;
        case BuildStrategy::ReduceStrategy::kReduce:
          multi_devices_pass =
              AppendPass("reduce_mode_multi_devices_pass").get();
          break;
263 264 265
        case BuildStrategy::ReduceStrategy::kNoReduce:
          multi_devices_pass = AppendPass("no_reduce_multi_devices_pass").get();
          break;
C
chengduo 已提交
266
        default:
267 268
          PADDLE_THROW(
              platform::errors::Unimplemented("Unknown reduce strategy."));
269 270 271 272 273 274
      }
    }
    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
                                                         &strategy_);
  }

C
chengduo 已提交
275 276 277 278 279 280 281 282 283 284 285
  void AppendPrintGraphPass(const std::string &pass_name,
                            const std::string &debug_file_suffix) {
    if (!strategy_.debug_graphviz_path_.empty()) {
      auto viz_pass = AppendPass(pass_name);
      const std::string graph_path = string::Sprintf(
          "%s%s", strategy_.debug_graphviz_path_.c_str(), debug_file_suffix);
      viz_pass->Set<std::string>(ir::kGraphvizPath,
                                 new std::string(graph_path));
    }
  }

286
  void AppendPassWithCheck(const paddle::optional<bool> &append_pass,
C
chengduo 已提交
287 288 289 290
                           const std::string &pass_name) {
    AppendPassWithCheck(append_pass == true, pass_name);
  }

C
chengduo 已提交
291 292 293 294 295 296 297 298 299 300 301
  void AppendPassWithCheck(bool append_pass, const std::string &pass_name) {
    if (append_pass) {
      AppendPass(pass_name);
    }
  }

  void AppendPassToSetMkldnnAttr(const std::string &pass_name) {
#ifdef PADDLE_WITH_MKLDNN
    if (FLAGS_use_mkldnn) {
      AppendPass(pass_name);
    } else if (!strategy_.mkldnn_enabled_op_types_.empty()) {
T
tangwei12 已提交
302 303 304 305 306
      VLOG(1) << "mkldnn_enabled_op_types specify the operator type list to "
                 "use MKLDNN acceleration. It is null in default, means "
                 "that all the operators supported by MKLDNN will be "
                 "accelerated. And it should not be set when "
                 "FLAGS_use_mkldnn=false.";
C
chengduo 已提交
307 308
    }
#else
309 310
    PADDLE_ENFORCE_NE(FLAGS_use_mkldnn,
                      true,
311 312 313 314
                      platform::errors::PreconditionNotMet(
                          "FLAGS_use_mkldnn has been set to True, but "
                          "PaddlePaddle is compiled without MKLDNN. "
                          "Please compile PaddlePaddle with MKLDNN first."));
C
chengduo 已提交
315 316 317
#endif
  }

318 319 320 321
 private:
  BuildStrategy strategy_;
};

322
std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
X
Xin Pan 已提交
323 324
    bool finalize_strategy) const {
  if (is_finalized_) {
325 326
    return pass_builder_;
  }
327
  pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
X
Xin Pan 已提交
328 329
  if (finalize_strategy) {
    is_finalized_ = true;
330
  }
X
fix  
Xin Pan 已提交
331
  return pass_builder_;
332 333
}

334
bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
335
  return framework::ir::MultiDevSSAGraphBuilder().count(pass_name) > 0;
336 337
}

338 339 340 341 342
ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
                                const std::vector<platform::Place> &places,
                                const std::string &loss_var_name,
                                const std::vector<Scope *> &local_scopes,
                                const size_t &nranks,
343
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
344
                                DeviceType use_device,
345
                                platform::NCCLCommunicator *nccl_ctxs) const {
346 347 348
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
                                DeviceType use_device,
                                platform::BKCLCommunicator *bkcl_ctxs) const {
349
#else
350
                                DeviceType use_device) const {
351
#endif
C
chengduo 已提交
352
  VLOG(1) << "apply all passes";
353 354
  if (FLAGS_convert_all_blocks) {
    PADDLE_ENFORCE_EQ(
355 356
        graph->IsMainGraph(),
        true,
357 358
        platform::errors::InvalidArgument("This graph is not main_graph"));
  }
359 360
  // Create a default one if not finalized by user.
  CreatePassesFromStrategy(false);
X
fix  
Xin Pan 已提交
361 362

  for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
C
chengduo 已提交
363
    VLOG(1) << "BuildStrategy::Apply pass:" << pass->Type();
364 365 366
    if (IsMultiDevPass(pass->Type())) {
      pass->Erase(kPlaces);
      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
367 368
      pass->Erase(ir::kLossVarName);
      pass->SetNotOwned<const std::string>(ir::kLossVarName, &loss_var_name);
369 370
      pass->Erase(kLocalScopes);
      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
X
fix  
Xin Pan 已提交
371
                                                    &local_scopes);
372 373
      pass->Erase(kNRanks);
      pass->Set<size_t>(kNRanks, new size_t(nranks));
Y
Yancey1989 已提交
374

375
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
376 377
      platform::NCCLCommunicator *nctx =
          (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
C
chengduo 已提交
378
      pass->Erase(kNCCLCtxs);
379
      pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
380 381 382 383 384 385
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
      // ToDo: more check
      platform::BKCLCommunicator *bkcl_ctx =
          (use_device == p::kXPU) ? bkcl_ctxs : nullptr;
      pass->Erase(kBKCLCtxs);
      pass->SetNotOwned<platform::BKCLCommunicator>(kBKCLCtxs, bkcl_ctx);
386
#endif
C
chengduo 已提交
387
    } else if (pass->Type() == "fuse_all_reduce_op_pass") {
388 389
      pass->Erase(kNRanks);
      pass->Set<size_t>(kNRanks, new size_t(nranks));
C
chengduo 已提交
390 391 392 393 394
      pass->Erase(kPlaces);
      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
      pass->Erase(kLocalScopes);
      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
                                                    &local_scopes);
395
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
396 397
      platform::NCCLCommunicator *nctx =
          (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
C
chengduo 已提交
398 399 400 401 402
      pass->Erase(kNCCLCtxs);
      pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
      pass->Erase(kUseHierarchicalAllReduce);
      pass->Set<bool>(kUseHierarchicalAllReduce,
                      new bool(use_hierarchical_allreduce_));
403 404 405 406 407 408
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
      platform::BKCLCommunicator *nctx =
          (use_device == p::kXPU) ? bkcl_ctxs : nullptr;
      pass->Erase(kBKCLCtxs);
      pass->SetNotOwned<platform::BKCLCommunicator>(kBKCLCtxs, nctx);
      pass->Erase(kUseHierarchicalAllReduce);
409 410
      PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_,
                        false,
411 412 413 414
                        platform::errors::Unimplemented(
                            "xpu doesn't support hierarchical_allreduce"));
      pass->Set<bool>(kUseHierarchicalAllReduce,
                      new bool(use_hierarchical_allreduce_));
415
#endif
416
    } else if (pass->Type() == "coalesce_grad_tensor_pass") {
417 418
      pass->Erase(kNRanks);
      pass->Set<size_t>(kNRanks, new size_t(nranks));
S
sneaxiy 已提交
419
    } else if (pass->Type() == "sequential_execution_pass") {
420 421
      LOG(INFO) << "set enable_sequential_execution:"
                << enable_sequential_execution_;
422
    } else if (pass->Type() == "all_reduce_deps_pass") {
423
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
424 425
      platform::NCCLCommunicator *nctx =
          (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
426
      pass->Erase(kNCCLCtxs);
427
      pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
428 429 430
      pass->Erase(kUseHierarchicalAllReduce);
      pass->Set<bool>(kUseHierarchicalAllReduce,
                      new bool(use_hierarchical_allreduce_));
431 432 433 434 435 436
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
      platform::BKCLCommunicator *nctx =
          (use_device == p::kXPU) ? bkcl_ctxs : nullptr;
      pass->Erase(kBKCLCtxs);
      pass->SetNotOwned<platform::BKCLCommunicator>(kBKCLCtxs, nctx);
      pass->Erase(kUseHierarchicalAllReduce);
437 438
      PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_,
                        false,
439 440 441 442
                        platform::errors::Unimplemented(
                            "xpu doesn't support hierarchical_allreduce"));
      pass->Set<bool>(kUseHierarchicalAllReduce,
                      new bool(use_hierarchical_allreduce_));
443
#endif
444 445
      VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
              << ", num_trainers:" << num_trainers_;
446
    } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
447
      if (use_device != p::kCUDA) {
T
tangwei12 已提交
448 449
        VLOG(1) << "fuse_relu_depthwise_conv_pass is only supported on "
                   "GPU, skipped.";
450 451
        continue;
      }
452
    } else if (pass->Type() == "fusion_group_pass") {
453 454
      pass->Set<bool>("use_gpu", new bool((use_device == p::kCUDA)));
      if (use_device != p::kCUDA) {
T
tangwei12 已提交
455
        VLOG(1) << "fusion_group_pass is only supported on GPU, skipped.";
456 457
        continue;
      }
Z
Zhen Wang 已提交
458
    } else if (pass->Type() == "fuse_bn_act_pass") {
459
      if (use_device != p::kCUDA) {
T
tangwei12 已提交
460 461
        VLOG(1) << "fuse_bn_act_pass is only supported on "
                   "GPU, skipped.";
Z
Zhen Wang 已提交
462 463
        continue;
      }
Z
Zhang Ting 已提交
464
    } else if (pass->Type() == "fuse_bn_add_act_pass") {
465
      if (use_device != p::kCUDA) {
T
tangwei12 已提交
466 467
        VLOG(1) << "fuse_bn_add_act_pass is only supported on "
                   "GPU, skipped.";
Z
Zhang Ting 已提交
468 469
        continue;
      }
470 471 472
    } else if (pass->Type() == "mkldnn_placement_pass") {
      pass->Set("mkldnn_enabled_op_types",
                new std::unordered_set<std::string>(mkldnn_enabled_op_types_));
473
    } else if (pass->Type() == "backward_optimizer_op_deps_pass") {
474
      if (use_device != p::kCUDA) {
475 476 477 478
        VLOG(1) << "backward_optimizer_op_deps_pass is only supported on "
                   "GPU, skipped.";
        continue;
      }
X
fix  
Xin Pan 已提交
479
    }
C
chengduo 已提交
480
    VLOG(1) << "Start Apply Pass " << pass->Type();
481 482 483 484 485 486 487 488
    if (FLAGS_convert_all_blocks) {
      for (size_t i = 0; i < graph->SubGraphsSize(); ++i) {
        VLOG(3) << "Apply Pass " << pass->Type() << "to SubGraph " << i;
        pass->Apply(graph->GetSubGraph(i));
      }
    } else {
      graph = pass->Apply(graph);
    }
C
chengduo 已提交
489
    VLOG(1) << "Finish Apply Pass " << pass->Type();
X
fix  
Xin Pan 已提交
490
  }
C
chengduo 已提交
491
  VLOG(1) << "All Passes Applied";
492 493
  return graph;
}
D
dzhwinter 已提交
494

495 496 497 498
}  // namespace details
}  // namespace framework
}  // namespace paddle

Q
qingqing01 已提交
499
USE_PASS(sync_batch_norm_pass);
500
USE_PASS(fuse_relu_depthwise_conv_pass);
501
USE_PASS(fuse_elewise_add_act_pass);
Z
Zhen Wang 已提交
502
USE_PASS(fuse_bn_act_pass);
Z
Zhang Ting 已提交
503
USE_PASS(fuse_bn_add_act_pass);
504
USE_PASS(graph_viz_pass);
505
USE_PASS(multi_batch_merge_pass);
506
USE_PASS(no_reduce_multi_devices_pass);
507
USE_PASS(reduce_mode_multi_devices_pass);
C
chengduo 已提交
508
USE_PASS(all_reduce_mode_multi_devices_pass);
509
USE_PASS(dist_multi_devices_pass);
510 511
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
S
sneaxiy 已提交
512
USE_PASS(sequential_execution_pass);
513
USE_PASS(all_reduce_deps_pass);
514
USE_PASS(backward_optimizer_op_deps_pass);
S
sneaxiy 已提交
515
USE_PASS(modify_op_lock_and_record_event_pass);
M
minqiyang 已提交
516
USE_PASS(lock_free_optimize_pass);
517
USE_PASS(coalesce_grad_tensor_pass);
W
WangZhen 已提交
518
USE_PASS(graph_to_program_pass);
C
chengduo 已提交
519 520
USE_PASS(fuse_adam_op_pass);
USE_PASS(fuse_sgd_op_pass);
C
chengduo 已提交
521
USE_PASS(fuse_momentum_op_pass);
C
chengduo 已提交
522
USE_PASS(fuse_all_reduce_op_pass);
523
USE_PASS(runtime_context_cache_pass);
524
USE_PASS(add_reader_dependency_pass);
525
USE_PASS(delete_dropout_op_x_pass);
526 527 528
#ifdef PADDLE_WITH_CUDA
USE_PASS(fused_attention_pass);
#endif
529
#ifdef PADDLE_WITH_CINN
J
jiangcheng 已提交
530
USE_PASS(build_cinn_pass);
531
#endif
532 533 534
#ifdef PADDLE_WITH_MKLDNN
USE_PASS(mkldnn_placement_pass);
#endif
535 536
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
    !defined(_WIN32) && !defined(__APPLE__)
537 538
USE_PASS(fusion_group_pass);
#endif
539 540 541
#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060)
USE_PASS(fuse_gemm_epilogue_pass);
#endif