build_strategy.cc 22.0 KB
Newer Older
1
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2
Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/framework/details/build_strategy.h"

D
dzhwinter 已提交
18
#include <glog/logging.h>
19

20
#include "paddle/fluid/framework/details/reduce_op_handle.h"
C
chengduo 已提交
21
#include "paddle/fluid/framework/ir/graph_printer.h"
22
#include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
23

24
DECLARE_bool(convert_all_blocks);
25
DECLARE_bool(use_mkldnn);
26 27 28
#ifdef PADDLE_WITH_CINN
DECLARE_bool(use_cinn);
#endif
29

30 31 32 33
namespace paddle {
namespace framework {
namespace details {

34
static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
Y
Yancey1989 已提交
35 36
  // Should fix the allreduce op order if scheduling
  // them in multiple threads or processes to avoid hang.
Y
Yancey1989 已提交
37
  // NOTE: ParallelGraph would execute this pass on each graph, so
Y
Yancey1989 已提交
38
  // don't need to append it here.
Y
Yancey1989 已提交
39
  return (!strategy.enable_sequential_execution_ &&
Y
Yancey1989 已提交
40 41
          strategy.num_trainers_ > 1) &&
         !strategy.enable_parallel_graph_;
42 43
}

44 45
static inline void ConvertDefaultValue(paddle::optional<bool> *default_value) {
  if (*default_value == paddle::none) {
C
chengduo 已提交
46 47 48 49
    *default_value = true;
  }
}

50 51 52 53
class ParallelExecutorPassBuilder : public ir::PassBuilder {
 public:
  explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
      : ir::PassBuilder(), strategy_(strategy) {
C
chengduo 已提交
54
    ResolveOptionConfliction();
C
chengduo 已提交
55

C
chengduo 已提交
56
    AppendPrintGraphPass("graph_viz_pass", "_original_graph");
57 58

#ifdef PADDLE_WITH_CINN
59
    if (FLAGS_use_cinn || strategy.build_cinn_pass_) {
60 61 62
      // Note: This is a trick to support 0D-Tensor for CINN. This pass will be
      // removed in the near future.
      AppendPass("cinn_zero_tensor_trick_pass");
63 64 65 66 67 68
      // Note: This pass is used to enable cinn.
      AppendPass("build_cinn_pass");
      AppendPrintGraphPass("graph_viz_pass", "_build_cinn_graph");
    }
#endif

C
chengduo 已提交
69 70 71 72 73 74
    AppendPassWithCheck(strategy_.enable_sequential_execution_,
                        "sequential_execution_pass");
    AppendPassWithCheck(strategy_.sync_batch_norm_, "sync_batch_norm_pass");

    AppendOpFusePasses();
    AppendPrintGraphPass("graph_viz_pass", "_fused_graph");
75

76
    AppendAddReaderDependencyPass();
C
chengduo 已提交
77 78 79 80 81 82 83 84 85 86 87 88 89
    AppendMultiDevPass();
    AppendMultiGraphOptPasses();

    AppendPassToSetMkldnnAttr("mkldnn_placement_pass");
    // runtime_context_cache pass should be the last pass to enable the attr of
    // all original and fused operators. But no operators can be enabled this
    // attr if putting it after MultiDevPass.
    AppendPassWithCheck(strategy_.cache_runtime_context_,
                        "runtime_context_cache_pass");
    AppendPassWithCheck(strategy_.remove_unnecessary_lock_,
                        "modify_op_lock_and_record_event_pass");
    // Note: This pass is used to check whether the multi_device_graph is right.
    AppendPass("multi_devices_check_pass");
Z
Zeng Jinle 已提交
90

C
chengduo 已提交
91 92
    SetCollectiveContext();
  }
93

C
chengduo 已提交
94 95 96
  void ResolveOptionConfliction() {
    // Specifies the restrictions between different pass.
    if (strategy_.enable_parallel_graph_) {
C
chengduo 已提交
97
      LOG_IF(WARNING, strategy_.fuse_all_optimizer_ops_ == true)
98
          << "Currently, fuse_all_optimizer_ops doesn't work under "
C
chengduo 已提交
99 100
             "parallel_graph.";
      strategy_.fuse_all_optimizer_ops_ = false;
C
chengduo 已提交
101
      LOG_IF(WARNING, strategy_.fuse_all_reduce_ops_ == true)
102 103 104
          << "fuse_all_reduce_ops doesn't work under "
             "parallel_graph.";
      strategy_.fuse_all_reduce_ops_ = false;
S
sneaxiy 已提交
105
    }
C
chengduo 已提交
106
    if (strategy_.is_distribution_) {
C
chengduo 已提交
107
      LOG_IF(WARNING, strategy_.fuse_all_optimizer_ops_ == true)
C
chengduo 已提交
108 109 110
          << "Currently, fuse_all_optimizer_ops only works under "
             "Non-distributed mode.";
      strategy_.fuse_all_optimizer_ops_ = false;
C
chengduo 已提交
111
      LOG_IF(WARNING, strategy_.fuse_all_reduce_ops_ == true)
112 113 114
          << "Currently, fuse_all_reduce_ops_ only works under "
             "Non-distributed mode.";
      strategy_.fuse_all_reduce_ops_ = false;
Q
qingqing01 已提交
115
    }
C
chengduo 已提交
116
    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
C
chengduo 已提交
117
      LOG_IF(WARNING, strategy_.fuse_all_optimizer_ops_ == true)
C
chengduo 已提交
118 119 120
          << "Currently, fuse_all_optimizer_ops only works under AllReduce "
             "mode.";
      strategy_.fuse_all_optimizer_ops_ = false;
C
chengduo 已提交
121
      LOG_IF(WARNING, strategy_.fuse_all_reduce_ops_ == true)
122 123
          << "fuse_all_optimizer_ops only works under AllReduce "
             "mode.";
C
chengduo 已提交
124
      strategy_.fuse_all_reduce_ops_ = false;
D
dzhwinter 已提交
125
    }
C
chengduo 已提交
126 127 128 129 130 131 132 133 134 135 136 137 138
    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
      LOG_IF(WARNING, strategy_.fuse_broadcast_ops_ == true)
          << "Currently, fuse_broadcast_ops only works under Reduce "
             "mode.";
      strategy_.fuse_broadcast_ops_ = false;
    }

    ConvertDefaultValue(&strategy_.fuse_all_optimizer_ops_);
    ConvertDefaultValue(&strategy_.fuse_all_reduce_ops_);
    ConvertDefaultValue(&strategy_.fuse_broadcast_ops_);

    if (strategy_.fuse_all_optimizer_ops_ == true) {
      LOG_IF(WARNING, strategy_.async_mode_)
139 140
          << "Currently, fuse_all_optimizer_ops doesn't work under "
             "async mode.";
C
chengduo 已提交
141 142 143 144
      strategy_.fuse_all_optimizer_ops_ = !strategy_.async_mode_;
    }
    if (strategy_.fuse_all_reduce_ops_ == true) {
      LOG_IF(WARNING, strategy_.async_mode_)
145 146
          << "Currently, fuse_all_reduce_ops doesn't work under "
             "async mode.";
C
chengduo 已提交
147
      strategy_.fuse_all_reduce_ops_ = !strategy_.async_mode_;
148
    }
C
chengduo 已提交
149
  }
150

C
chengduo 已提交
151 152 153 154 155 156
  void AppendMultiGraphOptPasses() {
    // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator
    // first, if the number is zero, fuse_all_reduce_ops will do nothing.
    AppendPassWithCheck(strategy_.fuse_all_reduce_ops_,
                        "fuse_all_reduce_op_pass");
    AppendPrintGraphPass("multi_devices_print_pass", "_multi_devices_graph");
S
sneaxiy 已提交
157

C
chengduo 已提交
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
    // experimental shows that the program will be faster if append
    // all_reduce_deps_pass here.
    bool append_all_reduce_deps_pass =
        !strategy_.enable_parallel_graph_ &&
        (SeqOnlyAllReduceOps(strategy_) ||
         strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce);
    AppendPassWithCheck(append_all_reduce_deps_pass, "all_reduce_deps_pass");

    bool append_backward_optimizer_op_deps_pass =
        strategy_.num_trainers_ > 1 && !strategy_.async_mode_ &&
        !strategy_.is_distribution_ &&
        strategy_.enable_backward_optimizer_op_deps_;
    AppendPassWithCheck(append_backward_optimizer_op_deps_pass,
                        "backward_optimizer_op_deps_pass");
  }
C
chengduo 已提交
173

C
chengduo 已提交
174
  void AppendOpFusePasses() {
175
    // 1. infernce pass if enabled.
H
Hui Zhang 已提交
176 177 178 179 180 181
    AppendPassWithCheck(
        strategy_.enable_inference_pass_ && strategy_.delete_dropout_,
        "delete_dropout_op_x_pass");
    AppendPassWithCheck(
        strategy_.enable_inference_pass_ && strategy_.use_mkldnn_,
        "mkldnn_placement_pass");
182 183

    // 2. trainning pass
C
chengduo 已提交
184 185
    AppendPassWithCheck(strategy_.fuse_relu_depthwise_conv_,
                        "fuse_relu_depthwise_conv_pass");
186
    AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass");
Z
Zhang Ting 已提交
187
    AppendPassWithCheck(strategy_.fuse_bn_add_act_ops_, "fuse_bn_add_act_pass");
188 189
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
    !defined(_WIN32) && !defined(__APPLE__)
190 191
    AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass");
#endif
192

193 194
#ifdef PADDLE_WITH_CUDA
    AppendPassWithCheck(strategy_.fused_attention_, "fused_attention_pass");
Y
yuehuayingxueluo 已提交
195
    AppendPassWithCheck(strategy_.fuse_adamw_, "fuse_adamw_op_pass");
196 197
#endif

198 199 200 201
#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060)
    AppendPassWithCheck(strategy_.fuse_gemm_epilogue_,
                        "fuse_gemm_epilogue_pass");
#endif
C
chengduo 已提交
202 203
    AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_,
                        "fuse_elewise_add_act_pass");
C
chengduo 已提交
204
    // for single card training, fuse_all_reduce_ops is unnecessary.
205
    // coalesce_grad_tensor_pass should be before of MultiDevPass.
C
chengduo 已提交
206 207
    AppendPassWithCheck(strategy_.fuse_all_reduce_ops_,
                        "coalesce_grad_tensor_pass");
208
    // Fuse all the optimization operators.
C
chengduo 已提交
209 210 211
    // NOTE: fuse_all_xx_ops will count the number of xx operator first,
    // if the number is zero, fuse_all_reduce_ops will do nothing.
    // Currently, only one type of optimization algorithm can be fused.
C
chengduo 已提交
212
    if (strategy_.fuse_all_optimizer_ops_ == true) {
213 214 215
      AppendPass("fuse_adam_op_pass");
      AppendPass("fuse_sgd_op_pass");
      AppendPass("fuse_momentum_op_pass");
C
chengduo 已提交
216
    }
217 218 219
#ifdef PADDLE_WITH_CUDA
    AppendPassWithCheck(strategy_.fused_feedforward_, "fused_feedforward_pass");
#endif
C
chengduo 已提交
220
  }
C
chengduo 已提交
221

C
chengduo 已提交
222 223 224 225
  void SetCollectiveContext() const {
    CollectiveContext *context = CollectiveContext::GetInstance();
    context->endpoints_ = strategy_.trainers_endpoints_;
    context->trainer_id_ = strategy_.trainer_id_;
226
    PADDLE_ENFORCE_GE(
227 228
        strategy_.trainer_id_,
        0,
229 230 231 232 233
        platform::errors::InvalidArgument(
            "The trainer_id_ of strategy_ must be greater than or equal to 0, "
            "but received strategy_.trainer_id_ = %d.",
            strategy_.trainer_id_));

C
chengduo 已提交
234
    if (strategy_.trainer_id_ > 0 && strategy_.trainers_endpoints_.size() > 0) {
235 236 237 238 239 240 241 242 243 244
      PADDLE_ENFORCE_LT(
          static_cast<size_t>(strategy_.trainer_id_),
          strategy_.trainers_endpoints_.size(),
          platform::errors::InvalidArgument(
              "The trainer_id_ of strategy_ must be less than the "
              "size of vector strategy_.trainers_endpoints_, "
              "but received strategy_.trainer_id_ = %d, "
              "the size of strategy_.trainers_endpoints_ is %d.",
              static_cast<size_t>(strategy_.trainer_id_),
              strategy_.trainers_endpoints_.size()));
S
sneaxiy 已提交
245
    }
C
chengduo 已提交
246
    VLOG(1) << "CollectiveContext:" << context->String();
247 248
  }

249 250 251 252
  void AppendAddReaderDependencyPass() {
    AppendPass("add_reader_dependency_pass");
  }

253
  // Convert graph to run on multi-devices.
C
chengduo 已提交
254
  void AppendMultiDevPass() {
C
chengduo 已提交
255
    ir::Pass *multi_devices_pass = nullptr;
Q
Qiao Longfei 已提交
256 257 258
    if (strategy_.async_mode_) {
      multi_devices_pass = AppendPass("async_multi_devices_pass").get();
    } else if (strategy_.is_distribution_) {
259 260
      multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
    } else {
C
chengduo 已提交
261 262 263 264 265 266 267 268 269
      switch (strategy_.reduce_) {
        case BuildStrategy::ReduceStrategy::kAllReduce:
          multi_devices_pass =
              AppendPass("all_reduce_mode_multi_devices_pass").get();
          break;
        case BuildStrategy::ReduceStrategy::kReduce:
          multi_devices_pass =
              AppendPass("reduce_mode_multi_devices_pass").get();
          break;
270 271 272
        case BuildStrategy::ReduceStrategy::kNoReduce:
          multi_devices_pass = AppendPass("no_reduce_multi_devices_pass").get();
          break;
C
chengduo 已提交
273
        default:
274 275
          PADDLE_THROW(
              platform::errors::Unimplemented("Unknown reduce strategy."));
276 277 278 279 280 281
      }
    }
    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
                                                         &strategy_);
  }

C
chengduo 已提交
282 283 284 285 286 287 288 289 290 291 292
  void AppendPrintGraphPass(const std::string &pass_name,
                            const std::string &debug_file_suffix) {
    if (!strategy_.debug_graphviz_path_.empty()) {
      auto viz_pass = AppendPass(pass_name);
      const std::string graph_path = string::Sprintf(
          "%s%s", strategy_.debug_graphviz_path_.c_str(), debug_file_suffix);
      viz_pass->Set<std::string>(ir::kGraphvizPath,
                                 new std::string(graph_path));
    }
  }

293
  void AppendPassWithCheck(const paddle::optional<bool> &append_pass,
C
chengduo 已提交
294 295 296 297
                           const std::string &pass_name) {
    AppendPassWithCheck(append_pass == true, pass_name);
  }

C
chengduo 已提交
298 299 300 301 302 303 304 305 306 307 308
  void AppendPassWithCheck(bool append_pass, const std::string &pass_name) {
    if (append_pass) {
      AppendPass(pass_name);
    }
  }

  void AppendPassToSetMkldnnAttr(const std::string &pass_name) {
#ifdef PADDLE_WITH_MKLDNN
    if (FLAGS_use_mkldnn) {
      AppendPass(pass_name);
    } else if (!strategy_.mkldnn_enabled_op_types_.empty()) {
T
tangwei12 已提交
309 310 311 312 313
      VLOG(1) << "mkldnn_enabled_op_types specify the operator type list to "
                 "use MKLDNN acceleration. It is null in default, means "
                 "that all the operators supported by MKLDNN will be "
                 "accelerated. And it should not be set when "
                 "FLAGS_use_mkldnn=false.";
C
chengduo 已提交
314 315
    }
#else
316 317
    PADDLE_ENFORCE_NE(FLAGS_use_mkldnn,
                      true,
318 319 320 321
                      platform::errors::PreconditionNotMet(
                          "FLAGS_use_mkldnn has been set to True, but "
                          "PaddlePaddle is compiled without MKLDNN. "
                          "Please compile PaddlePaddle with MKLDNN first."));
C
chengduo 已提交
322 323 324
#endif
  }

325 326 327 328
 private:
  BuildStrategy strategy_;
};

329
std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy(
X
Xin Pan 已提交
330 331
    bool finalize_strategy) const {
  if (is_finalized_) {
332 333
    return pass_builder_;
  }
334
  pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
X
Xin Pan 已提交
335 336
  if (finalize_strategy) {
    is_finalized_ = true;
337
  }
X
fix  
Xin Pan 已提交
338
  return pass_builder_;
339 340
}

341
bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const {
342
  return framework::ir::MultiDevSSAGraphBuilder().count(pass_name) > 0;
343 344
}

345 346 347 348 349
ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
                                const std::vector<platform::Place> &places,
                                const std::string &loss_var_name,
                                const std::vector<Scope *> &local_scopes,
                                const size_t &nranks,
350
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
351
                                DeviceType use_device,
352
                                platform::NCCLCommunicator *nccl_ctxs) const {
353 354 355
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
                                DeviceType use_device,
                                platform::BKCLCommunicator *bkcl_ctxs) const {
356
#else
357
                                DeviceType use_device) const {
358
#endif
C
chengduo 已提交
359
  VLOG(1) << "apply all passes";
360 361
  if (FLAGS_convert_all_blocks) {
    PADDLE_ENFORCE_EQ(
362 363
        graph->IsMainGraph(),
        true,
364 365
        platform::errors::InvalidArgument("This graph is not main_graph"));
  }
366 367
  // Create a default one if not finalized by user.
  CreatePassesFromStrategy(false);
X
fix  
Xin Pan 已提交
368 369

  for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
C
chengduo 已提交
370
    VLOG(1) << "BuildStrategy::Apply pass:" << pass->Type();
371 372 373
    if (IsMultiDevPass(pass->Type())) {
      pass->Erase(kPlaces);
      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
374 375
      pass->Erase(ir::kLossVarName);
      pass->SetNotOwned<const std::string>(ir::kLossVarName, &loss_var_name);
376 377
      pass->Erase(kLocalScopes);
      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
X
fix  
Xin Pan 已提交
378
                                                    &local_scopes);
379 380
      pass->Erase(kNRanks);
      pass->Set<size_t>(kNRanks, new size_t(nranks));
Y
Yancey1989 已提交
381

382
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
383 384
      platform::NCCLCommunicator *nctx =
          (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
C
chengduo 已提交
385
      pass->Erase(kNCCLCtxs);
386
      pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
387 388 389 390 391 392
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
      // ToDo: more check
      platform::BKCLCommunicator *bkcl_ctx =
          (use_device == p::kXPU) ? bkcl_ctxs : nullptr;
      pass->Erase(kBKCLCtxs);
      pass->SetNotOwned<platform::BKCLCommunicator>(kBKCLCtxs, bkcl_ctx);
393
#endif
C
chengduo 已提交
394
    } else if (pass->Type() == "fuse_all_reduce_op_pass") {
395 396
      pass->Erase(kNRanks);
      pass->Set<size_t>(kNRanks, new size_t(nranks));
C
chengduo 已提交
397 398 399 400 401
      pass->Erase(kPlaces);
      pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
      pass->Erase(kLocalScopes);
      pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
                                                    &local_scopes);
402
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
403 404
      platform::NCCLCommunicator *nctx =
          (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
C
chengduo 已提交
405 406 407 408 409
      pass->Erase(kNCCLCtxs);
      pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
      pass->Erase(kUseHierarchicalAllReduce);
      pass->Set<bool>(kUseHierarchicalAllReduce,
                      new bool(use_hierarchical_allreduce_));
410 411 412 413 414 415
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
      platform::BKCLCommunicator *nctx =
          (use_device == p::kXPU) ? bkcl_ctxs : nullptr;
      pass->Erase(kBKCLCtxs);
      pass->SetNotOwned<platform::BKCLCommunicator>(kBKCLCtxs, nctx);
      pass->Erase(kUseHierarchicalAllReduce);
416 417
      PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_,
                        false,
418 419 420 421
                        platform::errors::Unimplemented(
                            "xpu doesn't support hierarchical_allreduce"));
      pass->Set<bool>(kUseHierarchicalAllReduce,
                      new bool(use_hierarchical_allreduce_));
422
#endif
423
    } else if (pass->Type() == "coalesce_grad_tensor_pass") {
424 425
      pass->Erase(kNRanks);
      pass->Set<size_t>(kNRanks, new size_t(nranks));
S
sneaxiy 已提交
426
    } else if (pass->Type() == "sequential_execution_pass") {
427 428
      LOG(INFO) << "set enable_sequential_execution:"
                << enable_sequential_execution_;
429
    } else if (pass->Type() == "all_reduce_deps_pass") {
430
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
431 432
      platform::NCCLCommunicator *nctx =
          (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
433
      pass->Erase(kNCCLCtxs);
434
      pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
435 436 437
      pass->Erase(kUseHierarchicalAllReduce);
      pass->Set<bool>(kUseHierarchicalAllReduce,
                      new bool(use_hierarchical_allreduce_));
438 439 440 441 442 443
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
      platform::BKCLCommunicator *nctx =
          (use_device == p::kXPU) ? bkcl_ctxs : nullptr;
      pass->Erase(kBKCLCtxs);
      pass->SetNotOwned<platform::BKCLCommunicator>(kBKCLCtxs, nctx);
      pass->Erase(kUseHierarchicalAllReduce);
444 445
      PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_,
                        false,
446 447 448 449
                        platform::errors::Unimplemented(
                            "xpu doesn't support hierarchical_allreduce"));
      pass->Set<bool>(kUseHierarchicalAllReduce,
                      new bool(use_hierarchical_allreduce_));
450
#endif
451 452
      VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
              << ", num_trainers:" << num_trainers_;
453
    } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
454
      if (use_device != p::kCUDA) {
T
tangwei12 已提交
455 456
        VLOG(1) << "fuse_relu_depthwise_conv_pass is only supported on "
                   "GPU, skipped.";
457 458
        continue;
      }
459
    } else if (pass->Type() == "fusion_group_pass") {
460 461
      pass->Set<bool>("use_gpu", new bool((use_device == p::kCUDA)));
      if (use_device != p::kCUDA) {
T
tangwei12 已提交
462
        VLOG(1) << "fusion_group_pass is only supported on GPU, skipped.";
463 464
        continue;
      }
Z
Zhen Wang 已提交
465
    } else if (pass->Type() == "fuse_bn_act_pass") {
466
      if (use_device != p::kCUDA) {
T
tangwei12 已提交
467 468
        VLOG(1) << "fuse_bn_act_pass is only supported on "
                   "GPU, skipped.";
Z
Zhen Wang 已提交
469 470
        continue;
      }
Z
Zhang Ting 已提交
471
    } else if (pass->Type() == "fuse_bn_add_act_pass") {
472
      if (use_device != p::kCUDA) {
T
tangwei12 已提交
473 474
        VLOG(1) << "fuse_bn_add_act_pass is only supported on "
                   "GPU, skipped.";
Z
Zhang Ting 已提交
475 476
        continue;
      }
477 478 479
    } else if (pass->Type() == "mkldnn_placement_pass") {
      pass->Set("mkldnn_enabled_op_types",
                new std::unordered_set<std::string>(mkldnn_enabled_op_types_));
480
    } else if (pass->Type() == "backward_optimizer_op_deps_pass") {
481
      if (use_device != p::kCUDA) {
482 483 484 485
        VLOG(1) << "backward_optimizer_op_deps_pass is only supported on "
                   "GPU, skipped.";
        continue;
      }
X
fix  
Xin Pan 已提交
486
    }
C
chengduo 已提交
487
    VLOG(1) << "Start Apply Pass " << pass->Type();
488 489 490 491 492 493 494 495
    if (FLAGS_convert_all_blocks) {
      for (size_t i = 0; i < graph->SubGraphsSize(); ++i) {
        VLOG(3) << "Apply Pass " << pass->Type() << "to SubGraph " << i;
        pass->Apply(graph->GetSubGraph(i));
      }
    } else {
      graph = pass->Apply(graph);
    }
C
chengduo 已提交
496
    VLOG(1) << "Finish Apply Pass " << pass->Type();
X
fix  
Xin Pan 已提交
497
  }
C
chengduo 已提交
498
  VLOG(1) << "All Passes Applied";
499 500
  return graph;
}
D
dzhwinter 已提交
501

502 503 504 505
}  // namespace details
}  // namespace framework
}  // namespace paddle

Q
qingqing01 已提交
506
USE_PASS(sync_batch_norm_pass);
507
USE_PASS(fuse_relu_depthwise_conv_pass);
508
USE_PASS(fuse_elewise_add_act_pass);
Z
Zhen Wang 已提交
509
USE_PASS(fuse_bn_act_pass);
Z
Zhang Ting 已提交
510
USE_PASS(fuse_bn_add_act_pass);
511
USE_PASS(graph_viz_pass);
512
USE_PASS(multi_batch_merge_pass);
513
USE_PASS(no_reduce_multi_devices_pass);
514
USE_PASS(reduce_mode_multi_devices_pass);
C
chengduo 已提交
515
USE_PASS(all_reduce_mode_multi_devices_pass);
516
USE_PASS(dist_multi_devices_pass);
517 518
USE_PASS(multi_devices_check_pass);
USE_PASS(multi_devices_print_pass);
S
sneaxiy 已提交
519
USE_PASS(sequential_execution_pass);
520
USE_PASS(all_reduce_deps_pass);
521
USE_PASS(backward_optimizer_op_deps_pass);
S
sneaxiy 已提交
522
USE_PASS(modify_op_lock_and_record_event_pass);
M
minqiyang 已提交
523
USE_PASS(lock_free_optimize_pass);
524
USE_PASS(coalesce_grad_tensor_pass);
W
WangZhen 已提交
525
USE_PASS(graph_to_program_pass);
C
chengduo 已提交
526 527
USE_PASS(fuse_adam_op_pass);
USE_PASS(fuse_sgd_op_pass);
C
chengduo 已提交
528
USE_PASS(fuse_momentum_op_pass);
C
chengduo 已提交
529
USE_PASS(fuse_all_reduce_op_pass);
530
USE_PASS(runtime_context_cache_pass);
531
USE_PASS(add_reader_dependency_pass);
532
USE_PASS(delete_dropout_op_x_pass);
533 534
#ifdef PADDLE_WITH_CUDA
USE_PASS(fused_attention_pass);
Y
yuehuayingxueluo 已提交
535
USE_PASS(fuse_adamw_op_pass);
536
#endif
537
#ifdef PADDLE_WITH_CINN
538
USE_PASS(cinn_zero_tensor_trick_pass);
J
jiangcheng 已提交
539
USE_PASS(build_cinn_pass);
540
#endif
541 542 543
#ifdef PADDLE_WITH_CUDA
USE_PASS(fused_feedforward_pass);
#endif
544 545 546
#ifdef PADDLE_WITH_MKLDNN
USE_PASS(mkldnn_placement_pass);
#endif
547 548
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
    !defined(_WIN32) && !defined(__APPLE__)
549 550
USE_PASS(fusion_group_pass);
#endif
551 552 553
#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060)
USE_PASS(fuse_gemm_epilogue_pass);
#endif