Clean build strategy (#18148)

* clean build_strategy test=develop * DataBalanceOpHandle has been removed test=develop * debug * update build_strategy. test=develop

Clean build strategy (#18148)
* clean build_strategy test=develop * DataBalanceOpHandle has been removed test=develop * debug * update build_strategy. test=develop
5489216e · chengduo · GitHub · 14e1e165 · 5489216e · 5489216e
3 changed file
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -59,47 +59,63 @@ struct BuildStrategy {
  enum class GradientScaleStrategy {
    kCoeffNumDevice = 0,
    kOne = 1,
+    // user can customize gradient scale to use, and just feed
+    // it into exe.run().
    kCustomized = 2,
  };

-  enum class OptimizeStrategy {
-    // To be Implemented,bruteforce, recursive compute unused var names.
-    kBruteForce = 0,
-    kControlFlowGraph = 1,  // use cfg_graph algorithm, faster speed.
-  };
-
  ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
  GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
-  OptimizeStrategy strategy_{OptimizeStrategy::kControlFlowGraph};

  std::string debug_graphviz_path_{""};

-  bool fuse_elewise_add_act_ops_{false};
+  // Add dependency between backward ops and optimization ops, make sure that
+  // all the backward ops are finished before running the optimization ops.
+  // It might make the training speed of data parallelism faster.
+  bool enable_backward_optimizer_op_deps_{false};
+  // TODO(dev-paddle): enable_sequential_execution depends on
+  // kStaleProgramOpDescs, it is not appropriate, because kStaleProgramOpDescs
+  // will be removed in the near future.
+  bool enable_sequential_execution_{false};
+  bool remove_unnecessary_lock_{true};
+  // TODO(dev-paddle): cache_runtime_context may cause some models to hang up
+  // while running.
+  bool cache_runtime_context_{false};

+  // Operator fusion
+  // TODO(dev-paddle): fuse_elewise_add_act_ops may cause some models have
+  // cycle.
+  bool fuse_elewise_add_act_ops_{false};
+  // Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients
+  // should not be sparse types
  bool fuse_all_optimizer_ops_{false};
-
  bool fuse_all_reduce_ops_{false};
-
-  bool enable_backward_optimizer_op_deps_{false};
-
+  // fuse_relu_depthwise_conv can fuse the `relu ->
+  // depthwise_conv`
  bool fuse_relu_depthwise_conv_{false};
-
+  // NOTE(zcd): In reduce mode, fusing broadcast ops may make the program
+  // faster. Because fusing broadcast OP equals delaying the execution of all
+  // broadcast Ops, in this case, all nccl streams are used only for reduce
+  // operations for a period of time.
+  bool fuse_broadcast_ops_{false};
+  // replace batch_norm with sync_batch_norm.
  bool sync_batch_norm_{false};

+  // mkldnn_enabled_op_types specify the operator type list to
+  // use MKLDNN acceleration. It is null in default, means
+  // that all the operators supported by MKLDNN will be
+  // accelerated. And it should not be set when
+  // FLAGS_use_mkldnn=false
+  std::unordered_set<std::string> mkldnn_enabled_op_types_;
+
  // FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4
  // to open them by default, we need to solve the fetch variable issue
+  // TODO(liuwei1031): memory_optimize depends on kStaleProgramOpDescs,
+  // it is not appropriate, because kStaleProgramOpDescs will be removed in the
+  // near future.
  bool memory_optimize_{false};
-
  bool enable_inplace_{false};

-  bool enable_sequential_execution_{false};
-
-  // NOTE(zcd): In reduce mode, fusing broadcast ops may make the program
-  // faster. Because fusing broadcast OP equals delaying the execution of all
-  // broadcast Ops, in this case, all nccl streams are used only for reduce
-  // operations for a period of time.
-  bool fuse_broadcast_ops_{false};
-
  // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
  // num_trainers is 1, so the current fields of build_strategy doesn't tell if
  // it's distributed model.
@@ -108,11 +124,8 @@ struct BuildStrategy {
  int num_trainers_{1};
  int trainer_id_{0};
  std::vector<std::string> trainers_endpoints_;
-  bool remove_unnecessary_lock_{true};
-
-  bool cache_runtime_context_{false};
-  std::unordered_set<std::string> mkldnn_enabled_op_types_;

+  // NCCL config
  size_t nccl_comm_num_{1};
  // The picture is here:
  // https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396

--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -22,17 +22,24 @@ namespace details {
 struct ExecutionStrategy {
  enum ExecutorType { kDefault = 0, kExperimental = 1 };

+  // num_threads indicates the size of thread pool.
  size_t num_threads_{0};
  bool use_cuda_{true};
+  // Note that allow_op_delay is invalid now.
  bool allow_op_delay_{false};
-  // If we set this to 1, we will delete all variables when finish a batch. and
-  // this will loss 15%+ performance.
-  // Please be aware about this parameters.
+  // num_iteration_per_drop_scope indicates how many
+  // iterations the framework cleans up a local execution scope.
+  // In some models, the value of this parameter has a great
+  // influence on the performance(about 15%) of the program.
  size_t num_iteration_per_drop_scope_{1};
+  // At present, the kExperimental executor is the fastest in most models.
  ExecutorType type_{kExperimental};
+  // This debug option.
  bool dry_run_{false};
-  size_t num_iteration_per_run_{1};  // only use with async_ssa_graph_executor
-                                     // and pyreader with data queue
+
+  // only use with async_ssa_graph_executor
+  // and pyreader with data queue
+  size_t num_iteration_per_run_{1};
 };

 }  //  namespace details

--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -86,18 +86,8 @@ class ReadOp : public framework::OperatorBase {

    reader->ReadNext(&ins);
    if (ins.empty()) {
-      VLOG(3) << "read empty data in";
-      if (Attr<bool>("throw_eof_exp")) {
-        VLOG(3) << "throw_eof_exp";
-        PADDLE_THROW_EOF();
-      } else {
-        ins.resize(out_arg_names.size());
-        for (auto& tensor : ins) {
-          // data type is not important for subsequent DataBalanceOpHandle
-          tensor.mutable_data<float>(framework::make_ddim({0}), dev_place);
-        }
-      }
-      VLOG(3) << "read empty data out";
+      VLOG(3) << "throw_eof_exp";
+      PADDLE_THROW_EOF();
    }
    PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
    for (size_t i = 0; i < out_arg_names.size(); ++i) {