From 5489216ebaaf5de951ea0d1412b16f5e5609fcc6 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Mon, 24 Jun 2019 19:02:15 +0800
Subject: [PATCH] Clean build strategy (#18148)

* clean build_strategy
test=develop

* DataBalanceOpHandle has been removed
test=develop

* debug

* update build_strategy.
test=develop
---
 .../fluid/framework/details/build_strategy.h  | 65 +++++++++++--------
 .../framework/details/execution_strategy.h    | 17 +++--
 paddle/fluid/operators/reader/read_op.cc      | 14 +---
 3 files changed, 53 insertions(+), 43 deletions(-)

diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 8eaace17bb1..ae28a2cc6f9 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -59,47 +59,63 @@ struct BuildStrategy {
   enum class GradientScaleStrategy {
     kCoeffNumDevice = 0,
     kOne = 1,
+    // user can customize gradient scale to use, and just feed
+    // it into exe.run().
     kCustomized = 2,
   };
 
-  enum class OptimizeStrategy {
-    // To be Implemented,bruteforce, recursive compute unused var names.
-    kBruteForce = 0,
-    kControlFlowGraph = 1,  // use cfg_graph algorithm, faster speed.
-  };
-
   ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
   GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
-  OptimizeStrategy strategy_{OptimizeStrategy::kControlFlowGraph};
 
   std::string debug_graphviz_path_{""};
 
-  bool fuse_elewise_add_act_ops_{false};
+  // Add dependency between backward ops and optimization ops, make sure that
+  // all the backward ops are finished before running the optimization ops.
+  // It might make the training speed of data parallelism faster.
+  bool enable_backward_optimizer_op_deps_{false};
+  // TODO(dev-paddle): enable_sequential_execution depends on
+  // kStaleProgramOpDescs, it is not appropriate, because kStaleProgramOpDescs
+  // will be removed in the near future.
+  bool enable_sequential_execution_{false};
+  bool remove_unnecessary_lock_{true};
+  // TODO(dev-paddle): cache_runtime_context may cause some models to hang up
+  // while running.
+  bool cache_runtime_context_{false};
 
+  // Operator fusion
+  // TODO(dev-paddle): fuse_elewise_add_act_ops may cause some models have
+  // cycle.
+  bool fuse_elewise_add_act_ops_{false};
+  // Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients
+  // should not be sparse types
   bool fuse_all_optimizer_ops_{false};
-
   bool fuse_all_reduce_ops_{false};
-
-  bool enable_backward_optimizer_op_deps_{false};
-
+  // fuse_relu_depthwise_conv can fuse the `relu ->
+  // depthwise_conv`
   bool fuse_relu_depthwise_conv_{false};
-
+  // NOTE(zcd): In reduce mode, fusing broadcast ops may make the program
+  // faster. Because fusing broadcast OP equals delaying the execution of all
+  // broadcast Ops, in this case, all nccl streams are used only for reduce
+  // operations for a period of time.
+  bool fuse_broadcast_ops_{false};
+  // replace batch_norm with sync_batch_norm.
   bool sync_batch_norm_{false};
 
+  // mkldnn_enabled_op_types specify the operator type list to
+  // use MKLDNN acceleration. It is null in default, means
+  // that all the operators supported by MKLDNN will be
+  // accelerated. And it should not be set when
+  // FLAGS_use_mkldnn=false
+  std::unordered_set<std::string> mkldnn_enabled_op_types_;
+
   // FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4
   // to open them by default, we need to solve the fetch variable issue
+  // TODO(liuwei1031): memory_optimize depends on kStaleProgramOpDescs,
+  // it is not appropriate, because kStaleProgramOpDescs will be removed in the
+  // near future.
   bool memory_optimize_{false};
-
   bool enable_inplace_{false};
 
-  bool enable_sequential_execution_{false};
-
-  // NOTE(zcd): In reduce mode, fusing broadcast ops may make the program
-  // faster. Because fusing broadcast OP equals delaying the execution of all
-  // broadcast Ops, in this case, all nccl streams are used only for reduce
-  // operations for a period of time.
-  bool fuse_broadcast_ops_{false};
-
   // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
   // num_trainers is 1, so the current fields of build_strategy doesn't tell if
   // it's distributed model.
@@ -108,11 +124,8 @@ struct BuildStrategy {
   int num_trainers_{1};
   int trainer_id_{0};
   std::vector<std::string> trainers_endpoints_;
-  bool remove_unnecessary_lock_{true};
-
-  bool cache_runtime_context_{false};
-  std::unordered_set<std::string> mkldnn_enabled_op_types_;
 
+  // NCCL config
   size_t nccl_comm_num_{1};
   // The picture is here:
   // https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index 4f074323f9b..68de1580e20 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -22,17 +22,24 @@ namespace details {
 struct ExecutionStrategy {
   enum ExecutorType { kDefault = 0, kExperimental = 1 };
 
+  // num_threads indicates the size of thread pool.
   size_t num_threads_{0};
   bool use_cuda_{true};
+  // Note that allow_op_delay is invalid now.
   bool allow_op_delay_{false};
-  // If we set this to 1, we will delete all variables when finish a batch. and
-  // this will loss 15%+ performance.
-  // Please be aware about this parameters.
+  // num_iteration_per_drop_scope indicates how many
+  // iterations the framework cleans up a local execution scope.
+  // In some models, the value of this parameter has a great
+  // influence on the performance(about 15%) of the program.
   size_t num_iteration_per_drop_scope_{1};
+  // At present, the kExperimental executor is the fastest in most models.
   ExecutorType type_{kExperimental};
+  // This debug option.
   bool dry_run_{false};
-  size_t num_iteration_per_run_{1};  // only use with async_ssa_graph_executor
-                                     // and pyreader with data queue
+
+  // only use with async_ssa_graph_executor
+  // and pyreader with data queue
+  size_t num_iteration_per_run_{1};
 };
 
 }  //  namespace details
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index 33a69ad5fec..5fa6bf73aa8 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -86,18 +86,8 @@ class ReadOp : public framework::OperatorBase {
 
     reader->ReadNext(&ins);
     if (ins.empty()) {
-      VLOG(3) << "read empty data in";
-      if (Attr<bool>("throw_eof_exp")) {
-        VLOG(3) << "throw_eof_exp";
-        PADDLE_THROW_EOF();
-      } else {
-        ins.resize(out_arg_names.size());
-        for (auto& tensor : ins) {
-          // data type is not important for subsequent DataBalanceOpHandle
-          tensor.mutable_data<float>(framework::make_ddim({0}), dev_place);
-        }
-      }
-      VLOG(3) << "read empty data out";
+      VLOG(3) << "throw_eof_exp";
+      PADDLE_THROW_EOF();
     }
     PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
     for (size_t i = 0; i < out_arg_names.size(); ++i) {
-- 
GitLab