diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 8eaace17bb1a59bc5033e632511886c7630d0cd2..ae28a2cc6f9a4979eabff69a36b5fac87c096f87 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -59,47 +59,63 @@ struct BuildStrategy { enum class GradientScaleStrategy { kCoeffNumDevice = 0, kOne = 1, + // user can customize gradient scale to use, and just feed + // it into exe.run(). kCustomized = 2, }; - enum class OptimizeStrategy { - // To be Implemented,bruteforce, recursive compute unused var names. - kBruteForce = 0, - kControlFlowGraph = 1, // use cfg_graph algorithm, faster speed. - }; - ReduceStrategy reduce_{ReduceStrategy::kAllReduce}; GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice}; - OptimizeStrategy strategy_{OptimizeStrategy::kControlFlowGraph}; std::string debug_graphviz_path_{""}; - bool fuse_elewise_add_act_ops_{false}; + // Add dependency between backward ops and optimization ops, make sure that + // all the backward ops are finished before running the optimization ops. + // It might make the training speed of data parallelism faster. + bool enable_backward_optimizer_op_deps_{false}; + // TODO(dev-paddle): enable_sequential_execution depends on + // kStaleProgramOpDescs, it is not appropriate, because kStaleProgramOpDescs + // will be removed in the near future. + bool enable_sequential_execution_{false}; + bool remove_unnecessary_lock_{true}; + // TODO(dev-paddle): cache_runtime_context may cause some models to hang up + // while running. + bool cache_runtime_context_{false}; + // Operator fusion + // TODO(dev-paddle): fuse_elewise_add_act_ops may cause some models have + // cycle. + bool fuse_elewise_add_act_ops_{false}; + // Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients + // should not be sparse types bool fuse_all_optimizer_ops_{false}; - bool fuse_all_reduce_ops_{false}; - - bool enable_backward_optimizer_op_deps_{false}; - + // fuse_relu_depthwise_conv can fuse the `relu -> + // depthwise_conv` bool fuse_relu_depthwise_conv_{false}; - + // NOTE(zcd): In reduce mode, fusing broadcast ops may make the program + // faster. Because fusing broadcast OP equals delaying the execution of all + // broadcast Ops, in this case, all nccl streams are used only for reduce + // operations for a period of time. + bool fuse_broadcast_ops_{false}; + // replace batch_norm with sync_batch_norm. bool sync_batch_norm_{false}; + // mkldnn_enabled_op_types specify the operator type list to + // use MKLDNN acceleration. It is null in default, means + // that all the operators supported by MKLDNN will be + // accelerated. And it should not be set when + // FLAGS_use_mkldnn=false + std::unordered_set mkldnn_enabled_op_types_; + // FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4 // to open them by default, we need to solve the fetch variable issue + // TODO(liuwei1031): memory_optimize depends on kStaleProgramOpDescs, + // it is not appropriate, because kStaleProgramOpDescs will be removed in the + // near future. bool memory_optimize_{false}; - bool enable_inplace_{false}; - bool enable_sequential_execution_{false}; - - // NOTE(zcd): In reduce mode, fusing broadcast ops may make the program - // faster. Because fusing broadcast OP equals delaying the execution of all - // broadcast Ops, in this case, all nccl streams are used only for reduce - // operations for a period of time. - bool fuse_broadcast_ops_{false}; - // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, // num_trainers is 1, so the current fields of build_strategy doesn't tell if // it's distributed model. @@ -108,11 +124,8 @@ struct BuildStrategy { int num_trainers_{1}; int trainer_id_{0}; std::vector trainers_endpoints_; - bool remove_unnecessary_lock_{true}; - - bool cache_runtime_context_{false}; - std::unordered_set mkldnn_enabled_op_types_; + // NCCL config size_t nccl_comm_num_{1}; // The picture is here: // https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396 diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 4f074323f9bb8f0aa909c4a95ae04464bdaeb9ad..68de1580e20a0221b9c9855c50849369eaaff871 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -22,17 +22,24 @@ namespace details { struct ExecutionStrategy { enum ExecutorType { kDefault = 0, kExperimental = 1 }; + // num_threads indicates the size of thread pool. size_t num_threads_{0}; bool use_cuda_{true}; + // Note that allow_op_delay is invalid now. bool allow_op_delay_{false}; - // If we set this to 1, we will delete all variables when finish a batch. and - // this will loss 15%+ performance. - // Please be aware about this parameters. + // num_iteration_per_drop_scope indicates how many + // iterations the framework cleans up a local execution scope. + // In some models, the value of this parameter has a great + // influence on the performance(about 15%) of the program. size_t num_iteration_per_drop_scope_{1}; + // At present, the kExperimental executor is the fastest in most models. ExecutorType type_{kExperimental}; + // This debug option. bool dry_run_{false}; - size_t num_iteration_per_run_{1}; // only use with async_ssa_graph_executor - // and pyreader with data queue + + // only use with async_ssa_graph_executor + // and pyreader with data queue + size_t num_iteration_per_run_{1}; }; } // namespace details diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc index 33a69ad5fec2b850cae070ca3f113f12c4e835f9..5fa6bf73aa839ffcd307a15eb4fc8dd5a2ea16af 100644 --- a/paddle/fluid/operators/reader/read_op.cc +++ b/paddle/fluid/operators/reader/read_op.cc @@ -86,18 +86,8 @@ class ReadOp : public framework::OperatorBase { reader->ReadNext(&ins); if (ins.empty()) { - VLOG(3) << "read empty data in"; - if (Attr("throw_eof_exp")) { - VLOG(3) << "throw_eof_exp"; - PADDLE_THROW_EOF(); - } else { - ins.resize(out_arg_names.size()); - for (auto& tensor : ins) { - // data type is not important for subsequent DataBalanceOpHandle - tensor.mutable_data(framework::make_ddim({0}), dev_place); - } - } - VLOG(3) << "read empty data out"; + VLOG(3) << "throw_eof_exp"; + PADDLE_THROW_EOF(); } PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size()); for (size_t i = 0; i < out_arg_names.size(); ++i) {