未验证 提交 5489216e 编写于 作者: C chengduo 提交者: GitHub

Clean build strategy (#18148)

* clean build_strategy
test=develop

* DataBalanceOpHandle has been removed
test=develop

* debug

* update build_strategy.
test=develop
上级 14e1e165
...@@ -59,47 +59,63 @@ struct BuildStrategy { ...@@ -59,47 +59,63 @@ struct BuildStrategy {
enum class GradientScaleStrategy { enum class GradientScaleStrategy {
kCoeffNumDevice = 0, kCoeffNumDevice = 0,
kOne = 1, kOne = 1,
// user can customize gradient scale to use, and just feed
// it into exe.run().
kCustomized = 2, kCustomized = 2,
}; };
enum class OptimizeStrategy {
// To be Implemented,bruteforce, recursive compute unused var names.
kBruteForce = 0,
kControlFlowGraph = 1, // use cfg_graph algorithm, faster speed.
};
ReduceStrategy reduce_{ReduceStrategy::kAllReduce}; ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice}; GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
OptimizeStrategy strategy_{OptimizeStrategy::kControlFlowGraph};
std::string debug_graphviz_path_{""}; std::string debug_graphviz_path_{""};
bool fuse_elewise_add_act_ops_{false}; // Add dependency between backward ops and optimization ops, make sure that
// all the backward ops are finished before running the optimization ops.
// It might make the training speed of data parallelism faster.
bool enable_backward_optimizer_op_deps_{false};
// TODO(dev-paddle): enable_sequential_execution depends on
// kStaleProgramOpDescs, it is not appropriate, because kStaleProgramOpDescs
// will be removed in the near future.
bool enable_sequential_execution_{false};
bool remove_unnecessary_lock_{true};
// TODO(dev-paddle): cache_runtime_context may cause some models to hang up
// while running.
bool cache_runtime_context_{false};
// Operator fusion
// TODO(dev-paddle): fuse_elewise_add_act_ops may cause some models have
// cycle.
bool fuse_elewise_add_act_ops_{false};
// Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients
// should not be sparse types
bool fuse_all_optimizer_ops_{false}; bool fuse_all_optimizer_ops_{false};
bool fuse_all_reduce_ops_{false}; bool fuse_all_reduce_ops_{false};
// fuse_relu_depthwise_conv can fuse the `relu ->
bool enable_backward_optimizer_op_deps_{false}; // depthwise_conv`
bool fuse_relu_depthwise_conv_{false}; bool fuse_relu_depthwise_conv_{false};
// NOTE(zcd): In reduce mode, fusing broadcast ops may make the program
// faster. Because fusing broadcast OP equals delaying the execution of all
// broadcast Ops, in this case, all nccl streams are used only for reduce
// operations for a period of time.
bool fuse_broadcast_ops_{false};
// replace batch_norm with sync_batch_norm.
bool sync_batch_norm_{false}; bool sync_batch_norm_{false};
// mkldnn_enabled_op_types specify the operator type list to
// use MKLDNN acceleration. It is null in default, means
// that all the operators supported by MKLDNN will be
// accelerated. And it should not be set when
// FLAGS_use_mkldnn=false
std::unordered_set<std::string> mkldnn_enabled_op_types_;
// FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4 // FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4
// to open them by default, we need to solve the fetch variable issue // to open them by default, we need to solve the fetch variable issue
// TODO(liuwei1031): memory_optimize depends on kStaleProgramOpDescs,
// it is not appropriate, because kStaleProgramOpDescs will be removed in the
// near future.
bool memory_optimize_{false}; bool memory_optimize_{false};
bool enable_inplace_{false}; bool enable_inplace_{false};
bool enable_sequential_execution_{false};
// NOTE(zcd): In reduce mode, fusing broadcast ops may make the program
// faster. Because fusing broadcast OP equals delaying the execution of all
// broadcast Ops, in this case, all nccl streams are used only for reduce
// operations for a period of time.
bool fuse_broadcast_ops_{false};
// FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
// num_trainers is 1, so the current fields of build_strategy doesn't tell if // num_trainers is 1, so the current fields of build_strategy doesn't tell if
// it's distributed model. // it's distributed model.
...@@ -108,11 +124,8 @@ struct BuildStrategy { ...@@ -108,11 +124,8 @@ struct BuildStrategy {
int num_trainers_{1}; int num_trainers_{1};
int trainer_id_{0}; int trainer_id_{0};
std::vector<std::string> trainers_endpoints_; std::vector<std::string> trainers_endpoints_;
bool remove_unnecessary_lock_{true};
bool cache_runtime_context_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_;
// NCCL config
size_t nccl_comm_num_{1}; size_t nccl_comm_num_{1};
// The picture is here: // The picture is here:
// https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396 // https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
......
...@@ -22,17 +22,24 @@ namespace details { ...@@ -22,17 +22,24 @@ namespace details {
struct ExecutionStrategy { struct ExecutionStrategy {
enum ExecutorType { kDefault = 0, kExperimental = 1 }; enum ExecutorType { kDefault = 0, kExperimental = 1 };
// num_threads indicates the size of thread pool.
size_t num_threads_{0}; size_t num_threads_{0};
bool use_cuda_{true}; bool use_cuda_{true};
// Note that allow_op_delay is invalid now.
bool allow_op_delay_{false}; bool allow_op_delay_{false};
// If we set this to 1, we will delete all variables when finish a batch. and // num_iteration_per_drop_scope indicates how many
// this will loss 15%+ performance. // iterations the framework cleans up a local execution scope.
// Please be aware about this parameters. // In some models, the value of this parameter has a great
// influence on the performance(about 15%) of the program.
size_t num_iteration_per_drop_scope_{1}; size_t num_iteration_per_drop_scope_{1};
// At present, the kExperimental executor is the fastest in most models.
ExecutorType type_{kExperimental}; ExecutorType type_{kExperimental};
// This debug option.
bool dry_run_{false}; bool dry_run_{false};
size_t num_iteration_per_run_{1}; // only use with async_ssa_graph_executor
// only use with async_ssa_graph_executor
// and pyreader with data queue // and pyreader with data queue
size_t num_iteration_per_run_{1};
}; };
} // namespace details } // namespace details
......
...@@ -86,18 +86,8 @@ class ReadOp : public framework::OperatorBase { ...@@ -86,18 +86,8 @@ class ReadOp : public framework::OperatorBase {
reader->ReadNext(&ins); reader->ReadNext(&ins);
if (ins.empty()) { if (ins.empty()) {
VLOG(3) << "read empty data in";
if (Attr<bool>("throw_eof_exp")) {
VLOG(3) << "throw_eof_exp"; VLOG(3) << "throw_eof_exp";
PADDLE_THROW_EOF(); PADDLE_THROW_EOF();
} else {
ins.resize(out_arg_names.size());
for (auto& tensor : ins) {
// data type is not important for subsequent DataBalanceOpHandle
tensor.mutable_data<float>(framework::make_ddim({0}), dev_place);
}
}
VLOG(3) << "read empty data out";
} }
PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size()); PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
for (size_t i = 0; i < out_arg_names.size(); ++i) { for (size_t i = 0; i < out_arg_names.size(); ++i) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册