build_strategy.h 5.4 KB
Newer Older
Y
yuyang18 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

17
#include <memory>
Y
yuyang18 已提交
18
#include <string>
C
chengduo 已提交
19
#include <utility>
20 21 22 23 24 25 26
#include <vector>
#include "paddle/fluid/framework/ir/pass_builder.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"

P
peizhilin 已提交
27
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
28 29
#include "paddle/fluid/platform/nccl_helper.h"
#endif
Y
yuyang18 已提交
30

Y
yuyang18 已提交
31 32 33 34 35
namespace paddle {
namespace framework {
namespace details {

struct BuildStrategy {
C
chengduo 已提交
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
  // ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and
  // kReduce, for CPU and GPU. If you use kAllReduce, different threads
  // optimize their parameters separately. If you use kReduce, the optimizations
  // of parameters are distributed to different threads.
  // For example, a model has 100 parameters and is running with four threads,
  // if you choose kAllReduce, every thread is to optimize 100 parameters
  // separately, if you choose kReduce, every thread is to optimize 25
  // parameters.
  // Of particular note is, if you use kReduce when using CPU training,
  // all the parameters are shared between different threads. This feature will
  // save memory.
  // FIXME(zcd): The result of the two modes(kAllReduce and kReduce) maybe not
  // equal for GPU. Because, the result of the different order of summing maybe
  // different, for example, the result of `a+b+c+d` may be different with the
  // result of `c+a+b+d`.
  // For GPU, the implementation of kAllReduce and kReduce is adopted NCCL,
  // so the result of kAllReduce and kReduce maybe not equal.
  // For CPU, if you want to fix the order of summing to make the result
  // of kAllReduce and kReduce no diff, you can add
  // `FLAGS_cpu_deterministic=true` to env.
Y
yuyang18 已提交
56 57 58 59 60 61 62 63
  enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 };

  enum class GradientScaleStrategy {
    kCoeffNumDevice = 0,
    kOne = 1,
    kCustomized = 2,
  };

D
dzhwinter 已提交
64 65 66 67 68 69
  enum class OptimizeStrategy {
    // To be Implemented,bruteforce, recursive compute unused var names.
    kBruteForce = 0,
    kControlFlowGraph = 1,  // use cfg_graph algorithm, faster speed.
  };

Y
yuyang18 已提交
70
  ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
Y
yuyang18 已提交
71
  GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
D
dzhwinter 已提交
72
  OptimizeStrategy strategy_{OptimizeStrategy::kControlFlowGraph};
Y
yuyang18 已提交
73 74

  std::string debug_graphviz_path_{""};
F
fengjiayi 已提交
75

C
chengduo 已提交
76 77
  bool fuse_elewise_add_act_ops_{false};

C
chengduo 已提交
78 79
  bool fuse_all_optimizer_ops_{false};

C
chengduo 已提交
80 81
  bool fuse_all_reduce_ops_{false};

82 83
  bool fuse_relu_depthwise_conv_{false};

Q
qingqing01 已提交
84 85
  bool sync_batch_norm_{false};

86
  bool memory_optimize_{true};
D
dzhwinter 已提交
87 88 89
  // TODO(dzhwinter):
  // make enable_inplace, memory_optimize_
  // memory_early_delete_ true by default
90
  bool enable_inplace_{true};
D
dzhwinter 已提交
91

S
sneaxiy 已提交
92
  bool enable_sequential_execution_{false};
S
sneaxiy 已提交
93

94 95
  bool fuse_broadcast_op_{false};

96 97 98 99
  // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
  // num_trainers is 1, so the current fields of build_strategy doesn't tell if
  // it's distributed model.
  bool is_distribution_{false};
100
  int num_trainers_{1};
101 102
  int trainer_id_{0};
  std::vector<std::string> trainers_endpoints_;
S
sneaxiy 已提交
103
  bool remove_unnecessary_lock_{true};
S
sneaxiy 已提交
104

X
Xin Pan 已提交
105 106 107 108 109
  // NOTE:
  // Before you add new options, think if it's a general strategy that works
  // with other strategy. If not, the strategy should be created through
  // CreatePassesFromStrategy and the pass can be managed separately.

X
Xin Pan 已提交
110
  // User normally doesn't need to call this API.
X
Xin Pan 已提交
111
  // The PassBuilder allows for more customized insert, remove of passes
X
Xin Pan 已提交
112 113 114
  // from python side.
  // A new PassBuilder is created based on configs defined above and
  // passes are owned by the PassBuilder.
115
  std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy(
X
Xin Pan 已提交
116 117 118
      bool finalize_strategy) const;

  bool IsFinalized() const { return is_finalized_; }
119

120 121
  bool IsMultiDevPass(const std::string &pass_name) const;

X
Xin Pan 已提交
122 123
  // Apply the passes built by the pass_builder_. The passes will be
  // applied to the Program and output an ir::Graph.
124 125 126 127
  ir::Graph *Apply(ir::Graph *graph, const std::vector<platform::Place> &places,
                   const std::string &loss_var_name,
                   const std::vector<Scope *> &local_scopes,
                   const size_t &nranks,
P
peizhilin 已提交
128
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
129 130
                   const bool use_cuda,
                   platform::NCCLContextMap *nccl_ctxs) const;
131
#else
132
                   const bool use_cuda) const;
133 134
#endif

135 136 137 138 139 140 141
  // If set true, ParallelExecutor would build the main_program into multiple
  // graphs,
  // each of the graphs would run with one device. This approach can achieve
  // better performance
  // on some scenarios.
  mutable bool enable_parallel_graph_ = false;

142
 private:
X
Xin Pan 已提交
143
  mutable bool is_finalized_ = false;
144
  mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
Y
yuyang18 已提交
145 146 147 148 149
};

}  // namespace details
}  // namespace framework
}  // namespace paddle