distributed_strategy.proto 8.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto2";
package paddle.fleet;

enum Mode {
  COLLECTIVE = 1;
  PS = 2;
  PIPELINE = 3;
  HETER = 4; // support XPU and GPU computing server
}

J
JZ-LIANG 已提交
25 26 27 28 29
message RecomputeConfig {
  repeated string checkpoints = 1;
  optional bool enable_offload = 2 [ default = false ];
  repeated int32 checkpoint_shape = 3;
}
30

31
message ShardingConfig {
32
  optional string sharding_segment_strategy = 1
33
      [ default = 'segment_broadcast_MB' ];
34 35 36 37 38 39 40 41 42 43
  optional float segment_broadcast_MB = 2 [ default = 32.0 ];
  repeated string segment_anchors = 3;
  optional int32 sharding_degree = 4 [ default = 8 ];
  optional int32 mp_degree = 5 [ default = 1 ];
  optional int32 dp_degree = 6 [ default = 1 ];
  optional bool hybrid_dp = 7 [ default = false ];
  optional int32 gradient_merge_acc_step = 8 [ default = 1 ];
  optional bool optimize_offload = 9 [ default = false ];
  optional bool pp_allreduce_in_optimize = 10 [ default = false ];
  optional int32 pp_degree = 11 [ default = 1 ];
44 45
}

46 47 48 49
message HybridConfig {
  optional int32 dp_degree = 1 [ default = -1 ];
  optional int32 mp_degree = 2 [ default = 1 ];
  optional int32 pp_degree = 3 [ default = 1 ];
J
JZ-LIANG 已提交
50
  optional int32 sharding_degree = 4 [ default = 1 ];
51 52
}

53 54 55 56 57 58 59
message AMPConfig {
  optional float init_loss_scaling = 1 [ default = 32768.0 ];
  optional int32 incr_every_n_steps = 2 [ default = 1000 ];
  optional int32 decr_every_n_nan_or_inf = 3 [ default = 2 ];
  optional float incr_ratio = 4 [ default = 2.0 ];
  optional float decr_ratio = 5 [ default = 0.8 ];
  optional bool use_dynamic_loss_scaling = 6 [ default = true ];
60 61 62
  repeated string custom_white_list = 7;
  repeated string custom_black_list = 8;
  repeated string custom_black_varnames = 9;
63 64
  optional bool use_pure_fp16 = 10 [ default = false ];
  optional bool use_fp16_guard = 11 [ default = true ];
65
}
66

67 68 69 70
message LocalSGDConfig {
  optional int32 k_steps = 1 [ default = 1 ];
  optional int32 begin_step = 2 [ default = 1 ];
}
71

72 73 74 75 76
message AdaptiveLocalSGDConfig {
  optional int32 init_k_steps = 1 [ default = 1 ];
  optional int32 begin_step = 2 [ default = 1 ];
}

77 78 79 80 81
message GradientMergeConfig {
  optional int32 k_steps = 1 [ default = 1 ];
  optional bool avg = 2 [ default = true ];
}

82 83 84 85 86 87
message DGCConfig {
  optional int32 rampup_begin_step = 1 [ default = 0 ];
  optional int32 rampup_step = 2 [ default = 1 ];
  repeated float sparsity = 3;
}

88 89 90
message LarsConfig {
  optional float lars_coeff = 1 [ default = 0.001 ];
  optional float lars_weight_decay = 2 [ default = 0.0005 ];
91 92
  optional float epsilon = 3 [ default = 0.0 ];
  repeated string exclude_from_weight_decay = 4;
93 94 95
}

message LambConfig {
96 97
  optional float lamb_weight_decay = 1 [ default = 0.01 ];
  repeated string exclude_from_weight_decay = 2;
98 99
}

100 101 102 103 104 105 106 107 108 109
message BuildStrategy {
  optional bool enable_sequential_execution = 1 [ default = false ];
  optional bool fuse_elewise_add_act_ops = 2 [ default = false ];
  optional bool fuse_bn_act_ops = 3 [ default = false ];
  optional bool fuse_relu_depthwise_conv = 4 [ default = false ];
  optional bool fuse_broadcast_ops = 5 [ default = false ];
  optional bool fuse_all_optimizer_ops = 6 [ default = false ];
  optional bool enable_inplace = 7 [ default = false ];
  optional bool enable_backward_optimizer_op_deps = 8 [ default = true ];
  optional bool cache_runtime_context = 9 [ default = false ];
110 111 112
  optional bool fuse_bn_add_act_ops = 10 [ default = true ];
  optional bool enable_auto_fusion = 11 [ default = false ];
  optional bool enable_addto = 12 [ default = false ];
113
}
114

115 116 117 118 119 120 121 122
message ExecutionStrategy {
  optional int32 num_threads = 1 [ default = 1 ];
  optional int32 num_iteration_per_drop_scope = 2 [ default = 10 ];
  optional int32 num_iteration_per_run = 3 [ default = 1 ];
  optional bool use_thread_barrier = 4 [ default = false ];
}

message AsyncConfig {
123
  optional int32 k_steps = 1 [ default = -1 ];
124 125 126 127 128 129 130
  optional int32 max_merge_var_num = 2 [ default = 1 ];
  optional int32 send_queue_size = 3 [ default = 16 ];
  optional bool independent_recv_thread = 4 [ default = false ];
  optional int32 min_send_grad_num_before_recv = 5 [ default = 1 ];
  optional int32 thread_pool_size = 6 [ default = 1 ];
  optional int32 send_wait_times = 7 [ default = 1 ];
  optional bool runtime_split_send_recv = 8 [ default = false ];
C
Chengmo 已提交
131
  optional bool launch_barrier = 9 [ default = true ];
132
  optional string heter_worker_device_guard = 10 [ default = 'cpu' ];
133
  optional int32 lr_decay_steps = 11 [ default = 10 ];
T
Thunderbrook 已提交
134
  optional int32 use_ps_gpu = 12 [ default = 0 ];
135 136
}

137 138 139
message PipelineConfig {
  optional int32 micro_batch_size = 1 [ default = 1 ];
  optional int32 accumulate_steps = 2 [ default = 1 ];
140
  optional string schedule_mode = 3 [ default = '1F1B' ];
141
}
142

L
lilong12 已提交
143 144
message TensorParallelConfig {
  optional int32 tensor_parallel_degree = 1 [ default = 1 ];
145
  optional int32 tensor_init_seed = 2 [ default = -1 ];
L
lilong12 已提交
146 147
}

148 149 150 151 152 153 154 155 156 157 158 159 160
message DistributedStrategy {
  // bool options
  optional Mode mode = 1 [ default = COLLECTIVE ];
  optional bool amp = 2 [ default = false ];
  optional bool recompute = 3 [ default = false ];
  optional bool localsgd = 4 [ default = false ];
  optional bool dgc = 5 [ default = false ];
  optional bool gradient_merge = 6 [ default = false ];
  optional bool lars = 7 [ default = false ];
  optional bool lamb = 8 [ default = false ];
  optional bool pipeline = 9 [ default = false ];
  optional bool elastic = 10 [ default = false ];
  optional bool auto = 11 [ default = false ];
D
Dong Daxiang 已提交
161
  optional bool a_sync = 12 [ default = true ];
162 163 164 165 166 167
  optional bool sync_nccl_allreduce = 13 [ default = true ];
  optional int32 nccl_comm_num = 14 [ default = 1 ];
  optional bool use_hierarchical_allreduce = 15 [ default = false ];
  optional int32 hierarchical_allreduce_inter_nranks = 16 [ default = 1 ];
  optional bool sync_batch_norm = 17 [ default = false ];
  optional bool fuse_all_reduce_ops = 18 [ default = true ];
168 169
  optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ];
  optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ];
170
  optional bool cudnn_exhaustive_search = 21 [ default = false ];
L
lilong12 已提交
171
  optional int32 conv_workspace_size_limit = 22 [ default = 512 ];
172
  optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = false ];
173
  optional bool adaptive_localsgd = 24 [ default = false ];
174
  optional bool fp16_allreduce = 25 [ default = false ];
175
  optional bool sharding = 26 [ default = false ];
176
  optional float last_comm_group_size_MB = 27 [ default = 1 ];
177
  optional bool find_unused_parameters = 28 [ default = false ];
L
lilong12 已提交
178
  optional bool tensor_parallel = 29 [ default = false ];
179
  optional bool without_graph_optimization = 30 [ default = false ];
180
  optional int32 fuse_grad_size_in_num = 31 [ default = 1 ];
181
  optional bool calc_comm_same_stream = 32 [ default = false ];
182

183 184 185 186
  optional RecomputeConfig recompute_configs = 101;
  optional AMPConfig amp_configs = 102;
  optional LocalSGDConfig localsgd_configs = 103;
  optional GradientMergeConfig gradient_merge_configs = 104;
187
  optional DGCConfig dgc_configs = 105;
188
  optional PipelineConfig pipeline_configs = 106;
D
Dong Daxiang 已提交
189
  optional AsyncConfig a_sync_configs = 107;
190 191
  optional LarsConfig lars_configs = 108;
  optional LambConfig lamb_configs = 109;
192
  optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110;
193
  optional ShardingConfig sharding_configs = 111;
194
  optional HybridConfig hybrid_configs = 112;
L
lilong12 已提交
195
  optional TensorParallelConfig tensor_parallel_configs = 113;
196 197
  optional BuildStrategy build_strategy = 201;
  optional ExecutionStrategy execution_strategy = 202;
198 199 200 201 202 203 204 205 206 207 208 209 210
}

message DistributedJobInfo {
  optional int32 worker_num = 1;
  optional int32 server_num = 2;
  repeated string worker_ips = 3;
  repeated string server_endpoints = 4;
  optional string origin_startup = 5;
  optional string origin_main = 6; // without backpropagation and optimization
  optional string distributed_main = 7; // with backpropagation and optimization
  optional string optimizer_name = 8;   // optimizer name
  optional DistributedStrategy strategy = 101;
}