distributed_strategy.proto 12.2 KB
Newer Older
1
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2
// Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto2";
package paddle.fleet;

enum Mode {
  COLLECTIVE = 1;
  PS = 2;
  PIPELINE = 3;
  HETER = 4; // support XPU and GPU computing server
}

J
JZ-LIANG 已提交
26 27 28 29 30
message RecomputeConfig {
  repeated string checkpoints = 1;
  optional bool enable_offload = 2 [ default = false ];
  repeated int32 checkpoint_shape = 3;
}
31

32
message ShardingConfig {
33
  optional string sharding_segment_strategy = 1
34
      [ default = 'segment_broadcast_MB' ];
35 36 37 38 39 40 41 42 43 44
  optional float segment_broadcast_MB = 2 [ default = 32.0 ];
  repeated string segment_anchors = 3;
  optional int32 sharding_degree = 4 [ default = 8 ];
  optional int32 mp_degree = 5 [ default = 1 ];
  optional int32 dp_degree = 6 [ default = 1 ];
  optional bool hybrid_dp = 7 [ default = false ];
  optional int32 gradient_merge_acc_step = 8 [ default = 1 ];
  optional bool optimize_offload = 9 [ default = false ];
  optional bool pp_allreduce_in_optimize = 10 [ default = false ];
  optional int32 pp_degree = 11 [ default = 1 ];
45
  optional bool optimize_cast = 12 [ default = false ];
46 47
  // Optimizer sharding. Temporary plans and may be deprecated
  optional bool _dp_as_optimizer_sharding = 13 [ default = false ];
48 49
}

50 51 52 53
message HybridConfig {
  optional int32 dp_degree = 1 [ default = -1 ];
  optional int32 mp_degree = 2 [ default = 1 ];
  optional int32 pp_degree = 3 [ default = 1 ];
J
JZ-LIANG 已提交
54
  optional int32 sharding_degree = 4 [ default = 1 ];
55 56
}

57 58 59 60 61 62 63
message AMPConfig {
  optional float init_loss_scaling = 1 [ default = 32768.0 ];
  optional int32 incr_every_n_steps = 2 [ default = 1000 ];
  optional int32 decr_every_n_nan_or_inf = 3 [ default = 2 ];
  optional float incr_ratio = 4 [ default = 2.0 ];
  optional float decr_ratio = 5 [ default = 0.8 ];
  optional bool use_dynamic_loss_scaling = 6 [ default = true ];
64 65 66
  repeated string custom_white_list = 7;
  repeated string custom_black_list = 8;
  repeated string custom_black_varnames = 9;
67 68
  optional bool use_pure_fp16 = 10 [ default = false ];
  optional bool use_fp16_guard = 11 [ default = true ];
69
}
70

71 72 73 74
message LocalSGDConfig {
  optional int32 k_steps = 1 [ default = 1 ];
  optional int32 begin_step = 2 [ default = 1 ];
}
75

76 77 78 79 80
message AdaptiveLocalSGDConfig {
  optional int32 init_k_steps = 1 [ default = 1 ];
  optional int32 begin_step = 2 [ default = 1 ];
}

81 82 83 84 85
message GradientMergeConfig {
  optional int32 k_steps = 1 [ default = 1 ];
  optional bool avg = 2 [ default = true ];
}

86 87 88 89 90 91
message DGCConfig {
  optional int32 rampup_begin_step = 1 [ default = 0 ];
  optional int32 rampup_step = 2 [ default = 1 ];
  repeated float sparsity = 3;
}

92 93 94
message LarsConfig {
  optional float lars_coeff = 1 [ default = 0.001 ];
  optional float lars_weight_decay = 2 [ default = 0.0005 ];
95 96
  optional float epsilon = 3 [ default = 0.0 ];
  repeated string exclude_from_weight_decay = 4;
97 98 99
}

message LambConfig {
100 101
  optional float lamb_weight_decay = 1 [ default = 0.01 ];
  repeated string exclude_from_weight_decay = 2;
102 103
}

104 105 106 107 108 109 110 111 112 113
message BuildStrategy {
  optional bool enable_sequential_execution = 1 [ default = false ];
  optional bool fuse_elewise_add_act_ops = 2 [ default = false ];
  optional bool fuse_bn_act_ops = 3 [ default = false ];
  optional bool fuse_relu_depthwise_conv = 4 [ default = false ];
  optional bool fuse_broadcast_ops = 5 [ default = false ];
  optional bool fuse_all_optimizer_ops = 6 [ default = false ];
  optional bool enable_inplace = 7 [ default = false ];
  optional bool enable_backward_optimizer_op_deps = 8 [ default = true ];
  optional bool cache_runtime_context = 9 [ default = false ];
114 115 116
  optional bool fuse_bn_add_act_ops = 10 [ default = true ];
  optional bool enable_auto_fusion = 11 [ default = false ];
  optional bool enable_addto = 12 [ default = false ];
Z
Zeng Jinle 已提交
117
  optional bool fix_op_run_order = 13 [ default = false ];
118
  optional bool allow_cuda_graph_capture = 14 [ default = false ];
119
}
120

121 122 123 124 125 126 127
message ExecutionStrategy {
  optional int32 num_threads = 1 [ default = 1 ];
  optional int32 num_iteration_per_drop_scope = 2 [ default = 10 ];
  optional int32 num_iteration_per_run = 3 [ default = 1 ];
  optional bool use_thread_barrier = 4 [ default = false ];
}

Y
Yuang Liu 已提交
128 129 130 131 132 133 134 135
message GradientScaleConfig {
  // Optional value ['avg', 'sum', 'customized']
  // If avg, loss@grad will be divided by the number of devices,
  // that is, the gradient will be accumulated and averaged among
  // multiple devices.
  // Else if sum, the gradient will accumulated among multiple
  // devices.
  optional string scale_strategy = 1 [ default = 'avg' ];
136 137 138 139
  // The avg_loss flag is used to determine the position of average
  // If scale_gradient is False, it will avg the loss@Grad before grad merge.
  // Otherwise, it will do grad merge firstly, then avg the grad after merging.
  optional bool scale_gradient = 2 [ default = false ];
Y
Yuang Liu 已提交
140 141
}

142
message AsyncConfig {
143
  optional int32 k_steps = 1 [ default = -1 ];
144 145 146 147 148 149 150
  optional int32 max_merge_var_num = 2 [ default = 1 ];
  optional int32 send_queue_size = 3 [ default = 16 ];
  optional bool independent_recv_thread = 4 [ default = false ];
  optional int32 min_send_grad_num_before_recv = 5 [ default = 1 ];
  optional int32 thread_pool_size = 6 [ default = 1 ];
  optional int32 send_wait_times = 7 [ default = 1 ];
  optional bool runtime_split_send_recv = 8 [ default = false ];
C
Chengmo 已提交
151
  optional bool launch_barrier = 9 [ default = true ];
152
  optional string heter_worker_device_guard = 10 [ default = 'cpu' ];
153
  optional int32 lr_decay_steps = 11 [ default = 10 ];
T
Thunderbrook 已提交
154
  optional int32 use_ps_gpu = 12 [ default = 0 ];
155 156
}

157 158 159 160 161 162 163
message TrainerDescConfig {
  optional string dump_fields_path = 1;
  repeated string dump_fields = 2;
  repeated string dump_param = 3;
  repeated string stat_var_names = 4;
}

164 165 166
message PipelineConfig {
  optional int32 micro_batch_size = 1 [ default = 1 ];
  optional int32 accumulate_steps = 2 [ default = 1 ];
167
  optional string schedule_mode = 3 [ default = '1F1B' ];
168
  optional bool p2p_cache_shape = 4 [ default = true ];
169
}
170

L
lilong12 已提交
171 172
message TensorParallelConfig {
  optional int32 tensor_parallel_degree = 1 [ default = 1 ];
173
  optional int32 tensor_init_seed = 2 [ default = -1 ];
L
lilong12 已提交
174 175
}

176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
enum TableType {
  PS_SPARSE_TABLE = 0;
  PS_DENSE_TABLE = 1;
}

message TableParameter {
  optional uint64 table_id = 1;
  optional string table_class = 2;
  optional uint64 shard_num = 3;
  optional TableType type = 4;
  optional TableAccessorParameter accessor = 5;
}

message TableAccessorParameter {
  optional string accessor_class = 1;
  optional SGDParameter embed_sgd_param = 2;
  optional SGDParameter embedx_sgd_param = 3;
  optional uint32 fea_dim = 4; // for sparse table, this means field size of one
                               // value; for dense table, this means total value
                               // num
  optional uint32 embedx_dim = 5;       // embedx feature size
  optional uint32 embedx_threshold = 6; // embedx feature create threshold
  optional CtrAccessorParameter ctr_accessor_param = 7;
}

// TODO(guanqun): add NaiveSGD/Adam...
message SGDParameter {
  optional string name = 1;
  optional SGDRuleParameter adagrad = 2;
}

message SGDRuleParameter {
  optional double learning_rate = 1;
  optional double initial_g2sum = 2;
  optional double initial_range = 3 [ default = 0 ];
  repeated float weight_bounds = 4;
}

message CtrAccessorParameter {
  optional float nonclk_coeff = 1; // to calculate show_click_score
  optional float click_coeff = 2;  // to calculate show_click_score
  optional float base_threshold =
      3; // show_click_score > base_threshold, this feature can be saved
  optional float delta_threshold =
      4; // delta_score > delta_threshold, this feature can be saved
  optional float delta_keep_days =
      5; // unseen_day < delta_keep_days, this feature can be saved
  optional float show_click_decay_rate = 6; // show/click will update to
                                            // show/click *
                                            // show_click_decay_rate after a day
  optional float delete_threshold = 7;      // threshold to shrink a feasign
  optional float delete_after_unseen_days = 8;
  optional int32 ssd_unseenday_threshold = 9;
}

message FsClientParameter {
  optional string uri = 1;
  optional string user = 2;
  optional string passwd = 3;
  optional string hadoop_bin = 4;
}

238 239 240 241 242 243 244 245 246 247 248 249 250
message DistributedStrategy {
  // bool options
  optional Mode mode = 1 [ default = COLLECTIVE ];
  optional bool amp = 2 [ default = false ];
  optional bool recompute = 3 [ default = false ];
  optional bool localsgd = 4 [ default = false ];
  optional bool dgc = 5 [ default = false ];
  optional bool gradient_merge = 6 [ default = false ];
  optional bool lars = 7 [ default = false ];
  optional bool lamb = 8 [ default = false ];
  optional bool pipeline = 9 [ default = false ];
  optional bool elastic = 10 [ default = false ];
  optional bool auto = 11 [ default = false ];
D
Dong Daxiang 已提交
251
  optional bool a_sync = 12 [ default = true ];
252 253 254 255 256
  optional bool sync_nccl_allreduce = 13 [ default = true ];
  optional int32 nccl_comm_num = 14 [ default = 1 ];
  optional bool use_hierarchical_allreduce = 15 [ default = false ];
  optional int32 hierarchical_allreduce_inter_nranks = 16 [ default = 1 ];
  optional bool sync_batch_norm = 17 [ default = false ];
李季 已提交
257
  optional bool fuse_all_reduce_ops = 18 [ default = true ];
258 259
  optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ];
  optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ];
260
  optional bool cudnn_exhaustive_search = 21 [ default = false ];
L
lilong12 已提交
261
  optional int32 conv_workspace_size_limit = 22 [ default = 512 ];
262
  optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = false ];
263
  optional bool adaptive_localsgd = 24 [ default = false ];
264
  optional bool fp16_allreduce = 25 [ default = false ];
265
  optional bool sharding = 26 [ default = false ];
266
  optional float last_comm_group_size_MB = 27 [ default = 1 ];
267
  optional bool find_unused_parameters = 28 [ default = false ];
L
lilong12 已提交
268
  optional bool tensor_parallel = 29 [ default = false ];
269
  optional bool without_graph_optimization = 30 [ default = false ];
270
  optional int32 fuse_grad_size_in_num = 31 [ default = 8 ];
271
  optional bool calc_comm_same_stream = 32 [ default = false ];
272
  optional bool asp = 33 [ default = false ];
273
  optional bool fuse_grad_merge = 34 [ default = false ];
274
  optional bool semi_auto = 35 [ default = false ];
275
  optional bool adam_d2sum = 36 [ default = true ];
276

277 278 279 280
  optional RecomputeConfig recompute_configs = 101;
  optional AMPConfig amp_configs = 102;
  optional LocalSGDConfig localsgd_configs = 103;
  optional GradientMergeConfig gradient_merge_configs = 104;
281
  optional DGCConfig dgc_configs = 105;
282
  optional PipelineConfig pipeline_configs = 106;
D
Dong Daxiang 已提交
283
  optional AsyncConfig a_sync_configs = 107;
284 285
  optional LarsConfig lars_configs = 108;
  optional LambConfig lamb_configs = 109;
286
  optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110;
287
  optional ShardingConfig sharding_configs = 111;
288
  optional HybridConfig hybrid_configs = 112;
L
lilong12 已提交
289
  optional TensorParallelConfig tensor_parallel_configs = 113;
290
  optional TrainerDescConfig trainer_desc_configs = 114;
291 292 293
  optional TableParameter downpour_table_param = 115;
  optional FsClientParameter fs_client_param = 116;

294 295
  optional BuildStrategy build_strategy = 201;
  optional ExecutionStrategy execution_strategy = 202;
Y
Yuang Liu 已提交
296
  optional GradientScaleConfig gradient_scale_configs = 203;
297 298 299 300 301 302 303 304 305 306 307 308 309
}

message DistributedJobInfo {
  optional int32 worker_num = 1;
  optional int32 server_num = 2;
  repeated string worker_ips = 3;
  repeated string server_endpoints = 4;
  optional string origin_startup = 5;
  optional string origin_main = 6; // without backpropagation and optimization
  optional string distributed_main = 7; // with backpropagation and optimization
  optional string optimizer_name = 8;   // optimizer name
  optional DistributedStrategy strategy = 101;
}