distributed_strategy.proto 16.1 KB
Newer Older
1
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
2
// Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto2";
package paddle.fleet;

enum Mode {
  COLLECTIVE = 1;
  PS = 2;
  PIPELINE = 3;
  HETER = 4; // support XPU and GPU computing server
}

J
JZ-LIANG 已提交
26 27 28 29
message RecomputeConfig {
  repeated string checkpoints = 1;
  optional bool enable_offload = 2 [ default = false ];
  repeated int32 checkpoint_shape = 3;
30
  optional bool enable_tuning = 4 [ default = false ]; // incubate for auto parallel
J
JZ-LIANG 已提交
31
}
32

33
message ShardingConfig {
34
  optional string sharding_segment_strategy = 1
35
      [ default = 'segment_broadcast_MB' ];
36 37 38 39 40 41 42 43 44 45
  optional float segment_broadcast_MB = 2 [ default = 32.0 ];
  repeated string segment_anchors = 3;
  optional int32 sharding_degree = 4 [ default = 8 ];
  optional int32 mp_degree = 5 [ default = 1 ];
  optional int32 dp_degree = 6 [ default = 1 ];
  optional bool hybrid_dp = 7 [ default = false ];
  optional int32 gradient_merge_acc_step = 8 [ default = 1 ];
  optional bool optimize_offload = 9 [ default = false ];
  optional bool pp_allreduce_in_optimize = 10 [ default = false ];
  optional int32 pp_degree = 11 [ default = 1 ];
46
  optional bool optimize_cast = 12 [ default = false ];
47 48
  // Optimizer sharding. Temporary plans and may be deprecated
  optional bool _dp_as_optimizer_sharding = 13 [ default = false ];
J
JZ-LIANG 已提交
49
  optional int32 stage = 14 [ default = 1 ];
50
  optional bool enable_tuning = 15 [ default = false ]; // incubate for auto parallel
51 52
}

53 54
// for dygraph
message MpConfig {
55
    optional bool sync_param= 1 [ default = true ];
56 57
    optional bool sync_grad= 2 [ default = false ];
    optional bool sync_moment= 3 [ default = false ];
W
wuhuachaocoding 已提交
58
    optional string sync_mode= 4 [ default = 'broadcast' ];
59 60
}

61 62 63
message PpConfig {
    optional bool dp_comm_overlap = 1 [ default = false ];
    optional bool delay_scale_loss = 2 [ default = false ];
Y
Yuang Liu 已提交
64
    optional bool enable_timer = 3 [ default = false ];
65
    optional bool sharding_comm_overlap = 4 [ default = false ];
66
    optional bool profiling = 5 [ default = false ];
67 68
}

69 70 71 72
message DygraphShardingConfig {
  optional bool tensor_fusion = 1 [ default = false ];
}

73 74 75 76
message HybridConfig {
  optional int32 dp_degree = 1 [ default = -1 ];
  optional int32 mp_degree = 2 [ default = 1 ];
  optional int32 pp_degree = 3 [ default = 1 ];
J
JZ-LIANG 已提交
77
  optional int32 sharding_degree = 4 [ default = 1 ];
78
  optional MpConfig mp_configs = 5;
79
  optional PpConfig pp_configs = 6;
80
  optional DygraphShardingConfig sharding_configs = 7;
81 82
}

83 84 85 86 87 88 89
message AMPConfig {
  optional float init_loss_scaling = 1 [ default = 32768.0 ];
  optional int32 incr_every_n_steps = 2 [ default = 1000 ];
  optional int32 decr_every_n_nan_or_inf = 3 [ default = 2 ];
  optional float incr_ratio = 4 [ default = 2.0 ];
  optional float decr_ratio = 5 [ default = 0.8 ];
  optional bool use_dynamic_loss_scaling = 6 [ default = true ];
90 91 92
  repeated string custom_white_list = 7;
  repeated string custom_black_list = 8;
  repeated string custom_black_varnames = 9;
93 94
  optional bool use_pure_fp16 = 10 [ default = false ];
  optional bool use_fp16_guard = 11 [ default = true ];
95 96
  optional bool use_optimizer_fp16 = 12
      [ default = false ]; // auto parallel effective only
97
}
98

99 100 101 102
message LocalSGDConfig {
  optional int32 k_steps = 1 [ default = 1 ];
  optional int32 begin_step = 2 [ default = 1 ];
}
103

104 105 106 107 108
message AdaptiveLocalSGDConfig {
  optional int32 init_k_steps = 1 [ default = 1 ];
  optional int32 begin_step = 2 [ default = 1 ];
}

109 110 111 112 113
message GradientMergeConfig {
  optional int32 k_steps = 1 [ default = 1 ];
  optional bool avg = 2 [ default = true ];
}

114 115 116 117 118 119
message DGCConfig {
  optional int32 rampup_begin_step = 1 [ default = 0 ];
  optional int32 rampup_step = 2 [ default = 1 ];
  repeated float sparsity = 3;
}

120 121 122
message LarsConfig {
  optional float lars_coeff = 1 [ default = 0.001 ];
  optional float lars_weight_decay = 2 [ default = 0.0005 ];
123 124
  optional float epsilon = 3 [ default = 0.0 ];
  repeated string exclude_from_weight_decay = 4;
125 126 127
}

message LambConfig {
128 129
  optional float lamb_weight_decay = 1 [ default = 0.01 ];
  repeated string exclude_from_weight_decay = 2;
130 131
}

132 133 134 135 136 137 138 139 140 141
message BuildStrategy {
  optional bool enable_sequential_execution = 1 [ default = false ];
  optional bool fuse_elewise_add_act_ops = 2 [ default = false ];
  optional bool fuse_bn_act_ops = 3 [ default = false ];
  optional bool fuse_relu_depthwise_conv = 4 [ default = false ];
  optional bool fuse_broadcast_ops = 5 [ default = false ];
  optional bool fuse_all_optimizer_ops = 6 [ default = false ];
  optional bool enable_inplace = 7 [ default = false ];
  optional bool enable_backward_optimizer_op_deps = 8 [ default = true ];
  optional bool cache_runtime_context = 9 [ default = false ];
142 143 144
  optional bool fuse_bn_add_act_ops = 10 [ default = true ];
  optional bool enable_auto_fusion = 11 [ default = false ];
  optional bool enable_addto = 12 [ default = false ];
Z
Zeng Jinle 已提交
145
  optional bool fix_op_run_order = 13 [ default = false ];
146
  optional bool allow_cuda_graph_capture = 14 [ default = false ];
147
  optional int32 reduce_strategy = 15 [ default = 0 ];
148
  optional bool fuse_gemm_epilogue = 16 [ default = false ];
149
  optional string debug_graphviz_path = 17;
150
  optional bool fused_attention = 18 [ default = false];
151
  optional bool fused_feedforward = 19 [ default = false];
152
}
153

154 155 156 157 158 159 160
message ExecutionStrategy {
  optional int32 num_threads = 1 [ default = 1 ];
  optional int32 num_iteration_per_drop_scope = 2 [ default = 10 ];
  optional int32 num_iteration_per_run = 3 [ default = 1 ];
  optional bool use_thread_barrier = 4 [ default = false ];
}

Y
Yuang Liu 已提交
161 162 163 164 165 166 167 168
message GradientScaleConfig {
  // Optional value ['avg', 'sum', 'customized']
  // If avg, loss@grad will be divided by the number of devices,
  // that is, the gradient will be accumulated and averaged among
  // multiple devices.
  // Else if sum, the gradient will accumulated among multiple
  // devices.
  optional string scale_strategy = 1 [ default = 'avg' ];
169 170 171 172
  // The avg_loss flag is used to determine the position of average
  // If scale_gradient is False, it will avg the loss@Grad before grad merge.
  // Otherwise, it will do grad merge firstly, then avg the grad after merging.
  optional bool scale_gradient = 2 [ default = false ];
Y
Yuang Liu 已提交
173 174
}

175
message AsyncConfig {
176
  optional int32 k_steps = 1 [ default = -1 ];
177 178 179 180 181 182 183
  optional int32 max_merge_var_num = 2 [ default = 1 ];
  optional int32 send_queue_size = 3 [ default = 16 ];
  optional bool independent_recv_thread = 4 [ default = false ];
  optional int32 min_send_grad_num_before_recv = 5 [ default = 1 ];
  optional int32 thread_pool_size = 6 [ default = 1 ];
  optional int32 send_wait_times = 7 [ default = 1 ];
  optional bool runtime_split_send_recv = 8 [ default = false ];
C
Chengmo 已提交
184
  optional bool launch_barrier = 9 [ default = true ];
185
  optional string heter_worker_device_guard = 10 [ default = 'cpu' ];
186
  optional int32 lr_decay_steps = 11 [ default = 10 ];
T
Thunderbrook 已提交
187
  optional int32 use_ps_gpu = 12 [ default = 0 ];
188 189
}

190 191 192 193 194
message TrainerDescConfig {
  optional string dump_fields_path = 1;
  repeated string dump_fields = 2;
  repeated string dump_param = 3;
  repeated string stat_var_names = 4;
195 196
  optional string trainer = 5;
  optional string device_worker = 6;
197 198
  repeated string local_sparse = 7;
  repeated string remote_sparse = 8;
199 200
}

201 202 203
message PipelineConfig {
  optional int32 micro_batch_size = 1 [ default = 1 ];
  optional int32 accumulate_steps = 2 [ default = 1 ];
204
  optional string schedule_mode = 3 [ default = '1F1B' ];
205
  optional bool p2p_cache_shape = 4 [ default = true ];
206
  optional bool enable_partial_send_recv = 5 [ default = true ];
207
}
208

L
lilong12 已提交
209 210
message TensorParallelConfig {
  optional int32 tensor_parallel_degree = 1 [ default = 1 ];
211
  optional int32 tensor_init_seed = 2 [ default = -1 ];
L
lilong12 已提交
212 213
}

214 215 216 217 218 219 220 221
message QatConfig {
  optional bool channel_wise_abs_max = 1 [default = true];
  optional int32 weight_bits = 2 [default = 8];
  optional int32 activation_bits = 3 [default = 8];
  repeated string not_quant_pattern = 4;
  optional string algo = 5;
}

222 223 224 225 226 227 228
enum TableType {
  PS_SPARSE_TABLE = 0;
  PS_DENSE_TABLE = 1;
}

message TableParameter {
  optional uint64 table_id = 1;
229 230 231 232 233
  optional string table_name = 2;
  optional string table_class = 3;
  optional uint64 shard_num = 4 [ default = 1000 ];
  optional TableType type = 5;
  optional TableAccessorParameter accessor = 6;
234
  optional bool compress_in_save = 7 [ default = false ];
235 236 237 238 239 240 241
  // for cache model
  optional bool enable_sparse_table_cache = 10 [ default = true ];
  optional double sparse_table_cache_rate = 11 [ default = 0.00055 ];
  optional uint32 sparse_table_cache_file_num = 12 [ default = 16 ];
  // for patch model
  optional bool enable_revert = 13 [ default = false ];
  optional float shard_merge_rate = 14 [ default = 1.0 ];
242 243 244 245
}

message TableAccessorParameter {
  optional string accessor_class = 1;
246 247 248 249
  optional uint32 fea_dim = 4 [ default = 11 ];   // field size of one value
  optional uint32 embedx_dim = 5 [ default = 8 ]; // embedx feature size
  optional uint32 embedx_threshold = 6
      [ default = 10 ]; // embedx feature create threshold
250
  optional CtrAccessorParameter ctr_accessor_param = 7;
251
  repeated TableAccessorSaveParameter table_accessor_save_param = 8;
D
danleifeng 已提交
252 253
  optional SGDParameter embed_sgd_param = 10;
  optional SGDParameter embedx_sgd_param = 11;
D
danleifeng 已提交
254 255 256 257 258 259
  optional GraphSGDParameter graph_sgd_param = 12;
}

message GraphSGDParameter {
  optional uint32 nodeid_slot = 1 [ default = 9008 ];
  optional float feature_learning_rate = 2 [ default = 0.05 ];
260 261 262 263
}

message SGDParameter {
  optional string name = 1;
264 265 266
  optional SparseNaiveSGDRuleParameter naive = 2;
  optional SparseAdagradSGDRuleParameter adagrad = 3;
  optional SparseAdamSGDParameter adam = 4;
267 268
}

269 270 271 272 273 274 275 276 277 278 279
message SparseNaiveSGDRuleParameter { // SparseNaiveSGDRule
  optional double learning_rate = 1 [ default = 0.05 ];
  optional double initial_range = 2 [ default = 0.0001 ];
  repeated float weight_bounds = 3;
}

message
    SparseAdagradSGDRuleParameter { // SparseAdaGradSGDRule|StdAdaGradSGDRule
  optional double learning_rate = 1 [ default = 0.05 ];
  optional double initial_g2sum = 2 [ default = 3.0 ];
  optional double initial_range = 3 [ default = 0.0001 ];
280 281 282
  repeated float weight_bounds = 4;
}

D
danleifeng 已提交
283
message SparseAdamSGDParameter { // SparseAdamSGDRule | SparseSharedAdamSGDRule
284 285 286 287 288 289 290 291
  optional double learning_rate = 1 [ default = 0.001 ];
  optional double initial_range = 2 [ default = 0.0001 ];
  optional double beta1_decay_rate = 3 [ default = 0.9 ];
  optional double beta2_decay_rate = 4 [ default = 0.999 ];
  optional double ada_epsilon = 5 [ default = 1e-08 ];
  repeated float weight_bounds = 6;
}

292
message CtrAccessorParameter {
293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313
  optional float nonclk_coeff = 1
      [ default = 0.1 ]; // to calculate show_click_score
  optional float click_coeff = 2
      [ default = 1 ]; // to calculate show_click_score
  optional float base_threshold = 3 [
    default = 1.5
  ]; // show_click_score > base_threshold, this feature can be saved
  optional float delta_threshold = 4
      [ default =
            0.25 ]; // delta_score > delta_threshold, this feature can be saved
  optional float delta_keep_days = 5
      [ default =
            16 ]; // unseen_day < delta_keep_days, this feature can be saved
  optional float show_click_decay_rate = 6
      [ default = 0.98 ]; // show/click will update to
                          // show/click *
                          // show_click_decay_rate after a day
  optional float delete_threshold = 7
      [ default = 0.8 ]; // threshold to shrink a feasign
  optional float delete_after_unseen_days = 8 [ default = 30 ];
  optional int32 ssd_unseenday_threshold = 9 [ default = 1 ];
314
  optional bool show_scale = 10 [ default = true ];
L
lxsbupt 已提交
315
  repeated float load_filter_slots = 11;
316 317 318 319 320 321
}

message TableAccessorSaveParameter {
  optional uint32 param = 1;
  optional string converter = 2;
  optional string deconverter = 3;
322 323 324 325 326 327 328 329 330
}

message FsClientParameter {
  optional string uri = 1;
  optional string user = 2;
  optional string passwd = 3;
  optional string hadoop_bin = 4;
}

331 332 333 334 335 336 337 338 339 340 341 342 343
message DistributedStrategy {
  // bool options
  optional Mode mode = 1 [ default = COLLECTIVE ];
  optional bool amp = 2 [ default = false ];
  optional bool recompute = 3 [ default = false ];
  optional bool localsgd = 4 [ default = false ];
  optional bool dgc = 5 [ default = false ];
  optional bool gradient_merge = 6 [ default = false ];
  optional bool lars = 7 [ default = false ];
  optional bool lamb = 8 [ default = false ];
  optional bool pipeline = 9 [ default = false ];
  optional bool elastic = 10 [ default = false ];
  optional bool auto = 11 [ default = false ];
D
Dong Daxiang 已提交
344
  optional bool a_sync = 12 [ default = true ];
345 346 347 348 349
  optional bool sync_nccl_allreduce = 13 [ default = true ];
  optional int32 nccl_comm_num = 14 [ default = 1 ];
  optional bool use_hierarchical_allreduce = 15 [ default = false ];
  optional int32 hierarchical_allreduce_inter_nranks = 16 [ default = 1 ];
  optional bool sync_batch_norm = 17 [ default = false ];
李季 已提交
350
  optional bool fuse_all_reduce_ops = 18 [ default = true ];
351 352
  optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ];
  optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ];
353
  optional bool cudnn_exhaustive_search = 21 [ default = false ];
L
lilong12 已提交
354
  optional int32 conv_workspace_size_limit = 22 [ default = 512 ];
355
  optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = false ];
356
  optional bool adaptive_localsgd = 24 [ default = false ];
357
  optional bool fp16_allreduce = 25 [ default = false ];
358
  optional bool sharding = 26 [ default = false ];
359
  optional float last_comm_group_size_MB = 27 [ default = 1 ];
360
  optional bool find_unused_parameters = 28 [ default = false ];
L
lilong12 已提交
361
  optional bool tensor_parallel = 29 [ default = false ];
362
  optional bool without_graph_optimization = 30 [ default = true ];
363
  optional int32 fuse_grad_size_in_num = 31 [ default = 8 ];
364
  optional bool calc_comm_same_stream = 32 [ default = false ];
365
  optional bool asp = 33 [ default = false ];
366
  optional bool fuse_grad_merge = 34 [ default = false ];
367
  optional bool semi_auto = 35 [ default = false ];
W
wangguanqun 已提交
368
  optional bool adam_d2sum = 36 [ default = false ];
Z
zhaoyingli 已提交
369
  optional bool auto_search = 37 [ default = false ];
K
kuizhiqing 已提交
370
  optional bool heter_ccl_mode = 38 [ default = false ];
371
  optional bool is_fl_ps_mode = 39 [ default = false ];
372
  optional bool with_coordinator = 40 [ default = false ];
373
  optional bool qat = 41 [ default = false ];
374
  optional bool split_data = 42 [ default = true ];
375

376 377 378 379
  optional RecomputeConfig recompute_configs = 101;
  optional AMPConfig amp_configs = 102;
  optional LocalSGDConfig localsgd_configs = 103;
  optional GradientMergeConfig gradient_merge_configs = 104;
380
  optional DGCConfig dgc_configs = 105;
381
  optional PipelineConfig pipeline_configs = 106;
D
Dong Daxiang 已提交
382
  optional AsyncConfig a_sync_configs = 107;
383 384
  optional LarsConfig lars_configs = 108;
  optional LambConfig lamb_configs = 109;
385
  optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110;
386
  optional ShardingConfig sharding_configs = 111;
387
  optional HybridConfig hybrid_configs = 112;
L
lilong12 已提交
388
  optional TensorParallelConfig tensor_parallel_configs = 113;
389
  optional TrainerDescConfig trainer_desc_configs = 114;
390
  repeated TableParameter downpour_table_param = 115;
391
  optional FsClientParameter fs_client_param = 116;
392
  optional QatConfig qat_configs = 117;
393

394 395
  optional BuildStrategy build_strategy = 201;
  optional ExecutionStrategy execution_strategy = 202;
Y
Yuang Liu 已提交
396
  optional GradientScaleConfig gradient_scale_configs = 203;
397 398 399 400 401 402 403 404 405 406 407 408 409
}

message DistributedJobInfo {
  optional int32 worker_num = 1;
  optional int32 server_num = 2;
  repeated string worker_ips = 3;
  repeated string server_endpoints = 4;
  optional string origin_startup = 5;
  optional string origin_main = 6; // without backpropagation and optimization
  optional string distributed_main = 7; // with backpropagation and optimization
  optional string optimizer_name = 8;   // optimizer name
  optional DistributedStrategy strategy = 101;
}