diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc index efec50efa92ea68cb68934bde32e1f56570b0868..95b45934ea6d2e7d5472f7552a86e27773509105 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc @@ -94,6 +94,7 @@ class DistributedFusedLambInitOpMaker AddOutput("GradOut", "The output gradient list.").AsDuplicable(); AddOutput("GlobalScale", "The global scale. It is usually the scale factor for AMP."); + AddOutput("Step", "The global step which excludes the NaN/Inf step."); AddAttr("beta1", "The initial value of Beta1Pow."); AddAttr("beta2", "The initial value of Beta2Pow."); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu index 7d8a7186d58b402e208fc749524d996b351abeef..3688b8067c23143a44612ab5c5d4e9700ef982b9 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu @@ -698,6 +698,10 @@ class DistributedFusedLambInitOpKernel TensorFillConstant(dev_ctx, global_scale, {1}, 1.0f); } VLOG(10) << "Init global scale ends"; + + TensorFillConstant(dev_ctx, ctx.Output("Step"), + {1}, static_cast(0)); + dev_ctx.Wait(); VLOG(10) << "Wait for H2D copy"; } diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc index 8f7c87912e93aa1bb3178d37afa641047e15a82b..161483c3420fc2a2161f102a615f72df9bf9807b 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc @@ -110,6 +110,7 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker { .AsDuplicable(); AddOutput("FoundInf", "Whether there is NaN/Inf"); + AddOutput("Step", "The global step which excludes the NaN/Inf step."); AddAttr("beta1", "The initial Beta1Pow value."); AddAttr("beta2", "The initial Beta2Pow value."); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index 5b60f65442b55dc89a845859f153048e89704f70..f445a140f27a306efa71d7fd60a1cd48ec44c700 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -381,8 +381,9 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel( const T *__restrict__ square_grad_norm_p, const T *__restrict__ global_scale, const T *__restrict__ beta1pow_p, const T *__restrict__ beta2pow_p, T *__restrict__ mom1_p, - T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, bool *found_inf, - T weight_decay, int weight_decay_end_numel, T beta1, T beta2, T epsilon, + T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, + bool *__restrict__ found_inf, int64_t *__restrict__ step, T weight_decay, + int weight_decay_end_numel, T beta1, T beta2, T epsilon, T max_global_grad_norm, int num, T rescale_grad) { T square_grad_norm = *square_grad_norm_p; bool need_update_found_inf = @@ -392,6 +393,7 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel( return; } else if (need_update_found_inf) { *found_inf = false; + ++(*step); } T scale = rescale_grad / global_scale[0]; @@ -467,8 +469,8 @@ static void MultiTensorUpdateLambMomentAndTrustRatioDiv( const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n, const T *param_p, const GradT *grad_p, const T *square_grad_norm_p, const T *global_scale, const T *beta1pow_p, const T *beta2pow_p, T *mom1_p, - T *mom2_p, T *trust_ratio_div_p, bool *found_inf_p, T weight_decay, - int weight_decay_end_idx, T beta1, T beta2, T epsilon, + T *mom2_p, T *trust_ratio_div_p, bool *found_inf_p, int64_t *step, + T weight_decay, int weight_decay_end_idx, T beta1, T beta2, T epsilon, T max_global_grad_norm, T rescale_grad) { if (n <= 0) return; int numel = offsets[n] - offsets[0]; @@ -496,15 +498,24 @@ static void MultiTensorUpdateLambMomentAndTrustRatioDiv( auto stream = dev_ctx.stream(); auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size); + if (found_inf_p == nullptr) { + PADDLE_ENFORCE_EQ( + step, nullptr, + platform::errors::InvalidArgument( + "Output(Step) cannot be updated twice in one mini-batch.")); + } else { + PADDLE_ENFORCE_NOT_NULL(step, platform::errors::InvalidArgument( + "Output(Step) cannot be nullptr.")); + } -#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL \ - do { \ - UpdateLambMomentAndTrustRatioDivCUDAKernel<<< \ - config.block_per_grid, config.thread_per_block, 0, stream>>>( \ - param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p, \ - beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p, \ - weight_decay, weight_decay_end_numel, beta1, beta2, epsilon, \ - max_global_grad_norm, numel, rescale_grad); \ +#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL \ + do { \ + UpdateLambMomentAndTrustRatioDivCUDAKernel<<< \ + config.block_per_grid, config.thread_per_block, 0, stream>>>( \ + param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p, \ + beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p, step, \ + weight_decay, weight_decay_end_numel, beta1, beta2, epsilon, \ + max_global_grad_norm, numel, rescale_grad); \ } while (0) PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL); @@ -1315,6 +1326,8 @@ class DistributedFusedLambOpKernel const auto *fp16_partial_fused_offsets = fp16_partial_fused_offsets_t->data(); + auto *step = ctx.Output("Step")->data(); + VLOG(1) << "FusedParamOffsets: " << FlattenToString(fused_offsets, fused_offsets_t->numel(), fused_offsets_t->place()); @@ -1337,8 +1350,8 @@ class DistributedFusedLambOpKernel dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num, fp32_param + fp32_offset, fp32_sum_grad, fp32_square_grad_norm, global_scale, beta1pow, beta2pow, moment1, moment2, trust_ratio_div, - found_inf, weight_decay, fp32_weight_decay_end_idx, beta1, beta2, - epsilon, max_global_grad_norm, rescale_grad); + found_inf, step, weight_decay, fp32_weight_decay_end_idx, beta1, + beta2, epsilon, max_global_grad_norm, rescale_grad); VLOG(10) << "Update FP32 Moment and TrustRatioDiv done"; } float *master_param = nullptr; @@ -1346,13 +1359,14 @@ class DistributedFusedLambOpKernel master_param = fp32_param + fp32_numel; VLOG(10) << "Update FP16 Moment and TrustRatioDiv starts"; auto tmp_found_inf = has_fp32_param ? nullptr : found_inf; + auto tmp_step = has_fp32_param ? nullptr : step; MultiTensorUpdateLambMomentAndTrustRatioDiv( dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num, master_param + fp16_offset, fp16_sum_grad, fp32_square_grad_norm, global_scale, beta1pow, beta2pow, moment1 + fp32_numel_each_device, moment2 + fp32_numel_each_device, - trust_ratio_div + fp32_numel_each_device, tmp_found_inf, weight_decay, - fp16_weight_decay_end_idx, beta1, beta2, epsilon, + trust_ratio_div + fp32_numel_each_device, tmp_found_inf, tmp_step, + weight_decay, fp16_weight_decay_end_idx, beta1, beta2, epsilon, max_global_grad_norm, rescale_grad); VLOG(10) << "Update FP16 Moment and TrustRatioDiv done"; } diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py index 00a39dfba0f180da785b3a76692dee676a88736f..12a88106a44cda42a29f2942755c81f670432b2e 100644 --- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py +++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py @@ -75,9 +75,18 @@ class DistributedFusedLamb(Optimizer): name=unique_name.generate('found_inf'), shape=[1], dtype=core.VarDesc.VarType.BOOL) + self._step = None self._param_to_master_param = {} + def _set_step(self, step): + self._step = step + + def _get_or_create_step(self): + if self._step is None: + self._step = self._create_persistable_var('step', dtype='int64') + return self._step + def _set_scale(self, scale): assert scale is not None if not isinstance(scale, Variable): @@ -189,6 +198,8 @@ class DistributedFusedLamb(Optimizer): param_order = self._create_persistable_var('param_order', dtype='int32') param_order.is_distributed = True + step = self._get_or_create_step() + rank = get_rank() nranks = get_world_size() scale = self._get_or_create_scale() @@ -234,6 +245,7 @@ class DistributedFusedLamb(Optimizer): 'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets], 'FusedParamOffsets': [fused_offsets], 'ParamOrder': [param_order], + 'Step': [step], }, attrs={ 'alignment': self._alignment, @@ -290,6 +302,7 @@ class DistributedFusedLamb(Optimizer): 'ParamOut': params, 'GradOut': grads, 'FoundInf': [self._found_inf], + 'Step': [step], }, attrs={ 'weight_decay': self._weight_decay,