diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.h b/paddle/fluid/operators/dgc_clip_by_norm_op.h index bd22d16f7a21877af4e78c30f7e0985c64b543f2..197bf59b2a470e1f6e4e31c6706d1e3f8e73fbbc 100644 --- a/paddle/fluid/operators/dgc_clip_by_norm_op.h +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h @@ -24,18 +24,21 @@ class DGCClipByNormKernel : public ClipByNormKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto rampup_begin_step = context.Attr("rampup_begin_step"); - if (static_cast(rampup_begin_step) >= 0) { - auto current_step_tensor = - context.Input("current_step"); - auto* current_step = current_step_tensor->data(); - - if (static_cast(*current_step) < - static_cast(rampup_begin_step)) { - VLOG(10) << "current_step:" << *current_step - << " < rampup_begin_step:" << rampup_begin_step - << " so does't use dgc_clip_by_norm"; - return; - } + if (static_cast(rampup_begin_step) < 0) { + return; + } + + auto current_step_tensor = context.Input("current_step"); + auto* current_step = current_step_tensor->data(); + + VLOG(10) << "current_step:" << *current_step + << ", rampup_begin_step:" << rampup_begin_step; + + if (static_cast(*current_step) < static_cast(rampup_begin_step)) { + VLOG(10) << "current_step:" << *current_step + << " < rampup_begin_step:" << rampup_begin_step + << " so does't use dgc_clip_by_norm"; + return; } return ClipByNormKernel::Compute(context); diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 79accabe87869c832b7467acbaf70d11cbca8a96..7e6e37116fe23f26eb14dd0573dbe031aec98dd8 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -832,7 +832,7 @@ class DGCMomentumOptimizer(MomentumOptimizer): type=x.type, name=name, dtype=x.dtype, persistable=False) helper.append_op( - type="clip_by_norm", + type="dgc_clip_by_norm", inputs={"X": x, "current_step": self._global_step_var}, attrs={ @@ -845,7 +845,7 @@ class DGCMomentumOptimizer(MomentumOptimizer): def _append_clip_norm(self, grad_var, clip_norm): with grad_var.block.program._backward_role_guard(): return self._clip_by_norm( - x=grad_var, max_norm=clip_norm, name=grad_var.name + "@DGC") + x=grad_var, max_norm=clip_norm, name=grad_var.name) def _dgc_op(self, param_var, clip_var, grad_var, u_var, v_var, k_var, encoded_var):