diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 70359dc3fd25bf2fb55f7c4f6e9f7974e3525028..a4af3117d3e32ea8db37881bef9c4423ba0173ca 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -187,6 +187,7 @@ size_t VarBase::GradOpNum() const { } void VarBase::ClearGradient() { + VLOG(4) << "ClearGradient " << Name(); if (grad_var_) { if (grad_var_->Var().IsType()) { auto* grad_t = diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index 64b34ce8345635711753532d6081e414844fa3fc..72a67a92c495863aba62bdaa93811e59780ed846 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -62,6 +62,7 @@ class GradScaler(AmpScaler): scaled = scaler.scale(loss) # scale the loss scaled.backward() # do backward scaler.minimize(optimizer, scaled) # update parameters + optimizer.clear_grad() """ def __init__(self, @@ -105,6 +106,7 @@ class GradScaler(AmpScaler): scaled = scaler.scale(loss) # scale the loss scaled.backward() # do backward scaler.minimize(optimizer, scaled) # update parameters + optimizer.clear_grad() """ return super(GradScaler, self).scale(var) @@ -140,5 +142,6 @@ class GradScaler(AmpScaler): scaled = scaler.scale(loss) # scale the loss scaled.backward() # do backward scaler.minimize(optimizer, scaled) # update parameters + optimizer.clear_grad() """ return super(GradScaler, self).minimize(optimizer, *args, **kwargs)