diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc index 0159e250d317e0c97f6ee908290e0ccfb9626de7..0f9bcc4c2d977978132d8030c4a3753217efd8f3 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc @@ -141,6 +141,9 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker { "NCCL communication data. If it is false, it would be less accurate " "and be less NCCL communication data.") .SetDefault(true); + AddAttr("use_master_acc_grad", + "Whether to use master gradient when acc_steps > 1.") + .SetDefault(true); AddAttr("is_grad_scaled_by_nranks", "Whether the input gradient has been scaled by nranks.") .SetDefault(true); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index eb354ef6d7576c0ce6ab64d9e477b0d04c184f05..e7f6223968f4376ac8dbd3e1b6486eb2afe9981c 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -1193,7 +1193,9 @@ class DistributedFusedLambOpKernel platform::float16 *fp16_acc_grad = nullptr; float *master_acc_grad = nullptr; + bool use_master_acc_grad = false; if (has_fp16_param) { + use_master_acc_grad = ctx.Attr("use_master_acc_grad"); auto *fp16_acc_grad_t = ctx.Output("FP16AccFusedGrad"); PADDLE_ENFORCE_NOT_NULL( @@ -1201,13 +1203,18 @@ class DistributedFusedLambOpKernel "Output(FP16AccFusedGrad) cannot be nullptr " "when Attr(acc_steps) > 1.")); if (!fp16_acc_grad_t->IsInitialized()) { - fp16_acc_grad_t->Resize({static_cast(3 * fp16_numel)}); + auto acc_grad_size = + use_master_acc_grad ? (3 * fp16_numel) : fp16_numel; + fp16_acc_grad_t->Resize({static_cast(acc_grad_size)}); fp16_acc_grad = fp16_acc_grad_t->mutable_data(place); } else { fp16_acc_grad = fp16_acc_grad_t->data(); } - master_acc_grad = reinterpret_cast(fp16_acc_grad + fp16_numel); + if (use_master_acc_grad) { + master_acc_grad = + reinterpret_cast(fp16_acc_grad + fp16_numel); + } } // Inplace addto @@ -1222,8 +1229,8 @@ class DistributedFusedLambOpKernel } if (has_fp16_param) { - if (acc_steps == 2) { - if (rounded_step == 0) { + if (acc_steps == 2 || !use_master_acc_grad) { + if (rounded_step != 1) { LaunchElementwiseAddWithCastKernel(dev_ctx, fp16_acc_grad, fp16_grad, fp16_acc_grad, fp16_numel, stream); diff --git a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py index ee2b180586dd2a5d73673eb1568a6032fc162db1..611d3d2891bac35d6c5f86445ba3191ba51d884e 100644 --- a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py +++ b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py @@ -162,6 +162,7 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs): kwargs = dict(kwargs) kwargs.pop('clip_after_allreduce', None) kwargs.pop('alignment', None) + kwargs.pop('use_master_acc_grad', None) base_clip = grad_clip if grad_clip is not None else IdentityGradClip( ) kwargs['grad_clip'] = GradClipDecorator(base_clip, @@ -271,6 +272,7 @@ class TestDistributedFusedLamb(unittest.TestCase): distutils.util.strtobool(os.getenv('CLIP_AFTER_ALLREDUCE', 'True'))) max_global_norm = float(os.getenv('MAX_GLOBAL_NORM', -1.0)) gm_steps = int(os.getenv('GRADIENT_MERGE_STEPS', 1)) + use_master_acc_grad = bool(int(os.getenv('USE_MASTER_ACC_GRAD', '1'))) print('clip_after_allreduce = {}, max_global_norm = {}'.format( clip_after_allreduce, max_global_norm)) return { @@ -281,9 +283,14 @@ class TestDistributedFusedLamb(unittest.TestCase): 'grad_clip': paddle.nn.ClipGradByGlobalNorm(max_global_norm) if max_global_norm > 0 else None, + 'use_master_acc_grad': + use_master_acc_grad, } - def run_main(self, use_fp16, use_master_param_norm=True): + def run_main(self, + use_fp16, + use_master_param_norm=True, + use_master_acc_grad=True): if not paddle.is_compiled_with_cuda(): return diff --git a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py index 324da95f37d80e3a4acab56aab148e91bdf6a872..b2c2b6e31a5f2a6d42660e5235423d175b489142 100644 --- a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py +++ b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py @@ -36,7 +36,8 @@ def remove_file_if_exists(file_name): def run_test(clip_after_allreduce=True, max_global_norm=-1.0, - gradient_merge_steps=1): + gradient_merge_steps=1, + use_master_acc_grad=True): if not paddle.is_compiled_with_cuda(): return if os.name == 'nt': @@ -58,6 +59,7 @@ def run_test(clip_after_allreduce=True, os.environ['CLIP_AFTER_ALLREDUCE'] = str(clip_after_allreduce) os.environ['MAX_GLOBAL_NORM'] = str(max_global_norm) os.environ['GRADIENT_MERGE_STEPS'] = str(gradient_merge_steps) + os.environ['USE_MASTER_ACC_GRAD'] = str(1 if use_master_acc_grad else 0) touch_file_env = 'SUCCESS_TOUCH_FILE' touch_file_name = 'distributed_fused_lamb_touch_file_{}'.format(os.getpid()) diff --git a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py index c2089b1d97db6d39554f5256b8ff419ebe3cc204..01ca09916a1e65755d1c16ee2a9b6a941a54a5d9 100644 --- a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py +++ b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py @@ -23,6 +23,12 @@ class TestDistributedFusedLambGradientMerge(unittest.TestCase): max_global_norm=-1.0, gradient_merge_steps=2) + def test_gm_with_fp16_acc_grad(self): + run_test(clip_after_allreduce=True, + max_global_norm=-1.0, + gradient_merge_steps=2, + use_master_acc_grad=False) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py index 4fddaff7ec9591d7325b4a443ab5d49e7eb24042..3029c3a294a00d3cd9863b7368368d00bd0a7e05 100644 --- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py +++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py @@ -40,6 +40,7 @@ class DistributedFusedLamb(Optimizer): alignment=128, use_master_param_norm=True, gradient_accumulation_steps=1, + use_master_acc_grad=True, name=None): assert not framework._non_static_mode( ), "DistributedFusedLamb does not support dygraph mode" @@ -67,6 +68,7 @@ class DistributedFusedLamb(Optimizer): self._ring_id = 0 self._use_master_param_norm = use_master_param_norm self._gradient_accumulation_steps = gradient_accumulation_steps + self._use_master_acc_grad = use_master_acc_grad assert self._gradient_accumulation_steps >= 1 self.helper = LayerHelper('distributed_fused_lamb') @@ -353,5 +355,6 @@ class DistributedFusedLamb(Optimizer): 'use_master_param_norm': self._use_master_param_norm, 'is_grad_scaled_by_nranks': self._is_grad_scaled_by_nranks, 'acc_steps': self._gradient_accumulation_steps, + 'use_master_acc_grad': self._use_master_acc_grad, }) return [lamb_op]