From 601d7a353da347418116dc4713cc221d68a77783 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Tue, 7 Jun 2022 19:00:06 +0800 Subject: [PATCH] Add use_master_acc_grad for DistributedFusedLamb (#43266) * add use_master_acc_grad * add ut --- .../optimizers/distributed_fused_lamb_op.cc | 3 +++ .../optimizers/distributed_fused_lamb_op.cu | 15 +++++++++++---- .../unittests/distributed_fused_lamb_test_base.py | 9 ++++++++- .../test_distributed_fused_lamb_op_with_clip.py | 4 +++- ...stributed_fused_lamb_op_with_gradient_merge.py | 6 ++++++ .../incubate/optimizer/distributed_fused_lamb.py | 3 +++ 6 files changed, 34 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc index 0159e250d31..0f9bcc4c2d9 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc @@ -141,6 +141,9 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker { "NCCL communication data. If it is false, it would be less accurate " "and be less NCCL communication data.") .SetDefault(true); + AddAttr("use_master_acc_grad", + "Whether to use master gradient when acc_steps > 1.") + .SetDefault(true); AddAttr("is_grad_scaled_by_nranks", "Whether the input gradient has been scaled by nranks.") .SetDefault(true); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index eb354ef6d75..e7f6223968f 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -1193,7 +1193,9 @@ class DistributedFusedLambOpKernel platform::float16 *fp16_acc_grad = nullptr; float *master_acc_grad = nullptr; + bool use_master_acc_grad = false; if (has_fp16_param) { + use_master_acc_grad = ctx.Attr("use_master_acc_grad"); auto *fp16_acc_grad_t = ctx.Output("FP16AccFusedGrad"); PADDLE_ENFORCE_NOT_NULL( @@ -1201,13 +1203,18 @@ class DistributedFusedLambOpKernel "Output(FP16AccFusedGrad) cannot be nullptr " "when Attr(acc_steps) > 1.")); if (!fp16_acc_grad_t->IsInitialized()) { - fp16_acc_grad_t->Resize({static_cast(3 * fp16_numel)}); + auto acc_grad_size = + use_master_acc_grad ? (3 * fp16_numel) : fp16_numel; + fp16_acc_grad_t->Resize({static_cast(acc_grad_size)}); fp16_acc_grad = fp16_acc_grad_t->mutable_data(place); } else { fp16_acc_grad = fp16_acc_grad_t->data(); } - master_acc_grad = reinterpret_cast(fp16_acc_grad + fp16_numel); + if (use_master_acc_grad) { + master_acc_grad = + reinterpret_cast(fp16_acc_grad + fp16_numel); + } } // Inplace addto @@ -1222,8 +1229,8 @@ class DistributedFusedLambOpKernel } if (has_fp16_param) { - if (acc_steps == 2) { - if (rounded_step == 0) { + if (acc_steps == 2 || !use_master_acc_grad) { + if (rounded_step != 1) { LaunchElementwiseAddWithCastKernel(dev_ctx, fp16_acc_grad, fp16_grad, fp16_acc_grad, fp16_numel, stream); diff --git a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py index ee2b180586d..611d3d2891b 100644 --- a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py +++ b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py @@ -162,6 +162,7 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs): kwargs = dict(kwargs) kwargs.pop('clip_after_allreduce', None) kwargs.pop('alignment', None) + kwargs.pop('use_master_acc_grad', None) base_clip = grad_clip if grad_clip is not None else IdentityGradClip( ) kwargs['grad_clip'] = GradClipDecorator(base_clip, @@ -271,6 +272,7 @@ class TestDistributedFusedLamb(unittest.TestCase): distutils.util.strtobool(os.getenv('CLIP_AFTER_ALLREDUCE', 'True'))) max_global_norm = float(os.getenv('MAX_GLOBAL_NORM', -1.0)) gm_steps = int(os.getenv('GRADIENT_MERGE_STEPS', 1)) + use_master_acc_grad = bool(int(os.getenv('USE_MASTER_ACC_GRAD', '1'))) print('clip_after_allreduce = {}, max_global_norm = {}'.format( clip_after_allreduce, max_global_norm)) return { @@ -281,9 +283,14 @@ class TestDistributedFusedLamb(unittest.TestCase): 'grad_clip': paddle.nn.ClipGradByGlobalNorm(max_global_norm) if max_global_norm > 0 else None, + 'use_master_acc_grad': + use_master_acc_grad, } - def run_main(self, use_fp16, use_master_param_norm=True): + def run_main(self, + use_fp16, + use_master_param_norm=True, + use_master_acc_grad=True): if not paddle.is_compiled_with_cuda(): return diff --git a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py index 324da95f37d..b2c2b6e31a5 100644 --- a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py +++ b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py @@ -36,7 +36,8 @@ def remove_file_if_exists(file_name): def run_test(clip_after_allreduce=True, max_global_norm=-1.0, - gradient_merge_steps=1): + gradient_merge_steps=1, + use_master_acc_grad=True): if not paddle.is_compiled_with_cuda(): return if os.name == 'nt': @@ -58,6 +59,7 @@ def run_test(clip_after_allreduce=True, os.environ['CLIP_AFTER_ALLREDUCE'] = str(clip_after_allreduce) os.environ['MAX_GLOBAL_NORM'] = str(max_global_norm) os.environ['GRADIENT_MERGE_STEPS'] = str(gradient_merge_steps) + os.environ['USE_MASTER_ACC_GRAD'] = str(1 if use_master_acc_grad else 0) touch_file_env = 'SUCCESS_TOUCH_FILE' touch_file_name = 'distributed_fused_lamb_touch_file_{}'.format(os.getpid()) diff --git a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py index c2089b1d97d..01ca09916a1 100644 --- a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py +++ b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py @@ -23,6 +23,12 @@ class TestDistributedFusedLambGradientMerge(unittest.TestCase): max_global_norm=-1.0, gradient_merge_steps=2) + def test_gm_with_fp16_acc_grad(self): + run_test(clip_after_allreduce=True, + max_global_norm=-1.0, + gradient_merge_steps=2, + use_master_acc_grad=False) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py index 4fddaff7ec9..3029c3a294a 100644 --- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py +++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py @@ -40,6 +40,7 @@ class DistributedFusedLamb(Optimizer): alignment=128, use_master_param_norm=True, gradient_accumulation_steps=1, + use_master_acc_grad=True, name=None): assert not framework._non_static_mode( ), "DistributedFusedLamb does not support dygraph mode" @@ -67,6 +68,7 @@ class DistributedFusedLamb(Optimizer): self._ring_id = 0 self._use_master_param_norm = use_master_param_norm self._gradient_accumulation_steps = gradient_accumulation_steps + self._use_master_acc_grad = use_master_acc_grad assert self._gradient_accumulation_steps >= 1 self.helper = LayerHelper('distributed_fused_lamb') @@ -353,5 +355,6 @@ class DistributedFusedLamb(Optimizer): 'use_master_param_norm': self._use_master_param_norm, 'is_grad_scaled_by_nranks': self._is_grad_scaled_by_nranks, 'acc_steps': self._gradient_accumulation_steps, + 'use_master_acc_grad': self._use_master_acc_grad, }) return [lamb_op] -- GitLab