未验证 提交 601d7a35 编写于 作者: S sneaxiy 提交者: GitHub

Add use_master_acc_grad for DistributedFusedLamb (#43266)

* add use_master_acc_grad

* add ut
上级 5dcebb9b
...@@ -141,6 +141,9 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -141,6 +141,9 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
"NCCL communication data. If it is false, it would be less accurate " "NCCL communication data. If it is false, it would be less accurate "
"and be less NCCL communication data.") "and be less NCCL communication data.")
.SetDefault(true); .SetDefault(true);
AddAttr<bool>("use_master_acc_grad",
"Whether to use master gradient when acc_steps > 1.")
.SetDefault(true);
AddAttr<bool>("is_grad_scaled_by_nranks", AddAttr<bool>("is_grad_scaled_by_nranks",
"Whether the input gradient has been scaled by nranks.") "Whether the input gradient has been scaled by nranks.")
.SetDefault(true); .SetDefault(true);
......
...@@ -1193,7 +1193,9 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T> ...@@ -1193,7 +1193,9 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
platform::float16 *fp16_acc_grad = nullptr; platform::float16 *fp16_acc_grad = nullptr;
float *master_acc_grad = nullptr; float *master_acc_grad = nullptr;
bool use_master_acc_grad = false;
if (has_fp16_param) { if (has_fp16_param) {
use_master_acc_grad = ctx.Attr<bool>("use_master_acc_grad");
auto *fp16_acc_grad_t = auto *fp16_acc_grad_t =
ctx.Output<framework::Tensor>("FP16AccFusedGrad"); ctx.Output<framework::Tensor>("FP16AccFusedGrad");
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
...@@ -1201,13 +1203,18 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T> ...@@ -1201,13 +1203,18 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
"Output(FP16AccFusedGrad) cannot be nullptr " "Output(FP16AccFusedGrad) cannot be nullptr "
"when Attr(acc_steps) > 1.")); "when Attr(acc_steps) > 1."));
if (!fp16_acc_grad_t->IsInitialized()) { if (!fp16_acc_grad_t->IsInitialized()) {
fp16_acc_grad_t->Resize({static_cast<int64_t>(3 * fp16_numel)}); auto acc_grad_size =
use_master_acc_grad ? (3 * fp16_numel) : fp16_numel;
fp16_acc_grad_t->Resize({static_cast<int64_t>(acc_grad_size)});
fp16_acc_grad = fp16_acc_grad =
fp16_acc_grad_t->mutable_data<platform::float16>(place); fp16_acc_grad_t->mutable_data<platform::float16>(place);
} else { } else {
fp16_acc_grad = fp16_acc_grad_t->data<platform::float16>(); fp16_acc_grad = fp16_acc_grad_t->data<platform::float16>();
} }
master_acc_grad = reinterpret_cast<float *>(fp16_acc_grad + fp16_numel); if (use_master_acc_grad) {
master_acc_grad =
reinterpret_cast<float *>(fp16_acc_grad + fp16_numel);
}
} }
// Inplace addto // Inplace addto
...@@ -1222,8 +1229,8 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T> ...@@ -1222,8 +1229,8 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
} }
if (has_fp16_param) { if (has_fp16_param) {
if (acc_steps == 2) { if (acc_steps == 2 || !use_master_acc_grad) {
if (rounded_step == 0) { if (rounded_step != 1) {
LaunchElementwiseAddWithCastKernel(dev_ctx, fp16_acc_grad, LaunchElementwiseAddWithCastKernel(dev_ctx, fp16_acc_grad,
fp16_grad, fp16_acc_grad, fp16_grad, fp16_acc_grad,
fp16_numel, stream); fp16_numel, stream);
......
...@@ -162,6 +162,7 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs): ...@@ -162,6 +162,7 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
kwargs = dict(kwargs) kwargs = dict(kwargs)
kwargs.pop('clip_after_allreduce', None) kwargs.pop('clip_after_allreduce', None)
kwargs.pop('alignment', None) kwargs.pop('alignment', None)
kwargs.pop('use_master_acc_grad', None)
base_clip = grad_clip if grad_clip is not None else IdentityGradClip( base_clip = grad_clip if grad_clip is not None else IdentityGradClip(
) )
kwargs['grad_clip'] = GradClipDecorator(base_clip, kwargs['grad_clip'] = GradClipDecorator(base_clip,
...@@ -271,6 +272,7 @@ class TestDistributedFusedLamb(unittest.TestCase): ...@@ -271,6 +272,7 @@ class TestDistributedFusedLamb(unittest.TestCase):
distutils.util.strtobool(os.getenv('CLIP_AFTER_ALLREDUCE', 'True'))) distutils.util.strtobool(os.getenv('CLIP_AFTER_ALLREDUCE', 'True')))
max_global_norm = float(os.getenv('MAX_GLOBAL_NORM', -1.0)) max_global_norm = float(os.getenv('MAX_GLOBAL_NORM', -1.0))
gm_steps = int(os.getenv('GRADIENT_MERGE_STEPS', 1)) gm_steps = int(os.getenv('GRADIENT_MERGE_STEPS', 1))
use_master_acc_grad = bool(int(os.getenv('USE_MASTER_ACC_GRAD', '1')))
print('clip_after_allreduce = {}, max_global_norm = {}'.format( print('clip_after_allreduce = {}, max_global_norm = {}'.format(
clip_after_allreduce, max_global_norm)) clip_after_allreduce, max_global_norm))
return { return {
...@@ -281,9 +283,14 @@ class TestDistributedFusedLamb(unittest.TestCase): ...@@ -281,9 +283,14 @@ class TestDistributedFusedLamb(unittest.TestCase):
'grad_clip': 'grad_clip':
paddle.nn.ClipGradByGlobalNorm(max_global_norm) paddle.nn.ClipGradByGlobalNorm(max_global_norm)
if max_global_norm > 0 else None, if max_global_norm > 0 else None,
'use_master_acc_grad':
use_master_acc_grad,
} }
def run_main(self, use_fp16, use_master_param_norm=True): def run_main(self,
use_fp16,
use_master_param_norm=True,
use_master_acc_grad=True):
if not paddle.is_compiled_with_cuda(): if not paddle.is_compiled_with_cuda():
return return
......
...@@ -36,7 +36,8 @@ def remove_file_if_exists(file_name): ...@@ -36,7 +36,8 @@ def remove_file_if_exists(file_name):
def run_test(clip_after_allreduce=True, def run_test(clip_after_allreduce=True,
max_global_norm=-1.0, max_global_norm=-1.0,
gradient_merge_steps=1): gradient_merge_steps=1,
use_master_acc_grad=True):
if not paddle.is_compiled_with_cuda(): if not paddle.is_compiled_with_cuda():
return return
if os.name == 'nt': if os.name == 'nt':
...@@ -58,6 +59,7 @@ def run_test(clip_after_allreduce=True, ...@@ -58,6 +59,7 @@ def run_test(clip_after_allreduce=True,
os.environ['CLIP_AFTER_ALLREDUCE'] = str(clip_after_allreduce) os.environ['CLIP_AFTER_ALLREDUCE'] = str(clip_after_allreduce)
os.environ['MAX_GLOBAL_NORM'] = str(max_global_norm) os.environ['MAX_GLOBAL_NORM'] = str(max_global_norm)
os.environ['GRADIENT_MERGE_STEPS'] = str(gradient_merge_steps) os.environ['GRADIENT_MERGE_STEPS'] = str(gradient_merge_steps)
os.environ['USE_MASTER_ACC_GRAD'] = str(1 if use_master_acc_grad else 0)
touch_file_env = 'SUCCESS_TOUCH_FILE' touch_file_env = 'SUCCESS_TOUCH_FILE'
touch_file_name = 'distributed_fused_lamb_touch_file_{}'.format(os.getpid()) touch_file_name = 'distributed_fused_lamb_touch_file_{}'.format(os.getpid())
......
...@@ -23,6 +23,12 @@ class TestDistributedFusedLambGradientMerge(unittest.TestCase): ...@@ -23,6 +23,12 @@ class TestDistributedFusedLambGradientMerge(unittest.TestCase):
max_global_norm=-1.0, max_global_norm=-1.0,
gradient_merge_steps=2) gradient_merge_steps=2)
def test_gm_with_fp16_acc_grad(self):
run_test(clip_after_allreduce=True,
max_global_norm=-1.0,
gradient_merge_steps=2,
use_master_acc_grad=False)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -40,6 +40,7 @@ class DistributedFusedLamb(Optimizer): ...@@ -40,6 +40,7 @@ class DistributedFusedLamb(Optimizer):
alignment=128, alignment=128,
use_master_param_norm=True, use_master_param_norm=True,
gradient_accumulation_steps=1, gradient_accumulation_steps=1,
use_master_acc_grad=True,
name=None): name=None):
assert not framework._non_static_mode( assert not framework._non_static_mode(
), "DistributedFusedLamb does not support dygraph mode" ), "DistributedFusedLamb does not support dygraph mode"
...@@ -67,6 +68,7 @@ class DistributedFusedLamb(Optimizer): ...@@ -67,6 +68,7 @@ class DistributedFusedLamb(Optimizer):
self._ring_id = 0 self._ring_id = 0
self._use_master_param_norm = use_master_param_norm self._use_master_param_norm = use_master_param_norm
self._gradient_accumulation_steps = gradient_accumulation_steps self._gradient_accumulation_steps = gradient_accumulation_steps
self._use_master_acc_grad = use_master_acc_grad
assert self._gradient_accumulation_steps >= 1 assert self._gradient_accumulation_steps >= 1
self.helper = LayerHelper('distributed_fused_lamb') self.helper = LayerHelper('distributed_fused_lamb')
...@@ -353,5 +355,6 @@ class DistributedFusedLamb(Optimizer): ...@@ -353,5 +355,6 @@ class DistributedFusedLamb(Optimizer):
'use_master_param_norm': self._use_master_param_norm, 'use_master_param_norm': self._use_master_param_norm,
'is_grad_scaled_by_nranks': self._is_grad_scaled_by_nranks, 'is_grad_scaled_by_nranks': self._is_grad_scaled_by_nranks,
'acc_steps': self._gradient_accumulation_steps, 'acc_steps': self._gradient_accumulation_steps,
'use_master_acc_grad': self._use_master_acc_grad,
}) })
return [lamb_op] return [lamb_op]
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册