From 8693e1d0467b000aa6f3900fa5236b53102e7eb2 Mon Sep 17 00:00:00 2001 From: Wenyu Date: Mon, 20 Jun 2022 13:55:56 +0800 Subject: [PATCH] AdamWDL builder and VIT checkpoint functionality (#6232) * upgrade adawm to adawmdl for transoformer * rename arg --- .../modeling/backbones/vision_transformer.py | 17 ++++-- ppdet/optimizer/adamw.py | 59 ++++++++++--------- ppdet/optimizer/optimizer.py | 7 +++ 3 files changed, 51 insertions(+), 32 deletions(-) diff --git a/ppdet/modeling/backbones/vision_transformer.py b/ppdet/modeling/backbones/vision_transformer.py index ffa9f3c56..8687b2c15 100644 --- a/ppdet/modeling/backbones/vision_transformer.py +++ b/ppdet/modeling/backbones/vision_transformer.py @@ -340,12 +340,15 @@ class VisionTransformer(nn.Layer): use_abs_pos_emb=False, use_sincos_pos_emb=True, with_fpn=True, + use_checkpoint=False, **args): super().__init__() self.img_size = img_size self.embed_dim = embed_dim self.with_fpn = with_fpn - + self.use_checkpoint = use_checkpoint + if use_checkpoint: + print('please set: FLAGS_allocator_strategy=naive_best_fit') self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, @@ -575,7 +578,7 @@ class VisionTransformer(nn.Layer): def forward(self, x): x = x['image'] if isinstance(x, dict) else x - _, _, w, h = x.shape + _, _, h, w = x.shape x = self.patch_embed(x) @@ -586,7 +589,8 @@ class VisionTransformer(nn.Layer): x = paddle.concat([cls_tokens, x], axis=1) if self.pos_embed is not None: - x = x + self.interpolate_pos_encoding(x, w, h) + # x = x + self.interpolate_pos_encoding(x, w, h) + x = x + self.interpolate_pos_encoding(x, h, w) x = self.pos_drop(x) @@ -597,7 +601,12 @@ class VisionTransformer(nn.Layer): feats = [] for idx, blk in enumerate(self.blocks): - x = blk(x, rel_pos_bias) + if self.use_checkpoint: + x = paddle.distributed.fleet.utils.recompute( + blk, x, rel_pos_bias, **{"preserve_rng_state": True}) + else: + x = blk(x, rel_pos_bias) + if idx in self.out_indices: xp = paddle.reshape( paddle.transpose( diff --git a/ppdet/optimizer/adamw.py b/ppdet/optimizer/adamw.py index dcdea8c1c..821135da0 100644 --- a/ppdet/optimizer/adamw.py +++ b/ppdet/optimizer/adamw.py @@ -18,6 +18,7 @@ from __future__ import print_function from paddle.optimizer import AdamW from functools import partial +import re def layerwise_lr_decay(decay_rate, name_dict, n_layers, param): @@ -34,15 +35,20 @@ def layerwise_lr_decay(decay_rate, name_dict, n_layers, param): """ ratio = 1.0 static_name = name_dict[param.name] - if "blocks" in static_name: - idx = static_name.find("blocks.") - layer = int(static_name[idx:].split(".")[1]) + if 'blocks.' in static_name or 'layers.' in static_name: + idx_1 = static_name.find('blocks.') + idx_2 = static_name.find('layers.') + assert any([x >= 0 for x in [idx_1, idx_2]]), '' + idx = idx_1 if idx_1 >= 0 else idx_2 + # idx = re.findall('[blocks|layers]\.(\d+)\.', static_name)[0] + + layer = int(static_name[idx:].split('.')[1]) ratio = decay_rate**(n_layers - layer) - elif "cls_token" in static_name or 'patch_embed' in static_name: + elif 'cls_token' in static_name or 'patch_embed' in static_name: ratio = decay_rate**(n_layers + 1) - param.optimize_attr["learning_rate"] *= ratio + param.optimize_attr['learning_rate'] *= ratio class AdamWDL(AdamW): @@ -156,16 +162,16 @@ class AdamWDL(AdamW): multi_precision=False, layerwise_decay=1.0, n_layers=12, - set_param_lr_fun=None, + set_param_lr_func=None, name_dict=None, name=None): if not isinstance(layerwise_decay, float): raise TypeError("coeff should be float or Tensor.") self.layerwise_decay = layerwise_decay self.n_layers = n_layers - self.set_param_lr_fun = partial( - set_param_lr_fun, layerwise_decay, name_dict, - n_layers) if set_param_lr_fun is not None else set_param_lr_fun + self.set_param_lr_func = partial( + set_param_lr_func, layerwise_decay, name_dict, + n_layers) if set_param_lr_func is not None else set_param_lr_func super(AdamWDL, self).__init__( learning_rate=learning_rate, parameters=parameters, @@ -180,36 +186,35 @@ class AdamWDL(AdamW): multi_precision=multi_precision) def _append_optimize_op(self, block, param_and_grad): - if self.set_param_lr_fun is None: + if self.set_param_lr_func is None: return super(AdamWDL, self)._append_optimize_op(block, param_and_grad) self._append_decoupled_weight_decay(block, param_and_grad) prev_lr = param_and_grad[0].optimize_attr["learning_rate"] - self.set_param_lr_fun(param_and_grad[0]) + self.set_param_lr_func(param_and_grad[0]) # excute Adam op res = super(AdamW, self)._append_optimize_op(block, param_and_grad) param_and_grad[0].optimize_attr["learning_rate"] = prev_lr return res -def build_adamw(model, - lr=1e-4, - weight_decay=0.05, - betas=(0.9, 0.999), - layer_decay=0.65, - num_layers=None, - filter_bias_and_bn=True, - skip_decay_names=None, - set_param_lr_fun=None): +def build_adamwdl(model, + lr=1e-4, + weight_decay=0.05, + betas=(0.9, 0.999), + layer_decay=0.65, + num_layers=None, + filter_bias_and_bn=True, + skip_decay_names=None, + set_param_lr_func='layerwise_lr_decay'): if skip_decay_names and filter_bias_and_bn: decay_dict = { - param.name: not (len(param.shape) == 1 or name.endswith(".bias") or + param.name: not (len(param.shape) == 1 or name.endswith('.bias') or any([_n in name for _n in skip_decay_names])) for name, param in model.named_parameters() } - parameters = [p for p in model.parameters()] else: @@ -221,17 +226,15 @@ def build_adamw(model, if decay_dict is not None: opt_args['apply_decay_param_fun'] = lambda n: decay_dict[n] - if isinstance(set_param_lr_fun, str): - set_param_lr_fun = eval(set_param_lr_fun) - opt_args['set_param_lr_fun'] = set_param_lr_fun + if isinstance(set_param_lr_func, str): + set_param_lr_func = eval(set_param_lr_func) + opt_args['set_param_lr_func'] = set_param_lr_func opt_args['beta1'] = betas[0] opt_args['beta2'] = betas[1] opt_args['layerwise_decay'] = layer_decay - name_dict = dict() - for n, p in model.named_parameters(): - name_dict[p.name] = n + name_dict = {p.name: n for n, p in model.named_parameters()} opt_args['name_dict'] = name_dict opt_args['n_layers'] = num_layers diff --git a/ppdet/optimizer/optimizer.py b/ppdet/optimizer/optimizer.py index 7be94d85e..fed47e85a 100644 --- a/ppdet/optimizer/optimizer.py +++ b/ppdet/optimizer/optimizer.py @@ -27,6 +27,8 @@ import paddle.regularizer as regularizer from ppdet.core.workspace import register, serializable import copy +from .adamw import AdamWDL, build_adamwdl + __all__ = ['LearningRate', 'OptimizerBuilder'] from ppdet.utils.logger import setup_logger @@ -317,8 +319,13 @@ class OptimizerBuilder(): optim_args = self.optimizer.copy() optim_type = optim_args['type'] del optim_args['type'] + + if optim_type == 'AdamWDL': + return build_adamwdl(model, lr=learning_rate, **optim_args) + if optim_type != 'AdamW': optim_args['weight_decay'] = regularization + op = getattr(optimizer, optim_type) if 'param_groups' in optim_args: -- GitLab