未验证 提交 c22803f2 编写于 作者: W Wenyu 提交者: GitHub

upgrade adamw for new paddle version (#7507)

上级 1a336e5f
...@@ -16,10 +16,15 @@ from __future__ import absolute_import ...@@ -16,10 +16,15 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import paddle
from paddle.optimizer import AdamW from paddle.optimizer import AdamW
from functools import partial from functools import partial
import re import re
IS_PADDLE_LATER_2_4 = (
int(paddle.version.major) >= 2 and
int(paddle.version.minor) >= 4) or int(paddle.version.major) == 0
def layerwise_lr_decay(decay_rate, name_dict, n_layers, param): def layerwise_lr_decay(decay_rate, name_dict, n_layers, param):
""" """
...@@ -48,7 +53,10 @@ def layerwise_lr_decay(decay_rate, name_dict, n_layers, param): ...@@ -48,7 +53,10 @@ def layerwise_lr_decay(decay_rate, name_dict, n_layers, param):
elif 'cls_token' in static_name or 'patch_embed' in static_name: elif 'cls_token' in static_name or 'patch_embed' in static_name:
ratio = decay_rate**(n_layers + 1) ratio = decay_rate**(n_layers + 1)
param.optimize_attr['learning_rate'] *= ratio if IS_PADDLE_LATER_2_4:
return ratio
else:
param.optimize_attr['learning_rate'] *= ratio
class AdamWDL(AdamW): class AdamWDL(AdamW):
...@@ -172,31 +180,51 @@ class AdamWDL(AdamW): ...@@ -172,31 +180,51 @@ class AdamWDL(AdamW):
self.set_param_lr_func = partial( self.set_param_lr_func = partial(
set_param_lr_func, layerwise_decay, name_dict, set_param_lr_func, layerwise_decay, name_dict,
n_layers) if set_param_lr_func is not None else set_param_lr_func n_layers) if set_param_lr_func is not None else set_param_lr_func
super(AdamWDL, self).__init__(
learning_rate=learning_rate, if IS_PADDLE_LATER_2_4:
parameters=parameters, super(AdamWDL, self).__init__(
beta1=beta1, learning_rate=learning_rate,
beta2=beta2, parameters=parameters,
epsilon=epsilon, beta1=beta1,
grad_clip=grad_clip, beta2=beta2,
name=name, epsilon=epsilon,
apply_decay_param_fun=apply_decay_param_fun, grad_clip=grad_clip,
weight_decay=weight_decay, name=name,
lazy_mode=lazy_mode, apply_decay_param_fun=apply_decay_param_fun,
multi_precision=multi_precision) weight_decay=weight_decay,
lazy_mode=lazy_mode,
def _append_optimize_op(self, block, param_and_grad): multi_precision=multi_precision,
if self.set_param_lr_func is None: lr_ratio=self.set_param_lr_func)
return super(AdamWDL, self)._append_optimize_op(block, else:
param_and_grad) super(AdamWDL, self).__init__(
learning_rate=learning_rate,
self._append_decoupled_weight_decay(block, param_and_grad) parameters=parameters,
prev_lr = param_and_grad[0].optimize_attr["learning_rate"] beta1=beta1,
self.set_param_lr_func(param_and_grad[0]) beta2=beta2,
# excute Adam op epsilon=epsilon,
res = super(AdamW, self)._append_optimize_op(block, param_and_grad) grad_clip=grad_clip,
param_and_grad[0].optimize_attr["learning_rate"] = prev_lr name=name,
return res apply_decay_param_fun=apply_decay_param_fun,
weight_decay=weight_decay,
lazy_mode=lazy_mode,
multi_precision=multi_precision)
def _append_optimize_op(self, block, param_and_grad):
if self.set_param_lr_func is None:
return super(AdamWDL, self)._append_optimize_op(block, param_and_grad)
self._append_decoupled_weight_decay(block, param_and_grad)
prev_lr = param_and_grad[0].optimize_attr["learning_rate"]
self.set_param_lr_func(param_and_grad[0])
# excute Adam op
res = super(AdamW, self)._append_optimize_op(block, param_and_grad)
param_and_grad[0].optimize_attr["learning_rate"] = prev_lr
return res
if not IS_PADDLE_LATER_2_4:
AdamWDL._append_optimize_op = _append_optimize_op
def build_adamwdl(model, def build_adamwdl(model,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册