未验证 提交 a1f8ea54 编写于 作者: W wuzewu 提交者: GitHub

strategy support module v2 params_layer (#343)

* strategy support layer_params
上级 2ef7c1e9
...@@ -202,9 +202,13 @@ class CombinedStrategy(DefaultStrategy): ...@@ -202,9 +202,13 @@ class CombinedStrategy(DefaultStrategy):
"noam_decay": False, "noam_decay": False,
"discriminative": { "discriminative": {
"blocks": 0, "blocks": 0,
"params_layer": None,
"factor": 2.6 "factor": 2.6
}, },
"gradual_unfreeze": 0, "gradual_unfreeze": {
"blocks": 0,
"params_layer": None,
},
"slanted_triangle": { "slanted_triangle": {
"cut_fraction": 0.0, "cut_fraction": 0.0,
"ratio": 32 "ratio": 32
...@@ -234,6 +238,38 @@ class CombinedStrategy(DefaultStrategy): ...@@ -234,6 +238,38 @@ class CombinedStrategy(DefaultStrategy):
for name in clip: for name in clip:
self.check_assign(self.clip, name, clip[name]) self.check_assign(self.clip, name, clip[name])
# resolve the conflict
if self.scheduler["discriminative"]["params_layer"] and self.scheduler[
"discriminative"]["blocks"]:
logger.warning(
"Both params_layer and blocks have been set in discriminative, only params_layer will take effect"
)
self.scheduler["discriminative"]["blocks"] = 0
if self.scheduler["gradual_unfreeze"][
"params_layer"] and self.scheduler["gradual_unfreeze"]["blocks"]:
logger.warning(
"Both params_layer and blocks have been set in gradual_unfreeze, only params_layer will take effect"
)
self.scheduler["gradual_unfreeze"]["blocks"] = 0
if self.scheduler["slanted_triangle"]["cut_fraction"] and (
self.scheduler["warmup"] or self.scheduler["noam_decay"]
or self.scheduler["linear_decay"]["start_point"] < 1):
logger.warning(
"You are using slanted_triangle learning rate strategy, "
"which will make warmup, noam_decay and linear_decay useless")
self.scheduler["warmup"] = 0.0
self.scheduler["noam_decay"] = False
self.scheduler["linear_decay"]["start_point"] = 1
if self.scheduler["noam_decay"] and self.scheduler["linear_decay"][
"start_point"]:
logger.warning(
"Both noam_decay and linear_decay have been set, only noam_decay will take effect"
)
self.scheduler["linear_decay"]["start_point"] = 1
self.epoch = 0 self.epoch = 0
self.main_program = None self.main_program = None
...@@ -244,9 +280,9 @@ class CombinedStrategy(DefaultStrategy): ...@@ -244,9 +280,9 @@ class CombinedStrategy(DefaultStrategy):
sub_dict = dictionary[key] sub_dict = dictionary[key]
for sub_name in value: for sub_name in value:
self.check_assign(sub_dict, sub_name, value[sub_name]) self.check_assign(sub_dict, sub_name, value[sub_name])
elif isinstance(dictionary[key], elif isinstance(dictionary[key], type(value)) or (
type(value)) or (isinstance(dictionary[key], float) isinstance(dictionary[key], float)
and isinstance(value, (float, int))): and isinstance(value, (float, int))) or dictionary[key] == None:
dictionary[key] = value dictionary[key] = value
else: else:
if isinstance(dictionary[key], dict): if isinstance(dictionary[key], dict):
...@@ -283,52 +319,49 @@ class CombinedStrategy(DefaultStrategy): ...@@ -283,52 +319,49 @@ class CombinedStrategy(DefaultStrategy):
persistable=True, persistable=True,
name="learning_rate") name="learning_rate")
if not self.scheduler["slanted_triangle"]["cut_fraction"]: warmup_steps = int(max_train_steps * self.scheduler["warmup"])
warmup_steps = int(max_train_steps * self.scheduler["warmup"])
linear_decay_start = int( # noam_decay (based on warmup)
max_train_steps * self.scheduler["linear_decay"]["start_point"]) if self.scheduler["noam_decay"]:
if linear_decay_start < warmup_steps: if warmup_steps > 0:
logger.warning( scheduled_lr = fluid.layers.learning_rate_scheduler \
"linear decay can not start during warmup process," .noam_decay(1 / (warmup_steps * (self.learning_rate ** 2)),
"it will start after warmup ends!") warmup_steps)
linear_decay_start = warmup_steps else:
if self.scheduler["noam_decay"]:
if warmup_steps > 0:
scheduled_lr = fluid.layers.learning_rate_scheduler \
.noam_decay(1 / (warmup_steps * (self.learning_rate ** 2)),
warmup_steps)
else:
logger.warning(
"Noam decay learning rate scheduler should have positive \
warmup steps, using constant learning rate instead!")
if not self.scheduler["noam_decay"] and \
(warmup_steps > 0 or self.scheduler["linear_decay"]["start_point"]<1):
with self.main_program._lr_schedule_guard():
global_step = lr_scheduler._decay_step_counter()
with control_flow.Switch() as switch:
if warmup_steps > 0:
with switch.case(global_step < warmup_steps):
decayed_lr = self.learning_rate * global_step * 1.0 / warmup_steps
fluid.layers.assign(decayed_lr, scheduled_lr)
if self.scheduler["linear_decay"]["start_point"] < 1:
with switch.case(global_step >= linear_decay_start):
decayed_lr = lr_scheduler.polynomial_decay(
learning_rate=self.learning_rate,
decay_steps=max_train_steps,
end_learning_rate=self.scheduler[
"linear_decay"]["end_learning_rate"],
power=1.0,
cycle=False)
fluid.layers.assign(decayed_lr, scheduled_lr)
else:
if self.scheduler["warmup"] or self.scheduler[
"noam_decay"] or self.scheduler["linear_decay"][
"start_point"] < 1:
logger.warning( logger.warning(
"You are using slanted_triangle learning rate " "Noam decay learning rate scheduler should have positive \
"which will make warmup, noam_decay and linear_decay unable" warmup steps, using constant learning rate instead!")
)
# warmup, linear_decay
if warmup_steps > 0 or self.scheduler["linear_decay"]["start_point"] < 1:
with self.main_program._lr_schedule_guard():
global_step = lr_scheduler._decay_step_counter()
with control_flow.Switch() as switch:
if warmup_steps > 0:
with switch.case(global_step < warmup_steps):
decayed_lr = self.learning_rate * global_step * 1.0 / warmup_steps
fluid.layers.assign(decayed_lr, scheduled_lr)
if self.scheduler["linear_decay"]["start_point"] < 1:
linear_decay_start = int(
max_train_steps *
self.scheduler["linear_decay"]["start_point"])
if linear_decay_start < warmup_steps:
logger.warning(
"linear decay can not start during warmup process,"
"it will start after warmup ends!")
linear_decay_start = warmup_steps
with switch.case(global_step >= linear_decay_start):
decayed_lr = lr_scheduler.polynomial_decay(
learning_rate=self.learning_rate,
decay_steps=max_train_steps,
end_learning_rate=self.scheduler["linear_decay"]
["end_learning_rate"],
power=1.0,
cycle=False)
fluid.layers.assign(decayed_lr, scheduled_lr)
# slanted_triangle
if self.scheduler["slanted_triangle"]["cut_fraction"]:
cut_step = int(max_train_steps * cut_step = int(max_train_steps *
self.scheduler["slanted_triangle"]["cut_fraction"]) self.scheduler["slanted_triangle"]["cut_fraction"])
ratio = self.scheduler["slanted_triangle"]["ratio"] ratio = self.scheduler["slanted_triangle"]["ratio"]
...@@ -346,9 +379,25 @@ class CombinedStrategy(DefaultStrategy): ...@@ -346,9 +379,25 @@ class CombinedStrategy(DefaultStrategy):
(ratio - 1)) / ratio (ratio - 1)) / ratio
fluid.layers.assign(decayed_lr, scheduled_lr) fluid.layers.assign(decayed_lr, scheduled_lr)
# set optimizer
super(CombinedStrategy, self).__init__( super(CombinedStrategy, self).__init__(
optimizer_name=self._optimizer_name, learning_rate=scheduled_lr) optimizer_name=self._optimizer_name, learning_rate=scheduled_lr)
# discriminative learning rate
# based on layer
if self.scheduler["discriminative"]["params_layer"]:
max_layer = max(
self.scheduler["discriminative"]["params_layer"].values())
for param in self.main_program.global_block().iter_parameters():
if param.name in self.scheduler["discriminative"][
"params_layer"]:
param_layer = self.scheduler["discriminative"][
"params_layer"][param.name]
param.optimize_attr["learning_rate"] *= pow(
1.0 / self.scheduler["discriminative"]["factor"],
max_layer - param_layer)
# based on blocks
if self.scheduler["discriminative"]["blocks"]: if self.scheduler["discriminative"]["blocks"]:
_block_layers = math.ceil( _block_layers = math.ceil(
len(self.sorted_depth) / len(self.sorted_depth) /
...@@ -426,13 +475,12 @@ class CombinedStrategy(DefaultStrategy): ...@@ -426,13 +475,12 @@ class CombinedStrategy(DefaultStrategy):
pass pass
if self.scheduler["discriminative"]["blocks"] > 0 or self.scheduler[ if self.scheduler["discriminative"]["blocks"] > 0 or self.scheduler[
"gradual_unfreeze"] > 0: "gradual_unfreeze"]["blocks"] > 0:
self.depth_params_dict = get_depth_parameter(self.main_program) self.depth_params_dict = get_depth_parameter(self.main_program)
self.sorted_depth = sorted( self.sorted_depth = sorted(
self.depth_params_dict.keys(), reverse=True) self.depth_params_dict.keys(), reverse=True)
self.max_depth = len(self.sorted_depth) self.max_depth = len(self.sorted_depth)
logger.info(self.__str__())
# handle scheduler # handle scheduler
scheduled_lr = self.scheduler_handler(max_train_steps) scheduled_lr = self.scheduler_handler(max_train_steps)
...@@ -442,6 +490,8 @@ class CombinedStrategy(DefaultStrategy): ...@@ -442,6 +490,8 @@ class CombinedStrategy(DefaultStrategy):
# handle regularization # handle regularization
self.regularization_handler(loss, scheduled_lr) self.regularization_handler(loss, scheduled_lr)
logger.info(self.__str__())
return scheduled_lr, max_train_steps return scheduled_lr, max_train_steps
def exclude_from_weight_decay(self, name): def exclude_from_weight_decay(self, name):
...@@ -454,25 +504,64 @@ class CombinedStrategy(DefaultStrategy): ...@@ -454,25 +504,64 @@ class CombinedStrategy(DefaultStrategy):
return False return False
def step(self): def step(self):
if self.scheduler["gradual_unfreeze"] > 0: if self.scheduler["gradual_unfreeze"]["blocks"] > 0:
self.epoch += 1 self.epoch += 1
if self.max_depth > 0 and self.epoch <= self.scheduler[ if self.max_depth > 0 and self.epoch <= self.scheduler[
"gradual_unfreeze"]: "gradual_unfreeze"]["blocks"]:
set_gradual_unfreeze( set_gradual_unfreeze(
self.main_program, self.main_program,
unfreeze_depths=self. unfreeze_depths=self.
sorted_depth[:self.max_depth * self.epoch // sorted_depth[:self.max_depth * self.epoch //
self.scheduler["gradual_unfreeze"]]) self.scheduler["gradual_unfreeze"]["blocks"]])
else: else:
logger.warning( logger.warning(
"The max op-depth in the network is %s. That results in that can't use the gradual unfreeze finetune strategy." "The max op-depth in the network is %s. That results in that can't use the gradual unfreeze finetune strategy."
% (self.max_depth)) % (self.max_depth))
elif self.scheduler["gradual_unfreeze"]["params_layer"]:
max_layer = max(
self.scheduler["gradual_unfreeze"]["params_layer"].values())
if self.epoch <= max_layer:
for param in self.main_program.global_block().iter_parameters():
if param.name in self.scheduler["gradual_unfreeze"][
"params_layer"]:
param_layer = self.scheduler["gradual_unfreeze"][
"params_layer"][param.name]
if param_layer >= max_layer - self.epoch:
param.stop_gradient = False
else:
param.stop_gradient = True
self.epoch += 1
else: else:
pass pass
def __str__(self): def __str__(self):
return "Strategy with scheduler: %s, regularization: %s and clip: %s" % ( self.clip = {"GlobalNorm": 0.0, "Norm": 0.0}
self.scheduler, self.regularization, self.clip)
strategy_name = ""
strategy_name += "warmup, " if self.scheduler["warmup"] else ""
strategy_name += "linear decay, " if self.scheduler["linear_decay"][
"start_point"] < 1 else ""
strategy_name += "noam decay, " if self.scheduler["noam_decay"] else ""
strategy_name += "discriminative learning rate, " if self.scheduler[
"discriminative"]["blocks"] or self.scheduler["discriminative"][
"params_layer"] else ""
strategy_name += "gradual unfreeze, " if self.scheduler[
"gradual_unfreeze"]["blocks"] or self.scheduler["gradual_unfreeze"][
"params_layer"] else ""
strategy_name += "slanted triangle learning rate, " if self.scheduler[
"slanted_triangle"] else ""
strategy_name += "L2 regularization, " if self.regularization[
"L2"] else ""
strategy_name += "L2SP regularization, " if self.regularization[
"L2SP"] else ""
strategy_name += "weight decay regularization, " if self.regularization[
"weight_decay"] else ""
strategy_name += "GlobalNorm clip, " if self.clip["GlobalNorm"] else ""
strategy_name += "Norm clip, " if self.clip["Norm"] else ""
return "Strategy with %s" % (strategy_name)
class AdamWeightDecayStrategy(CombinedStrategy): class AdamWeightDecayStrategy(CombinedStrategy):
...@@ -544,17 +633,22 @@ class ULMFiTStrategy(CombinedStrategy): ...@@ -544,17 +633,22 @@ class ULMFiTStrategy(CombinedStrategy):
ratio=32, ratio=32,
dis_blocks=3, dis_blocks=3,
factor=2.6, factor=2.6,
frz_blocks=3): frz_blocks=3,
params_layer=None):
scheduler = { scheduler = {
"slanted_triangle": { "slanted_triangle": {
"cut_fraction": cut_fraction, "cut_fraction": cut_fraction,
"ratio": ratio "ratio": ratio
}, },
"gradual_unfreeze": frz_blocks, "gradual_unfreeze": {
"blocks": frz_blocks,
"params_layer": params_layer
},
"discriminative": { "discriminative": {
"blocks": dis_blocks, "blocks": dis_blocks,
"factor": factor "factor": factor,
"params_layer": params_layer
} }
} }
regularization = {} regularization = {}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册