# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License" # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import multiprocessing import paddle.fluid as fluid from .optimization import adam_weight_decay_optimization class DefaultStrategy(object): def __init__(self, learning_rate=1e-4, optimizer_name="adam"): self.learning_rate = learning_rate self._optimizer_name = optimizer_name def execute(self, loss): if self.optimizer.lower() == "adam": self.optimizer = fluid.optimizer.Adam( learning_rate=self.learning_rate) elif self.optimizer.lower() == "sgd": self.optimizer = fluid.optimizer.SGD( learning_rate=self.learning_rate) if self.optimizer is not None: self.optimizer.minimize(loss) else: raise ValueError("DefaultStrategy's optimizer is None") class BERTFinetuneStrategy(DefaultStrategy): def __init__(self, learning_rate=1e-4, warmup_strategy="linear_warmup_decay", warmup_proportion=0.0, weight_decay=0.01, optimizer_name=None): super().__init__( learning_rate=learning_rate, optimizer_name=optimizer_name) # check strategy correctness if warmup_strategy not in ["linear_warmup_decay", "noam_decay"]: raise ValueError("warmup strategy {} is not setup " "correctly".format(warmup_strategy)) self._warmup_strategy = warmup_strategy self._warmup_proportion = warmup_proportion self._weight_decay = weight_decay @property def warmup_strategy(self): return self._warmup_strategy @property def warmup_proportion(self): return self._warmup_proportion @property def weight_decay(self): return self._weight_decay def execute(self, loss, main_program, data_reader, config): # calculate wamrup step dev_count = self._get_dev_count(config) num_train_examples = data_reader.get_num_examples(phase='train') max_train_steps = config.num_epoch * num_train_examples // config.batch_size // dev_count warmup_steps = int(max_train_steps * self.warmup_proportion) scheduled_lr = adam_weight_decay_optimization( loss, warmup_steps, max_train_steps, self.learning_rate, main_program, self.weight_decay, self.warmup_strategy) return scheduled_lr def _get_dev_count(self, config): if config.use_cuda: dev_count = fluid.core.get_cuda_device_count() else: dev_count = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) return dev_count