diff --git a/README.md b/README.md index 84b2fbe6ffe220cf68b8aa22a471f30c0b257f07..e57aafa43f55b61b18a9ca059d88cf1cf42699eb 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # PaddlePALM -PaddlePALM (Paddle for Multi-task) 是一个强大快速、灵活易用的 ***任务级(task level)*** 大规模多任务学习框架。框架实现了简洁易懂的任务实例创建与管理机制,强大易用的参数管理与复用接口,以及高效稳定的多任务调度算法。通过PaddlePALM,用户可以轻松完成复杂的多任务学习与参数复用,无缝集成「**单任务训练**」、「**多任务辅助训练**」和「**多目标任务联合训练**」这 *3* 种训练方式和灵活的保存与预测机制,且仅需书写极少量代码即可”一键启动”高性能单机单卡和分布式训练与推理。 +PaddlePALM (PAddle for Learning with Multi-tasks) 是一个强大通用、预置丰富、灵活易用的 ***大规模任务级(task level)*** NLP模型训练与推理框架。框架实现了简洁易懂的任务实例创建与管理机制,强大易用的参数管理与复用接口,以及高效稳定的多任务调度算法。通过PaddlePALM,用户仅需数行代码即可轻松完成各类NLP任务(分类、匹配、序列标注、机器阅读理解等)的 **有监督学习**、**自监督学习(预训练)**、**迁移学习**和**多任务学习**。在多任务学习模式中,又无缝集成了「主辅多任务学习」和「多任务联合学习」。此外,框架实现了自动多卡模式,用户无需书写代码即可完成大规模GPU多卡训练或推理。 框架中内置了丰富的[主干网络](#附录b内置主干网络backbone)及其[预训练模型](#预训练模型)(BERT、ERNIE等)、常见的[任务范式](#附录c内置任务范式paradigm)(分类、匹配、机器阅读理解等)和相应的[数据集读取与处理工具](#附录a内置数据集载入与处理工具reader)。同时框架提供了用户自定义接口,若内置工具、主干网络和任务无法满足需求,开发者可以轻松完成相关组件的自定义。各个组件均为零耦合设计,用户仅需完成组件本身的特性开发即可完成与框架的融合。 @@ -117,6 +117,23 @@ paddlepalm框架的运行原理图如图所示 python download_models.py -l ``` +运行以上命令可以看到类似如下输出信息 + +``` +Available pretrain items: + => roberta-cn-base + => roberta-cn-large + => bert-cn-base + => bert-cn-large + => bert-en-uncased-base + => bert-en-uncased-large + => bert-en-cased-base + => bert-en-cased-large + => ernie-en-uncased-base + => ernie-en-uncased-large + ... + ``` + 用户可通过运行`python download_models.py -d `下载需要的预训练模型,例如,下载预训练BERT模型(uncased large)的命令如下: ```shell @@ -741,7 +758,7 @@ BERT包含了如下输入对象 ```yaml token_ids: 一个shape为[batch_size, seq_len]的矩阵,每行是一条样本,其中的每个元素为文本中的每个token对应的单词id。 position_ids: 一个shape为[batch_size, seq_len]的矩阵,每行是一条样本,其中的每个元素为文本中的每个token对应的位置id。 -segment_ids: 一个shape为[batch_size, seq_len]的0/1矩阵,用于支持BERT、ERNIE等模型的输入,当元素为0时,代表当前token属于分类任务或匹配任务的text1,为1时代表当前token属于匹配任务的text2. +segment_ids: 一个shape为[batch_size, seq_len]的0/1矩阵,用于支持BERT、ERNIE等模型的输入,当元素为0时,代表当前token属于分类任务或匹配任务的text1,为1时代表当前token属于匹配任务的text2。 input_mask: 一个shape为[batch_size, seq_len]的矩阵,其中的每个元素为0或1,表示该位置是否是padding词(为1时代表是真实词,为0时代表是填充词)。 ``` @@ -781,6 +798,7 @@ sentence_pair_embedding: 一个shape为[batch_size, hidden_size]的matrix, float ## 附录C:内置任务范式(paradigm) + #### 分类范式:cls 分类范式额外包含以下配置字段: @@ -788,6 +806,7 @@ sentence_pair_embedding: 一个shape为[batch_size, hidden_size]的matrix, float ```yaml n_classes(REQUIRED): int类型。分类任务的类别数。 pred_output_path (OPTIONAL) : str类型。预测输出结果的保存路径,当该参数未空时,保存至全局配置文件中的`save_path`字段指定路径下的任务目录。 +save_infermodel_every_n_steps (OPTIONAL) : int类型。周期性保存预测模型的间隔,未设置或设为-1时仅在该任务训练结束时保存预测模型。默认为-1。 ``` 分类范式包含如下的输入对象: @@ -812,6 +831,7 @@ sentence_embedding: 一个shape为[batch_size, hidden_size]的matrix, float32类 ```yaml pred_output_path (OPTIONAL) : str类型。预测输出结果的保存路径,当该参数未空时,保存至全局配置文件中的`save_path`字段指定路径下的任务目录。 +save_infermodel_every_n_steps (OPTIONAL) : int类型。周期性保存预测模型的间隔,未设置或设为-1时仅在该任务训练结束时保存预测模型。默认为-1。 ``` 匹配范式包含如下的输入对象: @@ -838,6 +858,7 @@ sentence_pair_embedding: 一个shape为[batch_size, hidden_size]的matrix, float max_answer_len(REQUIRED): int类型。预测的最大答案长度 n_best_size (OPTIONAL) : int类型,默认为20。预测时保存的nbest回答文件中每条样本的n_best数量 pred_output_path (OPTIONAL) : str类型。预测输出结果的保存路径,当该参数未空时,保存至全局配置文件中的`save_path`字段指定路径下的任务目录 +save_infermodel_every_n_steps (OPTIONAL) : int类型。周期性保存预测模型的间隔,未设置或设为-1时仅在该任务训练结束时保存预测模型。默认为-1。 ``` 机器阅读理解范式包含如下的输入对象: @@ -885,7 +906,8 @@ do_lower_case (OPTIONAL): bool类型。大小写标志位。默认为False,即 for_cn: bool类型。中文模式标志位。默认为False,即默认输入为英文,设置为True后,分词器、后处理等按照中文语言进行处理。 print_every_n_steps (OPTIONAL): int类型。默认为5。训练阶段打印日志的频率(step为单位)。 -save_every_n_steps (OPTIONAL): int类型。默认为-1。训练过程中保存checkpoint模型的频率,默认不保存。 +save_ckpt_every_n_steps (OPTIONAL): int类型。默认为-1。训练过程中保存完整计算图的检查点(checkpoint)的频率,默认-1,仅在最后一个step自动保存检查点。 +save_infermodel_every_n_steps (OPTIONAL) : int类型。周期性保存预测模型的间隔,未设置或设为-1时仅在该任务训练结束时保存预测模型。默认为-1。 optimizer(REQUIRED): str类型。优化器名称,目前框架只支持adam,未来会支持更多优化器。 learning_rate(REQUIRED): str类型。训练阶段的学习率。 diff --git a/demo/demo2/config.yaml b/demo/demo2/config.yaml index 1cc55777a39c747f98dd4e89fcaae4af587592d5..fe00c1a7339b1be418e68fcd836b3d657eb8c708 100644 --- a/demo/demo2/config.yaml +++ b/demo/demo2/config.yaml @@ -12,6 +12,8 @@ do_lower_case: True max_seq_len: 512 batch_size: 4 +save_ckpt_every_n_steps: 5 +save_infermodel_every_n_steps: 5 num_epochs: 2 optimizer: "adam" learning_rate: 3e-5 diff --git a/paddlepalm/README.md b/paddlepalm/README.md deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/paddlepalm/_downloader.py b/paddlepalm/_downloader.py index 1b8de4b629a491148e43b71f96cb70c0542d15d4..ae196696b3c3c192ad74760679dc1a10210fa6b0 100644 --- a/paddlepalm/_downloader.py +++ b/paddlepalm/_downloader.py @@ -34,6 +34,8 @@ _items = { 'pretrain': {'ernie-en-uncased-large': 'https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz', 'bert-en-uncased-large': 'https://bert-models.bj.bcebos.com/uncased_L-24_H-1024_A-16.tar.gz', 'bert-en-uncased-base': 'https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz', + 'roberta-cn-base': 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_ext_L-12_H-768_A-12.tar.gz', + 'roberta-cn-large': 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.tar.gz', 'utils': None}, 'reader': {'utils': None}, 'backbone': {'utils': None}, @@ -91,13 +93,22 @@ def _download(item, scope, path, silent=False): tar.extractall(path = data_dir) tar.close() os.remove(filename) - if scope.startswith('bert'): - source_path = data_dir + '/' + data_name.split('.')[0] - fileList = os.listdir(source_path) - for file in fileList: - filePath = os.path.join(source_path, file) + + if len(os.listdir(data_dir)) == 1 and os.path.isdir(os.path.join(data_dir, os.listdir(data_dir)[0])): + temp = os.path.join(data_dir, os.listdir(data_dir)[0]) + for file in os.listdir(temp): + filePath = os.path.join(temp, file) shutil.move(filePath, data_dir) - os.removedirs(source_path) + os.removedirs(temp) + + # if scope.startswith('bert'): + # source_path = os.path.join(data_dir + '/' + data_name.split('.')[0] + # fileList = os.listdir(source_path) + # for file in fileList: + # filePath = os.path.join(source_path, file) + # shutil.move(filePath, data_dir) + # os.removedirs(source_path) + if not silent: print ('done!') if not silent: diff --git a/paddlepalm/mtl_controller.py b/paddlepalm/mtl_controller.py index 98f8a97724b696d26db6ad9e2760664ddb889f84..5193d3423f13e5a819214e6e52e6b5e2033fd10d 100755 --- a/paddlepalm/mtl_controller.py +++ b/paddlepalm/mtl_controller.py @@ -35,6 +35,9 @@ from paddlepalm.utils.reader_helper import create_net_inputs, create_iterator_fn from paddlepalm.default_settings import * from task_instance import TaskInstance, check_instances +import Queue +from threading import Thread + DEBUG=False VERBOSE=0 @@ -338,6 +341,7 @@ class Controller(object): main_conf = main_inst.config if not os.path.exists(main_conf['save_path']): os.makedirs(main_conf['save_path']) + os.makedirs(os.path.join(main_conf['save_path'], 'ckpt')) # prepare backbone train_backbone = Backbone(bb_conf, phase='train') @@ -399,12 +403,15 @@ class Controller(object): prefixes.append(inst.name) mrs.append(inst.mix_ratio) - joint_iterator_fn = create_joint_iterator_fn(iterators, prefixes, joint_shape_and_dtypes, mrs, name_to_position, dev_count=dev_count, verbose=VERBOSE) + joint_iterator_fn = create_joint_iterator_fn(iterators, prefixes, joint_shape_and_dtypes, mrs, name_to_position, dev_count=dev_count, verbose=VERBOSE, return_type='dict') + self._joint_iterator_fn = joint_iterator_fn input_attrs = [[i, j, k] for i, (j,k) in zip(joint_input_names, joint_shape_and_dtypes)] pred_input_attrs = [[i, j, k] for i, (j,k) in zip(pred_joint_input_names, pred_joint_shape_and_dtypes)] - net_inputs = create_net_inputs(input_attrs, async=True, iterator_fn=joint_iterator_fn, dev_count=dev_count, n_prefetch=3) - + # net_inputs = create_net_inputs(input_attrs, async=True, iterator_fn=joint_iterator_fn, dev_count=dev_count, n_prefetch=3) + net_inputs = create_net_inputs(input_attrs, async=False) + self._net_inputs = net_inputs + # build backbone and task layers train_prog = fluid.default_main_program() train_init_prog = fluid.default_startup_program() @@ -568,6 +575,18 @@ class Controller(object): return False return True + def pack_multicard_feed(iterator, net_inputs, dev_count): + ret = [] + mask = [] + for i in range(dev_count): + temp = {} + content, flag = next(iterator) + for q, var in net_inputs.items(): + temp[var.name] = content[q] + ret.append(temp) + mask.append(1 if flag else 0) + return ret, mask + # do training fetch_names, fetch_list = zip(*fetches.items()) @@ -576,8 +595,50 @@ class Controller(object): epoch = 0 time_begin = time.time() backbone_buffer = [] + + def multi_dev_reader(reader, dev_count): + def worker(reader, dev_count, queue): + dev_batches = [] + for index, data in enumerate(reader()): + if len(dev_batches) < dev_count: + dev_batches.append(data) + if len(dev_batches) == dev_count: + queue.put((dev_batches, 0)) + dev_batches = [] + # For the prediction of the remained batches, pad more batches to + # the number of devices and the padded samples would be removed in + # prediction outputs. + if len(dev_batches) > 0: + num_pad = dev_count - len(dev_batches) + for i in range(len(dev_batches), dev_count): + dev_batches.append(dev_batches[-1]) + queue.put((dev_batches, num_pad)) + queue.put(None) + + queue = Queue.Queue(dev_count*2) + p = Thread( + target=worker, args=(reader, dev_count, queue)) + p.daemon = True + p.start() + while True: + ret = queue.get() + if ret is not None: + batches, num_pad = ret + queue.task_done() + for batch in batches: + flag = num_pad == 0 + if num_pad > 0: + num_pad -= 1 + yield batch, flag + else: + break + queue.join() + + joint_iterator = multi_dev_reader(self._joint_iterator_fn, self.dev_count) + while not train_finish(): - rt_outputs = self.exe.run(train_program, fetch_list=fetch_list) + feed, mask = pack_multicard_feed(joint_iterator, self._net_inputs, self.dev_count) + rt_outputs = self.exe.run(train_program, feed=feed, fetch_list=fetch_list) rt_outputs = {k:v for k,v in zip(fetch_names, rt_outputs)} rt_task_id = np.squeeze(rt_outputs['__task_id']).tolist() rt_task_id = rt_task_id[0] if isinstance(rt_task_id, list) else rt_task_id @@ -592,8 +653,9 @@ class Controller(object): global_step += 1 cur_task.cur_train_step += 1 - if cur_task.save_infermodel_every_n_steps > 0 and cur_task.cur_train_step % cur_task.save_infermodel_every_n_steps == 0: - cur_task.save(suffix='.step'+str(cur_task.cur_train_step)) + cur_task_global_step = cur_task.cur_train_step + cur_task.cur_train_epoch * cur_task.steps_pur_epoch + if cur_task.is_target and cur_task.save_infermodel_every_n_steps > 0 and cur_task_global_step % cur_task.save_infermodel_every_n_steps == 0: + cur_task.save(suffix='.step'+str(cur_task_global_step)) if global_step % main_conf.get('print_every_n_steps', 5) == 0: loss = rt_outputs[cur_task.name+'/loss'] @@ -611,10 +673,16 @@ class Controller(object): print(cur_task.name+': train finished!') cur_task.save() - if 'save_every_n_steps' in main_conf and global_step % main_conf['save_every_n_steps'] == 0: - save_path = os.path.join(main_conf['save_path'], + if 'save_ckpt_every_n_steps' in main_conf and global_step % main_conf['save_ckpt_every_n_steps'] == 0: + save_path = os.path.join(main_conf['save_path'], 'ckpt', "step_" + str(global_step)) fluid.io.save_persistables(self.exe, save_path, saver_program) + print('checkpoint has been saved at '+save_path) + + save_path = os.path.join(main_conf['save_path'], 'ckpt', + "step_" + str(global_step)) + fluid.io.save_persistables(self.exe, save_path, saver_program) + print('checkpoint has been saved at '+save_path) print("ALL tasks train finished, exiting...") diff --git a/paddlepalm/utils/reader_helper.py b/paddlepalm/utils/reader_helper.py index b7878c968bf5cc58433e5a6c98422c3fa853b0cd..c03e5fec04a62659f2252569c26ce27d0f01d63a 100644 --- a/paddlepalm/utils/reader_helper.py +++ b/paddlepalm/utils/reader_helper.py @@ -106,11 +106,13 @@ def create_iterator_fn(iterator, iterator_prefix, shape_and_dtypes, outname_to_p return iterator -def create_joint_iterator_fn(iterators, iterator_prefixes, joint_shape_and_dtypes, mrs, outname_to_pos, dev_count=1, keep_one_task=True, verbose=0): +def create_joint_iterator_fn(iterators, iterator_prefixes, joint_shape_and_dtypes, mrs, outname_to_pos, dev_count=1, keep_one_task=True, verbose=0, return_type='list'): """ joint_shape_and_dtypes: 本质上是根据bb和parad的attr设定的,并且由reader中的attr自动填充-1(可变)维度得到,因此通过与iterator的校验可以完成runtime的batch正确性检查 """ + pos_to_outname = {j:i for i,j in outname_to_pos.items()} + task_ids = range(len(iterators)) weights = [mr / float(sum(mrs)) for mr in mrs] if not keep_one_task: @@ -203,7 +205,13 @@ def create_joint_iterator_fn(iterators, iterator_prefixes, joint_shape_and_dtype print(np.shape(i)) print('') v -= 1 - yield results + if return_type == 'list': + yield results + elif return_type == 'dict': + temp = {} + for pos, i in enumerate(results): + temp[pos_to_outname[pos]] = i + yield temp return iterator diff --git a/script/convert_params.sh b/script/convert_params.sh new file mode 100755 index 0000000000000000000000000000000000000000..e645d9ab9be0815ceb5423bc9c629184cca323f7 --- /dev/null +++ b/script/convert_params.sh @@ -0,0 +1,37 @@ + +#!/bin/sh +if [[ $# != 1 ]]; then + echo "usage: bash convert_params.sh " + exit 1 +fi + +if [[ -f $1/__palminfo__ ]]; then + echo "already converted." + exit 0 +fi + +echo "converting..." +if [[ -d $1/params ]]; then + cd $1/params +else + cd $1 +fi + +mkdir .palm.backup + +for file in $(ls *) + do cp $file .palm.backup; mv $file "__paddlepalm_"$file +done +tar -cf __rawmodel__ .palm.backup/* +rm .palm.backup/* +mv __rawmodel__ .palm.backup +# find . ! -name '__rawmodel__' -exec rm {} + +tar -cf __palmmodel__ __paddlepalm_* +touch __palminfo__ +ls __paddlepalm_* > __palminfo__ +rm __paddlepalm_* + +cd - >/dev/null + +echo "done!" + diff --git a/script/download_pretrain_backbone.sh b/script/download_pretrain_backbone.sh new file mode 100755 index 0000000000000000000000000000000000000000..bc64a428801cf08e3f184ad50955dd706187341e --- /dev/null +++ b/script/download_pretrain_backbone.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +set -e + +if [[ $# != 1 ]]; then + echo "Usage: bash download_pretrain.sh " + exit 1 +fi + +if [[ $1 == 'bert' ]]; then + name="bert" + link="https://bert-models.bj.bcebos.com/uncased_L-24_H-1024_A-16.tar.gz" + packname="uncased_L-24_H-1024_A-16.tar.gz" + dirname="uncased_L-24_H-1024_A-16" +elif [[ $1 == 'ernie' ]]; then + name="ernie" + link="https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz" + packname="ERNIE_Large_en_stable-2.0.0.tar.gz" +else + echo "$1 is currently not supported." + exit 1 +fi + +if [[ ! -d pretrain_model ]]; then + mkdir pretrain_model +fi + +cd pretrain_model +mkdir $name +cd $name +echo "downloading ${name}..." +wget --no-check-certificate $link +echo "decompressing..." +tar -zxf $packname +rm -rf $packname +if [[ $dirname != "" ]]; then + mv $dirname/* . + rm -rf $dirname +fi + +cd ../.. + + diff --git a/script/recover_params.sh b/script/recover_params.sh new file mode 100755 index 0000000000000000000000000000000000000000..a99ceb500f5e9f70dfa8660d8c308ec1f0841c5b --- /dev/null +++ b/script/recover_params.sh @@ -0,0 +1,33 @@ + +#!/bin/sh +if [[ $# != 1 ]]; then + echo "usage: bash recover_params.sh " + exit 1 +fi + +if [[ ! -d $1 ]]; then + echo "$1 not found." + exit 1 +fi + +if [[ ! -f $1/__palmmodel__ ]]; then + echo "paddlepalm model not found." + exit 1 +fi + +echo "recovering..." +if [[ -d $1/params ]]; then + cd $1/params +else + cd $1 +fi +rm __palm* +mv .palm.backup/__rawmodel__ . +rm -rf .palm.backup +tar -xf __rawmodel__ +mv .palm.backup/* . +rm __rawmodel__ + +rm -rf .palm.backup +cd - >/dev/null + diff --git a/setup.py b/setup.py index bfeb6be5427166b2f2468af4f76457e9a11f7497..6c81d9e193a37e2cc2a480d841a463f3e5c294ef 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ """ Setup script. Authors: zhouxiangyang(zhouxiangyang@baidu.com) -Date: 2019/09/29 21:00:01 +Date: 2019/12/05 13:24:01 """ import setuptools from io import open @@ -27,10 +27,10 @@ with open("README.md", "r", encoding='utf-8') as fh: setuptools.setup( name="paddlepalm", - version="0.2.1", + version="0.2.2", author="PaddlePaddle", author_email="zhangyiming04@baidu.com", - description="A Multi-task Learning Lib for PaddlePaddle Users.", + description="A Lib for PaddlePaddle Users.", # long_description=long_description, # long_description_content_type="text/markdown", url="https://github.com/PaddlePaddle/PALM",