diff --git a/demo1_config.yaml b/config_demo1.yaml similarity index 90% rename from demo1_config.yaml rename to config_demo1.yaml index 3033cc0b418e281ae7d92579167166c084eb8e30..3e26ff331be7bfee901198f0be48ae4888839664 100644 --- a/demo1_config.yaml +++ b/config_demo1.yaml @@ -12,9 +12,10 @@ do_lower_case: True max_seq_len: 512 batch_size: 5 -num_epochs: 2 +num_epochs: 3 optimizer: "adam" learning_rate: 3e-5 warmup_proportion: 0.1 weight_decay: 0.1 +print_every_n_steps: 1 diff --git a/demo1.py b/demo1.py index 505a6e0a68580cb888e077d164b7e9d370afe4cf..790b76c18b473e4d93ab879991f741bd043f20e9 100644 --- a/demo1.py +++ b/demo1.py @@ -1,10 +1,10 @@ import paddlepalm as palm if __name__ == '__main__': - controller = palm.Controller('demo1_config.yaml', task_dir='demo1_tasks') + controller = palm.Controller('config_demo1.yaml', task_dir='demo1_tasks') controller.load_pretrain('pretrain_model/ernie/params') controller.train() - controller = palm.Controller(config='demo1_config.yaml', task_dir='demo1_tasks', for_train=False) + controller = palm.Controller(config='config_demo1.yaml', task_dir='demo1_tasks', for_train=False) controller.pred('mrqa', inference_model_dir='output_model/firstrun/infer_model') diff --git a/demo2.py b/demo2.py index d1c7b8762e2f384675fd63bee8d72bb8b8c353ed..c4f8675669cc8cd962722c48bc1573da2c62bc2d 100644 --- a/demo2.py +++ b/demo2.py @@ -3,8 +3,8 @@ import paddlepalm as palm if __name__ == '__main__': controller = palm.Controller('config_demo2.yaml', task_dir='demo2_tasks') controller.load_pretrain('pretrain_model/ernie/params') - controller.train() + # controller.train() - controller = palm.Controller(config='config_demo2.yaml', task_dir='demo2_tasks', for_train=False) - controller.pred('mrqa', inference_model_dir='output_model/secondrun/infer_model') + # controller = palm.Controller(config='config_demo2.yaml', task_dir='demo2_tasks', for_train=False) + # controller.pred('mrqa', inference_model_dir='output_model/secondrun/infer_model') diff --git a/paddlepalm/backbone/ernie.py b/paddlepalm/backbone/ernie.py index 4362334ab933ae16ed0917b90b3bb3444de82527..1e471537cf9485f533dbc4f048662f0a5bc30c60 100644 --- a/paddlepalm/backbone/ernie.py +++ b/paddlepalm/backbone/ernie.py @@ -76,7 +76,7 @@ class Model(backbone): "sentence_embedding": [[-1, self._emb_size], 'float32'], "sentence_pair_embedding": [[-1, self._emb_size], 'float32']} - def build(self, inputs): + def build(self, inputs, scope_name=""): src_ids = inputs['token_ids'] pos_ids = inputs['position_ids'] @@ -90,25 +90,25 @@ class Model(backbone): size=[self._voc_size, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( - name=self._word_emb_name, initializer=self._param_initializer), + name=scope_name+self._word_emb_name, initializer=self._param_initializer), is_sparse=False) # fluid.global_scope().find_var('backbone-word_embedding').get_tensor() - embedding_table = fluid.default_main_program().global_block().var(self._word_emb_name) + embedding_table = fluid.default_main_program().global_block().var(scope_name+self._word_emb_name) position_emb_out = fluid.layers.embedding( input=pos_ids, size=[self._max_position_seq_len, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( - name=self._pos_emb_name, initializer=self._param_initializer)) + name=scope_name+self._pos_emb_name, initializer=self._param_initializer)) sent_emb_out = fluid.layers.embedding( sent_ids, size=[self._sent_types, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( - name=self._sent_emb_name, initializer=self._param_initializer)) + name=scope_name+self._sent_emb_name, initializer=self._param_initializer)) emb_out = emb_out + position_emb_out emb_out = emb_out + sent_emb_out @@ -118,13 +118,13 @@ class Model(backbone): size=[self._task_types, self._emb_size], dtype=self._emb_dtype, param_attr=fluid.ParamAttr( - name=self._task_emb_name, + name=scope_name+self._task_emb_name, initializer=self._param_initializer)) emb_out = emb_out + task_emb_out emb_out = pre_process_layer( - emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder') + emb_out, 'nd', self._prepostprocess_dropout, name=scope_name+'pre_encoder') self_attn_mask = fluid.layers.matmul( x=input_mask, y=input_mask, transpose_y=True) @@ -151,7 +151,7 @@ class Model(backbone): preprocess_cmd="", postprocess_cmd="dan", param_initializer=self._param_initializer, - name='encoder') + name=scope_name+'encoder') next_sent_feat = fluid.layers.slice( @@ -162,8 +162,8 @@ class Model(backbone): size=self._emb_size, act="tanh", param_attr=fluid.ParamAttr( - name="pooled_fc.w_0", initializer=self._param_initializer), - bias_attr="pooled_fc.b_0") + name=scope_name+"pooled_fc.w_0", initializer=self._param_initializer), + bias_attr=scope_name+"pooled_fc.b_0") return {'embedding_table': embedding_table, 'word_embedding': emb_out, diff --git a/paddlepalm/mtl_controller.py b/paddlepalm/mtl_controller.py index 4086390f9bb98d9faf76f556b2ff49eafe2c546d..b4716d5850f1ff677a9fae1be0cac7c81d6213e3 100755 --- a/paddlepalm/mtl_controller.py +++ b/paddlepalm/mtl_controller.py @@ -430,23 +430,25 @@ class Controller(object): # build backbone and task layers # 不指定scope名字会挂,框架有坑 + train_prog = fluid.default_main_program() + train_init_prog = fluid.default_startup_program() + # 别用unique_name.guard了,没用的,无法作用到param_attr里的name上 with fluid.unique_name.guard("backbone-"): - bb_output_vars = train_backbone.build(net_inputs) - # bb_output_vars = train_backbone.build(net_inputs) + bb_output_vars = train_backbone.build(net_inputs, scope_name='__paddlepalm_') assert sorted(bb_output_vars.keys()) == sorted(train_backbone.outputs_attr.keys()) - + #for var in train_init_prog.blocks[0].vars: + # print(var) + # 会挂 # 这里是否有必要新建一个program?是的,被坑死了 pred_prog = fluid.Program() pred_init_prog = fluid.Program() - train_prog = fluid.default_main_program() - train_init_prog = fluid.default_startup_program() - with fluid.program_guard(main_program = pred_prog, startup_program = pred_init_prog): pred_net_inputs = create_net_inputs(pred_input_attrs) - with fluid.unique_name.guard("backbone-"): - pred_bb_output_vars = pred_backbone.build(pred_net_inputs) + # 别用unique_name.guard了,没用的,无法作用到param_attr里的name上 + # with fluid.unique_name.guard("backbone-"): + pred_bb_output_vars = pred_backbone.build(pred_net_inputs, scope_name='__paddlepalm_') fluid.framework.switch_main_program(train_prog) fluid.framework.switch_startup_program(train_init_prog) @@ -503,13 +505,13 @@ class Controller(object): num_examples = main_reader.num_examples for inst in instances: - max_train_steps = int(main_conf['num_epochs']* inst.mix_ratio * num_examples) // main_conf['batch_size'] // dev_count + max_train_steps = int(main_conf['num_epochs']* inst.mix_ratio * (num_examples // main_conf['batch_size'] // dev_count)) if inst.is_target: print('{}: expected train steps {}.'.format(inst.name, max_train_steps)) inst.steps_pur_epoch = inst.reader['train'].num_examples // main_conf['batch_size'] // dev_count inst.expected_train_steps = max_train_steps - global_max_train_steps = int(main_conf['num_epochs'] * num_examples * sum(mrs)) // main_conf['batch_size'] // dev_count + global_max_train_steps = int(main_conf['num_epochs'] * sum(mrs) * (num_examples // main_conf['batch_size'] // dev_count)) print('Estimated overall train steps {}.'.format(global_max_train_steps)) if 'warmup_proportion' in main_conf and main_conf['warmup_proportion'] > 0: diff --git a/paddlepalm/optimizer/adam.py b/paddlepalm/optimizer/adam.py index 74a02463e96e3749537a58f3b7e411ec54f563cf..e08e45b145c23376cad262cf6bf012f0c5fd6ba0 100644 --- a/paddlepalm/optimizer/adam.py +++ b/paddlepalm/optimizer/adam.py @@ -90,11 +90,6 @@ def optimize(loss, config, max_train_steps=None, warmup_steps=0, train_program=N _, param_grads = optimizer.minimize(loss) - for block in fluid.default_main_program().blocks: - for var_name in block.vars: - if var_name.startswith("embedding"): - print(block.vars[var_name]) - if config.get('weight_decay', 0) > 0: for param, grad in param_grads: diff --git a/paddlepalm/utils/saver.py b/paddlepalm/utils/saver.py index ffc3f0da3c38d11a2aa770d8ad944509b6863a1d..9277f0d3ffd17e36cfd7ce3224aa88d18756f0bb 100644 --- a/paddlepalm/utils/saver.py +++ b/paddlepalm/utils/saver.py @@ -19,6 +19,8 @@ import os import six import ast import copy +import tarfile +import shutil import numpy as np import paddle.fluid as fluid @@ -48,18 +50,31 @@ def init_pretraining_params(exe, assert os.path.exists(pretraining_params_path ), "[%s] cann't be found." % pretraining_params_path + + assert os.path.exists(os.path.join(pretraining_params_path, '__palmmodel__')), "__palmmodel__ not found." + print("Loading pretraining parameters from {}...".format( + pretraining_params_path)) + + with tarfile.open(os.path.join(pretraining_params_path, '__palmmodel__'), 'r:') as f: + f.extractall(os.path.join(pretraining_params_path, '.temp')) + + log_path = os.path.join(pretraining_params_path, '__palmmodel__') + pretraining_params_path = os.path.join(pretraining_params_path, '.temp') + def existed_params(var): if not isinstance(var, fluid.framework.Parameter): return False + if not os.path.exists(os.path.join(pretraining_params_path, var.name)): + print('Warning: {} not found in {}.'.format(var.name, log_path)) return os.path.exists(os.path.join(pretraining_params_path, var.name)) - print("Load pretraining parameters from {}...\n".format( - pretraining_params_path)) - fluid.io.load_vars( exe, pretraining_params_path, main_program=main_program, predicate=existed_params) + shutil.rmtree(pretraining_params_path) + print('') + diff --git a/run_demo1.sh b/run_demo1.sh index 3f3d8ecf634c9cfd8328132f75b6357c784bdc9a..a73cb1b974c0ef24308ca1299cb8f30eb0b82f90 100755 --- a/run_demo1.sh +++ b/run_demo1.sh @@ -1,4 +1,4 @@ -export CUDA_VISIBLE_DEVICES=0 +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python demo1.py diff --git a/run_demo2.sh b/run_demo2.sh index a4c5471d827e6f2132ee9b996e064a48733b72bb..48850b1e4f06ce2fd88054c3e650bd431126fd1a 100755 --- a/run_demo2.sh +++ b/run_demo2.sh @@ -1,4 +1,4 @@ -export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export CUDA_VISIBLE_DEVICES=0 python demo2.py diff --git a/script/convert_params.sh b/script/convert_params.sh index 676f9d37f6cedaa51c6ea927a485caebe348ffab..e645d9ab9be0815ceb5423bc9c629184cca323f7 100755 --- a/script/convert_params.sh +++ b/script/convert_params.sh @@ -5,13 +5,32 @@ if [[ $# != 1 ]]; then exit 1 fi +if [[ -f $1/__palminfo__ ]]; then + echo "already converted." + exit 0 +fi + echo "converting..." -cd $1 +if [[ -d $1/params ]]; then + cd $1/params +else + cd $1 +fi + mkdir .palm.backup for file in $(ls *) - do cp $file "backbone-"$file; mv $file .palm.backup + do cp $file .palm.backup; mv $file "__paddlepalm_"$file done +tar -cf __rawmodel__ .palm.backup/* +rm .palm.backup/* +mv __rawmodel__ .palm.backup +# find . ! -name '__rawmodel__' -exec rm {} + +tar -cf __palmmodel__ __paddlepalm_* +touch __palminfo__ +ls __paddlepalm_* > __palminfo__ +rm __paddlepalm_* + cd - >/dev/null echo "done!" diff --git a/script/recover_params.sh b/script/recover_params.sh index 33a836fb569863e28eff832be300c90a0b13a18e..a99ceb500f5e9f70dfa8660d8c308ec1f0841c5b 100755 --- a/script/recover_params.sh +++ b/script/recover_params.sh @@ -5,7 +5,29 @@ if [[ $# != 1 ]]; then exit 1 fi -rm $1/backbone-* -mv $1/.palm.backup/* $1 -rm -rf $1/.palm.backup +if [[ ! -d $1 ]]; then + echo "$1 not found." + exit 1 +fi + +if [[ ! -f $1/__palmmodel__ ]]; then + echo "paddlepalm model not found." + exit 1 +fi + +echo "recovering..." +if [[ -d $1/params ]]; then + cd $1/params +else + cd $1 +fi +rm __palm* +mv .palm.backup/__rawmodel__ . +rm -rf .palm.backup +tar -xf __rawmodel__ +mv .palm.backup/* . +rm __rawmodel__ + +rm -rf .palm.backup +cd - >/dev/null