diff --git a/PaddleNLP/language_model/train.py b/PaddleNLP/language_model/train.py index 9a5af4b7be9869d0e2a09d9959f82947279900b1..3cc04364d79e8c9f8b02fe028554054666fea32f 100644 --- a/PaddleNLP/language_model/train.py +++ b/PaddleNLP/language_model/train.py @@ -178,7 +178,7 @@ def main(): print(args.init_from_pretrain_model) raise Warning("The pretrained params do not exist.") return - fluid.load(main_program, args.init_from_pretrain_model) + fluid.load(main_program, args.init_from_pretrain_model, exe) print("finish initing model from pretrained params from %s" % (args.init_from_pretrain_model)) diff --git a/PaddleNLP/shared_modules/models/language_model/lm_model.py b/PaddleNLP/shared_modules/models/language_model/lm_model.py index c66b77b7dc6c37b32926fc697bb35e12a24d8850..decad759592e7867b1d85dc46049d8c42c092f78 100644 --- a/PaddleNLP/shared_modules/models/language_model/lm_model.py +++ b/PaddleNLP/shared_modules/models/language_model/lm_model.py @@ -241,8 +241,6 @@ def lm_model(hidden_size, name="init_cell", shape=[None, num_layers, hidden_size], dtype='float32') - init_cell.persistable = True - init_hidden.persistable = True init_hidden = layers.transpose(init_hidden, perm=[1, 0, 2]) init_cell = layers.transpose(init_cell, perm=[1, 0, 2]) @@ -334,8 +332,6 @@ def lm_model(hidden_size, loss = layers.reduce_sum(loss) loss.persistable = True - last_cell.persistable = True - last_hidden.persistable = True # This will feed last_hidden, last_cell to init_hidden, init_cell, which # can be used directly in next batch. This can avoid the fetching of