You need to sign in or sign up before continuing.
提交 207c1969 编写于 作者: L Li Fuchen 提交者: Yibing Liu

fix language_model enable to run multi-GPU tasks (#3861)

* fix language_model enable to run multi GPUs tasks

* fix a bug of save model

* fix a bug of save model
上级 fa449c72
...@@ -6,7 +6,7 @@ function run_train() { ...@@ -6,7 +6,7 @@ function run_train() {
python train.py \ python train.py \
--data_path data/simple-examples/data/ \ --data_path data/simple-examples/data/ \
--model_type small \ --model_type small \
--use_gpu True --use_gpu True \
#--init_from_pretrain_model models/0/params #--init_from_pretrain_model models/0/params
} }
......
...@@ -22,7 +22,7 @@ import os ...@@ -22,7 +22,7 @@ import os
import random import random
import math import math
import contextlib import contextlib
from distutils.dir_util import mkpath
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
...@@ -111,6 +111,9 @@ def main(): ...@@ -111,6 +111,9 @@ def main():
config = RNNConfig(args) config = RNNConfig(args)
if not os.path.exists(args.save_model_dir):
mkpath(args.save_model_dir)
# define train program # define train program
main_program = fluid.Program() main_program = fluid.Program()
startup_program = fluid.Program() startup_program = fluid.Program()
...@@ -206,11 +209,12 @@ def main(): ...@@ -206,11 +209,12 @@ def main():
train_data, valid_data, test_data = ptb_data train_data, valid_data, test_data = ptb_data
def generate_init_data(): def generate_init_data():
batch_size = config.batch_size * device_count
init_hidden = np.zeros( init_hidden = np.zeros(
(config.num_layers, config.batch_size, config.hidden_size), (batch_size, config.num_layers, config.hidden_size),
dtype='float32') dtype='float32')
init_cell = np.zeros( init_cell = np.zeros(
(config.num_layers, config.batch_size, config.hidden_size), (batch_size, config.num_layers, config.hidden_size),
dtype='float32') dtype='float32')
return init_hidden, init_cell return init_hidden, init_cell
...@@ -244,8 +248,8 @@ def main(): ...@@ -244,8 +248,8 @@ def main():
def eval(data): def eval(data):
# when eval the batch_size set to 1 # when eval the batch_size set to 1
eval_data_iter = reader.get_data_iter(data, config.batch_size, eval_data_iter = reader.get_data_iter(data, config.batch_size *
config.num_steps) device_count, config.num_steps)
total_loss = 0.0 total_loss = 0.0
iters = 0 iters = 0
init_hidden, init_cell = generate_init_data() init_hidden, init_cell = generate_init_data()
...@@ -277,8 +281,8 @@ def main(): ...@@ -277,8 +281,8 @@ def main():
def train_an_epoch(epoch_id, batch_times): def train_an_epoch(epoch_id, batch_times):
# get train epoch size # get train epoch size
log_interval = get_log_interval(len(train_data)) log_interval = get_log_interval(len(train_data))
train_data_iter = reader.get_data_iter(train_data, config.batch_size, train_data_iter = reader.get_data_iter(train_data, config.batch_size *
config.num_steps) device_count, config.num_steps)
total_loss = 0 total_loss = 0
iters = 0 iters = 0
...@@ -307,7 +311,6 @@ def main(): ...@@ -307,7 +311,6 @@ def main():
lr = np.array(fetch_outs[1]) lr = np.array(fetch_outs[1])
init_hidden = np.array(fetch_outs[2]) init_hidden = np.array(fetch_outs[2])
init_cell = np.array(fetch_outs[3]) init_cell = np.array(fetch_outs[3])
total_loss += cost_train total_loss += cost_train
iters += config.num_steps iters += config.num_steps
if batch_id > 0 and batch_id % log_interval == 0: if batch_id > 0 and batch_id % log_interval == 0:
...@@ -379,7 +382,7 @@ def main(): ...@@ -379,7 +382,7 @@ def main():
if args.use_dataloader: if args.use_dataloader:
def data_gen(): def data_gen():
data_iter_size = config.batch_size // device_count data_iter_size = config.batch_size
train_batches = reader.get_data_iter(train_data, data_iter_size, train_batches = reader.get_data_iter(train_data, data_iter_size,
config.num_steps) config.num_steps)
for batch in train_batches: for batch in train_batches:
...@@ -444,8 +447,11 @@ def main(): ...@@ -444,8 +447,11 @@ def main():
format( format(
len(valid_data), config.batch_size, config.num_steps)) len(valid_data), config.batch_size, config.num_steps))
save_model_dir = os.path.join(args.save_model_dir, save_model_dir = os.path.join(args.save_model_dir, str(epoch_id))
str(epoch_id), "params") if not os.path.exists(save_model_dir):
mkpath(save_model_dir)
save_model_dir = os.path.join(save_model_dir, 'params')
fluid.save(main_program, save_model_dir) fluid.save(main_program, save_model_dir)
print("Saved model to: %s.\n" % save_model_dir) print("Saved model to: %s.\n" % save_model_dir)
......
...@@ -185,7 +185,6 @@ def lm_model(hidden_size, ...@@ -185,7 +185,6 @@ def lm_model(hidden_size,
pre_cell = cell_array[k] pre_cell = cell_array[k]
weight_1 = weight_1_arr[k] weight_1 = weight_1_arr[k]
bias = bias_arr[k] bias = bias_arr[k]
nn = layers.concat([input, pre_hidden], 1) nn = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.matmul(x=nn, y=weight_1)
...@@ -255,10 +254,8 @@ def lm_model(hidden_size, ...@@ -255,10 +254,8 @@ def lm_model(hidden_size,
return real_res, last_hidden, last_cell return real_res, last_hidden, last_cell
batch_size_each = batch_size // fluid.core.get_cuda_device_count() batch_size_each = batch_size // fluid.core.get_cuda_device_count()
x = fluid.data( x = fluid.data(name="x", shape=[None, num_steps, 1], dtype='int64')
name="x", shape=[batch_size_each, num_steps, 1], dtype='int64') y = fluid.data(name="y", shape=[None, 1], dtype='int64')
y = fluid.data(
name="y", shape=[batch_size_each * num_steps, 1], dtype='int64')
if use_dataloader: if use_dataloader:
dataloader = fluid.io.DataLoader.from_generator( dataloader = fluid.io.DataLoader.from_generator(
...@@ -269,16 +266,18 @@ def lm_model(hidden_size, ...@@ -269,16 +266,18 @@ def lm_model(hidden_size,
init_hidden = fluid.data( init_hidden = fluid.data(
name="init_hidden", name="init_hidden",
shape=[num_layers, batch_size_each, hidden_size], shape=[None, num_layers, hidden_size],
dtype='float32') dtype='float32')
init_cell = fluid.data( init_cell = fluid.data(
name="init_cell", name="init_cell",
shape=[num_layers, batch_size_each, hidden_size], shape=[None, num_layers, hidden_size],
dtype='float32') dtype='float32')
init_cell.persistable = True init_cell.persistable = True
init_hidden.persistable = True init_hidden.persistable = True
init_hidden = layers.transpose(init_hidden, perm=[1, 0, 2])
init_cell = layers.transpose(init_cell, perm=[1, 0, 2])
init_hidden_reshape = layers.reshape( init_hidden_reshape = layers.reshape(
init_hidden, shape=[num_layers, -1, hidden_size]) init_hidden, shape=[num_layers, -1, hidden_size])
init_cell_reshape = layers.reshape( init_cell_reshape = layers.reshape(
...@@ -373,9 +372,8 @@ def lm_model(hidden_size, ...@@ -373,9 +372,8 @@ def lm_model(hidden_size,
# can be used directly in next batch. This can avoid the fetching of # can be used directly in next batch. This can avoid the fetching of
# last_hidden and last_cell and feeding of init_hidden and init_cell in # last_hidden and last_cell and feeding of init_hidden and init_cell in
# each training step. # each training step.
layers.assign(input=last_cell, output=init_cell) last_hidden = layers.transpose(last_hidden, perm=[1, 0, 2])
layers.assign(input=last_hidden, output=init_hidden) last_cell = layers.transpose(last_cell, perm=[1, 0, 2])
feeding_list = ['x', 'y', 'init_hidden', 'init_cell'] feeding_list = ['x', 'y', 'init_hidden', 'init_cell']
if use_dataloader: if use_dataloader:
return loss, last_hidden, last_cell, feeding_list, dataloader return loss, last_hidden, last_cell, feeding_list, dataloader
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册