提交 207c1969 编写于 作者: L Li Fuchen 提交者: Yibing Liu

fix language_model enable to run multi-GPU tasks (#3861)

* fix language_model enable to run multi GPUs tasks

* fix a bug of save model

* fix a bug of save model
上级 fa449c72
......@@ -6,7 +6,7 @@ function run_train() {
python train.py \
--data_path data/simple-examples/data/ \
--model_type small \
--use_gpu True
--use_gpu True \
#--init_from_pretrain_model models/0/params
}
......
......@@ -22,7 +22,7 @@ import os
import random
import math
import contextlib
from distutils.dir_util import mkpath
import paddle
import paddle.fluid as fluid
import paddle.fluid.framework as framework
......@@ -111,6 +111,9 @@ def main():
config = RNNConfig(args)
if not os.path.exists(args.save_model_dir):
mkpath(args.save_model_dir)
# define train program
main_program = fluid.Program()
startup_program = fluid.Program()
......@@ -206,11 +209,12 @@ def main():
train_data, valid_data, test_data = ptb_data
def generate_init_data():
batch_size = config.batch_size * device_count
init_hidden = np.zeros(
(config.num_layers, config.batch_size, config.hidden_size),
(batch_size, config.num_layers, config.hidden_size),
dtype='float32')
init_cell = np.zeros(
(config.num_layers, config.batch_size, config.hidden_size),
(batch_size, config.num_layers, config.hidden_size),
dtype='float32')
return init_hidden, init_cell
......@@ -244,8 +248,8 @@ def main():
def eval(data):
# when eval the batch_size set to 1
eval_data_iter = reader.get_data_iter(data, config.batch_size,
config.num_steps)
eval_data_iter = reader.get_data_iter(data, config.batch_size *
device_count, config.num_steps)
total_loss = 0.0
iters = 0
init_hidden, init_cell = generate_init_data()
......@@ -277,8 +281,8 @@ def main():
def train_an_epoch(epoch_id, batch_times):
# get train epoch size
log_interval = get_log_interval(len(train_data))
train_data_iter = reader.get_data_iter(train_data, config.batch_size,
config.num_steps)
train_data_iter = reader.get_data_iter(train_data, config.batch_size *
device_count, config.num_steps)
total_loss = 0
iters = 0
......@@ -307,7 +311,6 @@ def main():
lr = np.array(fetch_outs[1])
init_hidden = np.array(fetch_outs[2])
init_cell = np.array(fetch_outs[3])
total_loss += cost_train
iters += config.num_steps
if batch_id > 0 and batch_id % log_interval == 0:
......@@ -379,7 +382,7 @@ def main():
if args.use_dataloader:
def data_gen():
data_iter_size = config.batch_size // device_count
data_iter_size = config.batch_size
train_batches = reader.get_data_iter(train_data, data_iter_size,
config.num_steps)
for batch in train_batches:
......@@ -444,8 +447,11 @@ def main():
format(
len(valid_data), config.batch_size, config.num_steps))
save_model_dir = os.path.join(args.save_model_dir,
str(epoch_id), "params")
save_model_dir = os.path.join(args.save_model_dir, str(epoch_id))
if not os.path.exists(save_model_dir):
mkpath(save_model_dir)
save_model_dir = os.path.join(save_model_dir, 'params')
fluid.save(main_program, save_model_dir)
print("Saved model to: %s.\n" % save_model_dir)
......
......@@ -185,7 +185,6 @@ def lm_model(hidden_size,
pre_cell = cell_array[k]
weight_1 = weight_1_arr[k]
bias = bias_arr[k]
nn = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=nn, y=weight_1)
......@@ -255,10 +254,8 @@ def lm_model(hidden_size,
return real_res, last_hidden, last_cell
batch_size_each = batch_size // fluid.core.get_cuda_device_count()
x = fluid.data(
name="x", shape=[batch_size_each, num_steps, 1], dtype='int64')
y = fluid.data(
name="y", shape=[batch_size_each * num_steps, 1], dtype='int64')
x = fluid.data(name="x", shape=[None, num_steps, 1], dtype='int64')
y = fluid.data(name="y", shape=[None, 1], dtype='int64')
if use_dataloader:
dataloader = fluid.io.DataLoader.from_generator(
......@@ -269,16 +266,18 @@ def lm_model(hidden_size,
init_hidden = fluid.data(
name="init_hidden",
shape=[num_layers, batch_size_each, hidden_size],
shape=[None, num_layers, hidden_size],
dtype='float32')
init_cell = fluid.data(
name="init_cell",
shape=[num_layers, batch_size_each, hidden_size],
shape=[None, num_layers, hidden_size],
dtype='float32')
init_cell.persistable = True
init_hidden.persistable = True
init_hidden = layers.transpose(init_hidden, perm=[1, 0, 2])
init_cell = layers.transpose(init_cell, perm=[1, 0, 2])
init_hidden_reshape = layers.reshape(
init_hidden, shape=[num_layers, -1, hidden_size])
init_cell_reshape = layers.reshape(
......@@ -373,9 +372,8 @@ def lm_model(hidden_size,
# can be used directly in next batch. This can avoid the fetching of
# last_hidden and last_cell and feeding of init_hidden and init_cell in
# each training step.
layers.assign(input=last_cell, output=init_cell)
layers.assign(input=last_hidden, output=init_hidden)
last_hidden = layers.transpose(last_hidden, perm=[1, 0, 2])
last_cell = layers.transpose(last_cell, perm=[1, 0, 2])
feeding_list = ['x', 'y', 'init_hidden', 'init_cell']
if use_dataloader:
return loss, last_hidden, last_cell, feeding_list, dataloader
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册