未验证 提交 b91cf1fc 编写于 作者: Y Yiqun Liu 提交者: GitHub

Update the optimization of PaddingRNN model in benchmark repo to models (#2413)

* Update the optimization of PaddingRNN model in benchmark repo.

* Move the configuration of rnn model to a serapate file.

* Fix the error of forgetting feeding learning_rate when use py_reader.

* Add a argument and some check.

* Fix the feeding problem and remove some build options.

* Add some comments and refine the format.

* Fix an error when define the inference program.

* Remove the setting of use_experimental_executor, and simplfy some codes.

* Refine the setting of device_count.
上级 503ebb48
......@@ -20,6 +20,15 @@ import argparse
import distutils.util
def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Unsupported value encountered.')
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
......@@ -36,10 +45,30 @@ def parse_args():
"--data_path", type=str, help="all the data for train,valid,test")
parser.add_argument('--para_init', action='store_true')
parser.add_argument(
'--use_gpu', type=bool, default=False, help='whether using gpu')
'--use_gpu',
type=str2bool,
default=False,
help='Whether using gpu [True|False]')
parser.add_argument(
'--parallel',
type=str2bool,
default=True,
help='Whether using gpu in parallel [True|False]')
parser.add_argument(
'--use_py_reader',
type=str2bool,
default=False,
help='Whether using py_reader to feed data [True|False]')
parser.add_argument(
'--log_path',
help='path of the log file. If not set, logs are printed to console')
parser.add_argument(
'--save_model_dir',
type=str,
default="models",
help='dir of the saved model.')
parser.add_argument('--enable_ce', action='store_true')
parser.add_argument('--batch_size', type=int, default=0, help='batch size')
parser.add_argument('--max_epoch', type=int, default=0, help='max epoch')
args = parser.parse_args()
return args
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class RNNConfig(object):
def __init__(self, args):
self.model_type = args.model_type
self.rnn_model = args.rnn_model
self.vocab_size = 10000
if self.model_type == "test":
self.num_layers = 1
self.batch_size = 2
self.hidden_size = 10
self.num_steps = 3
self.init_scale = 0.1
self.max_grad_norm = 5.0
self.epoch_start_decay = 1
self.max_epoch = 1
self.dropout = 0.0
self.lr_decay = 0.5
self.base_learning_rate = 1.0
elif self.model_type == "small":
self.num_layers = 2
self.batch_size = 20
self.hidden_size = 200
self.num_steps = 20
self.init_scale = 0.1
self.max_grad_norm = 5.0
self.epoch_start_decay = 4
self.max_epoch = 13
self.dropout = 0.0
self.lr_decay = 0.5
self.base_learning_rate = 1.0
elif self.model_type == "medium":
self.num_layers = 2
self.batch_size = 20
self.hidden_size = 650
self.num_steps = 35
self.init_scale = 0.05
self.max_grad_norm = 5.0
self.epoch_start_decay = 6
self.max_epoch = 39
self.dropout = 0.5
self.lr_decay = 0.8
self.base_learning_rate = 1.0
elif self.model_type == "large":
self.num_layers = 2
self.batch_size = 20
self.hidden_size = 1500
self.num_steps = 35
self.init_scale = 0.04
self.max_grad_norm = 10.0
self.epoch_start_decay = 14
self.max_epoch = 55
self.dropout = 0.65
self.lr_decay = 1.0 / 1.15
self.base_learning_rate = 1.0
else:
raise ValueError('Unsupported model_type.')
if args.rnn_model not in ('static', 'padding', 'cudnn'):
raise ValueError('Unsupported rnn_model.')
if args.batch_size > 0:
self.batch_size = args.batch_size
if args.max_epoch > 0:
self.max_epoch = args.max_epoch
......@@ -41,6 +41,7 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
from args import *
sys.path.append("../")
from models.language_model import lm_model
from config import RNNConfig
import logging
import pickle
......@@ -74,16 +75,12 @@ def save_para_npz(train_prog, train_exe):
np.savez("mode_base", **vals)
def train():
def main():
args = parse_args()
model_type = args.model_type
rnn_model = args.rnn_model
logger = logging.getLogger("lm")
logger.setLevel(logging.INFO)
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s')
if args.enable_ce:
fluid.default_startup_program().random_seed = SEED
if args.log_path:
file_handler = logging.FileHandler(args.log_path)
file_handler.setLevel(logging.INFO)
......@@ -94,93 +91,90 @@ def train():
console_handler.setLevel(logging.INFO)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)
logger.info('Running with args : {}'.format(args))
vocab_size = 10000
if model_type == "test":
num_layers = 1
batch_size = 2
hidden_size = 10
num_steps = 3
init_scale = 0.1
max_grad_norm = 5.0
epoch_start_decay = 1
max_epoch = 1
dropout = 0.0
lr_decay = 0.5
base_learning_rate = 1.0
elif model_type == "small":
num_layers = 2
batch_size = 20
hidden_size = 200
num_steps = 20
init_scale = 0.1
max_grad_norm = 5.0
epoch_start_decay = 4
max_epoch = 13
dropout = 0.0
lr_decay = 0.5
base_learning_rate = 1.0
elif model_type == "medium":
num_layers = 2
batch_size = 20
hidden_size = 650
num_steps = 35
init_scale = 0.05
max_grad_norm = 5.0
epoch_start_decay = 6
max_epoch = 39
dropout = 0.5
lr_decay = 0.8
base_learning_rate = 1.0
elif model_type == "large":
num_layers = 2
batch_size = 20
hidden_size = 1500
num_steps = 35
init_scale = 0.04
max_grad_norm = 10.0
epoch_start_decay = 14
max_epoch = 55
dropout = 0.65
lr_decay = 1.0 / 1.15
base_learning_rate = 1.0
else:
print("model type not support")
return
# Training process
loss, last_hidden, last_cell, feed_order = lm_model.lm_model(
hidden_size,
vocab_size,
batch_size,
num_layers=num_layers,
num_steps=num_steps,
init_scale=init_scale,
dropout=dropout,
rnn_model=rnn_model)
# clone from default main program and use it as the validation program
main_program = fluid.default_main_program()
inference_program = fluid.default_main_program().clone(for_test=True)
fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
clip_norm=max_grad_norm))
learning_rate = fluid.layers.create_global_var(
name="learning_rate",
shape=[1],
value=1.0,
dtype='float32',
persistable=True)
optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
optimizer.minimize(loss)
config = RNNConfig(args)
# define train program
main_program = fluid.Program()
startup_program = fluid.Program()
if args.enable_ce:
startup_program.random_seed = SEED
with fluid.program_guard(main_program, startup_program):
with fluid.unique_name.guard():
res_vars = lm_model.lm_model(
config.hidden_size,
config.vocab_size,
config.batch_size,
num_layers=config.num_layers,
num_steps=config.num_steps,
init_scale=config.init_scale,
dropout=config.dropout,
rnn_model=config.rnn_model,
use_py_reader=args.use_py_reader)
if args.use_py_reader:
py_reader = res_vars[-1]
res_vars = res_vars[:-1]
loss, last_hidden, last_cell, feed_order = res_vars
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(
clip_norm=config.max_grad_norm))
learning_rate = fluid.layers.create_global_var(
name="learning_rate",
shape=[1],
value=1.0,
dtype='float32',
persistable=True)
optimizer = fluid.optimizer.SGD(learning_rate=learning_rate)
optimizer.minimize(loss)
# define inference program
inference_program = fluid.Program()
inference_startup_program = fluid.Program()
with fluid.program_guard(inference_program, inference_startup_program):
with fluid.unique_name.guard():
lm_model.lm_model(
config.hidden_size,
config.vocab_size,
config.batch_size,
num_layers=config.num_layers,
num_steps=config.num_steps,
init_scale=config.init_scale,
dropout=config.dropout,
rnn_model=config.rnn_model,
use_py_reader=False)
# Some op behaves differently for train and inference, we need to call
# this clone function to ensure every op is right for inference.
inference_program = inference_program.clone(for_test=True)
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = Executor(place)
exe.run(framework.default_startup_program())
exe.run(startup_program)
device_count = len(fluid.cuda_places()) if args.use_gpu else len(
fluid.cpu_places())
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = device_count
exec_strategy.num_iteration_per_drop_scope = 100
build_strategy = fluid.BuildStrategy()
build_strategy.enable_inplace = True
build_strategy.memory_optimize = False
build_strategy.fuse_all_optimizer_ops = True
if args.parallel:
train_program = fluid.compiler.CompiledProgram(
main_program).with_data_parallel(
loss_name=loss.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
else:
train_program = fluid.compiler.CompiledProgram(main_program)
data_path = args.data_path
print("begin to load data")
......@@ -188,127 +182,249 @@ def train():
print("finished load data")
train_data, valid_data, test_data, _ = raw_data
def prepare_input(batch, init_hidden, init_cell, epoch_id=0, with_lr=True):
def generate_init_data():
init_hidden = np.zeros(
(config.num_layers, config.batch_size, config.hidden_size),
dtype='float32')
init_cell = np.zeros(
(config.num_layers, config.batch_size, config.hidden_size),
dtype='float32')
return init_hidden, init_cell
def generate_new_lr(epoch_id=0, device_count=1):
new_lr = config.base_learning_rate * (config.lr_decay**max(
epoch_id + 1 - config.epoch_start_decay, 0.0))
lr = np.ones((device_count), dtype='float32') * new_lr
return lr
def prepare_input(batch,
init_hidden=None,
init_cell=None,
epoch_id=0,
with_lr=True,
device_count=1):
x, y = batch
new_lr = base_learning_rate * (lr_decay**max(
epoch_id + 1 - epoch_start_decay, 0.0))
lr = np.ones((1), dtype='float32') * new_lr
res = {}
x = x.reshape((-1, num_steps, 1))
x = x.reshape((-1, config.num_steps, 1))
y = y.reshape((-1, 1))
res = {}
res['x'] = x
res['y'] = y
res['init_hidden'] = init_hidden
res['init_cell'] = init_cell
if init_hidden is not None:
res['init_hidden'] = init_hidden
if init_cell is not None:
res['init_cell'] = init_cell
if with_lr:
res['learning_rate'] = lr
res['learning_rate'] = generate_new_lr(epoch_id, device_count)
return res
def eval(data):
# when eval the batch_size set to 1
eval_data_iter = reader.get_data_iter(data, batch_size, num_steps)
eval_data_iter = reader.get_data_iter(data, config.batch_size,
config.num_steps)
total_loss = 0.0
iters = 0
init_hidden = np.zeros((num_layers, batch_size, hidden_size), dtype='float32')
init_cell = np.zeros((num_layers, batch_size, hidden_size), dtype='float32')
init_hidden, init_cell = generate_init_data()
for batch_id, batch in enumerate(eval_data_iter):
input_data_feed = prepare_input(
batch, init_hidden, init_cell, epoch_id, with_lr=False)
batch, init_hidden, init_cell, epoch_id=0, with_lr=False)
fetch_outs = exe.run(
inference_program,
program=inference_program,
feed=input_data_feed,
fetch_list=[loss.name, last_hidden.name, last_cell.name],
use_program_cache=True)
cost_train = np.array(fetch_outs[0])
cost_eval = np.array(fetch_outs[0])
init_hidden = np.array(fetch_outs[1])
init_cell = np.array(fetch_outs[2])
total_loss += cost_train
iters += num_steps
total_loss += cost_eval
iters += config.num_steps
ppl = np.exp(total_loss / iters)
return ppl
# get train epoch size
batch_len = len(train_data) // batch_size
epoch_size = (batch_len - 1) // num_steps
log_interval = epoch_size // 10
total_time = 0.0
for epoch_id in range(max_epoch):
start_time = time.time()
print("epoch id", epoch_id)
train_data_iter = reader.get_data_iter(train_data, batch_size,
num_steps)
def get_log_interval(data_len):
num_batchs = data_len // config.batch_size
epoch_size = (num_batchs - 1) // config.num_steps
log_interval = max(1, epoch_size // 10)
return log_interval
total_loss = 0
def train_an_epoch(epoch_id, batch_times):
# get train epoch size
log_interval = get_log_interval(len(train_data))
train_data_iter = reader.get_data_iter(train_data, config.batch_size,
config.num_steps)
init_hidden = None
init_cell = None
#debug_para(fluid.framework.default_main_program(), parallel_executor)
total_loss = 0
iters = 0
init_hidden = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
init_cell = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
for batch_id, batch in enumerate(train_data_iter):
if batch_id == 0:
init_hidden, init_cell = generate_init_data()
else:
init_hidden = None
init_cell = None
input_data_feed = prepare_input(
batch, init_hidden, init_cell, epoch_id=epoch_id)
fetch_outs = exe.run(feed=input_data_feed,
fetch_list=[
loss.name, last_hidden.name,
last_cell.name, 'learning_rate'
],
batch,
init_hidden=init_hidden,
init_cell=init_cell,
epoch_id=epoch_id,
with_lr=True,
device_count=device_count)
batch_start_time = time.time()
fetch_outs = exe.run(train_program,
feed=input_data_feed,
fetch_list=[loss.name, "learning_rate"],
use_program_cache=True)
batch_time = time.time() - batch_start_time
batch_times.append(batch_time)
cost_train = np.array(fetch_outs[0])
init_hidden = np.array(fetch_outs[1])
init_cell = np.array(fetch_outs[2])
lr = np.array(fetch_outs[3])
lr = np.array(fetch_outs[1])
total_loss += cost_train
iters += num_steps
iters += config.num_steps
if batch_id > 0 and batch_id % log_interval == 0:
ppl = np.exp(total_loss / iters)
print("ppl ", batch_id, ppl[0], lr[0])
print(
"-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f"
% (epoch_id, batch_id, batch_time, ppl[0], lr[0]))
ppl = np.exp(total_loss / iters)
if epoch_id == 0 and ppl[0] > 1000:
# for bad init, after first epoch, the loss is over 1000
# no more need to continue
return
end_time = time.time()
total_time += end_time - start_time
print("train ppl", ppl[0])
if epoch_id == max_epoch - 1 and args.enable_ce:
card_num = get_cards()
print("ptblm\tlstm_language_model_%s_duration_card%d\t%s" %
(args.rnn_model, card_num, total_time / max_epoch))
print("ptblm\tlstm_language_model_%s_loss_card%d\t%s" % (args.rnn_model, card_num, ppl[0]))
model_path = os.path.join("model_new/", str(epoch_id))
if not os.path.isdir(model_path):
os.makedirs(model_path)
fluid.io.save_persistables(
executor=exe, dirname=model_path, main_program=main_program)
valid_ppl = eval(valid_data)
print("valid ppl", valid_ppl[0])
test_ppl = eval(test_data)
print("test ppl", test_ppl[0])
return ppl
def train_an_epoch_py_reader(epoch_id, batch_times):
# get train epoch size
log_interval = get_log_interval(len(train_data))
def get_cards():
num = 0
cards = os.environ.get('CUDA_VISIBLE_DEVICES', '')
if cards != '':
num = len(cards.split(","))
return num
init_hidden, init_cell = generate_init_data()
total_loss = 0
iters = 0
py_reader.start()
batch_id = 0
try:
while True:
data_feeds = {}
if batch_id == 0:
batch_time = 0
batch_start_time = time.time()
data_feeds["init_hidden"] = init_hidden
data_feeds["init_cell"] = init_cell
else:
batch_time = time.time() - batch_start_time
batch_times.append(batch_time)
batch_start_time = time.time()
new_lr = generate_new_lr(epoch_id, device_count)
data_feeds['learning_rate'] = new_lr
fetch_outs = exe.run(train_program,
feed=data_feeds,
fetch_list=[loss.name, "learning_rate"],
use_program_cache=True)
cost_train = np.array(fetch_outs[0])
lr = np.array(fetch_outs[1])
total_loss += cost_train
iters += config.num_steps
if batch_id > 0 and (log_interval == 0 or
batch_id % log_interval == 0):
ppl = np.exp(total_loss / iters)
print(
"-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f"
% (epoch_id, batch_id, batch_time, ppl[0], lr[0]))
batch_id += 1
except fluid.core.EOFException:
py_reader.reset()
batch_times.append(time.time() - batch_start_time)
ppl = np.exp(total_loss / iters)
return ppl
def train():
if args.use_py_reader:
def data_gen():
data_iter_size = config.batch_size // device_count
train_batches = reader.get_data_iter(train_data, data_iter_size,
config.num_steps)
for batch in train_batches:
x, y = batch
x = x.reshape((-1, config.num_steps, 1))
y = y.reshape((-1, 1))
yield x, y
py_reader.decorate_tensor_provider(data_gen)
total_time = 0.0
for epoch_id in range(config.max_epoch):
batch_times = []
epoch_start_time = time.time()
if args.use_py_reader:
train_ppl = train_an_epoch_py_reader(epoch_id, batch_times)
else:
train_ppl = train_an_epoch(epoch_id, batch_times)
epoch_time = time.time() - epoch_start_time
total_time += epoch_time
print(
"\nTrain epoch:[%d]; epoch Time: %.5f; ppl: %.5f; avg_time: %.5f steps/s \n"
% (epoch_id, epoch_time, train_ppl[0],
len(batch_times) / sum(batch_times)))
# FIXME(zjl): ppl[0] increases as batch_size increases.
# We should find a better way to calculate ppl by normalizing batch_size.
if device_count == 1 and config.batch_size <= 20 and epoch_id == 0 and train_ppl[
0] > 1000:
# for bad init, after first epoch, the loss is over 1000
# no more need to continue
print(
"Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch."
)
print("Abort this training process and please start again.")
return
if epoch_id == config.max_epoch - 1 and args.enable_ce:
# kpis
print("ptblm\tlstm_language_model_%s_duration_card%d\t%s" %
(args.rnn_model, device_count,
total_time / config.max_epoch))
print("ptblm\tlstm_language_model_%s_loss_card%d\t%s" %
(args.rnn_model, device_count, train_ppl[0]))
# NOTE(zjl): sometimes we have not enough data for eval if batch_size is large, i.e., 2100
# Just skip to avoid error
def is_valid_data(data, batch_size, num_steps):
data_len = len(data)
batch_len = data_len // batch_size
epoch_size = (batch_len - 1) // num_steps
return epoch_size >= 1
valid_data_valid = is_valid_data(valid_data, config.batch_size,
config.num_steps)
if valid_data_valid:
valid_ppl = eval(valid_data)
print("Valid ppl: %.5f" % valid_ppl[0])
else:
print(
'WARNING: length of valid_data is {}, which is not enough for batch_size {} and num_steps {}'.
format(
len(valid_data), config.batch_size, config.num_steps))
save_model_dir = os.path.join(args.save_model_dir, str(epoch_id))
fluid.io.save_persistables(
executor=exe, dirname=save_model_dir, main_program=main_program)
print("Saved model to: %s.\n" % save_model_dir)
if __name__ == '__main__':
train()
test_ppl = eval(test_data)
print("Test ppl:", test_ppl[0])
if __name__ == '__main__':
main()
......@@ -28,8 +28,9 @@ def lm_model(hidden_size,
num_layers=2,
num_steps=20,
init_scale=0.1,
dropout=None,
rnn_model='static'):
dropout=None,
rnn_model='static',
use_py_reader=False):
def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None):
weight_1_arr = []
weight_2_arr = []
......@@ -38,8 +39,12 @@ def lm_model(hidden_size,
cell_array = []
mask_array = []
for i in range(num_layers):
weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \
default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
weight_1 = layers.create_parameter(
[hidden_size * 2, hidden_size * 4],
dtype="float32",
name="fc_weight1_" + str(i),
default_initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale))
weight_1_arr.append(weight_1)
bias_1 = layers.create_parameter(
[hidden_size * 4],
......@@ -166,8 +171,12 @@ def lm_model(hidden_size,
cell_array = []
mask_array = []
for i in range(num_layers):
weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \
default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
weight_1 = layers.create_parameter(
[hidden_size * 2, hidden_size * 4],
dtype="float32",
name="fc_weight1_" + str(i),
default_initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale))
weight_1_arr.append(weight_1)
bias_1 = layers.create_parameter(
[hidden_size * 4],
......@@ -180,16 +189,20 @@ def lm_model(hidden_size,
init_hidden, axes=[0], starts=[i], ends=[i + 1])
pre_cell = layers.slice(
init_cell, axes=[0], starts=[i], ends=[i + 1])
pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size])
pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size])
pre_hidden = layers.reshape(
pre_hidden, shape=[-1, hidden_size], inplace=True)
pre_cell = layers.reshape(
pre_cell, shape=[-1, hidden_size], inplace=True)
hidden_array.append(pre_hidden)
cell_array.append(pre_cell)
res = []
sliced_inputs = layers.split(
input_embedding, num_or_sections=len, dim=1)
for index in range(len):
input = layers.slice(
input_embedding, axes=[1], starts=[index], ends=[index + 1])
input = layers.reshape(input, shape=[-1, hidden_size])
input = sliced_inputs[index]
input = layers.reshape(input, shape=[-1, hidden_size], inplace=True)
for k in range(num_layers):
pre_hidden = hidden_array[k]
pre_cell = cell_array[k]
......@@ -202,9 +215,38 @@ def lm_model(hidden_size,
gate_input = layers.elementwise_add(gate_input, bias)
i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
i) * layers.tanh(j)
m = layers.tanh(c) * layers.sigmoid(o)
try:
from paddle.fluid.contrib.layers import fused_elemwise_activation
# fluid.contrib.layers.fused_elemwise_activation can do a fused
# operation, like:
# 1) x + sigmoid(y); x + tanh(y)
# 2) tanh(x + y)
# Now the unary operation supported in this fused op is limit, and
# we will extent this operation to support more unary operations and
# do this kind of fusion automitically in future version of paddle.fluid.
# layers.sigmoid(i) * layers.tanh(j)
tmp0 = fused_elemwise_activation(
x=layers.tanh(j),
y=i,
functor_list=['elementwise_mul', 'sigmoid'],
save_intermediate_out=False)
# pre_cell * layers.sigmoid(f)
tmp1 = fused_elemwise_activation(
x=pre_cell,
y=f,
functor_list=['elementwise_mul', 'sigmoid'],
save_intermediate_out=False)
c = tmp0 + tmp1
# layers.tanh(c) * layers.sigmoid(o)
m = fused_elemwise_activation(
x=layers.tanh(c),
y=o,
functor_list=['elementwise_mul', 'sigmoid'],
save_intermediate_out=False)
except ImportError:
c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
i) * layers.tanh(j)
m = layers.tanh(c) * layers.sigmoid(o)
hidden_array[k] = m
cell_array[k] = c
......@@ -216,29 +258,62 @@ def lm_model(hidden_size,
dropout_prob=dropout,
dropout_implementation='upscale_in_train')
res.append(layers.reshape(input, shape=[1, -1, hidden_size]))
real_res = layers.concat(res, 0)
real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
res.append(input)
last_hidden = layers.concat(hidden_array, 1)
last_hidden = layers.reshape(
last_hidden, shape=[-1, num_layers, hidden_size])
last_hidden, shape=[-1, num_layers, hidden_size], inplace=True)
last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2])
last_cell = layers.concat(cell_array, 1)
last_cell = layers.reshape(
last_cell, shape=[-1, num_layers, hidden_size])
last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2])
real_res = layers.concat(res, 0)
real_res = layers.reshape(
real_res, shape=[len, -1, hidden_size], inplace=True)
real_res = layers.transpose(x=real_res, perm=[1, 0, 2])
return real_res, last_hidden, last_cell
x = layers.data(name="x", shape=[-1, 1, 1], dtype='int64')
y = layers.data(name="y", shape=[-1, 1], dtype='float32')
batch_size_each = batch_size // fluid.core.get_cuda_device_count()
if use_py_reader:
feed_shapes = [[batch_size_each, num_steps, 1],
[batch_size_each * num_steps, 1]]
py_reader = fluid.layers.py_reader(
capacity=16, shapes=feed_shapes, dtypes=['int64', 'int64'])
x, y = fluid.layers.read_file(py_reader)
else:
x = layers.data(
name="x",
shape=[batch_size_each, num_steps, 1],
dtype='int64',
append_batch_size=False)
y = layers.data(
name="y",
shape=[batch_size_each * num_steps, 1],
dtype='int64',
append_batch_size=False)
init_hidden = layers.data(
name="init_hidden",
shape=[num_layers, batch_size_each, hidden_size],
dtype='float32',
append_batch_size=False)
init_cell = layers.data(
name="init_cell",
shape=[num_layers, batch_size_each, hidden_size],
dtype='float32',
append_batch_size=False)
init_hidden = layers.data(name="init_hidden", shape=[1], dtype='float32')
init_cell = layers.data(name="init_cell", shape=[1], dtype='float32')
init_cell.persistable = True
init_hidden.persistable = True
init_hidden = layers.reshape(
init_hidden_reshape = layers.reshape(
init_hidden, shape=[num_layers, -1, hidden_size])
init_cell = layers.reshape(init_cell, shape=[num_layers, -1, hidden_size])
init_cell_reshape = layers.reshape(
init_cell, shape=[num_layers, -1, hidden_size])
x_emb = layers.embedding(
input=x,
......@@ -250,50 +325,84 @@ def lm_model(hidden_size,
initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale)))
x_emb = layers.reshape(x_emb, shape=[-1, num_steps, hidden_size])
x_emb = layers.reshape(
x_emb, shape=[-1, num_steps, hidden_size], inplace=True)
if dropout != None and dropout > 0.0:
x_emb = layers.dropout(
x_emb,
dropout_prob=dropout,
dropout_implementation='upscale_in_train')
if rnn_model == "padding":
rnn_out, last_hidden, last_cell = padding_rnn(
x_emb, len=num_steps, init_hidden=init_hidden, init_cell=init_cell)
x_emb,
len=num_steps,
init_hidden=init_hidden_reshape,
init_cell=init_cell_reshape)
elif rnn_model == "static":
rnn_out, last_hidden, last_cell = encoder_static(
x_emb, len=num_steps, init_hidden=init_hidden, init_cell=init_cell)
x_emb,
len=num_steps,
init_hidden=init_hidden_reshape,
init_cell=init_cell_reshape)
elif rnn_model == "cudnn":
x_emb = layers.transpose( x_emb, perm=[1, 0, 2])
rnn_out, last_hidden, last_cell = layers.lstm( x_emb, init_hidden, init_cell, num_steps, hidden_size, num_layers, \
is_bidirec=False, \
default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale) )
rnn_out = layers.transpose( rnn_out, perm=[1, 0, 2])
x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
rnn_out, last_hidden, last_cell = layers.lstm(
x_emb,
init_hidden_reshape,
init_cell_reshape,
num_steps,
hidden_size,
num_layers,
is_bidirec=False,
default_initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale))
rnn_out = layers.transpose(rnn_out, perm=[1, 0, 2])
else:
print( "type not support")
print("type not support")
return
rnn_out = layers.reshape(rnn_out, shape=[-1, num_steps, hidden_size])
softmax_weight = layers.create_parameter([hidden_size, vocab_size], dtype="float32", name="softmax_weight", \
default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
softmax_bias = layers.create_parameter([vocab_size], dtype="float32", name='softmax_bias', \
default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))
rnn_out = layers.reshape(
rnn_out, shape=[-1, num_steps, hidden_size], inplace=True)
softmax_weight = layers.create_parameter(
[hidden_size, vocab_size],
dtype="float32",
name="softmax_weight",
default_initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale))
softmax_bias = layers.create_parameter(
[vocab_size],
dtype="float32",
name='softmax_bias',
default_initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale))
projection = layers.matmul(rnn_out, softmax_weight)
projection = layers.elementwise_add(projection, softmax_bias)
projection = layers.reshape(projection, shape=[-1, vocab_size])
#y = layers.reshape( y, shape=[-1, vocab_size])
projection = layers.reshape(
projection, shape=[-1, vocab_size], inplace=True)
loss = layers.softmax_with_cross_entropy(
logits=projection, label=y, soft_label=False)
loss = layers.reshape(loss, shape=[-1, num_steps])
loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True)
loss = layers.reduce_mean(loss, dim=[0])
loss = layers.reduce_sum(loss)
loss.permissions = True
loss.persistable = True
last_cell.persistable = True
last_hidden.persistable = True
# This will feed last_hidden, last_cell to init_hidden, init_cell, which
# can be used directly in next batch. This can avoid the fetching of
# last_hidden and last_cell and feeding of init_hidden and init_cell in
# each training step.
layers.assign(input=last_cell, output=init_cell)
layers.assign(input=last_hidden, output=init_hidden)
feeding_list = ['x', 'y', 'init_hidden', 'init_cell']
return loss, last_hidden, last_cell, feeding_list
if use_py_reader:
return loss, last_hidden, last_cell, feeding_list, py_reader
else:
return loss, last_hidden, last_cell, feeding_list
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册