From ac7a2931692ddbba323292f2f70b6a055f0ff687 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 24 Apr 2018 17:11:10 +0800 Subject: [PATCH] append a language model in fluid (#873) * add a language model sample in fluid * minor update * fix code style --- fluid/language_model/README.md | 148 ++++++++++++++++++++++++++++++ fluid/language_model/infer.py | 65 +++++++++++++ fluid/language_model/train.py | 162 +++++++++++++++++++++++++++++++++ fluid/language_model/utils.py | 40 ++++++++ 4 files changed, 415 insertions(+) create mode 100644 fluid/language_model/README.md create mode 100644 fluid/language_model/infer.py create mode 100644 fluid/language_model/train.py create mode 100644 fluid/language_model/utils.py diff --git a/fluid/language_model/README.md b/fluid/language_model/README.md new file mode 100644 index 00000000..91ce2d7f --- /dev/null +++ b/fluid/language_model/README.md @@ -0,0 +1,148 @@ +# 语言模型 + +以下是本例的简要目录结构及说明: + +```text +. +├── README.md # 文档 +├── train.py # 训练脚本 +├── infer.py # 预测脚本 +└── utils.py # 通用函数 +``` + + +## 简介 + +循环神经网络语言模型的介绍可以参阅论文[Recurrent Neural Network Regularization](https://arxiv.org/abs/1409.2329),在本例中,我们实现了GRU-RNN语言模型。 + +## 训练 + +运行命令 `python train.py` 开始训练模型。 +```python +python train.py +``` + +当前支持的参数可参见[train.py](./train.py) `train_net` 函数 +```python +vocab, train_reader, test_reader = utils.prepare_data( + batch_size=20, # batch size + buffer_size=1000, # buffer size, default value is OK + word_freq_threshold=0) # vocabulary related parameter, and words with frequency below this value will be filtered + +train(train_reader=train_reader, + vocab=vocab, + network=network, + hid_size=200, # embedding and hidden size + base_lr=1.0, # base learning rate + batch_size=20, # batch size, the same as that in prepare_data + pass_num=12, # the number of passes for training + use_cuda=True, # whether to use GPU card + parallel=False, # whether to be parallel + model_dir="model", # directory to save model + init_low_bound=-0.1, # uniform parameter initialization lower bound + init_high_bound=0.1) # uniform parameter initialization upper bound +``` + +## 自定义网络结构 + +可在[train.py](./train.py) `network` 函数中调整网络结构,当前的网络结构如下: +```python +emb = fluid.layers.embedding(input=src, size=[vocab_size, hid_size], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound), + learning_rate=emb_lr_x), + is_sparse=True) + +fc0 = fluid.layers.fc(input=emb, size=hid_size * 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound), + learning_rate=gru_lr_x)) +gru_h0 = fluid.layers.dynamic_gru(input=fc0, size=hid_size, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound), + learning_rate=gru_lr_x)) + +fc = fluid.layers.fc(input=gru_h0, size=vocab_size, act='softmax', + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound), + learning_rate=fc_lr_x)) + +cost = fluid.layers.cross_entropy(input=fc, label=dst) +``` + +## 训练结果示例 + +我们在Tesla K40m单GPU卡上训练的日志如下所示 +```text +epoch_1 start +step:100 ppl:771.053 +step:200 ppl:449.597 +step:300 ppl:642.654 +step:400 ppl:458.128 +step:500 ppl:510.912 +step:600 ppl:451.545 +step:700 ppl:364.404 +step:800 ppl:324.272 +step:900 ppl:360.797 +step:1000 ppl:275.761 +step:1100 ppl:294.599 +step:1200 ppl:335.877 +step:1300 ppl:185.262 +step:1400 ppl:241.744 +step:1500 ppl:211.507 +step:1600 ppl:233.431 +step:1700 ppl:298.767 +step:1800 ppl:203.403 +step:1900 ppl:158.828 +step:2000 ppl:171.148 +step:2100 ppl:280.884 +epoch:1 num_steps:2104 time_cost(s):47.478780 +model saved in model/epoch_1 +epoch_2 start +step:100 ppl:238.099 +step:200 ppl:136.527 +step:300 ppl:204.184 +step:400 ppl:252.886 +step:500 ppl:177.377 +step:600 ppl:197.688 +step:700 ppl:131.650 +step:800 ppl:223.906 +step:900 ppl:144.785 +step:1000 ppl:176.286 +step:1100 ppl:148.158 +step:1200 ppl:203.581 +step:1300 ppl:168.208 +step:1400 ppl:159.412 +step:1500 ppl:114.032 +step:1600 ppl:157.985 +step:1700 ppl:147.743 +step:1800 ppl:88.676 +step:1900 ppl:141.962 +step:2000 ppl:106.087 +step:2100 ppl:122.709 +epoch:2 num_steps:2104 time_cost(s):47.583789 +model saved in model/epoch_2 +... +``` + +## 预测 +运行命令 `python infer.py model_dir start_epoch last_epoch(inclusive)` 开始预测,其中,start_epoch指定开始预测的轮次,last_epoch指定结束的轮次,例如 +```python +python infer.py model 1 12 # prediction from epoch 1 to epoch 12 +``` + +## 预测结果示例 +```text +model:model/epoch_1 ppl:254.540 time_cost(s):3.29 +model:model/epoch_2 ppl:177.671 time_cost(s):3.27 +model:model/epoch_3 ppl:156.251 time_cost(s):3.27 +model:model/epoch_4 ppl:139.036 time_cost(s):3.27 +model:model/epoch_5 ppl:132.661 time_cost(s):3.27 +model:model/epoch_6 ppl:130.092 time_cost(s):3.28 +model:model/epoch_7 ppl:128.751 time_cost(s):3.27 +model:model/epoch_8 ppl:125.411 time_cost(s):3.27 +model:model/epoch_9 ppl:124.604 time_cost(s):3.28 +model:model/epoch_10 ppl:124.754 time_cost(s):3.29 +model:model/epoch_11 ppl:125.421 time_cost(s):3.27 +model:model/epoch_12 ppl:125.676 time_cost(s):3.27 +``` diff --git a/fluid/language_model/infer.py b/fluid/language_model/infer.py new file mode 100644 index 00000000..a183d548 --- /dev/null +++ b/fluid/language_model/infer.py @@ -0,0 +1,65 @@ +import sys +import time +import math +import unittest +import contextlib +import numpy as np + +import paddle.fluid as fluid +import paddle.v2 as paddle + +import utils + + +def infer(test_reader, use_cuda, model_path): + """ inference function """ + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + with fluid.scope_guard(fluid.core.Scope()): + infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model( + model_path, exe) + + accum_cost = 0.0 + accum_words = 0 + t0 = time.time() + for data in test_reader(): + src_wordseq = utils.to_lodtensor(map(lambda x: x[0], data), place) + dst_wordseq = utils.to_lodtensor(map(lambda x: x[1], data), place) + avg_cost = exe.run( + infer_program, + feed={"src_wordseq": src_wordseq, + "dst_wordseq": dst_wordseq}, + fetch_list=fetch_vars) + + nwords = src_wordseq.lod()[0][-1] + + cost = np.array(avg_cost) * nwords + accum_cost += cost + accum_words += nwords + + ppl = math.exp(accum_cost / accum_words) + t1 = time.time() + print("model:%s ppl:%.3f time_cost(s):%.2f" % + (model_path, ppl, t1 - t0)) + + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("Usage: %s model_dir start_epoch last_epoch(inclusive)") + exit(0) + + model_dir = sys.argv[1] + try: + start_index = int(sys.argv[2]) + last_index = int(sys.argv[3]) + except: + print("Usage: %s model_dir start_epoch last_epoch(inclusive)") + exit(-1) + + vocab, train_reader, test_reader = utils.prepare_data( + batch_size=20, buffer_size=1000, word_freq_threshold=0) + + for epoch in xrange(start_index, last_index + 1): + epoch_path = model_dir + "/epoch_" + str(epoch) + infer(test_reader=test_reader, use_cuda=True, model_path=epoch_path) diff --git a/fluid/language_model/train.py b/fluid/language_model/train.py new file mode 100644 index 00000000..59fc3a98 --- /dev/null +++ b/fluid/language_model/train.py @@ -0,0 +1,162 @@ +import sys +import time + +import numpy as np +import math + +import paddle.fluid as fluid +import paddle.v2 as paddle + +import utils + + +def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound): + """ network definition """ + emb_lr_x = 10.0 + gru_lr_x = 1.0 + fc_lr_x = 1.0 + emb = fluid.layers.embedding( + input=src, + size=[vocab_size, hid_size], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=init_low_bound, high=init_high_bound), + learning_rate=emb_lr_x), + is_sparse=True) + + fc0 = fluid.layers.fc(input=emb, + size=hid_size * 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=init_low_bound, high=init_high_bound), + learning_rate=gru_lr_x)) + gru_h0 = fluid.layers.dynamic_gru( + input=fc0, + size=hid_size, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=init_low_bound, high=init_high_bound), + learning_rate=gru_lr_x)) + + fc = fluid.layers.fc(input=gru_h0, + size=vocab_size, + act='softmax', + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=init_low_bound, high=init_high_bound), + learning_rate=fc_lr_x)) + + cost = fluid.layers.cross_entropy(input=fc, label=dst) + return cost + + +def train(train_reader, + vocab, + network, + hid_size, + base_lr, + batch_size, + pass_num, + use_cuda, + parallel, + model_dir, + init_low_bound=-0.04, + init_high_bound=0.04): + """ train network """ + vocab_size = len(vocab) + + src_wordseq = fluid.layers.data( + name="src_wordseq", shape=[1], dtype="int64", lod_level=1) + dst_wordseq = fluid.layers.data( + name="dst_wordseq", shape=[1], dtype="int64", lod_level=1) + + avg_cost = None + if not parallel: + cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size, + init_low_bound, init_high_bound) + avg_cost = fluid.layers.mean(x=cost) + else: + places = fluid.layers.get_places() + pd = fluid.layers.ParallelDo(places) + with pd.do(): + cost = network( + pd.read_input(src_wordseq), + pd.read_input(dst_wordseq), vocab_size, hid_size, + init_low_bound, init_high_bound) + pd.write_output(cost) + + cost = pd() + avg_cost = fluid.layers.mean(x=cost) + + sgd_optimizer = fluid.optimizer.SGD( + learning_rate=fluid.layers.exponential_decay( + learning_rate=base_lr, + decay_steps=2100 * 4, + decay_rate=0.5, + staircase=True)) + sgd_optimizer.minimize(avg_cost) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + exe.run(fluid.default_startup_program()) + total_time = 0.0 + for pass_idx in xrange(pass_num): + epoch_idx = pass_idx + 1 + print "epoch_%d start" % epoch_idx + + t0 = time.time() + i = 0 + for data in train_reader(): + i += 1 + lod_src_wordseq = utils.to_lodtensor( + map(lambda x: x[0], data), place) + lod_dst_wordseq = utils.to_lodtensor( + map(lambda x: x[1], data), place) + ret_avg_cost = exe.run(fluid.default_main_program(), + feed={ + "src_wordseq": lod_src_wordseq, + "dst_wordseq": lod_dst_wordseq + }, + fetch_list=[avg_cost], + use_program_cache=True) + avg_ppl = math.exp(ret_avg_cost[0]) + if i % 100 == 0: + print "step:%d ppl:%.3f" % (i, avg_ppl) + + t1 = time.time() + total_time += t1 - t0 + print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i, + total_time / epoch_idx) + + save_dir = "%s/epoch_%d" % (model_dir, epoch_idx) + feed_var_names = ["src_wordseq", "dst_wordseq"] + fetch_vars = [avg_cost] + fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe) + print("model saved in %s" % save_dir) + + print("finish training") + + +def train_net(): + """ do training """ + batch_size = 20 + vocab, train_reader, test_reader = utils.prepare_data( + batch_size=batch_size, buffer_size=1000, word_freq_threshold=0) + train( + train_reader=train_reader, + vocab=vocab, + network=network, + hid_size=200, + base_lr=1.0, + batch_size=batch_size, + pass_num=12, + use_cuda=True, + parallel=False, + model_dir="model", + init_low_bound=-0.1, + init_high_bound=0.1) + + +if __name__ == "__main__": + train_net() diff --git a/fluid/language_model/utils.py b/fluid/language_model/utils.py new file mode 100644 index 00000000..c5909046 --- /dev/null +++ b/fluid/language_model/utils.py @@ -0,0 +1,40 @@ +import sys +import time +import numpy as np + +import paddle.fluid as fluid +import paddle.v2 as paddle + + +def to_lodtensor(data, place): + """ convert to LODtensor """ + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = fluid.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +def prepare_data(batch_size, buffer_size=1000, word_freq_threshold=0): + """ prepare the English Pann Treebank (PTB) data """ + vocab = paddle.dataset.imikolov.build_dict(word_freq_threshold) + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imikolov.train( + vocab, + buffer_size, + data_type=paddle.dataset.imikolov.DataType.SEQ), + buf_size=buffer_size), + batch_size) + test_reader = paddle.batch( + paddle.dataset.imikolov.test( + vocab, buffer_size, data_type=paddle.dataset.imikolov.DataType.SEQ), + batch_size) + return vocab, train_reader, test_reader -- GitLab