diff --git a/fluid/language_model/gru/.run_ce.sh b/fluid/language_model/gru/.run_ce.sh deleted file mode 100644 index 5ee2d8aa0582b2b8504f9ba645b6252aa75f23bf..0000000000000000000000000000000000000000 --- a/fluid/language_model/gru/.run_ce.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -export MKL_NUM_THREADS=1 -export OMP_NUM_THREADS=1 - -cudaid=${language_model:=0} # use 0-th card as default -export CUDA_VISIBLE_DEVICES=$cudaid - -FLAGS_benchmark=true python train.py --enable_ce | python _ce.py - -cudaid=${language_model_m:=0,1,2,3} # use 0,1,2,3 card as default -export CUDA_VISIBLE_DEVICES=$cudaid - -FLAGS_benchmark=true python train.py --enable_ce | python _ce.py diff --git a/fluid/language_model/gru/README.md b/fluid/language_model/gru/README.md deleted file mode 100644 index 91ce2d7f58085b56da2ac2dec03af2a05985ab8f..0000000000000000000000000000000000000000 --- a/fluid/language_model/gru/README.md +++ /dev/null @@ -1,148 +0,0 @@ -# 语言模型 - -以下是本例的简要目录结构及说明: - -```text -. -├── README.md # 文档 -├── train.py # 训练脚本 -├── infer.py # 预测脚本 -└── utils.py # 通用函数 -``` - - -## 简介 - -循环神经网络语言模型的介绍可以参阅论文[Recurrent Neural Network Regularization](https://arxiv.org/abs/1409.2329),在本例中,我们实现了GRU-RNN语言模型。 - -## 训练 - -运行命令 `python train.py` 开始训练模型。 -```python -python train.py -``` - -当前支持的参数可参见[train.py](./train.py) `train_net` 函数 -```python -vocab, train_reader, test_reader = utils.prepare_data( - batch_size=20, # batch size - buffer_size=1000, # buffer size, default value is OK - word_freq_threshold=0) # vocabulary related parameter, and words with frequency below this value will be filtered - -train(train_reader=train_reader, - vocab=vocab, - network=network, - hid_size=200, # embedding and hidden size - base_lr=1.0, # base learning rate - batch_size=20, # batch size, the same as that in prepare_data - pass_num=12, # the number of passes for training - use_cuda=True, # whether to use GPU card - parallel=False, # whether to be parallel - model_dir="model", # directory to save model - init_low_bound=-0.1, # uniform parameter initialization lower bound - init_high_bound=0.1) # uniform parameter initialization upper bound -``` - -## 自定义网络结构 - -可在[train.py](./train.py) `network` 函数中调整网络结构,当前的网络结构如下: -```python -emb = fluid.layers.embedding(input=src, size=[vocab_size, hid_size], - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound), - learning_rate=emb_lr_x), - is_sparse=True) - -fc0 = fluid.layers.fc(input=emb, size=hid_size * 3, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound), - learning_rate=gru_lr_x)) -gru_h0 = fluid.layers.dynamic_gru(input=fc0, size=hid_size, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound), - learning_rate=gru_lr_x)) - -fc = fluid.layers.fc(input=gru_h0, size=vocab_size, act='softmax', - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform(low=init_low_bound, high=init_high_bound), - learning_rate=fc_lr_x)) - -cost = fluid.layers.cross_entropy(input=fc, label=dst) -``` - -## 训练结果示例 - -我们在Tesla K40m单GPU卡上训练的日志如下所示 -```text -epoch_1 start -step:100 ppl:771.053 -step:200 ppl:449.597 -step:300 ppl:642.654 -step:400 ppl:458.128 -step:500 ppl:510.912 -step:600 ppl:451.545 -step:700 ppl:364.404 -step:800 ppl:324.272 -step:900 ppl:360.797 -step:1000 ppl:275.761 -step:1100 ppl:294.599 -step:1200 ppl:335.877 -step:1300 ppl:185.262 -step:1400 ppl:241.744 -step:1500 ppl:211.507 -step:1600 ppl:233.431 -step:1700 ppl:298.767 -step:1800 ppl:203.403 -step:1900 ppl:158.828 -step:2000 ppl:171.148 -step:2100 ppl:280.884 -epoch:1 num_steps:2104 time_cost(s):47.478780 -model saved in model/epoch_1 -epoch_2 start -step:100 ppl:238.099 -step:200 ppl:136.527 -step:300 ppl:204.184 -step:400 ppl:252.886 -step:500 ppl:177.377 -step:600 ppl:197.688 -step:700 ppl:131.650 -step:800 ppl:223.906 -step:900 ppl:144.785 -step:1000 ppl:176.286 -step:1100 ppl:148.158 -step:1200 ppl:203.581 -step:1300 ppl:168.208 -step:1400 ppl:159.412 -step:1500 ppl:114.032 -step:1600 ppl:157.985 -step:1700 ppl:147.743 -step:1800 ppl:88.676 -step:1900 ppl:141.962 -step:2000 ppl:106.087 -step:2100 ppl:122.709 -epoch:2 num_steps:2104 time_cost(s):47.583789 -model saved in model/epoch_2 -... -``` - -## 预测 -运行命令 `python infer.py model_dir start_epoch last_epoch(inclusive)` 开始预测,其中,start_epoch指定开始预测的轮次,last_epoch指定结束的轮次,例如 -```python -python infer.py model 1 12 # prediction from epoch 1 to epoch 12 -``` - -## 预测结果示例 -```text -model:model/epoch_1 ppl:254.540 time_cost(s):3.29 -model:model/epoch_2 ppl:177.671 time_cost(s):3.27 -model:model/epoch_3 ppl:156.251 time_cost(s):3.27 -model:model/epoch_4 ppl:139.036 time_cost(s):3.27 -model:model/epoch_5 ppl:132.661 time_cost(s):3.27 -model:model/epoch_6 ppl:130.092 time_cost(s):3.28 -model:model/epoch_7 ppl:128.751 time_cost(s):3.27 -model:model/epoch_8 ppl:125.411 time_cost(s):3.27 -model:model/epoch_9 ppl:124.604 time_cost(s):3.28 -model:model/epoch_10 ppl:124.754 time_cost(s):3.29 -model:model/epoch_11 ppl:125.421 time_cost(s):3.27 -model:model/epoch_12 ppl:125.676 time_cost(s):3.27 -``` diff --git a/fluid/language_model/gru/_ce.py b/fluid/language_model/gru/_ce.py deleted file mode 100644 index d4999d7a1e14e333f1c7056b3dc2c5b506682ec6..0000000000000000000000000000000000000000 --- a/fluid/language_model/gru/_ce.py +++ /dev/null @@ -1,62 +0,0 @@ -# this file is only used for continuous evaluation test! - -import os -import sys -sys.path.append(os.environ['ceroot']) -from kpi import CostKpi -from kpi import DurationKpi - -imikolov_20_avg_ppl_kpi = CostKpi('imikolov_20_avg_ppl', 0.2, 0) -imikolov_20_pass_duration_kpi = DurationKpi( - 'imikolov_20_pass_duration', 0.02, 0, actived=True) -imikolov_20_avg_ppl_kpi_card4 = CostKpi('imikolov_20_avg_ppl_card4', 0.2, 0) -imikolov_20_pass_duration_kpi_card4 = DurationKpi( - 'imikolov_20_pass_duration_card4', 0.03, 0, actived=True) - -tracking_kpis = [ - imikolov_20_avg_ppl_kpi, - imikolov_20_pass_duration_kpi, - imikolov_20_avg_ppl_kpi_card4, - imikolov_20_pass_duration_kpi_card4, -] - - -def parse_log(log): - ''' - This method should be implemented by model developers. - - The suggestion: - - each line in the log should be key, value, for example: - - " - train_cost\t1.0 - test_cost\t1.0 - train_cost\t1.0 - train_cost\t1.0 - train_acc\t1.2 - " - ''' - for line in log.split('\n'): - fs = line.strip().split('\t') - print(fs) - if len(fs) == 3 and fs[0] == 'kpis': - kpi_name = fs[1] - kpi_value = float(fs[2]) - yield kpi_name, kpi_value - - -def log_to_ce(log): - kpi_tracker = {} - for kpi in tracking_kpis: - kpi_tracker[kpi.name] = kpi - - for (kpi_name, kpi_value) in parse_log(log): - print(kpi_name, kpi_value) - kpi_tracker[kpi_name].add_record(kpi_value) - kpi_tracker[kpi_name].persist() - - -if __name__ == '__main__': - log = sys.stdin.read() - log_to_ce(log) diff --git a/fluid/language_model/gru/infer.py b/fluid/language_model/gru/infer.py deleted file mode 100644 index ad03ef396f2ad68236204facd3d244b1dc3079a9..0000000000000000000000000000000000000000 --- a/fluid/language_model/gru/infer.py +++ /dev/null @@ -1,66 +0,0 @@ -import sys -import time -import math -import unittest -import contextlib -import numpy as np -import six - -import paddle -import paddle.fluid as fluid - -import utils - - -def infer(test_reader, use_cuda, model_path): - """ inference function """ - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - - with fluid.scope_guard(fluid.core.Scope()): - infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model( - model_path, exe) - - accum_cost = 0.0 - accum_words = 0 - t0 = time.time() - for data in test_reader(): - src_wordseq = utils.to_lodtensor([dat[0] for dat in data], place) - dst_wordseq = utils.to_lodtensor([dat[1] for dat in data], place) - avg_cost = exe.run( - infer_program, - feed={"src_wordseq": src_wordseq, - "dst_wordseq": dst_wordseq}, - fetch_list=fetch_vars) - - nwords = src_wordseq.lod()[0][-1] - - cost = np.array(avg_cost) * nwords - accum_cost += cost - accum_words += nwords - - ppl = math.exp(accum_cost / accum_words) - t1 = time.time() - print("model:%s ppl:%.3f time_cost(s):%.2f" % - (model_path, ppl, t1 - t0)) - - -if __name__ == "__main__": - if len(sys.argv) != 4: - print("Usage: %s model_dir start_epoch last_epoch(inclusive)") - exit(0) - - model_dir = sys.argv[1] - try: - start_index = int(sys.argv[2]) - last_index = int(sys.argv[3]) - except: - print("Usage: %s model_dir start_epoch last_epoch(inclusive)") - exit(-1) - - vocab, train_reader, test_reader = utils.prepare_data( - batch_size=20, buffer_size=1000, word_freq_threshold=0) - - for epoch in six.moves.xrange(start_index, last_index + 1): - epoch_path = model_dir + "/epoch_" + str(epoch) - infer(test_reader=test_reader, use_cuda=True, model_path=epoch_path) diff --git a/fluid/language_model/gru/train.py b/fluid/language_model/gru/train.py deleted file mode 100644 index a999d37e4142044b5ae9340d2db514cb1183c4e1..0000000000000000000000000000000000000000 --- a/fluid/language_model/gru/train.py +++ /dev/null @@ -1,202 +0,0 @@ -import os -import sys -import time -import six - -import numpy as np -import math -import argparse -import paddle.fluid as fluid -import paddle - -import utils - -SEED = 102 - - -def parse_args(): - parser = argparse.ArgumentParser("language_model benchmark.") - parser.add_argument( - '--enable_ce', - action='store_true', - help='If set, run \ - the task with continuous evaluation logs.') - parser.add_argument( - '--num_devices', type=int, default=1, help='Number of GPU devices') - args = parser.parse_args() - return args - - -def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound): - """ network definition """ - emb_lr_x = 10.0 - gru_lr_x = 1.0 - fc_lr_x = 1.0 - emb = fluid.layers.embedding( - input=src, - size=[vocab_size, hid_size], - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=init_low_bound, high=init_high_bound), - learning_rate=emb_lr_x), - is_sparse=True) - - fc0 = fluid.layers.fc(input=emb, - size=hid_size * 3, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=init_low_bound, high=init_high_bound), - learning_rate=gru_lr_x)) - gru_h0 = fluid.layers.dynamic_gru( - input=fc0, - size=hid_size, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=init_low_bound, high=init_high_bound), - learning_rate=gru_lr_x)) - - fc = fluid.layers.fc(input=gru_h0, - size=vocab_size, - act='softmax', - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=init_low_bound, high=init_high_bound), - learning_rate=fc_lr_x)) - - cost = fluid.layers.cross_entropy(input=fc, label=dst) - return cost - - -def train(train_reader, - vocab, - network, - hid_size, - base_lr, - batch_size, - pass_num, - use_cuda, - parallel, - model_dir, - init_low_bound=-0.04, - init_high_bound=0.04): - """ train network """ - - args = parse_args() - if args.enable_ce: - # random seed must set before configuring the network. - fluid.default_startup_program().random_seed = SEED - vocab_size = len(vocab) - - #Input data - src_wordseq = fluid.layers.data( - name="src_wordseq", shape=[1], dtype="int64", lod_level=1) - dst_wordseq = fluid.layers.data( - name="dst_wordseq", shape=[1], dtype="int64", lod_level=1) - - # Train program - avg_cost = None - cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size, - init_low_bound, init_high_bound) - avg_cost = fluid.layers.mean(x=cost) - - # Optimization to minimize lost - sgd_optimizer = fluid.optimizer.SGD( - learning_rate=fluid.layers.exponential_decay( - learning_rate=base_lr, - decay_steps=2100 * 4, - decay_rate=0.5, - staircase=True)) - sgd_optimizer.minimize(avg_cost) - - # Initialize executor - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) - - total_time = 0.0 - fetch_list = [avg_cost.name] - for pass_idx in six.moves.xrange(pass_num): - epoch_idx = pass_idx + 1 - print("epoch_%d start" % epoch_idx) - - t0 = time.time() - i = 0 - newest_ppl = 0 - for data in train_reader(): - i += 1 - lod_src_wordseq = utils.to_lodtensor([dat[0] for dat in data], - place) - lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data], - place) - ret_avg_cost = train_exe.run(feed={ - "src_wordseq": lod_src_wordseq, - "dst_wordseq": lod_dst_wordseq - }, - fetch_list=fetch_list) - avg_ppl = np.exp(ret_avg_cost[0]) - newest_ppl = np.mean(avg_ppl) - if i % 100 == 0: - print("step:%d ppl:%.3f" % (i, newest_ppl)) - - t1 = time.time() - total_time += t1 - t0 - print("epoch:%d num_steps:%d time_cost(s):%f" % - (epoch_idx, i, total_time / epoch_idx)) - - if pass_idx == pass_num - 1 and args.enable_ce: - #Note: The following logs are special for CE monitoring. - #Other situations do not need to care about these logs. - gpu_num = get_cards(args.enable_ce) - if gpu_num == 1: - print("kpis imikolov_20_pass_duration %s" % - (total_time / epoch_idx)) - print("kpis imikolov_20_avg_ppl %s" % newest_ppl) - else: - print("kpis imikolov_20_pass_duration_card%s %s" % \ - (gpu_num, total_time / epoch_idx)) - print("kpis imikolov_20_avg_ppl_card%s %s" % - (gpu_num, newest_ppl)) - save_dir = "%s/epoch_%d" % (model_dir, epoch_idx) - feed_var_names = ["src_wordseq", "dst_wordseq"] - fetch_vars = [avg_cost] - fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe) - print("model saved in %s" % save_dir) - - print("finish training") - - -def get_cards(args): - if args.enable_ce: - cards = os.environ.get('CUDA_VISIBLE_DEVICES') - num = len(cards.split(",")) - return num - else: - return args.num_devices - - -def train_net(): - """ do training """ - batch_size = 20 - args = parse_args() - vocab, train_reader, test_reader = utils.prepare_data( - batch_size=batch_size * get_cards(args), buffer_size=1000, \ - word_freq_threshold=0, enable_ce = args.enable_ce) - train( - train_reader=train_reader, - vocab=vocab, - network=network, - hid_size=200, - base_lr=1.0, - batch_size=batch_size, - pass_num=12, - use_cuda=True, - parallel=True, - model_dir="model", - init_low_bound=-0.1, - init_high_bound=0.1) - - -if __name__ == "__main__": - train_net() diff --git a/fluid/language_model/gru/train_on_cloud.py b/fluid/language_model/gru/train_on_cloud.py deleted file mode 100644 index 9a912a1e4ffc552f699cd4d9d41999bb3422d369..0000000000000000000000000000000000000000 --- a/fluid/language_model/gru/train_on_cloud.py +++ /dev/null @@ -1,288 +0,0 @@ -import os -import sys -import time -import six - -import numpy as np -import math - -import collections -import paddle -import paddle.fluid as fluid -import paddle.fluid.framework as framework - -cluster_train_dir = "./train/" -cluster_test_dir = "./test/" -train_file = "ptb.train.txt" -valid_file = "ptb.valid.txt" -test_file = "ptb.test.txt" - - -class DataType(object): - """ data type """ - NGRAM = 1 - SEQ = 2 - - -def word_count(f, word_freq=None): - """ count words """ - if word_freq is None: - word_freq = collections.defaultdict(int) - - for line in f: - for w in line.strip().split(): - word_freq[w] += 1 - word_freq[''] += 1 - word_freq[''] += 1 - - return word_freq - - -def build_dict(min_word_freq=50): - """ build dictionary """ - train_filename = cluster_train_dir + train_file - test_filename = cluster_test_dir + valid_file - trainf = open(train_filename).readlines() - testf = open(test_filename).readlines() - word_freq = word_count(testf, word_count(trainf)) - if '' in word_freq: - del word_freq[''] - word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items()) - word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0])) - words, _ = list(zip(*word_freq_sorted)) - word_idx = dict(zip(words, six.moves.xrange(len(words)))) - word_idx[''] = len(words) - return word_idx - - -def reader_creator(filename, word_idx, n, data_type): - """ create reader """ - - def reader(): - if True: - f = open(filename).readlines() - UNK = word_idx[''] - for line in f: - if DataType.NGRAM == data_type: - assert n > -1, 'Invalid gram length' - line = [''] + line.strip().split() + [''] - if len(line) >= n: - line = [word_idx.get(w, UNK) for w in line] - for i in range(n, len(line) + 1): - yield tuple(line[i - n:i]) - elif DataType.SEQ == data_type: - line = line.strip().split() - line = [word_idx.get(w, UNK) for w in line] - src_seq = [word_idx['']] + line - trg_seq = line + [word_idx['']] - if n > 0 and len(src_seq) > n: - continue - yield src_seq, trg_seq - else: - assert False, 'Unknow data type' - - return reader - - -def to_lodtensor(data, place): - """ convert to LODtensor """ - seq_lens = [len(seq) for seq in data] - cur_len = 0 - lod = [cur_len] - for line in seq_lens: - cur_len += line - lod.append(cur_len) - flattened_data = np.concatenate(data, axis=0).astype("int64") - flattened_data = flattened_data.reshape([len(flattened_data), 1]) - res = fluid.LoDTensor() - res.set(flattened_data, place) - res.set_lod([lod]) - return res - - -def prepare_data(batch_size, buffer_size=1000, word_freq_threshold=0): - """ prepare the English Pann Treebank (PTB) data """ - vocab = build_dict(word_freq_threshold) - train_reader = paddle.batch( - paddle.reader.shuffle( - reader_creator( - cluster_train_dir + train_file, - vocab, - buffer_size, - data_type=DataType.SEQ), - buf_size=buffer_size), - batch_size) - test_reader = paddle.batch( - reader_creator( - cluster_test_dir + test_file, - vocab, - buffer_size, - data_type=DataType.SEQ), - batch_size) - return vocab, train_reader, test_reader - - -def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound): - """ network definition """ - emb_lr_x = 10.0 - gru_lr_x = 1.0 - fc_lr_x = 1.0 - emb = fluid.layers.embedding( - input=src, - size=[vocab_size, hid_size], - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=init_low_bound, high=init_high_bound), - learning_rate=emb_lr_x), - is_sparse=True) - - fc0 = fluid.layers.fc(input=emb, - size=hid_size * 3, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=init_low_bound, high=init_high_bound), - learning_rate=gru_lr_x)) - gru_h0 = fluid.layers.dynamic_gru( - input=fc0, - size=hid_size, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=init_low_bound, high=init_high_bound), - learning_rate=gru_lr_x)) - - fc = fluid.layers.fc(input=gru_h0, - size=vocab_size, - act='softmax', - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=init_low_bound, high=init_high_bound), - learning_rate=fc_lr_x)) - - cost = fluid.layers.cross_entropy(input=fc, label=dst) - return cost - - -def do_train(train_reader, - vocab, - network, - hid_size, - base_lr, - batch_size, - pass_num, - use_cuda, - parallel, - model_dir, - init_low_bound=-0.04, - init_high_bound=0.04): - """ train network """ - vocab_size = len(vocab) - - src_wordseq = fluid.layers.data( - name="src_wordseq", shape=[1], dtype="int64", lod_level=1) - dst_wordseq = fluid.layers.data( - name="dst_wordseq", shape=[1], dtype="int64", lod_level=1) - - avg_cost = None - if not parallel: - cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size, - init_low_bound, init_high_bound) - avg_cost = fluid.layers.mean(x=cost) - else: - places = fluid.layers.device.get_places() - pd = fluid.layers.ParallelDo(places) - with pd.do(): - cost = network( - pd.read_input(src_wordseq), - pd.read_input(dst_wordseq), vocab_size, hid_size, - init_low_bound, init_high_bound) - pd.write_output(cost) - - cost = pd() - avg_cost = fluid.layers.mean(x=cost) - - sgd_optimizer = fluid.optimizer.SGD( - learning_rate=fluid.layers.exponential_decay( - learning_rate=base_lr, - decay_steps=2100 * 4, - decay_rate=0.5, - staircase=True)) - sgd_optimizer.minimize(avg_cost) - - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - - exe.run(fluid.default_startup_program()) - total_time = 0.0 - for pass_idx in six.moves.xrange(pass_num): - epoch_idx = pass_idx + 1 - print("epoch_%d start" % epoch_idx) - - t0 = time.time() - i = 0 - for data in train_reader(): - i += 1 - lod_src_wordseq = to_lodtensor([dat[0] for dat in data], place) - lod_dst_wordseq = to_lodtensor([dat[1] for dat in data], place) - ret_avg_cost = exe.run(fluid.default_main_program(), - feed={ - "src_wordseq": lod_src_wordseq, - "dst_wordseq": lod_dst_wordseq - }, - fetch_list=[avg_cost], - use_program_cache=True) - avg_ppl = math.exp(ret_avg_cost[0]) - if i % 100 == 0: - print("step:%d ppl:%.3f" % (i, avg_ppl)) - - t1 = time.time() - total_time += t1 - t0 - print("epoch:%d num_steps:%d time_cost(s):%f" % - (epoch_idx, i, total_time / epoch_idx)) - - save_dir = "%s/epoch_%d" % (model_dir, epoch_idx) - feed_var_names = ["src_wordseq", "dst_wordseq"] - fetch_vars = [avg_cost] - fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe) - print("model saved in %s" % save_dir) - - print("finish training") - - -def train(): - """ do training """ - batch_size = 20 - vocab, train_reader, test_reader = prepare_data( - batch_size=batch_size, buffer_size=1000, word_freq_threshold=0) - - # End batch and end pass event handler - def event_handler(event): - """ event handler """ - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print("\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics)) - else: - sys.stdout.write('.') - sys.stdout.flush() - if isinstance(event, paddle.event.EndPass): - print("isinstance(event, paddle.event.EndPass)") - - do_train( - train_reader=train_reader, - vocab=vocab, - network=network, - hid_size=200, - base_lr=1.0, - batch_size=batch_size, - pass_num=12, - use_cuda=True, - parallel=False, - model_dir="./output/model", - init_low_bound=-0.1, - init_high_bound=0.1) - - -if __name__ == "__main__": - if not os.path.exists("./output/model"): - os.makedirs("./output/model") - train() diff --git a/fluid/language_model/gru/utils.py b/fluid/language_model/gru/utils.py deleted file mode 100644 index dd03a89835e620dc8432a6ca16392fc5173a12d4..0000000000000000000000000000000000000000 --- a/fluid/language_model/gru/utils.py +++ /dev/null @@ -1,51 +0,0 @@ -import sys -import time -import numpy as np - -import paddle.fluid as fluid -import paddle - - -def to_lodtensor(data, place): - """ convert to LODtensor """ - seq_lens = [len(seq) for seq in data] - cur_len = 0 - lod = [cur_len] - for l in seq_lens: - cur_len += l - lod.append(cur_len) - flattened_data = np.concatenate(data, axis=0).astype("int64") - flattened_data = flattened_data.reshape([len(flattened_data), 1]) - res = fluid.LoDTensor() - res.set(flattened_data, place) - res.set_lod([lod]) - return res - - -def prepare_data(batch_size, - buffer_size=1000, - word_freq_threshold=0, - enable_ce=False): - """ prepare the English Pann Treebank (PTB) data """ - vocab = paddle.dataset.imikolov.build_dict(word_freq_threshold) - if enable_ce: - train_reader = paddle.batch( - paddle.dataset.imikolov.train( - vocab, - buffer_size, - data_type=paddle.dataset.imikolov.DataType.SEQ), - batch_size) - else: - train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.imikolov.train( - vocab, - buffer_size, - data_type=paddle.dataset.imikolov.DataType.SEQ), - buf_size=buffer_size), - batch_size) - test_reader = paddle.batch( - paddle.dataset.imikolov.test( - vocab, buffer_size, data_type=paddle.dataset.imikolov.DataType.SEQ), - batch_size) - return vocab, train_reader, test_reader diff --git a/fluid/language_model/lstm/.run_ce.sh b/fluid/language_model/lstm/.run_ce.sh deleted file mode 100644 index 8c192ad62e5b66bc4c7f3150d2e24507662491d8..0000000000000000000000000000000000000000 --- a/fluid/language_model/lstm/.run_ce.sh +++ /dev/null @@ -1,11 +0,0 @@ -export CUDA_VISIBLE_DEVICES=0 -cd data -sh download_data.sh -cd .. - -python train.py \ - --data_path data/simple-examples/data/ \ - --model_type small \ - --use_gpu True \ - --enable_ce | python _ce.py - diff --git a/fluid/language_model/lstm/README.md b/fluid/language_model/lstm/README.md deleted file mode 100644 index f6d1250ff66a066c8634eca9c3f74312f00a7749..0000000000000000000000000000000000000000 --- a/fluid/language_model/lstm/README.md +++ /dev/null @@ -1,76 +0,0 @@ -# lstm lm - -以下是本例的简要目录结构及说明: - -```text -. -├── README.md # 文档 -├── train.py # 训练脚本 -├── reader.py # 数据读取 -└── lm_model.py # 模型定义文件 -``` - - -## 简介 - -循环神经网络语言模型的介绍可以参阅论文[Recurrent Neural Network Regularization](https://arxiv.org/abs/1409.2329),本文主要是说明基于lstm的语言的模型的实现,数据是采用ptb dataset,下载地址为 -http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz - -## 数据下载 -用户可以自行下载数据,并解压, 也可以利用目录中的脚本 - -cd data; sh download_data.sh - -## 训练 - -运行命令 -`CUDA_VISIBLE_DEVICES=0 python train.py --data_path data/simple-examples/data/ --model_type small --use_gpu True` - 开始训练模型。 - -model_type 为模型配置的大小,目前支持 small,medium, large 三种配置形式 - -实现采用双层的lstm,具体的参数和网络配置 可以参考 train.py, lm_model.py 文件中的设置 - - -## 训练结果示例 - -p40中训练日志如下(small config), test 测试集仅在最后一个epoch完成后进行测试 -```text -epoch id 0 -ppl 232 865.86505 1.0 -ppl 464 632.76526 1.0 -ppl 696 510.47153 1.0 -ppl 928 437.60617 1.0 -ppl 1160 393.38422 1.0 -ppl 1392 353.05365 1.0 -ppl 1624 325.73267 1.0 -ppl 1856 305.488 1.0 -ppl 2088 286.3128 1.0 -ppl 2320 270.91504 1.0 -train ppl 270.86246 -valid ppl 181.867964379 -... -ppl 2320 40.975872 0.001953125 -train ppl 40.974102 -valid ppl 117.85741214 -test ppl 113.939103843 -``` -## 与tf结果对比 - -tf采用的版本是1.6 -```text -small config - train valid test -fluid 1.0 40.962 118.111 112.617 -tf 1.6 40.492 118.329 113.788 - -medium config - train valid test -fluid 1.0 45.620 87.398 83.682 -tf 1.6 45.594 87.363 84.015 - -large config - train valid test -fluid 1.0 37.221 82.358 78.137 -tf 1.6 38.342 82.311 78.121 -``` diff --git a/fluid/language_model/lstm/_ce.py b/fluid/language_model/lstm/_ce.py deleted file mode 100644 index f537f6aa62dd502c79174fadedda0da621c8eb7b..0000000000000000000000000000000000000000 --- a/fluid/language_model/lstm/_ce.py +++ /dev/null @@ -1,56 +0,0 @@ -# this file is only used for continuous evaluation test! - -import os -import sys -sys.path.append(os.environ['ceroot']) -from kpi import CostKpi -from kpi import DurationKpi - -imikolov_20_avg_ppl_kpi = CostKpi('lstm_language_model_loss', 0.02, 0) -imikolov_20_pass_duration_kpi = DurationKpi( - 'lstm_language_model_duration', 0.02, 0, actived=True) - -tracking_kpis = [ - imikolov_20_avg_ppl_kpi, - imikolov_20_pass_duration_kpi, -] - - -def parse_log(log): - ''' - This method should be implemented by model developers. - - The suggestion: - - each line in the log should be key, value, for example: - - " - train_cost\t1.0 - test_cost\t1.0 - train_cost\t1.0 - train_cost\t1.0 - train_acc\t1.2 - " - ''' - for line in log.split('\n'): - fs = line.strip().split('\t') - print(fs) - kpi_name = fs[0] - kpi_value = float(fs[1]) - yield kpi_name, kpi_value - - -def log_to_ce(log): - kpi_tracker = {} - for kpi in tracking_kpis: - kpi_tracker[kpi.name] = kpi - - for (kpi_name, kpi_value) in parse_log(log): - print(kpi_name, kpi_value) - kpi_tracker[kpi_name].add_record(kpi_value) - kpi_tracker[kpi_name].persist() - - -if __name__ == '__main__': - log = sys.stdin.read() - log_to_ce(log) diff --git a/fluid/language_model/lstm/args.py b/fluid/language_model/lstm/args.py deleted file mode 100644 index 498fd9437885238c09e721ee6b182c6d6764398b..0000000000000000000000000000000000000000 --- a/fluid/language_model/lstm/args.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import distutils.util - - -def parse_args(): - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--model_type", - type=str, - default="small", - help="model_type [test|small|med|big]") - parser.add_argument( - "--data_path", type=str, help="all the data for train,valid,test") - parser.add_argument('--para_init', action='store_true') - parser.add_argument( - '--use_gpu', type=bool, default=False, help='whether using gpu') - parser.add_argument( - '--log_path', - help='path of the log file. If not set, logs are printed to console') - parser.add_argument('--enable_ce', action='store_true') - args = parser.parse_args() - return args diff --git a/fluid/language_model/lstm/data/download_data.sh b/fluid/language_model/lstm/data/download_data.sh deleted file mode 100644 index 29966c7663a60ad802e273727496acb6381c98ca..0000000000000000000000000000000000000000 --- a/fluid/language_model/lstm/data/download_data.sh +++ /dev/null @@ -1,4 +0,0 @@ - -wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz - -tar -xzvf simple-examples.tgz diff --git a/fluid/language_model/lstm/lm_model.py b/fluid/language_model/lstm/lm_model.py deleted file mode 100644 index b52b18f9b95ea4654ca35419fb8b4b577e586577..0000000000000000000000000000000000000000 --- a/fluid/language_model/lstm/lm_model.py +++ /dev/null @@ -1,285 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle.fluid.layers as layers -import paddle.fluid as fluid -from paddle.fluid.layers.control_flow import StaticRNN as PaddingRNN -import numpy as np - - -def lm_model(hidden_size, - vocab_size, - batch_size, - num_layers=2, - num_steps=20, - init_scale=0.1, - dropout=None): - def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None): - weight_1_arr = [] - weight_2_arr = [] - bias_arr = [] - hidden_array = [] - cell_array = [] - mask_array = [] - for i in range(num_layers): - weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \ - default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale)) - weight_1_arr.append(weight_1) - bias_1 = layers.create_parameter( - [hidden_size * 4], - dtype="float32", - name="fc_bias1_" + str(i), - default_initializer=fluid.initializer.Constant(0.0)) - bias_arr.append(bias_1) - - pre_hidden = layers.slice( - init_hidden, axes=[0], starts=[i], ends=[i + 1]) - pre_cell = layers.slice( - init_cell, axes=[0], starts=[i], ends=[i + 1]) - pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size]) - pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size]) - hidden_array.append(pre_hidden) - cell_array.append(pre_cell) - - input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2]) - rnn = PaddingRNN() - - with rnn.step(): - input = rnn.step_input(input_embedding) - for k in range(num_layers): - pre_hidden = rnn.memory(init=hidden_array[k]) - pre_cell = rnn.memory(init=cell_array[k]) - weight_1 = weight_1_arr[k] - bias = bias_arr[k] - - nn = layers.concat([input, pre_hidden], 1) - gate_input = layers.matmul(x=nn, y=weight_1) - - gate_input = layers.elementwise_add(gate_input, bias) - #i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) - i = layers.slice( - gate_input, axes=[1], starts=[0], ends=[hidden_size]) - j = layers.slice( - gate_input, - axes=[1], - starts=[hidden_size], - ends=[hidden_size * 2]) - f = layers.slice( - gate_input, - axes=[1], - starts=[hidden_size * 2], - ends=[hidden_size * 3]) - o = layers.slice( - gate_input, - axes=[1], - starts=[hidden_size * 3], - ends=[hidden_size * 4]) - - c = pre_cell * layers.sigmoid(f) + layers.sigmoid( - i) * layers.tanh(j) - m = layers.tanh(c) * layers.sigmoid(o) - - rnn.update_memory(pre_hidden, m) - rnn.update_memory(pre_cell, c) - - rnn.step_output(m) - rnn.step_output(c) - - input = m - - if dropout != None and dropout > 0.0: - input = layers.dropout( - input, - dropout_prob=dropout, - dropout_implementation='upscale_in_train') - - rnn.step_output(input) - #real_res = layers.concat(res, 0) - rnnout = rnn() - - last_hidden_array = [] - last_cell_array = [] - real_res = rnnout[-1] - for i in range(num_layers): - m = rnnout[i * 2] - c = rnnout[i * 2 + 1] - m.stop_gradient = True - c.stop_gradient = True - last_h = layers.slice( - m, axes=[0], starts=[num_steps - 1], ends=[num_steps]) - last_hidden_array.append(last_h) - last_c = layers.slice( - c, axes=[0], starts=[num_steps - 1], ends=[num_steps]) - last_cell_array.append(last_c) - ''' - else: - real_res = rnnout[-1] - for i in range( num_layers ): - - m1, c1, m2, c2 = rnnout - real_res = m2 - m1.stop_gradient = True - c1.stop_gradient = True - c2.stop_gradient = True - ''' - - #layers.Print( first_hidden, message="22", summarize=10) - #layers.Print( rnnout[1], message="11", summarize=10) - #real_res = ( rnnout[1] + rnnout[2] + rnnout[3] + rnnout[4]) / 4.0 - real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) - last_hidden = layers.concat(last_hidden_array, 0) - last_cell = layers.concat(last_cell_array, 0) - ''' - last_hidden = layers.concat( hidden_array, 1 ) - last_hidden = layers.reshape( last_hidden, shape=[-1, num_layers, hidden_size]) - last_hidden = layers.transpose( x = last_hidden, perm = [1, 0, 2]) - last_cell = layers.concat( cell_array, 1) - last_cell = layers.reshape( last_cell, shape=[ -1, num_layers, hidden_size]) - last_cell = layers.transpose( x = last_cell, perm = [1, 0, 2]) - ''' - - return real_res, last_hidden, last_cell - - def encoder_static(input_embedding, len=3, init_hidden=None, - init_cell=None): - - weight_1_arr = [] - weight_2_arr = [] - bias_arr = [] - hidden_array = [] - cell_array = [] - mask_array = [] - for i in range(num_layers): - weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \ - default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale)) - weight_1_arr.append(weight_1) - bias_1 = layers.create_parameter( - [hidden_size * 4], - dtype="float32", - name="fc_bias1_" + str(i), - default_initializer=fluid.initializer.Constant(0.0)) - bias_arr.append(bias_1) - - pre_hidden = layers.slice( - init_hidden, axes=[0], starts=[i], ends=[i + 1]) - pre_cell = layers.slice( - init_cell, axes=[0], starts=[i], ends=[i + 1]) - pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size]) - pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size]) - hidden_array.append(pre_hidden) - cell_array.append(pre_cell) - - res = [] - for index in range(len): - input = layers.slice( - input_embedding, axes=[1], starts=[index], ends=[index + 1]) - input = layers.reshape(input, shape=[-1, hidden_size]) - for k in range(num_layers): - pre_hidden = hidden_array[k] - pre_cell = cell_array[k] - weight_1 = weight_1_arr[k] - bias = bias_arr[k] - - nn = layers.concat([input, pre_hidden], 1) - gate_input = layers.matmul(x=nn, y=weight_1) - - gate_input = layers.elementwise_add(gate_input, bias) - i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) - - c = pre_cell * layers.sigmoid(f) + layers.sigmoid( - i) * layers.tanh(j) - m = layers.tanh(c) * layers.sigmoid(o) - - hidden_array[k] = m - cell_array[k] = c - input = m - - if dropout != None and dropout > 0.0: - input = layers.dropout( - input, - dropout_prob=dropout, - dropout_implementation='upscale_in_train') - - res.append(layers.reshape(input, shape=[1, -1, hidden_size])) - real_res = layers.concat(res, 0) - real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) - last_hidden = layers.concat(hidden_array, 1) - last_hidden = layers.reshape( - last_hidden, shape=[-1, num_layers, hidden_size]) - last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2]) - last_cell = layers.concat(cell_array, 1) - last_cell = layers.reshape( - last_cell, shape=[-1, num_layers, hidden_size]) - last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2]) - - return real_res, last_hidden, last_cell - - x = layers.data(name="x", shape=[-1, 1, 1], dtype='int64') - y = layers.data(name="y", shape=[-1, 1], dtype='float32') - - init_hidden = layers.data(name="init_hidden", shape=[1], dtype='float32') - init_cell = layers.data(name="init_cell", shape=[1], dtype='float32') - - init_hidden = layers.reshape( - init_hidden, shape=[num_layers, -1, hidden_size]) - init_cell = layers.reshape(init_cell, shape=[num_layers, -1, hidden_size]) - - x_emb = layers.embedding( - input=x, - size=[vocab_size, hidden_size], - dtype='float32', - is_sparse=True, - param_attr=fluid.ParamAttr( - name='embedding_para', - initializer=fluid.initializer.UniformInitializer( - low=-init_scale, high=init_scale))) - - x_emb = layers.reshape(x_emb, shape=[-1, num_steps, hidden_size]) - if dropout != None and dropout > 0.0: - x_emb = layers.dropout( - x_emb, - dropout_prob=dropout, - dropout_implementation='upscale_in_train') - - rnn_out, last_hidden, last_cell = padding_rnn( - x_emb, len=num_steps, init_hidden=init_hidden, init_cell=init_cell) - rnn_out = layers.reshape(rnn_out, shape=[-1, num_steps, hidden_size]) - - - softmax_weight = layers.create_parameter([hidden_size, vocab_size], dtype="float32", name="softmax_weight", \ - default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale)) - softmax_bias = layers.create_parameter([vocab_size], dtype="float32", name='softmax_bias', \ - default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale)) - - projection = layers.matmul(rnn_out, softmax_weight) - projection = layers.elementwise_add(projection, softmax_bias) - - projection = layers.reshape(projection, shape=[-1, vocab_size]) - #y = layers.reshape( y, shape=[-1, vocab_size]) - - loss = layers.softmax_with_cross_entropy( - logits=projection, label=y, soft_label=False) - - loss = layers.reshape(loss, shape=[-1, num_steps]) - loss = layers.reduce_mean(loss, dim=[0]) - loss = layers.reduce_sum(loss) - - loss.permissions = True - - feeding_list = ['x', 'y', 'init_hidden', 'init_cell'] - return loss, last_hidden, last_cell, feeding_list diff --git a/fluid/language_model/lstm/reader.py b/fluid/language_model/lstm/reader.py deleted file mode 100644 index 50e8835ec8b96bf37a7b972700a588034d41425c..0000000000000000000000000000000000000000 --- a/fluid/language_model/lstm/reader.py +++ /dev/null @@ -1,105 +0,0 @@ -# Copyright 2015 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Utilities for parsing PTB text files.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import collections -import os -import sys -import numpy as np - -Py3 = sys.version_info[0] == 3 - - -def _read_words(filename): - data = [] - with open(filename, "r") as f: - return f.read().decode("utf-8").replace("\n", "").split() - - -def _build_vocab(filename): - data = _read_words(filename) - - counter = collections.Counter(data) - count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0])) - - words, _ = list(zip(*count_pairs)) - - print("vocab word num", len(words)) - word_to_id = dict(zip(words, range(len(words)))) - - return word_to_id - - -def _file_to_word_ids(filename, word_to_id): - data = _read_words(filename) - return [word_to_id[word] for word in data if word in word_to_id] - - -def ptb_raw_data(data_path=None): - """Load PTB raw data from data directory "data_path". - - Reads PTB text files, converts strings to integer ids, - and performs mini-batching of the inputs. - - The PTB dataset comes from Tomas Mikolov's webpage: - - http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz - - Args: - data_path: string path to the directory where simple-examples.tgz has - been extracted. - - Returns: - tuple (train_data, valid_data, test_data, vocabulary) - where each of the data objects can be passed to PTBIterator. - """ - - train_path = os.path.join(data_path, "ptb.train.txt") - #train_path = os.path.join(data_path, "train.fake") - valid_path = os.path.join(data_path, "ptb.valid.txt") - test_path = os.path.join(data_path, "ptb.test.txt") - - word_to_id = _build_vocab(train_path) - train_data = _file_to_word_ids(train_path, word_to_id) - valid_data = _file_to_word_ids(valid_path, word_to_id) - test_data = _file_to_word_ids(test_path, word_to_id) - vocabulary = len(word_to_id) - return train_data, valid_data, test_data, vocabulary - - -def get_data_iter(raw_data, batch_size, num_steps): - data_len = len(raw_data) - raw_data = np.asarray(raw_data, dtype="int64") - - #print( "raw", raw_data[:20] ) - - batch_len = data_len // batch_size - - data = raw_data[0:batch_size * batch_len].reshape((batch_size, batch_len)) - - #h = data.reshape( (-1)) - #print( "h", h[:20]) - - epoch_size = (batch_len - 1) // num_steps - for i in range(epoch_size): - start = i * num_steps - #print( i * num_steps ) - x = np.copy(data[:, i * num_steps:(i + 1) * num_steps]) - y = np.copy(data[:, i * num_steps + 1:(i + 1) * num_steps + 1]) - - yield (x, y) diff --git a/fluid/language_model/lstm/train.py b/fluid/language_model/lstm/train.py deleted file mode 100644 index fc058c6a0e80f4aeba76656fe505207846d66e2f..0000000000000000000000000000000000000000 --- a/fluid/language_model/lstm/train.py +++ /dev/null @@ -1,300 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import time -import os -import random - -import math - -import paddle -import paddle.fluid as fluid -import paddle.fluid.core as core -import paddle.fluid.framework as framework -from paddle.fluid.executor import Executor - -import reader - -import sys -if sys.version[0] == '2': - reload(sys) - sys.setdefaultencoding("utf-8") -sys.path.append('..') -import os -os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" - -from args import * -import lm_model -import logging -import pickle - -SEED = 123 - - -def get_current_model_para(train_prog, train_exe): - param_list = train_prog.block(0).all_parameters() - param_name_list = [p.name for p in param_list] - - vals = {} - for p_name in param_name_list: - p_array = np.array(fluid.global_scope().find_var(p_name).get_tensor()) - vals[p_name] = p_array - - return vals - - -def save_para_npz(train_prog, train_exe): - print("begin to save model to model_base") - param_list = train_prog.block(0).all_parameters() - param_name_list = [p.name for p in param_list] - - vals = {} - for p_name in param_name_list: - p_array = np.array(fluid.global_scope().find_var(p_name).get_tensor()) - vals[p_name] = p_array - - emb = vals["embedding_para"] - print("begin to save model to model_base") - np.savez("mode_base", **vals) - - -def train(): - args = parse_args() - model_type = args.model_type - logger = logging.getLogger("lm") - logger.setLevel(logging.INFO) - formatter = logging.Formatter( - '%(asctime)s - %(name)s - %(levelname)s - %(message)s') - if args.enable_ce: - fluid.default_startup_program().random_seed = SEED - if args.log_path: - file_handler = logging.FileHandler(args.log_path) - file_handler.setLevel(logging.INFO) - file_handler.setFormatter(formatter) - logger.addHandler(file_handler) - else: - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.INFO) - console_handler.setFormatter(formatter) - logger.addHandler(console_handler) - - logger.info('Running with args : {}'.format(args)) - - vocab_size = 10000 - if model_type == "test": - num_layers = 1 - batch_size = 2 - hidden_size = 10 - num_steps = 3 - init_scale = 0.1 - max_grad_norm = 5.0 - epoch_start_decay = 1 - max_epoch = 1 - dropout = 0.0 - lr_decay = 0.5 - base_learning_rate = 1.0 - elif model_type == "small": - num_layers = 2 - batch_size = 20 - hidden_size = 200 - num_steps = 20 - init_scale = 0.1 - max_grad_norm = 5.0 - epoch_start_decay = 4 - max_epoch = 13 - dropout = 0.0 - lr_decay = 0.5 - base_learning_rate = 1.0 - elif model_type == "medium": - num_layers = 2 - batch_size = 20 - hidden_size = 650 - num_steps = 35 - init_scale = 0.05 - max_grad_norm = 5.0 - epoch_start_decay = 6 - max_epoch = 39 - dropout = 0.5 - lr_decay = 0.8 - base_learning_rate = 1.0 - elif model_type == "large": - num_layers = 2 - batch_size = 20 - hidden_size = 1500 - num_steps = 35 - init_scale = 0.04 - max_grad_norm = 10.0 - epoch_start_decay = 14 - max_epoch = 55 - dropout = 0.65 - lr_decay = 1.0 / 1.15 - base_learning_rate = 1.0 - else: - print("model type not support") - return - - # Training process - loss, last_hidden, last_cell, feed_order = lm_model.lm_model( - hidden_size, - vocab_size, - batch_size, - num_layers=num_layers, - num_steps=num_steps, - init_scale=init_scale, - dropout=dropout) - # clone from default main program and use it as the validation program - main_program = fluid.default_main_program() - inference_program = fluid.default_main_program().clone(for_test=True) - - fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( - clip_norm=max_grad_norm)) - - learning_rate = fluid.layers.create_global_var( - name="learning_rate", - shape=[1], - value=1.0, - dtype='float32', - persistable=True) - - optimizer = fluid.optimizer.SGD(learning_rate=learning_rate) - - optimizer.minimize(loss) - - place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() - exe = Executor(place) - exe.run(framework.default_startup_program()) - - data_path = args.data_path - print("begin to load data") - raw_data = reader.ptb_raw_data(data_path) - print("finished load data") - train_data, valid_data, test_data, _ = raw_data - - def prepare_input(batch, init_hidden, init_cell, epoch_id=0, with_lr=True): - x, y = batch - new_lr = base_learning_rate * (lr_decay**max( - epoch_id + 1 - epoch_start_decay, 0.0)) - lr = np.ones((1), dtype='float32') * new_lr - res = {} - x = x.reshape((-1, num_steps, 1)) - y = y.reshape((-1, 1)) - - res['x'] = x - res['y'] = y - res['init_hidden'] = init_hidden - res['init_cell'] = init_cell - if with_lr: - res['learning_rate'] = lr - - return res - - def eval(data): - # when eval the batch_size set to 1 - eval_data_iter = reader.get_data_iter(data, 1, num_steps) - total_loss = 0.0 - iters = 0 - init_hidden = np.zeros((num_layers, 1, hidden_size), dtype='float32') - init_cell = np.zeros((num_layers, 1, hidden_size), dtype='float32') - for batch_id, batch in enumerate(eval_data_iter): - input_data_feed = prepare_input( - batch, init_hidden, init_cell, epoch_id, with_lr=False) - fetch_outs = exe.run( - inference_program, - feed=input_data_feed, - fetch_list=[loss.name, last_hidden.name, last_cell.name]) - - cost_train = np.array(fetch_outs[0]) - init_hidden = np.array(fetch_outs[1]) - init_cell = np.array(fetch_outs[2]) - - total_loss += cost_train - iters += num_steps - - ppl = np.exp(total_loss / iters) - return ppl - - # get train epoch size - batch_len = len(train_data) // batch_size - epoch_size = (batch_len - 1) // num_steps - log_interval = epoch_size // 10 - total_time = 0.0 - for epoch_id in range(max_epoch): - start_time = time.time() - print("epoch id", epoch_id) - train_data_iter = reader.get_data_iter(train_data, batch_size, - num_steps) - - total_loss = 0 - - init_hidden = None - init_cell = None - #debug_para(fluid.framework.default_main_program(), parallel_executor) - total_loss = 0 - iters = 0 - init_hidden = np.zeros( - (num_layers, batch_size, hidden_size), dtype='float32') - init_cell = np.zeros( - (num_layers, batch_size, hidden_size), dtype='float32') - for batch_id, batch in enumerate(train_data_iter): - input_data_feed = prepare_input( - batch, init_hidden, init_cell, epoch_id=epoch_id) - fetch_outs = exe.run(feed=input_data_feed, - fetch_list=[ - loss.name, last_hidden.name, - last_cell.name, 'learning_rate' - ]) - - cost_train = np.array(fetch_outs[0]) - init_hidden = np.array(fetch_outs[1]) - init_cell = np.array(fetch_outs[2]) - - lr = np.array(fetch_outs[3]) - - total_loss += cost_train - iters += num_steps - if batch_id > 0 and batch_id % log_interval == 0: - ppl = np.exp(total_loss / iters) - print("ppl ", batch_id, ppl[0], lr[0]) - - ppl = np.exp(total_loss / iters) - if epoch_id == 0 and ppl[0] > 1000: - # for bad init, after first epoch, the loss is over 1000 - # no more need to continue - return - end_time = time.time() - total_time += end_time - start_time - print("train ppl", ppl[0]) - - if epoch_id == max_epoch - 1 and args.enable_ce: - print("lstm_language_model_duration\t%s" % (total_time / max_epoch)) - print("lstm_language_model_loss\t%s" % ppl[0]) - - model_path = os.path.join("model_new/", str(epoch_id)) - if not os.path.isdir(model_path): - os.makedirs(model_path) - fluid.io.save_persistables( - executor=exe, dirname=model_path, main_program=main_program) - valid_ppl = eval(valid_data) - print("valid ppl", valid_ppl[0]) - test_ppl = eval(test_data) - print("test ppl", test_ppl[0]) - - -if __name__ == '__main__': - train()