From d3688c2d9f77d9eab42ab0f0d3372208dc1d384d Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Tue, 27 Nov 2018 17:12:33 +0800 Subject: [PATCH] fix cluster_train.py v2 --- fluid/PaddleRec/gru4rec/README.md | 89 ++++-- fluid/PaddleRec/gru4rec/__init__.py | 0 fluid/PaddleRec/gru4rec/cluster_train.py | 142 +++++++++ fluid/PaddleRec/gru4rec/cluster_train.sh | 62 ++++ fluid/PaddleRec/gru4rec/infer.py | 56 ++-- fluid/PaddleRec/gru4rec/net.py | 50 ++++ fluid/PaddleRec/gru4rec/small_test.txt | 100 ------- fluid/PaddleRec/gru4rec/small_train.txt | 100 ------- .../gru4rec/test_data/small_test.txt | 100 +++++++ fluid/PaddleRec/gru4rec/text2paddle.py | 82 ++++++ fluid/PaddleRec/gru4rec/train.py | 277 +++++++----------- .../gru4rec/train_data/small_train.txt | 100 +++++++ fluid/PaddleRec/gru4rec/utils.py | 108 +++---- fluid/PaddleRec/gru4rec/vocab.txt | 1 + 14 files changed, 771 insertions(+), 496 deletions(-) delete mode 100644 fluid/PaddleRec/gru4rec/__init__.py create mode 100644 fluid/PaddleRec/gru4rec/cluster_train.py create mode 100644 fluid/PaddleRec/gru4rec/cluster_train.sh create mode 100644 fluid/PaddleRec/gru4rec/net.py delete mode 100644 fluid/PaddleRec/gru4rec/small_test.txt delete mode 100644 fluid/PaddleRec/gru4rec/small_train.txt create mode 100644 fluid/PaddleRec/gru4rec/test_data/small_test.txt create mode 100644 fluid/PaddleRec/gru4rec/text2paddle.py create mode 100644 fluid/PaddleRec/gru4rec/train_data/small_train.txt create mode 100644 fluid/PaddleRec/gru4rec/vocab.txt diff --git a/fluid/PaddleRec/gru4rec/README.md b/fluid/PaddleRec/gru4rec/README.md index 6b3c9c66..e88e6172 100644 --- a/fluid/PaddleRec/gru4rec/README.md +++ b/fluid/PaddleRec/gru4rec/README.md @@ -7,10 +7,15 @@ ├── README.md # 文档 ├── train.py # 训练脚本 ├── infer.py # 预测脚本 +├── net.py # 网络结构 +├── text2paddle.py # 文本数据转paddle数据 +├── cluster_train.py # 多机训练 +├── cluster_train.sh # 多机训练脚本 ├── utils # 通用函数 ├── convert_format.py # 转换数据格式 -├── small_train.txt # 小样本训练集 -└── small_test.txt # 小样本测试集 +├── vocab.txt # 小样本字典 +├── train_data # 小样本训练目录 +└── test_data # 小样本测试目录 ``` @@ -25,7 +30,9 @@ GRU4REC模型的介绍可以参阅论文[Session-based Recommendations with Recu session-based推荐应用场景非常广泛,比如用户的商品浏览、新闻点击、地点签到等序列数据。 +运行样例程序可跳过'RSC15 数据下载及预处理'部分 ## RSC15 数据下载及预处理 + 运行命令 下载RSC15官网数据集 ``` curl -Lo yoochoose-data.7z https://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-data.7z @@ -79,45 +86,49 @@ python convert_format.py 214717867 214717867 ``` +根据训练和测试文件生成字典和对应的paddle输入文件 + +注意需要将训练文件放到一个目录下面,测试文件放到一个目录下面,同时支持多训练文件 +``` +python text2paddle.py raw_train_data/ raw_test_data/ train_data test_data vocab.txt +``` + +转化后生成的格式如下,可参考train_data/small_train.txt +``` +197 196 198 236 +93 93 384 362 363 43 +336 364 407 +421 322 +314 388 +128 58 +138 138 +46 46 46 +34 34 57 57 57 342 228 321 346 357 59 376 +110 110 +``` + ## 训练 '--use_cuda 1' 表示使用gpu, 缺省表示使用cpu '--parallel 1' 表示使用多卡,缺省表示使用单卡 +具体的参数配置可运行 +``` +python train.py -h +``` + GPU 环境 -运行命令 `CUDA_VISIBLE_DEVICES=0 python train.py train_file test_file --use_cuda 1` 开始训练模型。 +运行命令开始训练模型。 ``` -CUDA_VISIBLE_DEVICES=0 python train.py small_train.txt small_test.txt --use_cuda 1 +CUDA_VISIBLE_DEVICES=0 python train.py --train_dir train_data/ --use_cuda 1 ``` CPU 环境 -运行命令 `python train.py train_file test_file` 开始训练模型。 -``` -python train.py small_train.txt small_test.txt +运行命令开始训练模型。 ``` - -当前支持的参数可参见[train.py](./train.py) `train_net` 函数 -```python - batch_size = 50 # batch大小 推荐500() - args = parse_args() - vocab, train_reader, test_reader = utils.prepare_data( - train_file, test_file,batch_size=batch_size * get_cards(args),\ - buffer_size=1000, word_freq_threshold=0) # buffer_size 局部序列长度排序 - train( - train_reader=train_reader, - vocab=vocab, - network=network, - hid_size=100, # embedding and hidden size - base_lr=0.01, # base learning rate - batch_size=batch_size, - pass_num=10, # the number of passed for training - use_cuda=use_cuda, # whether to use GPU card - parallel=parallel, # whether to be parallel - model_dir="model_recall20", # directory to save model - init_low_bound=-0.1, # uniform parameter initialization lower bound - init_high_bound=0.1) # uniform parameter initialization upper bound +python train.py --train_dir train_data/ ``` ## 自定义网络结构 -可在[train.py](./train.py) `network` 函数中调整网络结构,当前的网络结构如下: +可在[net.py](./net.py) `network` 函数中调整网络结构,当前的网络结构如下: ```python emb = fluid.layers.embedding( input=src, @@ -206,9 +217,10 @@ model saved in model_recall20/epoch_1 ``` ## 预测 -运行命令 `CUDA_VISIBLE_DEVICES=0 python infer.py model_dir start_epoch last_epoch(inclusive) train_file test_file` 开始预测.其中,start_epoch指定开始预测的轮次,last_epoch指定结束的轮次,例如 -```python -CUDA_VISIBLE_DEVICES=0 python infer.py model 1 10 small_train.txt small_test.txt +运行命令 开始预测. + +``` +CUDA_VISIBLE_DEVICES=0 python infer.py --test_dir test_data/ --model_dir model_recall20/ --start_index 1 --last_index 10 --use_cuda 1 ``` ## 预测结果示例 @@ -224,3 +236,16 @@ model:model_r@20/epoch_8 recall@20:0.679 time_cost(s):12.37 model:model_r@20/epoch_9 recall@20:0.680 time_cost(s):12.22 model:model_r@20/epoch_10 recall@20:0.681 time_cost(s):12.2 ``` + + +## 多机训练 +厂内用户可以参考[wiki](http://wiki.baidu.com/pages/viewpage.action?pageId=628300529)利用paddlecloud 配置多机环境 + +可参考cluster_train.py 配置其他多机环境 + +运行命令本地模拟多机场景 +``` +sh cluster_train.sh +``` + +注意本地模拟需要关闭代理 diff --git a/fluid/PaddleRec/gru4rec/__init__.py b/fluid/PaddleRec/gru4rec/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/fluid/PaddleRec/gru4rec/cluster_train.py b/fluid/PaddleRec/gru4rec/cluster_train.py new file mode 100644 index 00000000..18b6e0df --- /dev/null +++ b/fluid/PaddleRec/gru4rec/cluster_train.py @@ -0,0 +1,142 @@ +import os +import sys +import time +import six +import numpy as np +import math +import argparse +import paddle.fluid as fluid +import paddle +import time +import utils +import net + +SEED = 102 + + +def parse_args(): + parser = argparse.ArgumentParser("gru4rec benchmark.") + parser.add_argument( + '--train_dir', type=str, default='train_data', help='train file address') + parser.add_argument( + '--vocab_path', type=str, default='vocab.txt', help='vocab file address') + parser.add_argument( + '--is_local', type=int, default=1, help='whether local') + parser.add_argument( + '--hid_size', type=int, default=100, help='hid size') + parser.add_argument( + '--model_dir', type=str, default='model_recall20', help='model dir') + parser.add_argument( + '--batch_size', type=int, default=5, help='num of batch size') + parser.add_argument( + '--pass_num', type=int, default=10, help='num of epoch') + parser.add_argument( + '--print_batch', type=int, default=10, help='num of print batch') + parser.add_argument( + '--use_cuda', type=int, default=0, help='whether use gpu') + parser.add_argument( + '--base_lr', type=float, default=0.01, help='learning rate') + parser.add_argument( + '--num_devices', type=int, default=1, help='Number of GPU devices') + parser.add_argument( + '--role', type=str, default='pserver', help='trainer or pserver') + parser.add_argument( + '--endpoints', type=str, default='127.0.0.1:6000', help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001') + parser.add_argument( + '--current_endpoint', type=str, default='127.0.0.1:6000', help='The current_endpoint') + parser.add_argument( + '--trainer_id', type=int, default=0, help='trainer id ,only trainer_id=0 save model') + parser.add_argument( + '--trainers', type=int, default=1, help='The num of trianers, (default: 1)') + args = parser.parse_args() + return args + +def get_cards(args): + return args.num_devices + +def train(): + """ do training """ + args = parse_args() + hid_size = args.hid_size + train_dir = args.train_dir + vocab_path = args.vocab_path + use_cuda = True if args.use_cuda else False + print("use_cuda:", use_cuda) + batch_size = args.batch_size + vocab_size, train_reader = utils.prepare_data( + train_dir, vocab_path, batch_size=batch_size * get_cards(args),\ + buffer_size=1000, word_freq_threshold=0, is_train=True) + + # Train program + src_wordseq, dst_wordseq, avg_cost, acc = net.network(vocab_size=vocab_size, hid_size=hid_size) + + # Optimization to minimize lost + sgd_optimizer = fluid.optimizer.SGD(learning_rate=args.base_lr) + sgd_optimizer.minimize(avg_cost) + + + def train_loop(main_program): + """ train network """ + pass_num = args.pass_num + model_dir = args.model_dir + fetch_list = [avg_cost.name] + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + total_time = 0.0 + for pass_idx in six.moves.xrange(pass_num): + epoch_idx = pass_idx + 1 + print("epoch_%d start" % epoch_idx) + + t0 = time.time() + i = 0 + newest_ppl = 0 + for data in train_reader(): + i += 1 + lod_src_wordseq = utils.to_lodtensor([dat[0] for dat in data], + place) + lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data], + place) + ret_avg_cost = exe.run(main_program, + feed={ "src_wordseq": lod_src_wordseq, + "dst_wordseq": lod_dst_wordseq}, + fetch_list=fetch_list) + avg_ppl = np.exp(ret_avg_cost[0]) + newest_ppl = np.mean(avg_ppl) + if i % args.print_batch == 0: + print("step:%d ppl:%.3f" % (i, newest_ppl)) + + t1 = time.time() + total_time += t1 - t0 + print("epoch:%d num_steps:%d time_cost(s):%f" % + (epoch_idx, i, total_time / epoch_idx)) + save_dir = "%s/epoch_%d" % (model_dir, epoch_idx) + feed_var_names = ["src_wordseq", "dst_wordseq"] + fetch_vars = [avg_cost, acc] + if args.trainer_id == 0: + fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe) + print("model saved in %s" % save_dir) + print("finish training") + + if args.is_local: + print("run local training") + train_loop(fluid.default_main_program()) + + else: + print("run distribute training") + t = fluid.DistributeTranspiler() + t.transpile(args.trainer_id, pservers=args.endpoints, trainers=args.trainers) + if args.role == "pserver": + print("run psever") + pserver_prog = t.get_pserver_program(args.current_endpoint) + pserver_startup = t.get_startup_program(args.current_endpoint, + pserver_prog) + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(pserver_startup) + exe.run(pserver_prog) + elif args.role == "trainer": + print("run trainer") + train_loop(t.get_trainer_program()) +if __name__ == "__main__": + train() diff --git a/fluid/PaddleRec/gru4rec/cluster_train.sh b/fluid/PaddleRec/gru4rec/cluster_train.sh new file mode 100644 index 00000000..2711ffad --- /dev/null +++ b/fluid/PaddleRec/gru4rec/cluster_train.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +#export GLOG_v=30 +#export GLOG_logtostderr=1 + +# start pserver0 +python cluster_train.py \ + --train_dir train_data \ + --model_dir cluster_model \ + --vocab_path vocab.txt \ + --batch_size 5 \ + --is_local 0 \ + --role pserver \ + --endpoints 127.0.0.1:6000,127.0.0.1:6001 \ + --current_endpoint 127.0.0.1:6000 \ + --trainers 2 \ + > pserver0.log 2>&1 & + +# start pserver1 +python cluster_train.py \ + --train_dir train_data \ + --model_dir cluster_model \ + --vocab_path vocab.txt \ + --batch_size 5 \ + --is_local 0 \ + --role pserver \ + --endpoints 127.0.0.1:6000,127.0.0.1:6001 \ + --current_endpoint 127.0.0.1:6001 \ + --trainers 2 \ + > pserver1.log 2>&1 & + +# start trainer0 +#CUDA_VISIBLE_DEVICES=1 python cluster_train.py \ +python cluster_train.py \ + --train_dir train_data \ + --model_dir cluster_model \ + --vocab_path vocab.txt \ + --batch_size 5 \ + --print_batch 10 \ + --use_cuda 0 \ + --is_local 0 \ + --role trainer \ + --endpoints 127.0.0.1:6000,127.0.0.1:6001 \ + --trainers 2 \ + --trainer_id 0 \ + > trainer0.log 2>&1 & + +# start trainer1 +#CUDA_VISIBLE_DEVICES=2 python cluster_train.py \ +python cluster_train.py \ + --train_dir train_data \ + --model_dir cluster_model \ + --vocab_path vocab.txt \ + --batch_size 5 \ + --print_batch 10 \ + --use_cuda 0 \ + --is_local 0 \ + --role trainer \ + --endpoints 127.0.0.1:6000,127.0.0.1:6001 \ + --trainers 2 \ + --trainer_id 1 \ + > trainer1.log 2>&1 & diff --git a/fluid/PaddleRec/gru4rec/infer.py b/fluid/PaddleRec/gru4rec/infer.py index ac1a63b4..ccb2ce09 100644 --- a/fluid/PaddleRec/gru4rec/infer.py +++ b/fluid/PaddleRec/gru4rec/infer.py @@ -1,3 +1,4 @@ +import argparse import sys import time import math @@ -10,6 +11,22 @@ import paddle import utils +def parse_args(): + parser = argparse.ArgumentParser("gru4rec benchmark.") + parser.add_argument( + '--test_dir', type=str, default='test_data', help='test file address') + parser.add_argument( + '--start_index', type=int, default='1', help='start index') + parser.add_argument( + '--last_index', type=int, default='10', help='end index') + parser.add_argument( + '--model_dir', type=str, default='model_recall20', help='model dir') + parser.add_argument( + '--use_cuda', type=int, default='1', help='whether use cuda') + parser.add_argument( + '--batch_size', type=int, default='5', help='batch_size') + args = parser.parse_args() + return args def infer(test_reader, use_cuda, model_path): """ inference function """ @@ -41,7 +58,7 @@ def infer(test_reader, use_cuda, model_path): label_data, axis=0).astype("int64")) accum_num_sum += (data_length) accum_num_recall += (data_length * acc_) - if step_id % 100 == 0: + if step_id % 1 == 0: print("step:%d " % (step_id), accum_num_recall / accum_num_sum) t1 = time.time() print("model:%s recall@20:%.3f time_cost(s):%.2f" % @@ -49,31 +66,18 @@ def infer(test_reader, use_cuda, model_path): if __name__ == "__main__": - if len(sys.argv) != 6: - print( - "Usage: %s model_dir start_epoch last_epoch(inclusive) train_file test_file" - ) - exit(0) - train_file = "" - test_file = "" - model_dir = sys.argv[1] - try: - start_index = int(sys.argv[2]) - last_index = int(sys.argv[3]) - train_file = sys.argv[4] - test_file = sys.argv[5] - except: - iprint( - "Usage: %s model_dir start_ipoch last_epoch(inclusive) train_file test_file" - ) - exit(-1) - vocab, train_reader, test_reader = utils.prepare_data( - train_file, - test_file, - batch_size=5, - buffer_size=1000, - word_freq_threshold=0) + args = parse_args() + start_index = args.start_index + last_index = args.last_index + test_dir = args.test_dir + model_dir = args.model_dir + batch_size = args.batch_size + use_cuda = True if args.use_cuda else False + print("start index: ", start_index, " last_index:" ,last_index) + vocab_size, test_reader = utils.prepare_data( + test_dir, "", batch_size=batch_size, + buffer_size=1000, word_freq_threshold=0, is_train=False) for epoch in xrange(start_index, last_index + 1): epoch_path = model_dir + "/epoch_" + str(epoch) - infer(test_reader=test_reader, use_cuda=True, model_path=epoch_path) + infer(test_reader=test_reader, use_cuda=use_cuda, model_path=epoch_path) diff --git a/fluid/PaddleRec/gru4rec/net.py b/fluid/PaddleRec/gru4rec/net.py new file mode 100644 index 00000000..fea2b3e9 --- /dev/null +++ b/fluid/PaddleRec/gru4rec/net.py @@ -0,0 +1,50 @@ +import paddle.fluid as fluid + +def network(vocab_size, + hid_size=100, + init_low_bound=-0.04, + init_high_bound=0.04): + """ network definition """ + emb_lr_x = 10.0 + gru_lr_x = 1.0 + fc_lr_x = 1.0 + # Input data + src_wordseq = fluid.layers.data( + name="src_wordseq", shape=[1], dtype="int64", lod_level=1) + dst_wordseq = fluid.layers.data( + name="dst_wordseq", shape=[1], dtype="int64", lod_level=1) + + emb = fluid.layers.embedding( + input=src_wordseq, + size=[vocab_size, hid_size], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=init_low_bound, high=init_high_bound), + learning_rate=emb_lr_x), + is_sparse=True) + fc0 = fluid.layers.fc(input=emb, + size=hid_size * 3, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=init_low_bound, high=init_high_bound), + learning_rate=gru_lr_x)) + gru_h0 = fluid.layers.dynamic_gru( + input=fc0, + size=hid_size, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=init_low_bound, high=init_high_bound), + learning_rate=gru_lr_x)) + + fc = fluid.layers.fc(input=gru_h0, + size=vocab_size, + act='softmax', + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform( + low=init_low_bound, high=init_high_bound), + learning_rate=fc_lr_x)) + + cost = fluid.layers.cross_entropy(input=fc, label=dst_wordseq) + acc = fluid.layers.accuracy(input=fc, label=dst_wordseq, k=20) + avg_cost = fluid.layers.mean(x=cost) + return src_wordseq, dst_wordseq, avg_cost, acc diff --git a/fluid/PaddleRec/gru4rec/small_test.txt b/fluid/PaddleRec/gru4rec/small_test.txt deleted file mode 100644 index 0cdaa7cc..00000000 --- a/fluid/PaddleRec/gru4rec/small_test.txt +++ /dev/null @@ -1,100 +0,0 @@ -214586805 214509260 -214857547 214857268 214857260 -214859848 214857787 -214687963 214531502 214687963 -214696532 214859034 214858850 -214857570 214857810 214857568 214857787 214857182 -214857562 214857570 214857562 214857568 -214859132 214545928 214859132 214551913 -214858843 214859859 214858912 214858691 214859900 -214561888 214561888 -214688430 214688435 214688430 -214536302 214531376 214531659 214531440 214531466 214513382 214550996 -214854930 214854930 -214858856 214690775 214859306 -214859872 214858912 214858689 -214859310 214859338 214859338 214859942 214859293 214859889 214859338 214859889 214859075 214859338 214859338 214859889 -214574906 214574906 -214859342 214859342 214858777 214851155 214851152 214572433 -214537127 214857257 -214857570 214857570 214857568 214857562 214857015 -214854352 214854352 214854354 -214738466 214855010 214857605 214856552 214574906 214857765 214849299 -214858365 214859900 214859126 214858689 214859126 214859126 214857759 214858850 214859895 214859300 -214857260 214561481 214848995 214849052 214865212 -214857596 214819412 214819412 -214849342 214849342 -214859902 214854845 214854845 214854825 -214859306 214859126 214859126 -214644962 214644960 214644958 -214696432 214696434 -214708372 214508287 214684093 -214857015 214857015 214858847 214690130 -214858787 214859855 -214858847 214696532 214859304 214854845 -214586805 214586805 -214857568 214857570 -214696532 214858850 214859034 214569238 214568120 214854165 214684785 214854262 214567327 -214602729 214857568 214857596 -214859122 214858687 214859122 214859872 -214555607 214836225 214836225 214836223 -214849299 214829724 214855010 214829801 214574906 214586722 214684307 214857570 -214859872 214695525 -214845947 214586722 214829801 -214829312 214546123 -214849055 214849052 -214509260 214587932 214596435 214644960 214696432 214696434 214545928 214857030 214636329 214832604 214574906 -214586805 214586805 -214587932 214587932 -214857568 214857549 214854894 -214836819 214836819 214595855 214595855 -214858787 214858787 -214854860 214857701 -214848750 214643908 -214858847 214859872 214859038 214859855 214690130 -214847780 214696817 214717305 -214509260 214509260 -214853122 214853122 214853122 214853323 -214858847 214858631 214858691 -214859859 214819807 214853072 214853072 214819730 -214820450 214705115 214586805 -214858787 214859036 -214829842 214864967 -214846033 214850949 -214587932 214586805 214509260 214696432 214855110 214545928 -214858856 214859081 214859306 214858854 -214690839 214690839 214711277 214839607 214582942 214582942 -214857030 214832604 -214857570 214855046 214859870 214577475 214858687 214656380 -214854845 214854845 214854684 214859893 214854845 214854778 -214850630 214848159 214848159 214848159 214848159 214848159 214848159 214848159 -214856248 214856248 -214858365 214858905 214858905 -214712274 214855046 -214845947 214845947 214831946 214717511 214846014 214854729 -214561462 214561462 214561481 214561481 -214836819 214853250 -214858854 214859915 214859306 214854300 -214857660 214857787 214539307 214855010 214855046 214849299 214856981 214849055 -214855046 214854877 214568102 214539523 214579762 214539347 214641127 214600995 214833733 214600995 214684633 214645121 214658040 214712276 214857660 214687895 214854313 214857517 -214845962 214853165 214846119 -214854146 214859034 -214819412 214819412 214819412 214819412 -214849747 214578350 214561991 -214854341 214854341 -214644855 214644857 214531153 -214644960 214862167 -214640490 214600918 214600922 -214854710 214857759 214859306 -214858843 214859297 214858631 214859117 214858689 214858912 214859902 214690127 -214586805 214586805 -214859306 214859306 214859126 -214859034 214696532 214858850 214859126 214859859 214859034 214859859 214858850 -214857782 214849048 214857787 -214854148 214857787 214854877 -214858631 214858631 214690127 214859034 214858850 214859117 214858631 214859300 214858843 214859859 214859859 -214646036 214646036 -214858847 214858631 214690127 214859297 -214861603 214700002 214700000 214835117 214700000 214857830 214700000 214712235 214700000 214700002 214510700 214835713 214712235 214853321 -214854855 214854815 214854815 -214857185 214854637 214829765 214848384 214829765 214856546 214848596 214835167 214563335 214553837 214536185 214855982 214845515 214550844 214712006 diff --git a/fluid/PaddleRec/gru4rec/small_train.txt b/fluid/PaddleRec/gru4rec/small_train.txt deleted file mode 100644 index a570fd29..00000000 --- a/fluid/PaddleRec/gru4rec/small_train.txt +++ /dev/null @@ -1,100 +0,0 @@ -214536502 214536500 214536506 214577561 -214662742 214662742 214825110 214757390 214757407 214551617 -214716935 214774687 214832672 -214836765 214706482 -214701242 214826623 -214826835 214826715 -214838855 214838855 -214576500 214576500 214576500 -214821275 214821275 214821371 214821371 214821371 214717089 214563337 214706462 214717436 214743335 214826837 214819762 -214717867 214717867 -214836761 214684513 214836761 -214577732 214587013 214577732 -214826897 214820441 -214684093 214684093 214684093 -214561790 214561790 214611457 214611457 -214577732 214577732 -214838503 214838503 214838503 214838503 214838503 214548744 -214718203 214718203 214718203 214718203 -214837485 214837485 214837485 214837487 214837487 214821315 214586711 214821305 214821307 214844357 214821341 214821309 214551617 214551617 214612920 214837487 -214613743 214613743 214539110 214539110 -214827028 214827017 214537796 214840762 214707930 214707930 214585652 214536197 214536195 214646169 -214579288 214714790 214676070 214601407 -214532036 214700432 -214836789 214836789 214710804 -214537967 214537967 -214718246 214826835 -214835257 214835265 -214834865 214571188 214571188 214571188 214820225 214820225 214820225 214820225 214820225 214820225 214820225 214820225 214706441 214706441 214706441 214706441 -214652878 214716737 214652878 -214684721 214680356 -214551594 214586970 -214826769 214537967 -214819745 214819745 -214691587 214587915 -214821277 214821277 214821277 214821277 214821277 -214716932 214716932 214716932 214716932 214716932 214716932 -214712235 214581489 214602605 -214820441 214826897 214826702 214684513 214838100 214544357 214551626 214691484 -214545935 214819438 214839907 214835917 214836210 -214698491 214523692 -214695307 214695305 214538317 214677448 -214819468 214716977 214716977 214716977 214716977 214716939 -214544355 214601212 214601212 214601212 -214716982 214716984 -214844248 214844248 -214515834 214515830 -214717318 214717318 -214832557 214559660 214559660 214819520 214586540 -214587797 214835775 214844109 -214714794 214601407 214826619 214746427 214821300 214717562 214826927 214748334 214826908 214800262 -214709645 214709645 214709645 214709645 214709645 -214532072 214532070 -214827022 214840419 -214716984 214832657 -214662975 214537779 214840762 -214821277 214821277 214821277 -214748300 214748293 -214826955 214826606 214687642 -214832559 214832559 214832559 214821017 214821017 214572234 214826715 214826715 -214509135 214536853 214509133 214509135 214509135 214509135 214717877 214826615 214716982 -214819472 214687685 -214821285 214821285 214826801 214826801 -214826705 214826705 -214668590 214826872 -214652220 214840483 214840483 214717286 214558807 214821300 214826908 214826908 214826908 214554637 214819430 214819430 214826837 214826837 214820392 214820392 214586694 214819376 214553844 214601229 214555500 214695127 214819760 214717850 214718385 214743369 214743369 -214648475 214648340 214648438 214648455 214712936 214712887 214696149 214717097 214534352 214534352 214717097 -214560099 214560099 214560099 214832750 214560099 -214685621 214684093 214546097 214685623 -214819685 214839907 214839905 214811752 -214717007 214717003 214716928 -214820842 214819490 -214555869 214537185 -214840599 214835735 -214838100 214706216 -214829737 214821315 -214748293 214748293 -214712272 214820450 -214821380 214821380 -214826799 214827005 214718390 214718396 214826627 -214841060 214841060 -214687768 214706445 -214811752 214811754 -214594678 214594680 214594680 -214821369 214821369 214697771 214697512 214697413 214697409 214652409 214537127 214537127 214820237 214820237 214709645 214699213 214820237 214820237 214820237 214709645 214537127 -214554358 214716950 -214821275 214829741 -214829741 214820842 214821279 214703790 -214716954 214838366 -214821022 214820814 -214684721 214821369 214826833 214819472 -214821315 214821305 -214826702 214821275 -214717847 214819719 214748336 -214536440 214536437 -214512416 214512416 -214839313 214839313 214839313 -214826705 214826705 -214510044 214510044 214510044 214582387 214537535 214584812 214537535 214584810 -214600989 214704180 -214705693 214696824 214705682 214696817 214705691 214705693 214711710 214705691 214705691 214687539 214705687 214744796 214681648 214717307 214577750 214650382 214744796 214696817 214705682 214711710 diff --git a/fluid/PaddleRec/gru4rec/test_data/small_test.txt b/fluid/PaddleRec/gru4rec/test_data/small_test.txt new file mode 100644 index 00000000..b4bf7189 --- /dev/null +++ b/fluid/PaddleRec/gru4rec/test_data/small_test.txt @@ -0,0 +1,100 @@ +0 16 +475 473 155 +491 21 +96 185 96 +29 14 13 +5 481 11 21 470 +70 5 70 11 +167 42 167 217 +72 15 73 161 172 +82 82 +97 297 97 +193 182 186 183 184 177 214 +152 152 +163 298 7 +39 73 71 +490 23 23 496 488 74 23 74 486 23 23 74 +17 17 +170 170 483 444 443 234 +25 472 +5 5 11 70 69 +149 149 455 +356 68 477 468 17 479 66 +159 172 6 71 6 6 158 13 494 169 +155 44 438 144 500 +156 9 9 +146 146 +173 10 10 461 +7 6 6 +269 48 268 +50 100 +323 174 18 +69 69 22 98 +38 171 +22 29 489 10 +0 0 +11 5 +29 13 14 232 231 451 289 452 229 +260 11 156 +166 160 166 39 +223 134 134 420 +66 401 68 132 17 84 287 5 +39 304 +65 84 132 +400 211 +145 144 +16 28 254 48 50 100 42 154 262 133 17 +0 0 +28 28 +11 476 464 +61 61 86 86 +38 38 +463 478 +437 265 +22 39 485 171 98 +434 51 344 +16 16 +67 67 67 448 +22 12 161 +15 377 147 147 374 +119 317 0 +38 484 +403 499 +432 442 +28 0 16 50 465 42 +163 487 7 162 +99 99 325 423 83 83 +154 133 +5 37 492 235 160 279 +10 10 457 493 10 460 +441 4 4 4 4 4 4 4 +153 153 +159 164 164 +328 37 +65 65 404 347 431 459 +80 80 44 44 +61 446 +162 495 7 453 +157 21 204 68 37 66 469 145 +37 151 230 206 240 205 264 87 409 87 288 270 280 329 157 296 454 474 +430 445 433 +449 14 +9 9 9 9 +440 238 226 +148 148 +266 267 181 +48 498 +263 255 256 +458 158 7 +72 168 12 165 71 73 173 49 +0 0 +7 7 6 +14 29 13 6 15 14 15 13 +480 439 21 +450 21 151 +12 12 49 14 13 165 12 169 72 15 15 +91 91 +22 12 49 168 +497 101 30 411 30 482 30 53 30 101 176 415 53 447 +462 150 150 +471 456 131 435 131 467 436 412 227 218 190 466 429 213 326 diff --git a/fluid/PaddleRec/gru4rec/text2paddle.py b/fluid/PaddleRec/gru4rec/text2paddle.py new file mode 100644 index 00000000..563a8cad --- /dev/null +++ b/fluid/PaddleRec/gru4rec/text2paddle.py @@ -0,0 +1,82 @@ +import sys +import six +import collections +import os + +def word_count(input_file, word_freq=None): + """ + compute word count from corpus + """ + if word_freq is None: + word_freq = collections.defaultdict(int) + + for l in input_file: + for w in l.strip().split(): + word_freq[w] += 1 + + return word_freq + + +def build_dict(min_word_freq=0, train_dir="", test_dir=""): + """ + Build a word dictionary from the corpus, Keys of the dictionary are words, + and values are zero-based IDs of these words. + """ + word_freq = collections.defaultdict(int) + files = os.listdir(train_dir) + for fi in files: + with open(train_dir + '/' + fi, "r") as f: + word_freq = word_count(f, word_freq) + files = os.listdir(test_dir) + for fi in files: + with open(test_dir + '/' + fi, "r") as f: + word_freq = word_count(f, word_freq) + + word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq] + word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0])) + words, _ = list(zip(*word_freq_sorted)) + word_idx = dict(list(zip(words, six.moves.range(len(words))))) + return word_idx + + +def write_paddle(word_idx, train_dir, test_dir, output_train_dir, output_test_dir): + files = os.listdir(train_dir) + if not os.path.exists(output_train_dir): + os.mkdir(output_train_dir) + for fi in files: + with open(train_dir + '/' + fi, "r") as f: + with open(output_train_dir + '/' + fi, "w") as wf: + for l in f: + l = l.strip().split() + l = [word_idx.get(w) for w in l] + for w in l: + wf.write(str(w) + " ") + wf.write("\n") + + files = os.listdir(test_dir) + if not os.path.exists(output_test_dir): + os.mkdir(output_test_dir) + for fi in files: + with open(test_dir + '/' + fi, "r") as f: + with open(output_test_dir + '/' + fi, "w") as wf: + for l in f: + l = l.strip().split() + l = [word_idx.get(w) for w in l] + for w in l: + wf.write(str(w) + " ") + wf.write("\n") + +def text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab): + vocab = build_dict(0, train_dir, test_dir) + with open(output_vocab, "w") as wf: + wf.write(str(len(vocab)) + "\n") + #wf.write(str(vocab)) + write_paddle(vocab, train_dir, test_dir, output_train_dir, output_test_dir) + + +train_dir = sys.argv[1] +test_dir = sys.argv[2] +output_train_dir = sys.argv[3] +output_test_dir = sys.argv[4] +output_vocab = sys.argv[5] +text2paddle(train_dir, test_dir, output_train_dir, output_test_dir, output_vocab) diff --git a/fluid/PaddleRec/gru4rec/train.py b/fluid/PaddleRec/gru4rec/train.py index 12bc1bc2..bf35c4e7 100644 --- a/fluid/PaddleRec/gru4rec/train.py +++ b/fluid/PaddleRec/gru4rec/train.py @@ -9,201 +9,142 @@ import paddle.fluid as fluid import paddle import time import utils +import net SEED = 102 def parse_args(): parser = argparse.ArgumentParser("gru4rec benchmark.") - parser.add_argument('train_file') - parser.add_argument('test_file') - parser.add_argument('--use_cuda', help='whether use gpu') - parser.add_argument('--parallel', help='whether parallel') parser.add_argument( - '--enable_ce', - action='store_true', - help='If set, run \ - the task with continuous evaluation logs.') + '--train_dir', type=str, default='train_data', help='train file address') + parser.add_argument( + '--vocab_path', type=str, default='vocab.txt', help='vocab file address') + parser.add_argument( + '--is_local', type=int, default=1, help='whether local') + parser.add_argument( + '--hid_size', type=int, default=100, help='hid size') + parser.add_argument( + '--model_dir', type=str, default='model_recall20', help='model dir') + parser.add_argument( + '--batch_size', type=int, default=5, help='num of batch size') + parser.add_argument( + '--print_batch', type=int, default=10, help='num of print batch') + parser.add_argument( + '--pass_num', type=int, default=10, help='num of epoch') + parser.add_argument( + '--use_cuda', type=int, default=0, help='whether use gpu') + parser.add_argument( + '--parallel', type=int, default=0, help='whether parallel') + parser.add_argument( + '--base_lr', type=float, default=0.01, help='learning rate') parser.add_argument( '--num_devices', type=int, default=1, help='Number of GPU devices') args = parser.parse_args() return args +def get_cards(args): + return args.num_devices -def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound): - """ network definition """ - emb_lr_x = 10.0 - gru_lr_x = 1.0 - fc_lr_x = 1.0 - emb = fluid.layers.embedding( - input=src, - size=[vocab_size, hid_size], - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=init_low_bound, high=init_high_bound), - learning_rate=emb_lr_x), - is_sparse=True) - - fc0 = fluid.layers.fc(input=emb, - size=hid_size * 3, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=init_low_bound, high=init_high_bound), - learning_rate=gru_lr_x)) - gru_h0 = fluid.layers.dynamic_gru( - input=fc0, - size=hid_size, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=init_low_bound, high=init_high_bound), - learning_rate=gru_lr_x)) - - fc = fluid.layers.fc(input=gru_h0, - size=vocab_size, - act='softmax', - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Uniform( - low=init_low_bound, high=init_high_bound), - learning_rate=fc_lr_x)) - - cost = fluid.layers.cross_entropy(input=fc, label=dst) - acc = fluid.layers.accuracy(input=fc, label=dst, k=20) - return cost, acc - - -def train(train_reader, - vocab, - network, - hid_size, - base_lr, - batch_size, - pass_num, - use_cuda, - parallel, - model_dir, - init_low_bound=-0.04, - init_high_bound=0.04): - """ train network """ - +def train(): + """ do training """ args = parse_args() - if args.enable_ce: - # random seed must set before configuring the network. - fluid.default_startup_program().random_seed = SEED - - vocab_size = len(vocab) - - # Input data - src_wordseq = fluid.layers.data( - name="src_wordseq", shape=[1], dtype="int64", lod_level=1) - dst_wordseq = fluid.layers.data( - name="dst_wordseq", shape=[1], dtype="int64", lod_level=1) + hid_size = args.hid_size + train_dir = args.train_dir + vocab_path = args.vocab_path + use_cuda = True if args.use_cuda else False + parallel = True if args.parallel else False + print("use_cuda:", use_cuda, "parallel:", parallel) + batch_size = args.batch_size + vocab_size, train_reader = utils.prepare_data( + train_dir, vocab_path, batch_size=batch_size * get_cards(args),\ + buffer_size=1000, word_freq_threshold=0, is_train=True) # Train program - avg_cost = None - cost, acc = network(src_wordseq, dst_wordseq, vocab_size, hid_size, - init_low_bound, init_high_bound) - avg_cost = fluid.layers.mean(x=cost) + src_wordseq, dst_wordseq, avg_cost, acc = net.network(vocab_size=vocab_size, hid_size=hid_size) # Optimization to minimize lost - sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=base_lr) + sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr) sgd_optimizer.minimize(avg_cost) - + # Initialize executor place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + training_role = os.getenv("TRAINING_ROLE", "TRAINER") + if training_role == "PSERVER": + place = fluid.CPUPlace() exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) if parallel: train_exe = fluid.ParallelExecutor( use_cuda=use_cuda, loss_name=avg_cost.name) else: train_exe = exe - total_time = 0.0 - fetch_list = [avg_cost.name] - for pass_idx in six.moves.xrange(pass_num): - epoch_idx = pass_idx + 1 - print("epoch_%d start" % epoch_idx) - - t0 = time.time() - i = 0 - newest_ppl = 0 - for data in train_reader(): - i += 1 - lod_src_wordseq = utils.to_lodtensor([dat[0] for dat in data], - place) - lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data], - place) - ret_avg_cost = train_exe.run(feed={ - "src_wordseq": lod_src_wordseq, - "dst_wordseq": lod_dst_wordseq - }, - fetch_list=fetch_list) - avg_ppl = np.exp(ret_avg_cost[0]) - newest_ppl = np.mean(avg_ppl) - if i % 10 == 0: - print("step:%d ppl:%.3f" % (i, newest_ppl)) + + def train_loop(main_program): + """ train network """ + pass_num = args.pass_num + model_dir = args.model_dir + fetch_list = [avg_cost.name] + + exe.run(fluid.default_startup_program()) + total_time = 0.0 + for pass_idx in six.moves.xrange(pass_num): + epoch_idx = pass_idx + 1 + print("epoch_%d start" % epoch_idx) + + t0 = time.time() + i = 0 + newest_ppl = 0 + for data in train_reader(): + i += 1 + lod_src_wordseq = utils.to_lodtensor([dat[0] for dat in data], + place) + lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data], + place) + ret_avg_cost = train_exe.run(main_program, + feed={ "src_wordseq": lod_src_wordseq, + "dst_wordseq": lod_dst_wordseq}, + fetch_list=fetch_list) + avg_ppl = np.exp(ret_avg_cost[0]) + newest_ppl = np.mean(avg_ppl) + if i % args.print_batch == 0: + print("step:%d ppl:%.3f" % (i, newest_ppl)) + + t1 = time.time() + total_time += t1 - t0 + print("epoch:%d num_steps:%d time_cost(s):%f" % + (epoch_idx, i, total_time / epoch_idx)) + save_dir = "%s/epoch_%d" % (model_dir, epoch_idx) + feed_var_names = ["src_wordseq", "dst_wordseq"] + fetch_vars = [avg_cost, acc] + fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe) + print("model saved in %s" % save_dir) + #exe.close() + print("finish training") + + if args.is_local: + print("run local training") + train_loop(fluid.default_main_program()) - t1 = time.time() - total_time += t1 - t0 - print("epoch:%d num_steps:%d time_cost(s):%f" % - (epoch_idx, i, total_time / epoch_idx)) - - if pass_idx == pass_num - 1 and args.enable_ce: - #Note: The following logs are special for CE monitoring. - #Other situations do not need to care about these logs. - gpu_num = get_cards(args.enable_ce) - if gpu_num == 1: - print("kpis rsc15_pass_duration %s" % - (total_time / epoch_idx)) - print("kpis rsc15_avg_ppl %s" % newest_ppl) - else: - print("kpis rsc15_pass_duration_card%s %s" % \ - (gpu_num, total_time / epoch_idx)) - print("kpis rsc15_avg_ppl_card%s %s" % - (gpu_num, newest_ppl)) - save_dir = "%s/epoch_%d" % (model_dir, epoch_idx) - feed_var_names = ["src_wordseq", "dst_wordseq"] - fetch_vars = [avg_cost, acc] - fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe) - print("model saved in %s" % save_dir) - - print("finish training") - - -def get_cards(args): - if args.enable_ce: - cards = os.environ.get('CUDA_VISIBLE_DEVICES') - num = len(cards.split(",")) - return num else: - return args.num_devices - - -def train_net(): - """ do training """ - args = parse_args() - train_file = args.train_file - test_file = args.test_file - use_cuda = True if args.use_cuda else False - parallel = True if args.parallel else False - print("use_cuda:", use_cuda, "parallel:", parallel) - batch_size = 50 - vocab, train_reader, test_reader = utils.prepare_data( - train_file, test_file,batch_size=batch_size * get_cards(args),\ - buffer_size=1000, word_freq_threshold=0) - train( - train_reader=train_reader, - vocab=vocab, - network=network, - hid_size=100, - base_lr=0.01, - batch_size=batch_size, - pass_num=10, - use_cuda=use_cuda, - parallel=parallel, - model_dir="model_recall20", - init_low_bound=-0.1, - init_high_bound=0.1) - - + print("run distribute training") + port = os.getenv("PADDLE_PORT", "6174") + pserver_ips = os.getenv("PADDLE_PSERVERS") # ip,ip... + eplist = [] + for ip in pserver_ips.split(","): + eplist.append(':'.join([ip, port])) + pserver_endpoints = ",".join(eplist) # ip:port,ip:port... + trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) + current_endpoint = os.getenv("POD_IP") + ":" + port + trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + t = fluid.DistributeTranspiler() + t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) + if training_role == "PSERVER": + pserver_prog = t.get_pserver_program(current_endpoint) + pserver_startup = t.get_startup_program(current_endpoint, + pserver_prog) + exe.run(pserver_startup) + exe.run(pserver_prog) + elif training_role == "TRAINER": + train_loop(t.get_trainer_program()) if __name__ == "__main__": - train_net() + train() diff --git a/fluid/PaddleRec/gru4rec/train_data/small_train.txt b/fluid/PaddleRec/gru4rec/train_data/small_train.txt new file mode 100644 index 00000000..6252a52c --- /dev/null +++ b/fluid/PaddleRec/gru4rec/train_data/small_train.txt @@ -0,0 +1,100 @@ +197 196 198 236 +93 93 384 362 363 43 +336 364 407 +421 322 +314 388 +128 58 +138 138 +46 46 46 +34 34 57 57 57 342 228 321 346 357 59 376 +110 110 +135 94 135 +27 250 27 +129 118 +18 18 18 +81 81 89 89 +27 27 +20 20 20 20 20 212 +33 33 33 33 +62 62 62 63 63 55 248 124 381 428 383 382 43 43 261 63 +90 90 78 78 +399 397 202 141 104 104 245 192 191 271 +239 332 283 88 +187 313 +136 136 324 +41 41 +352 128 +413 414 +410 45 45 45 1 1 1 1 1 1 1 1 31 31 31 31 +92 334 92 +95 285 +215 249 +390 41 +116 116 +300 252 +2 2 2 2 2 +8 8 8 8 8 8 +53 241 259 +118 129 126 94 137 208 216 299 +209 368 139 418 419 +311 180 +303 302 203 284 +369 32 32 32 32 337 +207 47 47 47 +106 107 +143 143 +179 178 +109 109 +405 79 79 371 246 +251 417 427 +333 88 387 358 123 348 394 360 36 365 +3 3 3 3 3 +189 188 +398 425 +107 406 +281 201 141 +2 2 2 +359 54 +395 385 293 +60 60 60 121 121 233 58 58 +24 199 175 24 24 24 351 386 106 +115 294 +122 122 127 127 +35 35 +282 393 +277 140 140 343 225 123 36 36 36 221 114 114 59 59 117 117 247 367 219 258 222 301 375 350 353 111 111 +275 272 273 274 331 330 305 108 76 76 108 +26 26 26 408 26 +290 18 210 291 +372 139 424 113 +341 340 335 +120 370 +224 200 +426 416 +137 319 +402 55 +54 54 +327 119 +125 125 +391 396 354 355 389 +142 142 +295 320 +113 366 +253 85 85 +56 56 310 309 308 307 278 25 25 19 19 3 312 19 19 19 3 25 +220 338 +34 130 +130 120 380 315 +339 422 +379 378 +95 56 392 115 +55 124 +126 34 +349 373 361 +195 194 +75 75 +64 64 64 +35 35 +40 40 40 242 77 244 77 243 +257 316 +103 306 102 51 52 103 105 52 52 292 318 112 286 345 237 276 112 51 102 105 diff --git a/fluid/PaddleRec/gru4rec/utils.py b/fluid/PaddleRec/gru4rec/utils.py index f6c28c9d..5dec9b75 100644 --- a/fluid/PaddleRec/gru4rec/utils.py +++ b/fluid/PaddleRec/gru4rec/utils.py @@ -5,7 +5,7 @@ import time import numpy as np import paddle.fluid as fluid import paddle - +import os def to_lodtensor(data, place): """ convert to LODtensor """ @@ -22,36 +22,36 @@ def to_lodtensor(data, place): res.set_lod([lod]) return res +def get_vocab_size(vocab_path): + with open(vocab_path, "r") as rf: + line = rf.readline() + return int(line.strip()) -def prepare_data(train_filename, - test_filename, +def prepare_data(file_dir, + vocab_path, batch_size, buffer_size=1000, word_freq_threshold=0, - enable_ce=False): + is_train=True): """ prepare the English Pann Treebank (PTB) data """ print("start constuct word dict") - vocab = build_dict(word_freq_threshold, train_filename, test_filename) - print("construct word dict done\n") - if enable_ce: - train_reader = paddle.batch( - train( - train_filename, vocab, buffer_size, data_type=DataType.SEQ), - batch_size) - else: - train_reader = sort_batch( + if is_train: + vocab_size = get_vocab_size(vocab_path) + reader = sort_batch( paddle.reader.shuffle( train( - train_filename, vocab, buffer_size, data_type=DataType.SEQ), + file_dir, buffer_size, data_type=DataType.SEQ), buf_size=buffer_size), batch_size, batch_size * 20) - test_reader = sort_batch( - test( - test_filename, vocab, buffer_size, data_type=DataType.SEQ), - batch_size, - batch_size * 20) - return vocab, train_reader, test_reader + else: + reader = sort_batch( + test( + file_dir, buffer_size, data_type=DataType.SEQ), + batch_size, + batch_size * 20) + vocab_size = 0 + return vocab_size, reader def sort_batch(reader, batch_size, sort_group_size, drop_last=False): @@ -103,57 +103,25 @@ def sort_batch(reader, batch_size, sort_group_size, drop_last=False): class DataType(object): SEQ = 2 - -def word_count(input_file, word_freq=None): - """ - compute word count from corpus - """ - if word_freq is None: - word_freq = collections.defaultdict(int) - - for l in input_file: - for w in l.strip().split(): - word_freq[w] += 1 - - return word_freq - - -def build_dict(min_word_freq=50, train_filename="", test_filename=""): - """ - Build a word dictionary from the corpus, Keys of the dictionary are words, - and values are zero-based IDs of these words. - """ - with open(train_filename) as trainf: - with open(test_filename) as testf: - word_freq = word_count(testf, word_count(trainf)) - - word_freq = [x for x in six.iteritems(word_freq) if x[1] > min_word_freq] - word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0])) - words, _ = list(zip(*word_freq_sorted)) - word_idx = dict(list(zip(words, six.moves.range(len(words))))) - return word_idx - - -def reader_creator(filename, word_idx, n, data_type): +def reader_creator(file_dir, n, data_type): def reader(): - with open(filename) as f: - for l in f: - if DataType.SEQ == data_type: - l = l.strip().split() - l = [word_idx.get(w) for w in l] - src_seq = l[:len(l) - 1] - trg_seq = l[1:] - if n > 0 and len(src_seq) > n: continue - yield src_seq, trg_seq - else: - assert False, 'error data type' - + files = os.listdir(file_dir) + for fi in files: + with open(file_dir + '/' + fi, "r") as f: + for l in f: + if DataType.SEQ == data_type: + l = l.strip().split() + l = [w for w in l] + src_seq = l[:len(l) - 1] + trg_seq = l[1:] + if n > 0 and len(src_seq) > n: continue + yield src_seq, trg_seq + else: + assert False, 'error data type' return reader +def train(train_dir, n, data_type=DataType.SEQ): + return reader_creator(train_dir, n, data_type) -def train(filename, word_idx, n, data_type=DataType.SEQ): - return reader_creator(filename, word_idx, n, data_type) - - -def test(filename, word_idx, n, data_type=DataType.SEQ): - return reader_creator(filename, word_idx, n, data_type) +def test(test_dir, n, data_type=DataType.SEQ): + return reader_creator(test_dir, n, data_type) diff --git a/fluid/PaddleRec/gru4rec/vocab.txt b/fluid/PaddleRec/gru4rec/vocab.txt new file mode 100644 index 00000000..c15fb720 --- /dev/null +++ b/fluid/PaddleRec/gru4rec/vocab.txt @@ -0,0 +1 @@ +501 -- GitLab