From c3f68339085283973c0b236c108ddd5056c5bf8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?yudongxu=28=E8=AE=B8=E7=85=9C=E4=B8=9C=29?= Date: Fri, 29 May 2020 14:55:37 +0800 Subject: [PATCH] fix some bugs --- PaddleRec/ctr/wide_deep/data_preparation.py | 4 ++-- PaddleRec/ctr/wide_deep/infer.py | 2 ++ PaddleRec/ctr/wide_deep/train.py | 2 ++ PaddleRec/dssm/README.md | 10 +++++++++ PaddleRec/dssm/dssm.py | 5 ++++- PaddleRec/dssm/infer.py | 2 ++ PaddleRec/multi_task/esmm/README.md | 3 +++ PaddleRec/multi_task/esmm/cpu_infer.sh | 1 + .../multi_task/esmm/dataset_generator.py | 4 ++-- PaddleRec/multi_task/esmm/gpu_infer.sh | 10 +++++---- PaddleRec/multi_task/esmm/gpu_train.sh | 8 +++---- PaddleRec/multi_task/esmm/infer.py | 9 +++++--- PaddleRec/multi_task/esmm/train.py | 8 +++++++ PaddleRec/multi_task/esmm/utils.py | 2 +- PaddleRec/ncf/evaluate.py | 2 +- PaddleRec/ncf/get_train_data.py | 2 +- PaddleRec/rerank/listwise/README.md | 14 ++++++++++++ PaddleRec/rerank/listwise/args.py | 2 +- PaddleRec/rerank/listwise/train.py | 22 +++++++++---------- PaddleRec/youtube_dnn/README.md | 10 +++++++++ PaddleRec/youtube_dnn/infer.py | 3 +++ PaddleRec/youtube_dnn/train.py | 3 +++ 22 files changed, 97 insertions(+), 31 deletions(-) diff --git a/PaddleRec/ctr/wide_deep/data_preparation.py b/PaddleRec/ctr/wide_deep/data_preparation.py index 928424b3..ca0e110c 100644 --- a/PaddleRec/ctr/wide_deep/data_preparation.py +++ b/PaddleRec/ctr/wide_deep/data_preparation.py @@ -81,11 +81,11 @@ def build_model_columns(train_data_path, test_data_path): train_df['label'] = train_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0) test_df['label'] = test_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0) - with io.open('train_data/columns.txt','w') as f: + with open('train_data/columns.txt','w') as f: write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n' f.write(write_str) f.close() - with io.open('test_data/columns.txt','w') as f: + with open('test_data/columns.txt','w') as f: write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n' f.write(write_str) f.close() diff --git a/PaddleRec/ctr/wide_deep/infer.py b/PaddleRec/ctr/wide_deep/infer.py index 68c0bfb9..104333c6 100644 --- a/PaddleRec/ctr/wide_deep/infer.py +++ b/PaddleRec/ctr/wide_deep/infer.py @@ -69,5 +69,7 @@ def run_infer(args,test_data_path): if __name__ == "__main__": args = args.parse_args() + logger.info("batch_size: {}, use_gpu: {}, test_epoch: {}, test_data_path: {}, model_dir:{}, hidden1_units: {}, hidden2_units: {}, hidden3_units: {}".format( + args.batch_size, args.use_gpu, args.test_epoch, args.test_data_path, args.model_dir, args.hidden1_units, args.hidden2_units, args.hidden3_units)) run_infer(args, args.test_data_path) \ No newline at end of file diff --git a/PaddleRec/ctr/wide_deep/train.py b/PaddleRec/ctr/wide_deep/train.py index 65a3a980..c2d9a927 100644 --- a/PaddleRec/ctr/wide_deep/train.py +++ b/PaddleRec/ctr/wide_deep/train.py @@ -47,4 +47,6 @@ def train(args, train_data_path): if __name__ == "__main__": args = args.parse_args() + logger.info("epoch:{}, batch_size: {}, use_gpu: {}, train_data_path: {}, model_dir: {}, hidden1_units: {}, hidden2_units: {}, hidden3_units: {}".format( + args.epoch, args.batch_size, args.use_gpu, args.train_data_path, args.model_dir, args.hidden1_units, args.hidden2_units, args.hidden3_units)) train(args, args.train_data_path) diff --git a/PaddleRec/dssm/README.md b/PaddleRec/dssm/README.md index e19fcbe4..ec71458d 100644 --- a/PaddleRec/dssm/README.md +++ b/PaddleRec/dssm/README.md @@ -23,6 +23,16 @@ DSSM[《Learning Deep Structured Semantic Models for Web Search using Clickthrou python3.7 +## 数据集说明 + +由于论文没有公开数据集,本项目构造数据验证网络的正确性,其说明如下: + +query:随机构造的query向量表示 + +doc_pos:随机构造doc正例向量表示 + +doc_neg_0~3为四个doc负例向量表示 + ## 单机训练 GPU环境 diff --git a/PaddleRec/dssm/dssm.py b/PaddleRec/dssm/dssm.py index b4a45f98..83309b98 100644 --- a/PaddleRec/dssm/dssm.py +++ b/PaddleRec/dssm/dssm.py @@ -57,7 +57,10 @@ def model(TRIGRAM_D = 1000, L1_N = 300, L2_N = 300, L3_N = 128, Neg = 4): return avg_loss, R_Q_D_p, [query] + [doc_pos] + doc_negs args = args.parse_args() -loss,R_Q_D_p, data_list = model(args.TRIGRAM_D,args.L1_N,args.L2_N,args.L3_N,args.Neg) +logger.info("use_gpu: {}, batch_size: {}, TRIGRAM_D: {}, L1_N:{}, L2_N: {}, L3_N: {}, Neg: {}, base_lr: {}, model_dir: {}".format( + args.use_gpu, args.batch_size, args.TRIGRAM_D, args.L1_N, args.L2_N, args.L3_N, args.Neg, args.base_lr, args.model_dir)) + +loss,R_Q_D_p, data_list = model(args.TRIGRAM_D, args.L1_N, args.L2_N, args.L3_N, args.Neg) sgd = fluid.optimizer.SGD(learning_rate=args.base_lr) sgd.minimize(loss) diff --git a/PaddleRec/dssm/infer.py b/PaddleRec/dssm/infer.py index b0cc9442..f91faf99 100644 --- a/PaddleRec/dssm/infer.py +++ b/PaddleRec/dssm/infer.py @@ -37,4 +37,6 @@ def infer(args): if __name__ == "__main__": args = args.parse_args() + logger.info("use_gpu: {}, model_dir: {}".format(args.use_gpu, args.model_dir)) + infer(args) \ No newline at end of file diff --git a/PaddleRec/multi_task/esmm/README.md b/PaddleRec/multi_task/esmm/README.md index 8c8d3991..2d17a732 100644 --- a/PaddleRec/multi_task/esmm/README.md +++ b/PaddleRec/multi_task/esmm/README.md @@ -96,6 +96,8 @@ GPU环境 ```sh python infer.py --use_gpu 1\ #是否使用gpu --batch_size 64\ #batch_size大小 + --cpu_num 2\ #cpu数量 + --model_dir ./model_dir \ #模型保存路径 --test_data_path ./test_data \ #训练数据路径 --vocab_path ./vocab_size.txt #embedding词汇表大小路径 ``` @@ -114,6 +116,7 @@ CPU环境 python infer.py --use_gpu 0\ #是否使用gpu --batch_size 64\ #batch_size大小 --cpu_num 2\ #cpu数量 + --model_dir ./model_dir \ #模型保存路径 --test_data_path ./test_data \ #训练数据路径 --vocab_path ./vocab_size.txt #embedding词汇表大小路径 ``` diff --git a/PaddleRec/multi_task/esmm/cpu_infer.sh b/PaddleRec/multi_task/esmm/cpu_infer.sh index 32cc1769..241075b4 100644 --- a/PaddleRec/multi_task/esmm/cpu_infer.sh +++ b/PaddleRec/multi_task/esmm/cpu_infer.sh @@ -1,5 +1,6 @@ python infer.py --use_gpu 0 \ --batch_size 64 \ --cpu_num 2 \ + --model_dir ./model_dir \ --test_data_path ./test_data \ --vocab_path ./vocab_size.txt \ No newline at end of file diff --git a/PaddleRec/multi_task/esmm/dataset_generator.py b/PaddleRec/multi_task/esmm/dataset_generator.py index 8e93977a..0731937b 100644 --- a/PaddleRec/multi_task/esmm/dataset_generator.py +++ b/PaddleRec/multi_task/esmm/dataset_generator.py @@ -9,7 +9,7 @@ all_field_id_dict = defaultdict(int) for i,field_id in enumerate(all_field_id): all_field_id_dict[field_id] = [False,i] -class CriteoDataset(dg.MultiSlotStringDataGenerator): +class Dataset(dg.MultiSlotStringDataGenerator): def generate_sample(self, line): @@ -40,5 +40,5 @@ class CriteoDataset(dg.MultiSlotStringDataGenerator): yield output return reader -d = CriteoDataset() +d = Dataset() d.run_from_stdin() \ No newline at end of file diff --git a/PaddleRec/multi_task/esmm/gpu_infer.sh b/PaddleRec/multi_task/esmm/gpu_infer.sh index 2236122d..ba979815 100644 --- a/PaddleRec/multi_task/esmm/gpu_infer.sh +++ b/PaddleRec/multi_task/esmm/gpu_infer.sh @@ -1,4 +1,6 @@ -python infer.py --use_gpu 1\ - --batch_size 64\ - --test_data_path ./test_data\ - --vocab_path ./vocab_size.txt \ No newline at end of file +CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1\ + --batch_size 64\ + --cpu_num 2 \ + --model_dir ./model_dir \ + --test_data_path ./test_data\ + --vocab_path ./vocab_size.txt \ No newline at end of file diff --git a/PaddleRec/multi_task/esmm/gpu_train.sh b/PaddleRec/multi_task/esmm/gpu_train.sh index 98d74c38..1b5d531d 100644 --- a/PaddleRec/multi_task/esmm/gpu_train.sh +++ b/PaddleRec/multi_task/esmm/gpu_train.sh @@ -1,8 +1,8 @@ -CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu True\ +CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1\ --epochs 100\ --batch_size 64\ --embed_size 12\ --cpu_num 2\ - --model_dir './model_dir'\ - --train_data_path './train_data'\ - --vocab_path './vocab/vocab_size.txt' \ No newline at end of file + --model_dir ./model_dir\ + --train_data_path ./train_data\ + --vocab_path ./vocab/vocab_size.txt \ No newline at end of file diff --git a/PaddleRec/multi_task/esmm/infer.py b/PaddleRec/multi_task/esmm/infer.py index 2cc5e62a..aa0c6784 100644 --- a/PaddleRec/multi_task/esmm/infer.py +++ b/PaddleRec/multi_task/esmm/infer.py @@ -33,7 +33,7 @@ def run_infer(args, model_path, test_data_path, vocab_size): inputs = esmm_model.input_data() avg_cost,auc_ctr,auc_ctcvr= esmm_model.net(inputs, vocab_size, args.embed_size) - dataset, file_list = utils.get_dataset(inputs, test_data_path, args.batch_size,args.cpu_num) + dataset, file_list = utils.get_dataset(inputs, test_data_path, args.batch_size, args.cpu_num) exe = fluid.Executor(place) fluid.load(fluid.default_main_program(), os.path.join(model_path, "checkpoint"), exe) @@ -51,6 +51,9 @@ def run_infer(args, model_path, test_data_path, vocab_size): if __name__ == "__main__": args = args.parse_args() + + logger.info("use_gpu: {}, epochs: {}, batch_size: {}, cpu_num: {}, model_dir: {}, test_data_path: {}, vocab_path: {}".format(args.use_gpu, args.epochs, + args.batch_size, args.cpu_num, args.model_dir, args.test_data_path, args.vocab_path)) model_list = [] for _, dir, _ in os.walk(args.model_dir): for model in dir: @@ -58,10 +61,10 @@ if __name__ == "__main__": path = os.path.join(args.model_dir, model) model_list.append(path) - vocab_size =utils.get_vocab_size(args.vocab_path) + vocab_size = utils.get_vocab_size(args.vocab_path) for model in model_list: logger.info("Test model {}".format(model)) - run_infer(args, model,args.test_data_path) + run_infer(args, model,args.test_data_path, vocab_size) \ No newline at end of file diff --git a/PaddleRec/multi_task/esmm/train.py b/PaddleRec/multi_task/esmm/train.py index fb3ec67e..321eb9e6 100644 --- a/PaddleRec/multi_task/esmm/train.py +++ b/PaddleRec/multi_task/esmm/train.py @@ -5,6 +5,11 @@ from net import ESMM import paddle import utils import args +import logging + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger("fluid") +logger.setLevel(logging.INFO) def train(args, vocab_size, train_data_path): esmm_model = ESMM() @@ -39,5 +44,8 @@ def train(args, vocab_size, train_data_path): if __name__ == "__main__": args = args.parse_args() + logger.info("use_gpu: {}, epochs: {}, batch_size: {}, embed_size: {}, cpu_num: {}, model_dir: {}, train_data_path: {}, vocab_path: {}".format(args.use_gpu, args.epochs, + args.batch_size, args.embed_size, args.cpu_num, args.model_dir, args.train_data_path, args.vocab_path)) + vocab_size =utils.get_vocab_size(args.vocab_path) train(args, vocab_size, args.train_data_path) diff --git a/PaddleRec/multi_task/esmm/utils.py b/PaddleRec/multi_task/esmm/utils.py index 7ab66dff..3bb3e9ff 100644 --- a/PaddleRec/multi_task/esmm/utils.py +++ b/PaddleRec/multi_task/esmm/utils.py @@ -14,7 +14,7 @@ all_field_id_dict = defaultdict(int) for i,field_id in enumerate(all_field_id): all_field_id_dict[field_id] = [False,i] -def get_dataset(inputs,files,batch_size,cpu_num): +def get_dataset(inputs, files,batch_size, cpu_num): dataset = fluid.DatasetFactory().create_dataset() dataset.set_use_var(inputs) dataset.set_pipe_command("python dataset_generator.py") diff --git a/PaddleRec/ncf/evaluate.py b/PaddleRec/ncf/evaluate.py index 1a655e45..e3cb6802 100644 --- a/PaddleRec/ncf/evaluate.py +++ b/PaddleRec/ncf/evaluate.py @@ -25,7 +25,7 @@ _model_path = None def run_infer(args, model_path, test_data_path): - test_data_generator = utils.CriteoDataset() + test_data_generator = utils.Dataset() with fluid.scope_guard(fluid.Scope()): test_reader = fluid.io.batch( diff --git a/PaddleRec/ncf/get_train_data.py b/PaddleRec/ncf/get_train_data.py index 44578dd1..e87bdcad 100644 --- a/PaddleRec/ncf/get_train_data.py +++ b/PaddleRec/ncf/get_train_data.py @@ -32,7 +32,7 @@ def get_train_data(filename, write_file, num_negatives): file = open(write_file, 'w') print("writing " + write_file) - for (u, i) in mat.keys(): + for (u, i) in mat: # positive instance user_input = str(u) item_input = str(i) diff --git a/PaddleRec/rerank/listwise/README.md b/PaddleRec/rerank/listwise/README.md index cdd23473..47e46beb 100644 --- a/PaddleRec/rerank/listwise/README.md +++ b/PaddleRec/rerank/listwise/README.md @@ -27,6 +27,20 @@ python3.7 +## 数据集说明 + +本项目构造数据集验证模型的正确性,字段说明如下: + +user_slot_name:用户端特征群id + +item_slot_name:item段特征群id + +lenght:item的长度 + +label:用户对给定的是否点击item的list + +注意:由于构造数据集的限制,本项目只用一个epoch,如果多个epoch,则多个epoch的数据是变化的,没有意义,因此只采用一个epoch。 + ## 单机训练 GPU环境 diff --git a/PaddleRec/rerank/listwise/args.py b/PaddleRec/rerank/listwise/args.py index d52791e2..5fc63297 100644 --- a/PaddleRec/rerank/listwise/args.py +++ b/PaddleRec/rerank/listwise/args.py @@ -22,7 +22,7 @@ import sys def parse_args(): parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--epochs", type=int, default=20, help="epochs") + parser.add_argument("--epochs", type=int, default=1, help="epochs") parser.add_argument("--batch_size", type=int, default=32, help="batch_size") parser.add_argument("--test_epoch", type=int, default=1, help="test_epoch") parser.add_argument('--use_gpu', type=int, default=0, help='whether using gpu') diff --git a/PaddleRec/rerank/listwise/train.py b/PaddleRec/rerank/listwise/train.py index c8b5d310..78f9c5be 100644 --- a/PaddleRec/rerank/listwise/train.py +++ b/PaddleRec/rerank/listwise/train.py @@ -51,17 +51,17 @@ def train(args): train_reader = fluid.io.batch(train_data_generator.get_train_data(), batch_size=args.batch_size) loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=args.batch_size, iterable=True) loader.set_sample_list_generator(train_reader, places=place) - - for i in range(args.sample_size): - for batch_id, data in enumerate(loader()): - begin = time.time() - loss_val, auc = exe.run(program=fluid.default_main_program(), - feed=data, - fetch_list=[loss.name, auc_val], - return_numpy=True) - end = time.time() - logger.info("batch_id: {}, batch_time: {:.5f}s, loss: {:.5f}, auc: {:.5f}".format( - batch_id, end-begin, float(np.array(loss_val)), float(np.array(auc)))) + for epoch in range(args.epochs): + for i in range(args.sample_size): + for batch_id, data in enumerate(loader()): + begin = time.time() + loss_val, auc = exe.run(program=fluid.default_main_program(), + feed=data, + fetch_list=[loss.name, auc_val], + return_numpy=True) + end = time.time() + logger.info("epoch: {}, batch_id: {}, batch_time: {:.5f}s, loss: {:.5f}, auc: {:.5f}".format( + epoch, batch_id, end-begin, float(np.array(loss_val)), float(np.array(auc)))) #save model model_dir = os.path.join(args.model_dir, 'epoch_' + str(1), "checkpoint") diff --git a/PaddleRec/youtube_dnn/README.md b/PaddleRec/youtube_dnn/README.md index 60714ef4..4e12e457 100644 --- a/PaddleRec/youtube_dnn/README.md +++ b/PaddleRec/youtube_dnn/README.md @@ -34,6 +34,16 @@ python3.7 +## 数据集说明 + +由于原论文没有开源数据集,本项目随机构造数据,其字段如下: + +watch_vecs:随机构造用户历史观看视频的embedding表示 + +search_vec:随机构造用户搜索历史的embedding表示 + +other_feat:随机构造其他特征的tembedding表示 + ## 单机训练 GPU环境 diff --git a/PaddleRec/youtube_dnn/infer.py b/PaddleRec/youtube_dnn/infer.py index 45921739..009affee 100644 --- a/PaddleRec/youtube_dnn/infer.py +++ b/PaddleRec/youtube_dnn/infer.py @@ -48,6 +48,9 @@ def infer(args): if __name__ == "__main__": args = args.parse_args() + logger.info("use_gpu: {}, test_epoch: {}, model_dir: {}, user_vec_path: {}".format( + args.use_gpu, args.test_epoch, args.model_dir, args.user_vec_path)) + if(os.path.exists(args.user_vec_path)): os.system("rm " + args.user_vec_path) infer(args) \ No newline at end of file diff --git a/PaddleRec/youtube_dnn/train.py b/PaddleRec/youtube_dnn/train.py index 4eb90307..9ee33008 100644 --- a/PaddleRec/youtube_dnn/train.py +++ b/PaddleRec/youtube_dnn/train.py @@ -70,6 +70,9 @@ def train(args): if __name__ == "__main__": args = args.parse_args() + logger.info("use_gpu: {}, batch_size: {}, epochs: {}, watch_vec_size: {}, search_vec_size: {}, other_feat_size: {}, output_size: {}, model_dir: {}, test_epoch: {}, base_lr: {}, video_vec_path: {}".format( + args.use_gpu, args.batch_size, args.epochs, args.watch_vec_size, args.search_vec_size, args.other_feat_size, args.output_size, args.model_dir, args.test_epoch, args.base_lr, args.video_vec_path)) + if(os.path.exists(args.video_vec_path)): os.system("rm " + args.video_vec_path) train(args) -- GitLab