diff --git a/PaddleRec/dssm/dssm.py b/PaddleRec/dssm/dssm.py index 4d959f7b58a50ab4c6739b9e7c16e515300e9b66..b4a45f989244a39b6cd4f16df844aa80cf894931 100644 --- a/PaddleRec/dssm/dssm.py +++ b/PaddleRec/dssm/dssm.py @@ -108,12 +108,3 @@ for i in range(sample_size): feed_var_names = ["query", "doc_pos"] fetch_vars = [R_Q_D_p] fluid.io.save_inference_model(args.model_dir, feed_var_names, fetch_vars, exe) - - - - - - - - - diff --git a/PaddleRec/dssm/infer.py b/PaddleRec/dssm/infer.py index 55cfd43aa9cb8e888eb8aabd580e69a11f3d06a6..b0cc9442be4df8630a008b9756fe2e506675f24a 100644 --- a/PaddleRec/dssm/infer.py +++ b/PaddleRec/dssm/infer.py @@ -14,7 +14,6 @@ def infer(args): with fluid.scope_guard(fluid.Scope()): infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(args.model_dir, exe) - #构造测试数据 sample_size = 100 l_Qs = [] pos_l_Ds = [] diff --git a/PaddleRec/dssm/infer_cpu.sh b/PaddleRec/dssm/infer_cpu.sh index ce3804962ab4012268eee1e5d7845f3e3d5cd6f7..bd473442fc3db90b194bcf55be1abd010cf5f6d0 100644 --- a/PaddleRec/dssm/infer_cpu.sh +++ b/PaddleRec/dssm/infer_cpu.sh @@ -1,2 +1,2 @@ python infer.py --use_gpu 0 \ - --model_dir 'model_dir' \ No newline at end of file + --model_dir ./model_dir \ No newline at end of file diff --git a/PaddleRec/dssm/infer_gpu.sh b/PaddleRec/dssm/infer_gpu.sh index dcce70b3001277e07b155f1e2de77457613a75c1..ce2367c6cfffd4daf01112b9a1bcef2747f56a6d 100644 --- a/PaddleRec/dssm/infer_gpu.sh +++ b/PaddleRec/dssm/infer_gpu.sh @@ -1,2 +1,2 @@ CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1 \ - --model_dir 'model_dir' \ No newline at end of file + --model_dir ./model_dir \ No newline at end of file diff --git a/PaddleRec/dssm/train_cpu.sh b/PaddleRec/dssm/train_cpu.sh index 1ac62be7dc42e03a93d42463e77201994160daf4..0c6951a521e602d07e2b69ad7d629436a4db01df 100644 --- a/PaddleRec/dssm/train_cpu.sh +++ b/PaddleRec/dssm/train_cpu.sh @@ -6,4 +6,4 @@ python dssm.py --use_gpu 0 \ --L3_N 128 \ --Neg 4 \ --base_lr 0.01 \ - --model_dir 'model_dir' \ No newline at end of file + --model_dir ./model_dir \ No newline at end of file diff --git a/PaddleRec/dssm/train_gpu.sh b/PaddleRec/dssm/train_gpu.sh index 7c80df1ba5adc05f639dccd8ca0688897608dbaf..4904aaea29a60e752fcdcf604c08e4490b05c6c8 100644 --- a/PaddleRec/dssm/train_gpu.sh +++ b/PaddleRec/dssm/train_gpu.sh @@ -6,4 +6,4 @@ CUDA_VISIBLE_DEVICES=0 python dssm.py --use_gpu 1 \ --L3_N 128 \ --Neg 4 \ --base_lr 0.01 \ - --model_dir 'model_dir' \ No newline at end of file + --model_dir ./model_dir \ No newline at end of file diff --git a/PaddleRec/multi_task/esmm/README.md b/PaddleRec/multi_task/esmm/README.md index e79a7323734de4df75d9450b15e4d32a30675230..8c8d39912003eebe2e6d7daf8ee7098a8abe9982 100644 --- a/PaddleRec/multi_task/esmm/README.md +++ b/PaddleRec/multi_task/esmm/README.md @@ -7,7 +7,6 @@ ├── net.py # ESMM网络结构 ├── train.py # ESMM模型训练脚本 ├── infer.py # ESMM模型预测脚本 -├── reader.py # 数据预处理文件 ├── utils.py # 通用函数 ├── args.py # 参数脚本 ├── get_data.sh # 生成训练数据脚本 @@ -16,6 +15,7 @@ ├── cpu_train.sh # cpu训练shell脚本 ├── gpu_infer.sh # gpu预测shell脚本 ├── cpu_infer.sh # cpu预测shell脚本 +├── vocab_size.txt #词汇表文件 ``` ## 简介 @@ -50,14 +50,14 @@ GPU环境 在gpu_train.sh脚本文件中设置好数据路径、参数。 ```shell -CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu True\ #是否使用gpu +CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1\ #是否使用gpu --epochs 100\ #训练轮次 --batch_size 64\ #batch_size大小 --embed_size 12\ #每个featsigns的embedding维度 --cpu_num 2\ #cpu数量 --model_dir ./model_dir \ #模型保存路径 --train_data_path ./train_data \ #训练数据路径 - --vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 + --vocab_path ./vocab_size.txt #embedding词汇表大小路径 ``` 修改脚本的可执行权限并运行 @@ -71,14 +71,14 @@ CPU环境 在cpu_train.sh脚本文件中设置好数据路径、参数。 ```shell -python train.py --use_gpu False\ #是否使用gpu +python train.py --use_gpu 0\ #是否使用gpu --epochs 100\ #训练轮次 --batch_size 64\ #batch_size大小 --embed_size 12\ #每个featsigns的embedding维度 --cpu_num 2\ #cpu数量 --model_dir ./model_dir \ #模型保存路径 --train_data_path ./train_data \ #训练数据路径 - --vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 + --vocab_path ./vocab_size.txt #embedding词汇表大小路径 ``` 修改脚本的可执行权限并运行 @@ -94,10 +94,10 @@ GPU环境 在gpu_infer.sh脚本文件中设置好数据路径、参数。 ```sh -python infer.py --use_gpu True\ #是否使用gpu +python infer.py --use_gpu 1\ #是否使用gpu --batch_size 64\ #batch_size大小 --test_data_path ./test_data \ #训练数据路径 - --vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 + --vocab_path ./vocab_size.txt #embedding词汇表大小路径 ``` 修改脚本的可执行权限并运行 @@ -111,11 +111,11 @@ CPU环境 在cpu_infer.sh脚本文件中设置好数据路径、参数。 ```shell -python infer.py --use_gpu False\ #是否使用gpu +python infer.py --use_gpu 0\ #是否使用gpu --batch_size 64\ #batch_size大小 --cpu_num 2\ #cpu数量 --test_data_path ./test_data \ #训练数据路径 - --vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 + --vocab_path ./vocab_size.txt #embedding词汇表大小路径 ``` 修改脚本的可执行权限并运行 diff --git a/PaddleRec/multi_task/esmm/args.py b/PaddleRec/multi_task/esmm/args.py index 69e70bb3aafa5863eabef0573e8811544fa8e32e..aca10d9dc7bd48a0adc046f7cc8865467043bdc3 100644 --- a/PaddleRec/multi_task/esmm/args.py +++ b/PaddleRec/multi_task/esmm/args.py @@ -27,12 +27,12 @@ def parse_args(): parser.add_argument("--batch_size", type=int, default=64, help="batch_size") parser.add_argument("--embed_size", type=int, default=12, help="embed_size") parser.add_argument("--cpu_num", type=int, default=2, help="cpu_num") - parser.add_argument('--use_gpu', type=bool, default=False, help='whether using gpu') + parser.add_argument('--use_gpu', type=int, default=0, help='whether using gpu') parser.add_argument('--model_dir', type=str, default='./model_dir', help='whether using gpu') parser.add_argument('--train_data_path', type=str, default='./train_data', help='train_data_path') parser.add_argument('--test_data_path', type=str, default='./test_data', help='test_data_path') - parser.add_argument('--vocab_path', type=str, default='./vocab/vocab_size.txt', help='vocab_path') + parser.add_argument('--vocab_path', type=str, default='./vocab_size.txt', help='vocab_path') parser.add_argument("--train_sample_size", type=int, default=sys.maxsize, help="train_sample_size") parser.add_argument("--test_sample_size", type=int, default=sys.maxsize, help="test_sample_size") diff --git a/PaddleRec/multi_task/esmm/cpu_infer.sh b/PaddleRec/multi_task/esmm/cpu_infer.sh index 141280c5cf852d88401b85d0fa29e05775e1aad8..32cc17691f851bc4b6aca84299eb4da67a15eff0 100644 --- a/PaddleRec/multi_task/esmm/cpu_infer.sh +++ b/PaddleRec/multi_task/esmm/cpu_infer.sh @@ -1,5 +1,5 @@ -python infer.py --use_gpu False\ #是否使用gpu - --batch_size 64\ #batch_size大小 - --cpu_num 2\ #cpu数量 - --test_data_path ./test_data \ #训练数据路径 - --vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 \ No newline at end of file +python infer.py --use_gpu 0 \ + --batch_size 64 \ + --cpu_num 2 \ + --test_data_path ./test_data \ + --vocab_path ./vocab_size.txt \ No newline at end of file diff --git a/PaddleRec/multi_task/esmm/cpu_train.sh b/PaddleRec/multi_task/esmm/cpu_train.sh index 5710b7e470ed84cf0b039576c58aa9703129c84e..c9b201c323215bfc819663c8f53c3ddbe1e62eb4 100644 --- a/PaddleRec/multi_task/esmm/cpu_train.sh +++ b/PaddleRec/multi_task/esmm/cpu_train.sh @@ -1,8 +1,8 @@ -python train.py --use_gpu False\ #是否使用gpu - --epochs 100\ #训练轮次 - --batch_size 64\ #batch_size大小 - --embed_size 12\ #每个featsigns的embedding维度 - --cpu_num 2\ #cpu数量 - --model_dir ./model_dir \ #模型保存路径 - --train_data_path ./train_data \ #训练数据路径 - --vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 \ No newline at end of file +python train.py --use_gpu 0 \ + --epochs 100 \ + --batch_size 64 \ + --embed_size 12 \ + --cpu_num 2 \ + --model_dir ./model_dir \ + --train_data_path ./train_data \ + --vocab_path ./vocab_size.txt \ No newline at end of file diff --git a/PaddleRec/multi_task/esmm/get_data.sh b/PaddleRec/multi_task/esmm/get_data.sh index c5698ffa5a54da51a4bbb673c4bb8a7d728a4901..960f9f9cea43289ba40a06596e05b1e591464fa8 100644 --- a/PaddleRec/multi_task/esmm/get_data.sh +++ b/PaddleRec/multi_task/esmm/get_data.sh @@ -1,26 +1,5 @@ mkdir train_data mkdir test_data -mkdir vocab -mkdir data -train_source_path="./data/sample_train.tar.gz" -train_target_path="train_data" -test_source_path="./data/sample_test.tar.gz" -test_target_path="test_data" -cd data -echo "downloading sample_train.tar.gz......" -curl -# 'http://jupter-oss.oss-cn-hangzhou.aliyuncs.com/file/opensearch/documents/408/sample_train.tar.gz?Expires=1586435769&OSSAccessKeyId=LTAIGx40tjZWxj6q&Signature=ahUDqhvKT1cGjC4%2FIER2EWtq7o4%3D&response-content-disposition=attachment%3B%20' -H 'Proxy-Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' -H 'Accept-Language: zh-CN,zh;q=0.9' --compressed --insecure -o sample_train.tar.gz -cd .. -echo "unzipping sample_train.tar.gz......" -tar -xzvf ${train_source_path} -C ${train_target_path} && rm -rf ${train_source_path} -cd data -echo "downloading sample_test.tar.gz......" -curl -# 'http://jupter-oss.oss-cn-hangzhou.aliyuncs.com/file/opensearch/documents/408/sample_test.tar.gz?Expires=1586435821&OSSAccessKeyId=LTAIGx40tjZWxj6q&Signature=OwLMPjt1agByQtRVi8pazsAliNk%3D&response-content-disposition=attachment%3B%20' -H 'Proxy-Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' -H 'Accept-Language: zh-CN,zh;q=0.9' --compressed --insecure -o sample_test.tar.gz -cd .. -echo "unzipping sample_test.tar.gz......" -tar -xzvf ${test_source_path} -C ${test_target_path} && rm -rf ${test_source_path} -echo "preprocessing data......" -python reader.py --train_data_path ${train_target_path} \ - --test_data_path ${test_target_path} \ - --vocab_path vocab/vocab_size.txt \ - --train_sample_size 6400 \ - --test_sample_size 6400 \ + +wget -P train_data/ https://paddlerec.bj.bcebos.com/esmm/traindata.csv +wget -P test_data/ https://paddlerec.bj.bcebos.com/esmm/testdata.csv diff --git a/PaddleRec/multi_task/esmm/gpu_infer.sh b/PaddleRec/multi_task/esmm/gpu_infer.sh index 3707741e6ae1062046fd5795604a5cdd854ee550..2236122d6642342bf06a6b835534389c37b31748 100644 --- a/PaddleRec/multi_task/esmm/gpu_infer.sh +++ b/PaddleRec/multi_task/esmm/gpu_infer.sh @@ -1,4 +1,4 @@ -python infer.py --use_gpu True\ #是否使用gpu - --batch_size 64\ #batch_size大小 - --test_data_path ./test_data \ #训练数据路径 - --vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 \ No newline at end of file +python infer.py --use_gpu 1\ + --batch_size 64\ + --test_data_path ./test_data\ + --vocab_path ./vocab_size.txt \ No newline at end of file diff --git a/PaddleRec/multi_task/esmm/gpu_train.sh b/PaddleRec/multi_task/esmm/gpu_train.sh index a9ba0c936c95c97260ff3f7f6d9d7c5d5f3545e3..98d74c38b2aec1c6badbb3789a1a7fa2845eb494 100644 --- a/PaddleRec/multi_task/esmm/gpu_train.sh +++ b/PaddleRec/multi_task/esmm/gpu_train.sh @@ -1,8 +1,8 @@ -CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu True\ #是否使用gpu - --epochs 100\ #训练轮次 - --batch_size 64\ #batch_size大小 - --embed_size 12\ #每个featsigns的embedding维度 - --cpu_num 2\ #cpu数量 - --model_dir ./model_dir \ #模型保存路径 - --train_data_path ./train_data \ #训练数据路径 - --vocab_path ./vocab/vocab_size.txt #embedding词汇表大小路径 \ No newline at end of file +CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu True\ + --epochs 100\ + --batch_size 64\ + --embed_size 12\ + --cpu_num 2\ + --model_dir './model_dir'\ + --train_data_path './train_data'\ + --vocab_path './vocab/vocab_size.txt' \ No newline at end of file diff --git a/PaddleRec/multi_task/esmm/infer.py b/PaddleRec/multi_task/esmm/infer.py index 4c7353374c53e2a531a9947a6433fc5d04977e32..2cc5e62a4d2ef334832fea5f5582f1a95e3094f0 100644 --- a/PaddleRec/multi_task/esmm/infer.py +++ b/PaddleRec/multi_task/esmm/infer.py @@ -21,7 +21,7 @@ def set_zero(place): param_array = np.zeros(param._get_dims()).astype("int64") param.set(param_array, place) -def run_infer(args,model_path,test_data_path,vocab_size): +def run_infer(args, model_path, test_data_path, vocab_size): place = fluid.CPUPlace() esmm_model = ESMM() @@ -33,10 +33,10 @@ def run_infer(args,model_path,test_data_path,vocab_size): inputs = esmm_model.input_data() avg_cost,auc_ctr,auc_ctcvr= esmm_model.net(inputs, vocab_size, args.embed_size) - dataset, file_list = utils.get_dataset(inputs, test_data_path,args.batch_size,args.cpu_num) + dataset, file_list = utils.get_dataset(inputs, test_data_path, args.batch_size,args.cpu_num) exe = fluid.Executor(place) - fluid.load(fluid.default_main_program(),os.path.join(model_path, "checkpoint"), exe) + fluid.load(fluid.default_main_program(), os.path.join(model_path, "checkpoint"), exe) set_zero(place) diff --git a/PaddleRec/multi_task/esmm/net.py b/PaddleRec/multi_task/esmm/net.py index b8a0092b600b16868e3d1c047dec49ff3c9ce2c1..0c71a60195b95b5e29899dd0faf3d8afc4d2b530 100644 --- a/PaddleRec/multi_task/esmm/net.py +++ b/PaddleRec/multi_task/esmm/net.py @@ -7,7 +7,7 @@ import args class ESMM(object): - def fc(self,tag, data, out_dim, active='prelu'): + def fc(self, tag, data, out_dim, active='prelu'): init_stddev = 1.0 scales = 1.0 / np.sqrt(data.shape[1]) @@ -35,7 +35,7 @@ class ESMM(object): return inputs - def net(self,inputs,vocab_size,embed_size): + def net(self, inputs, vocab_size, embed_size): emb = [] for data in inputs[0:-2]: @@ -47,7 +47,7 @@ class ESMM(object): ), is_sparse=True) - field_emb = fluid.layers.sequence_pool(input=feat_emb,pool_type='sum') + field_emb = fluid.layers.sequence_pool(input=feat_emb, pool_type='sum') emb.append(field_emb) concat_emb = fluid.layers.concat(emb, axis=1) @@ -60,7 +60,7 @@ class ESMM(object): # cvr cvr_fc1 = self.fc('cvr_fc1', concat_emb, 200, active) cvr_fc2 = self.fc('cvr_fc2', cvr_fc1, 80, active) - cvr_out = self.fc('cvr_out', cvr_fc2, 2,'softmax') + cvr_out = self.fc('cvr_out', cvr_fc2, 2, 'softmax') ctr_clk = inputs[-2] ctcvr_buy = inputs[-1] @@ -69,10 +69,10 @@ class ESMM(object): cvr_prop_one = fluid.layers.slice(cvr_out, axes=[1], starts=[1], ends=[2]) ctcvr_prop_one = fluid.layers.elementwise_mul(ctr_prop_one, cvr_prop_one) - ctcvr_prop = fluid.layers.concat(input=[1-ctcvr_prop_one,ctcvr_prop_one], axis = 1) + ctcvr_prop = fluid.layers.concat(input=[1 - ctcvr_prop_one, ctcvr_prop_one], axis = 1) - loss_ctr = paddle.fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk) - loss_ctcvr = paddle.fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy) + loss_ctr = fluid.layers.cross_entropy(input=ctr_out, label=ctr_clk) + loss_ctcvr = fluid.layers.cross_entropy(input=ctcvr_prop, label=ctcvr_buy) cost = loss_ctr + loss_ctcvr avg_cost = fluid.layers.mean(cost) diff --git a/PaddleRec/multi_task/esmm/reader.py b/PaddleRec/multi_task/esmm/reader.py deleted file mode 100644 index 4ccd31c96a420e292dd4145ae2fef49b6a8d7121..0000000000000000000000000000000000000000 --- a/PaddleRec/multi_task/esmm/reader.py +++ /dev/null @@ -1,157 +0,0 @@ -import numpy as np -import pandas as pd -from collections import defaultdict -import args -import os - -def join_data(file1,file2,write_file,sample_size): - sample_list = [] - common_logs = defaultdict(lambda: '') - file = open(write_file, 'w') - - print("begin push sample_list!") - with open(file1,'r') as f: - for i, line in enumerate(f): - try: - sample_list.append(line) - except: - continue - - print("begin push common_logs!") - with open(file2,'r') as f: - for i, line in enumerate(f): - try: - common_feature_index,sample_str = line.strip().split('\t') - common_logs[common_feature_index] = sample_str - except: - continue - - print("begin join data!") - for i, sample in enumerate(sample_list): - try: - common_feature_index,sample_str = sample.strip().split('\t') - common_str = common_logs.get(common_feature_index) - if common_str: - sample = "{0},{1}".format(sample_str, common_str) - else: - sample = "{0}".format(sample_str) - file.write(sample + "\n") - except: - continue - if(i == sample_size): - break - - print("join data successfully!") - - -def read_data(file_name,write_file): - file = open(write_file, 'w') - print("begin to write!") - with open(file_name,'r') as f: - for i, line in enumerate(f): - try: - line = line.strip().split(',') - feat_len = len(line) - feat_lists = [] - #common_feature_index|feat_num|feat_list - if(feat_len == 3): - feat_strs = line[2] - for fstr in feat_strs.split('\x01'): - filed, feat_val = fstr.split('\x02') - feat, val = feat_val.split('\x03') - feat_lists.append('%s:%s' % (filed,feat)) - common_feature = "{0}\t{1}".format(line[0], ','.join(feat_lists)) + "\n" - file.write(common_feature) - - #sample_id|y|z|common_feature_index|feat_num|feat_list - elif(feat_len == 6): - # y=0 & z=1 filter - if(line[1] == '0' and line[2] == '1'): - continue - feat_strs = line[5] - for fstr in feat_strs.split('\x01'): - filed, feat_val = fstr.split('\x02') - feat, val = feat_val.split('\x03') - feat_lists.append('%s:%s' % (filed,feat)) - sample = "{0}\t{1},{2},{3},{4}".format(line[3], line[0], line[1], line[2], ','.join(feat_lists)) + "\n" - file.write(sample) - except: - continue - - file.close() - -def recode(file_path,writh_file,vocab_path): - all_feat_id_dict = defaultdict(int) - file1 = open(writh_file[0], 'w') - file2 = open(writh_file[1], 'w') - vocab_file = open(vocab_path, 'w') - id = 0 - with open(file_path[0], "r") as f: - for i, line in enumerate(f): - line = line.strip().split(',') - feat_lists = [] - for elem in line[3:]: - field_id,feat_id = elem.strip().split(':') - if feat_id not in all_feat_id_dict: - id += 1 - all_feat_id_dict[feat_id] = id - feat_lists.append('%s:%s' % (field_id,all_feat_id_dict[feat_id])) - sample = "{0},{1},{2},{3}".format(line[0], line[1], line[2], ','.join(feat_lists)) + "\n" - file1.write(sample) - with open(file_path[1], "r") as f: - for i, line in enumerate(f): - line = line.strip().split(',') - feat_lists = [] - for elem in line[3:]: - field_id,feat_id = elem.strip().split(':') - if feat_id not in all_feat_id_dict: - id += 1 - all_feat_id_dict[feat_id] = id - feat_lists.append('%s:%s' % (field_id,all_feat_id_dict[feat_id])) - sample = "{0},{1},{2},{3}".format(line[0], line[1], line[2], ','.join(feat_lists)) + "\n" - file2.write(sample) - vocab_size =len(all_feat_id_dict) - vocab_file.write(str(vocab_size)) - file1.close() - file2.close() - vocab_file.close() - -if __name__ == "__main__": - args = args.parse_args() - - read_data(args.train_data_path + '/sample_skeleton_train.csv',args.train_data_path + '/skeleton_train.csv') - print("write skeleton_train.csv successfully") - read_data(args.train_data_path + '/common_features_train.csv',args.train_data_path + '/features_train.csv') - print("write features_train.csv successfully") - - skeleton_train_path = args.train_data_path + '/skeleton_train.csv' - features_train_path = args.train_data_path + '/features_train.csv' - - write_file = args.train_data_path + '/train_data.csv' - join_data(skeleton_train_path,features_train_path,write_file,args.train_sample_size) - - os.system('rm -rf ' + skeleton_train_path) - os.system('rm -rf ' + features_train_path) - - - read_data(args.test_data_path + '/sample_skeleton_test.csv',args.test_data_path + '/skeleton_test.csv') - print("write skeleton_est.csv successfully") - read_data(args.test_data_path + '/common_features_test.csv',args.test_data_path + '/features_test.csv') - print("write features_test.csv successfully") - - skeleton_test_path = args.test_data_path + '/skeleton_test.csv' - features_test_path = args.test_data_path + '/features_test.csv' - - write_file = args.test_data_path + '/test_data.csv' - join_data(skeleton_test_path,features_test_path,write_file,args.test_sample_size) - - os.system('rm -rf ' + skeleton_test_path) - os.system('rm -rf ' + features_test_path) - - - file_path = [args.train_data_path + '/train_data.csv', args.test_data_path + '/test_data.csv'] - write_file = [args.train_data_path + '/traindata.csv',args.test_data_path + '/testdata.csv'] - recode(file_path,write_file,args.vocab_path) - - for file in file_path: - os.system('rm -rf ' + file_path) diff --git a/PaddleRec/multi_task/esmm/train.py b/PaddleRec/multi_task/esmm/train.py index 2cb5e781e6248894f99f514703dcb7b0c658dc65..fb3ec67e3f508af1c247de3a6cbbdda79c14338c 100644 --- a/PaddleRec/multi_task/esmm/train.py +++ b/PaddleRec/multi_task/esmm/train.py @@ -10,9 +10,9 @@ def train(args, vocab_size, train_data_path): esmm_model = ESMM() inputs = esmm_model.input_data() - dataset, file_list = utils.get_dataset(inputs, train_data_path,args.batch_size,args.cpu_num) + dataset, file_list = utils.get_dataset(inputs, train_data_path, args.batch_size,args.cpu_num) - avg_cost,auc_ctr,auc_ctcvr= esmm_model.net(inputs, vocab_size, args.embed_size) + avg_cost, auc_ctr, auc_ctcvr = esmm_model.net(inputs, vocab_size, args.embed_size) optimizer = fluid.optimizer.Adam() optimizer.minimize(avg_cost) @@ -29,11 +29,11 @@ def train(args, vocab_size, train_data_path): dataset.set_filelist(file_list) exe.train_from_dataset(program=fluid.default_main_program(), dataset=dataset, - fetch_list=[avg_cost,auc_ctr,auc_ctcvr], - fetch_info=['epoch %d batch loss' % (epoch), "auc_ctr","auc_ctcvr"], + fetch_list=[avg_cost, auc_ctr, auc_ctcvr], + fetch_info=['epoch %d batch loss' % (epoch), "auc_ctr", "auc_ctcvr"], print_period=20, debug=False) - model_dir = os.path.join(args.model_dir,'epoch_' + str(epoch + 1), "checkpoint") + model_dir = os.path.join(args.model_dir, 'epoch_' + str(epoch + 1), "checkpoint") main_program = fluid.default_main_program() fluid.io.save(main_program,model_dir) diff --git a/PaddleRec/multi_task/esmm/vocab_size.txt b/PaddleRec/multi_task/esmm/vocab_size.txt new file mode 100644 index 0000000000000000000000000000000000000000..454435205da5ea0710c28b01542df89854f70c68 --- /dev/null +++ b/PaddleRec/multi_task/esmm/vocab_size.txt @@ -0,0 +1 @@ +129590 \ No newline at end of file diff --git a/PaddleRec/rerank/listwise/README.md b/PaddleRec/rerank/listwise/README.md index 360283ef20f5d1adad76fbd67381697a934a6565..cdd2347369b18900363dea76867d9f837ccd5365 100644 --- a/PaddleRec/rerank/listwise/README.md +++ b/PaddleRec/rerank/listwise/README.md @@ -35,9 +35,8 @@ GPU环境 ```sh CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1\ #使用gpu - --epochs 3\ --batch_size 32\ - --model_dir './model_dir'\ #模型保存路径 + --model_dir ./model_dir\ #模型保存路径 --embd_dim 16\ #embedding维度 --hidden_size 128\ #biRNN隐层大小 --item_vocab 200\ #item词典大小 @@ -60,9 +59,8 @@ CPU环境 ```sh python train.py --use_gpu 0\ #使用cpu - --epochs 3\ --batch_size 32\ - --model_dir './model_dir'\ #模型保存路径 + --model_dir ./model_dir\ #模型保存路径 --embd_dim 16\ #embedding维度 --hidden_size 128\ #biRNN隐层大小 --item_vocab 200\ #item词典大小 @@ -87,8 +85,8 @@ GPU环境 ```sh CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1 \ #使用gpu - --model_dir './model_dir'\ - --test_epoch 19 #选择哪一轮的模型参数 + --model_dir ./model_dir\ + --test_epoch 1 #选择哪一轮的模型参数 ``` diff --git a/PaddleRec/rerank/listwise/args.py b/PaddleRec/rerank/listwise/args.py index cad45797afa5ac0ea3b5ae4bd3bd83e492964c75..d52791e22ff69dc4b3d6b2a24b8e17f9e168fe98 100644 --- a/PaddleRec/rerank/listwise/args.py +++ b/PaddleRec/rerank/listwise/args.py @@ -24,7 +24,7 @@ def parse_args(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--epochs", type=int, default=20, help="epochs") parser.add_argument("--batch_size", type=int, default=32, help="batch_size") - parser.add_argument("--test_epoch", type=int, default=19, help="test_epoch") + parser.add_argument("--test_epoch", type=int, default=1, help="test_epoch") parser.add_argument('--use_gpu', type=int, default=0, help='whether using gpu') parser.add_argument('--model_dir', type=str, default='./model_dir', help='model_dir') parser.add_argument('--embd_dim', type=int, default=16, help='embd_dim') diff --git a/PaddleRec/rerank/listwise/evaluator.py b/PaddleRec/rerank/listwise/evaluator.py index d7d6c09a9950a870985b53a34d1e7b821db77568..a1e9d2781359bcb41750228b7188f190b3e9f302 100644 --- a/PaddleRec/rerank/listwise/evaluator.py +++ b/PaddleRec/rerank/listwise/evaluator.py @@ -16,9 +16,6 @@ class BiRNN(object): def default_normal_initializer(self, nf=128): return fluid.initializer.TruncatedNormal(loc=0.0, scale=np.sqrt(1.0/nf)) - def default_param_clip(self): - return fluid.clip.GradientClipByValue(1.0) - def default_regularizer(self): return None @@ -27,22 +24,18 @@ class BiRNN(object): size=size, num_flatten_dims=num_flatten_dims, param_attr=fluid.ParamAttr(initializer=self.default_normal_initializer(size), - gradient_clip=self.default_param_clip(), regularizer=self.default_regularizer()), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(value=0.0), - gradient_clip=self.default_param_clip(), regularizer=self.default_regularizer()), act=act, name=name) def default_embedding(self, data, vocab_size, embed_size): - gradient_clip = self.default_param_clip() reg = fluid.regularizer.L2Decay(1e-5) # IMPORTANT, to prevent overfitting. embed = fluid.embedding(input=data, size=[vocab_size, embed_size], param_attr=fluid.ParamAttr(initializer=fluid.initializer.Xavier(), - gradient_clip=gradient_clip, - regularizer=reg), + regularizer=reg), is_sparse=True) return embed @@ -51,10 +44,8 @@ class BiRNN(object): return fluid.layers.dynamic_gru(input=data, size=nf, param_attr=fluid.ParamAttr(initializer=self.default_normal_initializer(nf), - gradient_clip=self.default_param_clip(), regularizer=self.default_regularizer()), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(value=0.0), - gradient_clip=self.default_param_clip(), regularizer=self.default_regularizer()), is_reverse=is_reverse, h_0=h_0) diff --git a/PaddleRec/rerank/listwise/infer.py b/PaddleRec/rerank/listwise/infer.py index fb01f1b4dc8b4341a16776d5f78cc4d54ca15cf2..5c750d5050855ce580fa635802ab25cd6695bd2a 100644 --- a/PaddleRec/rerank/listwise/infer.py +++ b/PaddleRec/rerank/listwise/infer.py @@ -11,6 +11,30 @@ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger("fluid") logger.setLevel(logging.INFO) +user_id = 0 +class Dataset(object): + def _reader_creator(self): + def reader(): + global user_id + user_slot_name = [] + for j in range(args.batch_size): + user_slot_name.append([user_id]) + user_id += 1 + + item_slot_name = np.random.randint(args.item_vocab, size=(args.batch_size, args.item_len)).tolist() + lenght = [args.item_len]*args.batch_size + label = np.random.randint(2, size=(args.batch_size, args.item_len)).tolist() + output = [] + output.append(user_slot_name) + output.append(item_slot_name) + output.append(lenght) + output.append(label) + + yield output + return reader + def get_test_data(self): + return self._reader_creator() + def set_zero(var_name, scope=fluid.global_scope(), place=fluid.CPUPlace(), param_type="int64"): """ Set tensor of a Variable to zero. @@ -41,42 +65,23 @@ def run_infer(args): for var in auc_states: # reset auc states set_zero(var.name, scope=inference_scope, place=place) - # Build a random data set. - user_slot_names = [] - item_slot_names = [] - lens = [] - labels = [] - user_id = 0 - for i in range(args.sample_size): - user_slot_name = [] - for j in range(args.batch_size): - user_slot_name.append(user_id) - user_id += 1 - user_slot_names.append(user_slot_name) - - item_slot_name = np.random.randint(args.item_vocab, size=(args.batch_size, args.item_len)) - item_slot_names.append(item_slot_name) - lenght = np.array([args.item_len]*args.batch_size) - lens.append(lenght) - label = np.random.randint(2, size=(args.batch_size, args.item_len)) - labels.append(label) + test_data_generator = Dataset() + test_reader = fluid.io.batch(test_data_generator.get_test_data(), batch_size=args.batch_size) + loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=args.batch_size, iterable=True) + loader.set_sample_list_generator(test_reader, places=place) for i in range(args.sample_size): - begin = time.time() - loss_val, auc = exe.run(test_program, - feed={ - "user_slot_names": np.array(user_slot_names[i]).reshape(args.batch_size, 1), - "item_slot_names": item_slot_names[i].astype('int64'), - "lens": lens[i].astype('int64'), - "labels": labels[i].astype('int64') - }, - return_numpy=True, - fetch_list=[loss.name, auc_val]) - end = time.time() - logger.info("batch_time: {:.5f}s, loss: {:.5f}, auc: {:.5f}".format( - end-begin, float(np.array(loss_val)), float(np.array(auc)))) + for batch_id, data in enumerate(loader()): + begin = time.time() + loss_val, auc = exe.run(program=fluid.default_main_program(), + feed=data, + fetch_list=[loss.name, auc_val], + return_numpy=True) + end = time.time() + logger.info("batch_id: {}, batch_time: {:.5f}s, loss: {:.5f}, auc: {:.5f}".format( + batch_id, end-begin, float(np.array(loss_val)), float(np.array(auc)))) if __name__ == "__main__": - args = args.parse_args() + logger.info("use_gpu: {}, model_dir: {}, test_epoch: {}".format(args.use_gpu, args.model_dir, args.test_epoch)) run_infer(args) \ No newline at end of file diff --git a/PaddleRec/rerank/listwise/infer_cpu.sh b/PaddleRec/rerank/listwise/infer_cpu.sh index c75466da23724c43ad52b9e44912d231169d65a0..674e021bcec0e1b8a9482453843dca9854ec3fb9 100644 --- a/PaddleRec/rerank/listwise/infer_cpu.sh +++ b/PaddleRec/rerank/listwise/infer_cpu.sh @@ -1 +1 @@ -python infer.py --use_gpu 0 --model_dir './model_dir' --test_epoch 19 +python infer.py --use_gpu 0 --model_dir ./model_dir --test_epoch 1 diff --git a/PaddleRec/rerank/listwise/infer_gpu.sh b/PaddleRec/rerank/listwise/infer_gpu.sh index 0e61306d3827b766d9ec107145e66d4c4a5d03cf..e447d78bef7947786f87f5b96ffcbd695bd13953 100644 --- a/PaddleRec/rerank/listwise/infer_gpu.sh +++ b/PaddleRec/rerank/listwise/infer_gpu.sh @@ -1 +1 @@ -CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1 --model_dir './model_dir' --test_epoch 19 +CUDA_VISIBLE_DEVICES=0 python infer.py --use_gpu 1 --model_dir ./model_dir --test_epoch 1 diff --git a/PaddleRec/rerank/listwise/train.py b/PaddleRec/rerank/listwise/train.py index 314d53a9cf9c4597e205200914e1e3379567e39c..c8b5d3108d7fa7394858e38c77539c4c0b4eafb2 100644 --- a/PaddleRec/rerank/listwise/train.py +++ b/PaddleRec/rerank/listwise/train.py @@ -10,6 +10,29 @@ from evaluator import BiRNN logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger("fluid") logger.setLevel(logging.INFO) +user_id = 0 +class Dataset(object): + def _reader_creator(self): + def reader(): + global user_id + user_slot_name = [] + for j in range(args.batch_size): + user_slot_name.append([user_id]) + user_id += 1 + + item_slot_name = np.random.randint(args.item_vocab, size=(args.batch_size, args.item_len)).tolist() + lenght = [args.item_len]*args.batch_size + label = np.random.randint(2, size=(args.batch_size, args.item_len)).tolist() + output = [] + output.append(user_slot_name) + output.append(item_slot_name) + output.append(lenght) + output.append(label) + + yield output + return reader + def get_train_data(self): + return self._reader_creator() def train(args): @@ -23,48 +46,32 @@ def train(args): place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) + + train_data_generator = Dataset() + train_reader = fluid.io.batch(train_data_generator.get_train_data(), batch_size=args.batch_size) + loader = fluid.io.DataLoader.from_generator(feed_list=inputs, capacity=args.batch_size, iterable=True) + loader.set_sample_list_generator(train_reader, places=place) - # Build a random data set. - user_slot_names = [] - item_slot_names = [] - lens = [] - labels = [] - user_id = 0 for i in range(args.sample_size): - user_slot_name = [] - for j in range(args.batch_size): - user_slot_name.append(user_id) - user_id += 1 - user_slot_names.append(user_slot_name) - - item_slot_name = np.random.randint(args.item_vocab, size=(args.batch_size, args.item_len)) - item_slot_names.append(item_slot_name) - lenght = np.array([args.item_len]*args.batch_size) - lens.append(lenght) - label = np.random.randint(2, size=(args.batch_size, args.item_len)) - labels.append(label) - - for epoch in range(args.epochs): - for i in range(args.sample_size): + for batch_id, data in enumerate(loader()): begin = time.time() - loss_val, auc = exe.run(fluid.default_main_program(), - feed={ - "user_slot_names": np.array(user_slot_names[i]).reshape(args.batch_size, 1), - "item_slot_names": item_slot_names[i].astype('int64'), - "lens": lens[i].astype('int64'), - "labels": labels[i].astype('int64') - }, - return_numpy=True, - fetch_list=[loss.name, auc_val]) + loss_val, auc = exe.run(program=fluid.default_main_program(), + feed=data, + fetch_list=[loss.name, auc_val], + return_numpy=True) end = time.time() - logger.info("epoch_id: {}, batch_time: {:.5f}s, loss: {:.5f}, auc: {:.5f}".format( - epoch, end-begin, float(np.array(loss_val)), float(np.array(auc)))) + logger.info("batch_id: {}, batch_time: {:.5f}s, loss: {:.5f}, auc: {:.5f}".format( + batch_id, end-begin, float(np.array(loss_val)), float(np.array(auc)))) - #save model - model_dir = os.path.join(args.model_dir, 'epoch_' + str(epoch + 1), "checkpoint") - main_program = fluid.default_main_program() - fluid.save(main_program, model_dir) + #save model + model_dir = os.path.join(args.model_dir, 'epoch_' + str(1), "checkpoint") + main_program = fluid.default_main_program() + fluid.save(main_program, model_dir) if __name__ == "__main__": args = args.parse_args() + logger.info("use_gpu: {}, batch_size: {}, model_dir: {}, embd_dim: {}, hidden_size: {}, item_vocab: {}, user_vocab: {},\ + item_len: {}, sample_size: {}, base_lr: {}".format(args.use_gpu, args.batch_size, args.model_dir, args.embd_dim, + args.hidden_size, args.item_vocab, args.user_vocab, args.item_len, args.sample_size, args.base_lr)) + train(args) \ No newline at end of file diff --git a/PaddleRec/rerank/listwise/train_cpu.sh b/PaddleRec/rerank/listwise/train_cpu.sh index c09c47512aa4114142d3ebbe89323bb5cd626770..d792345ac0213b1ea16ec168cc2241ec95835e32 100644 --- a/PaddleRec/rerank/listwise/train_cpu.sh +++ b/PaddleRec/rerank/listwise/train_cpu.sh @@ -1 +1 @@ -CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 0 --epochs 20 --batch_size 32 --model_dir './model_dir' --embd_dim 16 --hidden_size 128 --item_vocab 200 --user_vocab 200 --item_len 5 --sample_size 100 --base_lr 0.01 +CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 0 --epochs 20 --batch_size 32 --model_dir ./model_dir --embd_dim 16 --hidden_size 128 --item_vocab 200 --user_vocab 200 --item_len 5 --sample_size 100 --base_lr 0.01 diff --git a/PaddleRec/rerank/listwise/train_gpu.sh b/PaddleRec/rerank/listwise/train_gpu.sh index 2bee22a3aff757a5558efe941fe63fc4687e07c5..c87aa5d311a663fe856a9afae163fd9266c567e0 100644 --- a/PaddleRec/rerank/listwise/train_gpu.sh +++ b/PaddleRec/rerank/listwise/train_gpu.sh @@ -1 +1 @@ -CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1 --epochs 20 --batch_size 32 --model_dir './model_dir' --embd_dim 16 --hidden_size 128 --item_vocab 200 --user_vocab 200 --item_len 5 --sample_size 100 --base_lr 0.01 +CUDA_VISIBLE_DEVICES=0 python train.py --use_gpu 1 --epochs 20 --batch_size 32 --model_dir ./model_dir --embd_dim 16 --hidden_size 128 --item_vocab 200 --user_vocab 200 --item_len 5 --sample_size 100 --base_lr 0.01 diff --git a/PaddleRec/youbube_dnn/README.md b/PaddleRec/youtube_dnn/README.md similarity index 100% rename from PaddleRec/youbube_dnn/README.md rename to PaddleRec/youtube_dnn/README.md diff --git a/PaddleRec/youbube_dnn/args.py b/PaddleRec/youtube_dnn/args.py similarity index 100% rename from PaddleRec/youbube_dnn/args.py rename to PaddleRec/youtube_dnn/args.py diff --git a/PaddleRec/youbube_dnn/get_topk.py b/PaddleRec/youtube_dnn/get_topk.py similarity index 93% rename from PaddleRec/youbube_dnn/get_topk.py rename to PaddleRec/youtube_dnn/get_topk.py index 69025adb79d8c33db2278a77667b523a1de8eb64..47aa4cec92b0e37ae4ba65972e80e2092ff3ea09 100644 --- a/PaddleRec/youbube_dnn/get_topk.py +++ b/PaddleRec/youtube_dnn/get_topk.py @@ -12,7 +12,7 @@ def cos_sim(vector_a, vector_b): sim = 0.5 + 0.5 * cos return sim -def get_topK(args, K): +def get_topK(args): video_vec = pd.read_csv(args.video_vec_path, header=None) user_vec = pd.read_csv(args.user_vec_path, header=None) @@ -24,11 +24,11 @@ def get_topK(args, K): tmp_list=copy.deepcopy(user_video_sim_list) tmp_list.sort() - max_sim_index=[user_video_sim_list.index(one) for one in tmp_list[::-1][:K]] + max_sim_index=[user_video_sim_list.index(one) for one in tmp_list[::-1][:args.topk]] print("user:{0}, top K videos:{1}".format(i, max_sim_index)) user_video_sim_list = [] if __name__ == "__main__": args = args.parse_args() - get_topK(args, 5) \ No newline at end of file + get_topK(args) \ No newline at end of file diff --git a/PaddleRec/youbube_dnn/infer.py b/PaddleRec/youtube_dnn/infer.py similarity index 100% rename from PaddleRec/youbube_dnn/infer.py rename to PaddleRec/youtube_dnn/infer.py diff --git a/PaddleRec/youbube_dnn/infer_cpu.sh b/PaddleRec/youtube_dnn/infer_cpu.sh similarity index 100% rename from PaddleRec/youbube_dnn/infer_cpu.sh rename to PaddleRec/youtube_dnn/infer_cpu.sh diff --git a/PaddleRec/youbube_dnn/infer_gpu.sh b/PaddleRec/youtube_dnn/infer_gpu.sh similarity index 100% rename from PaddleRec/youbube_dnn/infer_gpu.sh rename to PaddleRec/youtube_dnn/infer_gpu.sh diff --git a/PaddleRec/youbube_dnn/rec_topk.sh b/PaddleRec/youtube_dnn/rec_topk.sh similarity index 100% rename from PaddleRec/youbube_dnn/rec_topk.sh rename to PaddleRec/youtube_dnn/rec_topk.sh diff --git a/PaddleRec/youbube_dnn/train.py b/PaddleRec/youtube_dnn/train.py similarity index 100% rename from PaddleRec/youbube_dnn/train.py rename to PaddleRec/youtube_dnn/train.py diff --git a/PaddleRec/youbube_dnn/train_cpu.sh b/PaddleRec/youtube_dnn/train_cpu.sh similarity index 100% rename from PaddleRec/youbube_dnn/train_cpu.sh rename to PaddleRec/youtube_dnn/train_cpu.sh diff --git a/PaddleRec/youbube_dnn/train_gpu.sh b/PaddleRec/youtube_dnn/train_gpu.sh similarity index 100% rename from PaddleRec/youbube_dnn/train_gpu.sh rename to PaddleRec/youtube_dnn/train_gpu.sh diff --git a/PaddleRec/youbube_dnn/youtubednn.py b/PaddleRec/youtube_dnn/youtubednn.py similarity index 100% rename from PaddleRec/youbube_dnn/youtubednn.py rename to PaddleRec/youtube_dnn/youtubednn.py